diff --git a/Android.bp b/Android.bp
index 14290b9e1c9d0926add9eb57a6c0054187e7c025..768120577050042ad02419034385febd838a8c7e 100644
--- a/Android.bp
+++ b/Android.bp
@@ -28,12 +28,6 @@ opencl_srcs = [
         "src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl",
         "src/core/CL/cl_kernels/common/elementwise_unary.cl",
         "src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl",
-        "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h",
-        "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl",
-        "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl",
-        "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl",
-        "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h",
-        "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h",
         "src/core/CL/cl_kernels/common/fft.cl",
         "src/core/CL/cl_kernels/common/fft_digit_reverse.cl",
         "src/core/CL/cl_kernels/common/fft_scale.cl",
@@ -53,6 +47,7 @@ opencl_srcs = [
         "src/core/CL/cl_kernels/common/mat_mul.cl",
         "src/core/CL/cl_kernels/common/mat_mul_mmul.cl",
         "src/core/CL/cl_kernels/common/mat_mul_quantized.cl",
+        "src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl",
         "src/core/CL/cl_kernels/common/mean_stddev_normalization.cl",
         "src/core/CL/cl_kernels/common/memset.cl",
         "src/core/CL/cl_kernels/common/minmax_layer.cl",
@@ -73,7 +68,6 @@ opencl_srcs = [
         "src/core/CL/cl_kernels/common/select.cl",
         "src/core/CL/cl_kernels/common/slice_ops.cl",
         "src/core/CL/cl_kernels/common/softmax_layer.cl",
-        "src/core/CL/cl_kernels/common/softmax_layer_quantized.cl",
         "src/core/CL/cl_kernels/common/stack_layer.cl",
         "src/core/CL/cl_kernels/common/tile.cl",
         "src/core/CL/cl_kernels/common/transpose.cl",
@@ -218,9 +212,12 @@ cc_library_static {
         "src/core/AccessWindowAutoPadding.cpp",
         "src/core/AccessWindowStatic.cpp",
         "src/core/AccessWindowTranspose.cpp",
+        "src/core/CL/CLCommandBuffer.cpp",
+        "src/core/CL/CLCompatCommandBuffer.cpp",
         "src/core/CL/CLCompileContext.cpp",
         "src/core/CL/CLHelpers.cpp",
         "src/core/CL/CLKernelLibrary.cpp",
+        "src/core/CL/CLMutableCommandBuffer.cpp",
         "src/core/CL/CLUtils.cpp",
         "src/core/CL/DefaultLWSHeuristics.cpp",
         "src/core/CL/ICLKernel.cpp",
@@ -396,6 +393,7 @@ cc_library_static {
         "src/core/Validate.cpp",
         "src/core/Version.cpp",
         "src/core/helpers/SoftmaxHelpers.cpp",
+        "src/core/helpers/Utils.cpp",
         "src/core/helpers/WindowHelpers.cpp",
         "src/core/utils/ActivationFunctionUtils.cpp",
         "src/core/utils/AssemblyUtils.cpp",
@@ -403,6 +401,7 @@ cc_library_static {
         "src/core/utils/DataTypeUtils.cpp",
         "src/core/utils/FormatUtils.cpp",
         "src/core/utils/InterpolationPolicyUtils.cpp",
+        "src/core/utils/Math.cpp",
         "src/core/utils/ScaleUtils.cpp",
         "src/core/utils/StringUtils.cpp",
         "src/core/utils/helpers/fft.cpp",
@@ -485,11 +484,9 @@ cc_library_static {
         "src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp",
         "src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp",
         "src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp",
-        "src/cpu/kernels/cast/generic/neon/bfloat16.cpp",
         "src/cpu/kernels/cast/generic/neon/fp16.cpp",
         "src/cpu/kernels/crop/generic/neon/fp16.cpp",
         "src/cpu/kernels/crop/generic/neon/fp32.cpp",
-        "src/cpu/kernels/crop/generic/neon/impl.cpp",
         "src/cpu/kernels/crop/generic/neon/integer.cpp",
         "src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp",
         "src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp",
@@ -497,8 +494,11 @@ cc_library_static {
         "src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp",
         "src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp",
         "src/cpu/kernels/directconv2d/nchw/all.cpp",
+        "src/cpu/kernels/directconv2d/nchw/fp16.cpp",
+        "src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp",
         "src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp",
         "src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp",
+        "src/cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp",
         "src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp",
         "src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp",
         "src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp",
@@ -506,7 +506,6 @@ cc_library_static {
         "src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp",
         "src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp",
         "src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp",
-        "src/cpu/kernels/elementwise_unary/generic/neon/impl.cpp",
         "src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp",
         "src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp",
         "src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp",
@@ -515,11 +514,9 @@ cc_library_static {
         "src/cpu/kernels/floor/neon/fp32.cpp",
         "src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp",
         "src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp",
-        "src/cpu/kernels/fuse_batch_normalization/generic/impl.cpp",
         "src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp",
         "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp",
         "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp",
-        "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.cpp",
         "src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp",
         "src/cpu/kernels/gemm_matrix_add/generic/neon/fp32.cpp",
         "src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp",
@@ -537,11 +534,9 @@ cc_library_static {
         "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp",
         "src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp",
         "src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp",
-        "src/cpu/kernels/l2normlayer/generic/neon/impl.cpp",
         "src/cpu/kernels/lut/generic/neon/u8.cpp",
         "src/cpu/kernels/maxunpool/generic/neon/fp16.cpp",
         "src/cpu/kernels/maxunpool/generic/neon/fp32.cpp",
-        "src/cpu/kernels/maxunpool/generic/neon/impl.cpp",
         "src/cpu/kernels/maxunpool/generic/neon/qasymm8.cpp",
         "src/cpu/kernels/maxunpool/generic/neon/qasymm8_signed.cpp",
         "src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp",
@@ -555,16 +550,13 @@ cc_library_static {
         "src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp",
         "src/cpu/kernels/pool3d/neon/fp16.cpp",
         "src/cpu/kernels/pool3d/neon/fp32.cpp",
-        "src/cpu/kernels/pool3d/neon/impl.cpp",
         "src/cpu/kernels/pool3d/neon/qasymm8.cpp",
         "src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp",
         "src/cpu/kernels/range/generic/neon/fp16.cpp",
         "src/cpu/kernels/range/generic/neon/fp32.cpp",
-        "src/cpu/kernels/range/generic/neon/impl.cpp",
         "src/cpu/kernels/range/generic/neon/integer.cpp",
         "src/cpu/kernels/roialign/generic/neon/fp16.cpp",
         "src/cpu/kernels/roialign/generic/neon/fp32.cpp",
-        "src/cpu/kernels/roialign/generic/neon/impl.cpp",
         "src/cpu/kernels/roialign/generic/neon/qasymm8.cpp",
         "src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp",
         "src/cpu/kernels/scale/neon/fp16.cpp",
@@ -573,13 +565,13 @@ cc_library_static {
         "src/cpu/kernels/scale/neon/qasymm8_signed.cpp",
         "src/cpu/kernels/select/generic/neon/fp16.cpp",
         "src/cpu/kernels/select/generic/neon/fp32.cpp",
-        "src/cpu/kernels/select/generic/neon/impl.cpp",
         "src/cpu/kernels/select/generic/neon/integer.cpp",
         "src/cpu/kernels/softmax/generic/neon/fp16.cpp",
         "src/cpu/kernels/softmax/generic/neon/fp32.cpp",
         "src/cpu/kernels/softmax/generic/neon/impl.cpp",
         "src/cpu/kernels/softmax/generic/neon/qasymm8.cpp",
         "src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp",
+        "src/cpu/kernels/sub/neon/fp16.cpp",
         "src/cpu/kernels/sub/neon/qasymm8.cpp",
         "src/cpu/kernels/sub/neon/qasymm8_signed.cpp",
         "src/cpu/kernels/sub/neon/qsymm16.cpp",
@@ -628,6 +620,7 @@ cc_library_static {
         "src/dynamic_fusion/sketch/attributes/ClampAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/Conv2dAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp",
+        "src/dynamic_fusion/sketch/attributes/MatMulAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp",
@@ -647,8 +640,12 @@ cc_library_static {
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp",
@@ -657,6 +654,7 @@ cc_library_static {
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp",
@@ -666,6 +664,7 @@ cc_library_static {
         "src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp",
@@ -729,6 +728,7 @@ cc_library_static {
         "src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp",
         "src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp",
         "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp",
+        "src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp",
         "src/gpu/cl/kernels/ClMatMulNativeKernel.cpp",
         "src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp",
         "src/gpu/cl/kernels/ClMulKernel.cpp",
@@ -756,6 +756,7 @@ cc_library_static {
         "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp",
         "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp",
         "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp",
+        "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp",
         "src/gpu/cl/operators/ClActivation.cpp",
         "src/gpu/cl/operators/ClAdd.cpp",
         "src/gpu/cl/operators/ClCast.cpp",
@@ -1310,6 +1311,7 @@ cc_library_static {
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16fp32fp16_dot_16VL/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf029a7e9e15d032c4f6da46f74ccd9f34b0783c..9dd3e2cef7b01bdef701b586dcb119646d6fdc07 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 list(APPEND CMAKE_MESSAGE_CONTEXT ArmCompute)
 project(
   ArmCompute
-  VERSION 32.0.0
+  VERSION 33.0.0
   DESCRIPTION
     "The Arm Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A CPU and Arm® Mali™ GPU architectures"
   LANGUAGES C CXX ASM)
diff --git a/README.md b/README.md
index a8f0def7a18b9ae2a940dcda026142f00c86b9a9..9b06dbeabf145377e4d87bf3a018111f660ef9c9 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
  <img src="https://raw.githubusercontent.com/ARM-software/ComputeLibrary/gh-pages/ACL_logo.png"/><br><br>
 </div>
 
-# Compute Library ![](https://img.shields.io/badge/latest_release-23.08-green)
+# Compute Library ![](https://img.shields.io/badge/latest_release-23.11-green)
 
 
 The Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A, Arm® Neoverse® and Arm® Mali™ GPUs architectures.<br>
@@ -44,7 +44,7 @@ Key Features:
 <br>
 
 ## Documentation
-[![Documentation](https://img.shields.io/badge/documentation-23.08-green)](https://arm-software.github.io/ComputeLibrary/latest)
+[![Documentation](https://img.shields.io/badge/documentation-23.11-green)](https://arm-software.github.io/ComputeLibrary/latest)
 
 > Note: The documentation includes the reference API, changelogs, build guide, contribution guide, errata, etc.
 
@@ -57,24 +57,24 @@ All the binaries can be downloaded from [here](https://github.com/ARM-software/C
 
 | Platform       | Operating System | Release archive (Download) |
 | -------------- | ---------------- | -------------------------- |
-| Raspberry Pi 4 | Linux® 32bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-armv7a-neon.tar.gz) |
-| Raspberry Pi 4 | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-arm64-v8a-neon.tar.gz) |
-| Odroid N2      | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-arm64-v8a-neon-cl.tar.gz) |
-| HiKey960       | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| Raspberry Pi 4 | Linux® 32bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-armv7a-neon.tar.gz) |
+| Raspberry Pi 4 | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon.tar.gz) |
+| Odroid N2      | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| HiKey960       | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon-cl.tar.gz) |
 
 <br>
 
 | Architecture | Operating System | Release archive (Download) |
 | ------------ | ---------------- | -------------------------- |
-| armv7        | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-armv7a-neon-cl.tar.gz) |
-| arm64-v8a    | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-android-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-android-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-android-arm64-v8a-neon-cl.tar.gz) |
-| arm64-v8a    | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-arm64-v8a-neon-cl.tar.gz) |
-| arm64-v8.2-a | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-android-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-android-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-android-arm64-v8.2-a-neon-cl.tar.gz) |
-| arm64-v8.2-a | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.08/arm_compute-v23.08-bin-linux-arm64-v8.2-a-neon-cl.tar.gz) |
+| armv7        | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-armv7a-neon-cl.tar.gz) |
+| arm64-v8a    | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8a-neon-cl.tar.gz) |
+| arm64-v8a    | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| arm64-v8.2-a | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8.2-a-neon-cl.tar.gz) |
+| arm64-v8.2-a | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8.2-a-neon-cl.tar.gz) |
 
 <br>
 
-Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v23.08-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v23.08)
+Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v23.11-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v23.11)
 
 Pre-build binaries are generated with the following security / good coding practices related flags:
 > -Wall, -Wextra, -Wformat=2, -Winit-self, -Wstrict-overflow=2, -Wswitch-default, -Woverloaded-virtual, -Wformat-security, -Wctor-dtor-privacy, -Wsign-promo, -Weffc++, -pedantic, -fstack-protector-strong
diff --git a/SConscript b/SConscript
index 467f84cb55ae335738975332b2859aa56c10be97..099ff706add8f8254f9c68af0a8c07a5998e9eca 100644
--- a/SConscript
+++ b/SConscript
@@ -38,8 +38,8 @@ warn(DeprecatedWarning,
      " Link your application only to libarm_compute for core library functionality"
      )
 
-VERSION = "v23.08"
-LIBRARY_VERSION_MAJOR = 32
+VERSION = "v23.11"
+LIBRARY_VERSION_MAJOR = 33
 LIBRARY_VERSION_MINOR =  0
 LIBRARY_VERSION_PATCH =  0
 SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH)
@@ -222,7 +222,7 @@ def resolve_includes(target, source, env):
                 found = pattern.search(line)
                 if found:
                     # Only get the header file name and discard the relative path.
-                    # E.g. "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h" -> "fp_mixed_precision_helpers.h"
+                    # E.g. "src/core/CL/cl_kernels/activation_float_helpers.h" -> "activation_float_helpers.h"
                     include_file = found.group(1).split('/')[-1]
                     data = files_dict[include_file].file_contents
                     updated_file.extend(data)
@@ -387,9 +387,6 @@ if env['opencl'] and env['embed_kernels']:
                         'src/core/CL/cl_kernels/tile_helpers.h',
                         'src/core/CL/cl_kernels/types.h',
                         'src/core/CL/cl_kernels/warp_helpers.h',
-                        'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h',
-                        'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h',
-                        'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h',
                     ]
 
     # Common kernels
@@ -414,9 +411,6 @@ if env['opencl'] and env['embed_kernels']:
                        'src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl',
                        'src/core/CL/cl_kernels/common/elementwise_unary.cl',
                        'src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl',
-                       'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl',
-                       'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl',
-                       'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl',
                        'src/core/CL/cl_kernels/common/fft_digit_reverse.cl',
                        'src/core/CL/cl_kernels/common/fft.cl',
                        'src/core/CL/cl_kernels/common/fft_scale.cl',
@@ -436,6 +430,7 @@ if env['opencl'] and env['embed_kernels']:
                        'src/core/CL/cl_kernels/common/mat_mul.cl',
                        'src/core/CL/cl_kernels/common/mat_mul_mmul.cl',
                        'src/core/CL/cl_kernels/common/mat_mul_quantized.cl',
+                       'src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl',
                        'src/core/CL/cl_kernels/common/mean_stddev_normalization.cl',
                        'src/core/CL/cl_kernels/common/memset.cl',
                        'src/core/CL/cl_kernels/common/minmax_layer.cl',
@@ -456,7 +451,6 @@ if env['opencl'] and env['embed_kernels']:
                        'src/core/CL/cl_kernels/common/select.cl',
                        'src/core/CL/cl_kernels/common/slice_ops.cl',
                        'src/core/CL/cl_kernels/common/softmax_layer.cl',
-                       'src/core/CL/cl_kernels/common/softmax_layer_quantized.cl',
                        'src/core/CL/cl_kernels/common/stack_layer.cl',
                        'src/core/CL/cl_kernels/common/tile.cl',
                        'src/core/CL/cl_kernels/common/transpose.cl',
diff --git a/arm_compute/Acl.h b/arm_compute/Acl.h
index 1cd45d5756aeddc9102c730c9d19ae9068b91b8f..3e99a00cbe326deaf9cc130a3d3c1f166b381a2b 100644
--- a/arm_compute/Acl.h
+++ b/arm_compute/Acl.h
@@ -25,7 +25,8 @@
 #define ARM_COMPUTE_ACL_H_
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif /* __cplusplus */
 
 /* Core headers */
diff --git a/arm_compute/Acl.hpp b/arm_compute/Acl.hpp
index 55e04e876d2b529981a7e72d9c620e565728e6b8..6a9d585c142d1bb03e29964f2064b0e798b46849 100644
--- a/arm_compute/Acl.hpp
+++ b/arm_compute/Acl.hpp
@@ -75,7 +75,7 @@ struct ObjectDeleter
 #define OBJECT_DELETER(obj, func)              \
     template <>                                \
     struct ObjectDeleter<obj>                  \
-        \
+                                               \
     {                                          \
         static inline AclStatus Destroy(obj v) \
         {                                      \
@@ -171,7 +171,7 @@ protected:
     ObjectBase() = default;
 
 protected:
-    std::shared_ptr<T> _object{ nullptr }; /**< Library object */
+    std::shared_ptr<T> _object{nullptr}; /**< Library object */
 };
 
 /** Equality operator for library object
@@ -221,8 +221,7 @@ public:
      * @param[in] status Status returned
      * @param[in] msg    Error message to be bound with the exception
      */
-    Status(StatusCode status, const std::string &msg)
-        : _status(status), _msg(msg)
+    Status(StatusCode status, const std::string &msg) : _status(status), _msg(msg)
     {
     }
     /** Returns an explanatory exception message
@@ -266,7 +265,7 @@ private:
  */
 static inline void report_status(StatusCode status, const std::string &msg)
 {
-    if(status != StatusCode::Success)
+    if (status != StatusCode::Success)
     {
         throw Status(status, msg);
     }
@@ -299,7 +298,8 @@ enum class Target
 /**< Available execution modes */
 enum class ExecutionMode
 {
-    FastRerun = AclPreferFastRerun, /**< Prefer minimum latency in consecutive runs, might introduce higher startup times */
+    FastRerun =
+        AclPreferFastRerun, /**< Prefer minimum latency in consecutive runs, might introduce higher startup times */
     FastStart = AclPreferFastStart, /**< Prefer minimizing startup time */
 };
 
@@ -372,8 +372,7 @@ public:
      * @param[in]  target Target to create context for
      * @param[out] status Status information if requested
      */
-    explicit Context(Target target, StatusCode *status = nullptr)
-        : Context(target, Options(), status)
+    explicit Context(Target target, StatusCode *status = nullptr) : Context(target, Options(), status)
     {
     }
     /** Constructor
@@ -385,10 +384,11 @@ public:
     Context(Target target, const Options &options, StatusCode *status = nullptr)
     {
         AclContext ctx;
-        const auto st = detail::as_enum<StatusCode>(AclCreateContext(&ctx, detail::as_cenum<AclTarget>(target), &options.copts));
+        const auto st =
+            detail::as_enum<StatusCode>(AclCreateContext(&ctx, detail::as_cenum<AclTarget>(target), &options.copts));
         reset(ctx);
         report_status(st, "[Compute Library] Failed to create context");
-        if(status)
+        if (status)
         {
             *status = st;
         }
@@ -424,15 +424,13 @@ public:
          * As default options, no tuning will be performed, and the number of scheduling units will
          * depends on internal device discovery functionality
          */
-        Options()
-            : opts{ AclTuningModeNone, 0 } {};
+        Options() : opts{AclTuningModeNone, 0} {};
         /** Constructor
          *
          * @param[in] mode          Tuning mode to be used
          * @param[in] compute_units Number of scheduling units to be used
          */
-        Options(TuningMode mode, int32_t compute_units)
-            : opts{ detail::as_cenum<AclTuningMode>(mode), compute_units }
+        Options(TuningMode mode, int32_t compute_units) : opts{detail::as_cenum<AclTuningMode>(mode), compute_units}
         {
         }
 
@@ -448,8 +446,7 @@ public:
      * @param[in]  ctx    Context to create queue for
      * @param[out] status Status information if requested
      */
-    explicit Queue(Context &ctx, StatusCode *status = nullptr)
-        : Queue(ctx, Options(), status)
+    explicit Queue(Context &ctx, StatusCode *status = nullptr) : Queue(ctx, Options(), status)
     {
     }
     /** Constructor
@@ -466,7 +463,7 @@ public:
         const auto st = detail::as_enum<StatusCode>(AclCreateQueue(&queue, ctx.get(), &options.opts));
         reset(queue);
         report_status(st, "[Compute Library] Failed to create queue!");
-        if(status)
+        if (status)
         {
             *status = st;
         }
@@ -508,8 +505,7 @@ public:
      * @param[in] shape Shape of the tensor
      * @param[in] data_type Data type of the tensor
      */
-    TensorDescriptor(const std::vector<int32_t> &shape, DataType data_type)
-        : _shape(shape), _data_type(data_type)
+    TensorDescriptor(const std::vector<int32_t> &shape, DataType data_type) : _shape(shape), _data_type(data_type)
     {
         _cdesc.ndims     = _shape.size();
         _cdesc.shape     = _shape.data();
@@ -526,7 +522,7 @@ public:
         _cdesc     = desc;
         _data_type = detail::as_enum<DataType>(desc.data_type);
         _shape.reserve(desc.ndims);
-        for(int32_t d = 0; d < desc.ndims; ++d)
+        for (int32_t d = 0; d < desc.ndims; ++d)
         {
             _shape.emplace_back(desc.shape[d]);
         }
@@ -552,9 +548,9 @@ public:
         is_same &= _data_type == other._data_type;
         is_same &= _shape.size() == other._shape.size();
 
-        if(is_same)
+        if (is_same)
         {
-            for(uint32_t d = 0; d < _shape.size(); ++d)
+            for (uint32_t d = 0; d < _shape.size(); ++d)
             {
                 is_same &= _shape[d] == other._shape[d];
             }
@@ -592,8 +588,7 @@ public:
      * @param[in]  desc   Tensor descriptor to be used
      * @param[out] status Status information if requested
      */
-    Tensor(Context &ctx, const TensorDescriptor &desc, StatusCode *status = nullptr)
-        : Tensor(ctx, desc, true, status)
+    Tensor(Context &ctx, const TensorDescriptor &desc, StatusCode *status = nullptr) : Tensor(ctx, desc, true, status)
     {
     }
     /** Constructor
@@ -609,7 +604,7 @@ public:
         const auto st = detail::as_enum<StatusCode>(AclCreateTensor(&tensor, ctx.get(), desc.get(), allocate));
         reset(tensor);
         report_status(st, "[Compute Library] Failed to create tensor!");
-        if(status)
+        if (status)
         {
             *status = st;
         }
@@ -646,7 +641,8 @@ public:
      */
     StatusCode import(void *handle, ImportType type)
     {
-        const auto st = detail::as_enum<StatusCode>(AclTensorImport(_object.get(), handle, detail::as_cenum<AclImportMemoryType>(type)));
+        const auto st = detail::as_enum<StatusCode>(
+            AclTensorImport(_object.get(), handle, detail::as_cenum<AclImportMemoryType>(type)));
         report_status(st, "[Compute Library] Failed to import external memory to tensor!");
         return st;
     }
@@ -658,7 +654,7 @@ public:
      */
     uint64_t get_size()
     {
-        uint64_t   size{ 0 };
+        uint64_t   size{0};
         const auto st = detail::as_enum<StatusCode>(AclGetTensorSize(_object.get(), &size));
         report_status(st, "[Compute Library] Failed to get the size of the tensor");
         return size;
@@ -692,13 +688,12 @@ public:
          * @param[in] tensor_ Tensor to pack
          * @param[in] slot_id_ Slot identification of the tensor in respect with the operator
          */
-        PackPair(Tensor *tensor_, int32_t slot_id_)
-            : tensor(tensor_), slot_id(slot_id_)
+        PackPair(Tensor *tensor_, int32_t slot_id_) : tensor(tensor_), slot_id(slot_id_)
         {
         }
 
-        Tensor *tensor{ nullptr };         /**< Tensor object */
-        int32_t slot_id{ AclSlotUnknown }; /**< Slot id in respect with the operator */
+        Tensor *tensor{nullptr};         /**< Tensor object */
+        int32_t slot_id{AclSlotUnknown}; /**< Slot id in respect with the operator */
     };
 
 public:
@@ -713,7 +708,7 @@ public:
         const auto    st = detail::as_enum<StatusCode>(AclCreateTensorPack(&pack, ctx.get()));
         reset(pack);
         report_status(st, "[Compute Library] Failure during tensor pack creation");
-        if(status)
+        if (status)
         {
             *status = st;
         }
@@ -741,7 +736,7 @@ public:
         std::vector<int32_t>   slots(size);
         std::vector<AclTensor> tensors(size);
         int                    i = 0;
-        for(auto &p : packed)
+        for (auto &p : packed)
         {
             slots[i]   = p.slot_id;
             tensors[i] = AclTensor(p.tensor);
@@ -780,13 +775,17 @@ using ActivationDesc = AclActivationDescriptor;
 class Activation : public Operator
 {
 public:
-    Activation(Context &ctx, const TensorDescriptor &src, const TensorDescriptor &dst, const ActivationDesc &desc, StatusCode *status = nullptr)
+    Activation(Context                &ctx,
+               const TensorDescriptor &src,
+               const TensorDescriptor &dst,
+               const ActivationDesc   &desc,
+               StatusCode             *status = nullptr)
     {
         AclOperator op;
         const auto  st = detail::as_enum<StatusCode>(AclActivation(&op, ctx.get(), src.get(), dst.get(), desc));
         reset(op);
         report_status(st, "[Compute Library] Failure during Activation operator creation");
-        if(status)
+        if (status)
         {
             *status = st;
         }
diff --git a/arm_compute/AclDescriptors.h b/arm_compute/AclDescriptors.h
index a564bd2141b1023bbdccd0e4a5e4afe972213c59..cdaf7c0dc80f45750bf73df5eb3cc0b477c97db7 100644
--- a/arm_compute/AclDescriptors.h
+++ b/arm_compute/AclDescriptors.h
@@ -25,37 +25,38 @@
 #define ARM_COMPUTE_ACL_DESCRIPTORS_H_
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif /** __cplusplus */
 
-/**< Supported activation types */
-typedef enum
-{
-    AclActivationTypeNone = 0,  /**< No activation */
-    AclIdentity           = 1,  /**< Identity */
-    AclLogistic           = 2,  /**< Logistic */
-    AclTanh               = 3,  /**< Hyperbolic tangent */
-    AclRelu               = 4,  /**< Rectifier */
-    AclBoundedRelu        = 5,  /**< Upper Bounded Rectifier */
-    AclLuBoundedRelu      = 6,  /**< Lower and Upper Bounded Rectifier */
-    AclLeakyRelu          = 7,  /**< Leaky Rectifier */
-    AclSoftRelu           = 8,  /**< Soft Rectifier */
-    AclElu                = 9,  /**< Exponential Linear Unit */
-    AclAbs                = 10, /**< Absolute */
-    AclSquare             = 11, /**< Square */
-    AclSqrt               = 12, /**< Square root */
-    AclLinear             = 13, /**< Linear */
-    AclHardSwish          = 14, /**< Hard-swish */
-} AclActivationType;
+    /**< Supported activation types */
+    typedef enum
+    {
+        AclActivationTypeNone = 0,  /**< No activation */
+        AclIdentity           = 1,  /**< Identity */
+        AclLogistic           = 2,  /**< Logistic */
+        AclTanh               = 3,  /**< Hyperbolic tangent */
+        AclRelu               = 4,  /**< Rectifier */
+        AclBoundedRelu        = 5,  /**< Upper Bounded Rectifier */
+        AclLuBoundedRelu      = 6,  /**< Lower and Upper Bounded Rectifier */
+        AclLeakyRelu          = 7,  /**< Leaky Rectifier */
+        AclSoftRelu           = 8,  /**< Soft Rectifier */
+        AclElu                = 9,  /**< Exponential Linear Unit */
+        AclAbs                = 10, /**< Absolute */
+        AclSquare             = 11, /**< Square */
+        AclSqrt               = 12, /**< Square root */
+        AclLinear             = 13, /**< Linear */
+        AclHardSwish          = 14, /**< Hard-swish */
+    } AclActivationType;
 
-/**< Activation layer descriptor */
-typedef struct
-{
-    AclActivationType type;    /**< Activation type */
-    float             a;       /**< Factor &alpha used by some activations */
-    float             b;       /**< Factor &beta used by some activations */
-    bool              inplace; /**< Hint that src and dst tensors will be the same */
-} AclActivationDescriptor;
+    /**< Activation layer descriptor */
+    typedef struct
+    {
+        AclActivationType type;    /**< Activation type */
+        float             a;       /**< Factor &alpha used by some activations */
+        float             b;       /**< Factor &beta used by some activations */
+        bool              inplace; /**< Hint that src and dst tensors will be the same */
+    } AclActivationDescriptor;
 #ifdef __cplusplus
 }
 #endif /** __cplusplus */
diff --git a/arm_compute/AclEntrypoints.h b/arm_compute/AclEntrypoints.h
index ca3a911f5d4bc52c5f31855155347f83935d8deb..0d4902a3d5002da48010c95faa531c6445a8146d 100644
--- a/arm_compute/AclEntrypoints.h
+++ b/arm_compute/AclEntrypoints.h
@@ -27,10 +27,11 @@
 #include "arm_compute/AclTypes.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif /** __cplusplus */
 
-/** Create a context object
+    /** Create a context object
  *
  * Context is responsible for retaining internal information and work as an aggregate service mechanism
  *
@@ -46,11 +47,9 @@ extern "C" {
  *  - @ref AclUnsupportedTarget if the requested target is unsupported
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclCreateContext(AclContext              *ctx,
-                           AclTarget                target,
-                           const AclContextOptions *options);
+    AclStatus AclCreateContext(AclContext *ctx, AclTarget target, const AclContextOptions *options);
 
-/** Destroy a given context object
+    /** Destroy a given context object
  *
  * @param[in] ctx A valid context object to destroy
  *
@@ -60,9 +59,9 @@ AclStatus AclCreateContext(AclContext              *ctx,
  *  - @ref AclSuccess if functions was completed successfully
  *  - @ref AclInvalidArgument if the provided context is invalid
  */
-AclStatus AclDestroyContext(AclContext ctx);
+    AclStatus AclDestroyContext(AclContext ctx);
 
-/** Create an operator queue
+    /** Create an operator queue
  *
  * Queue is responsible for any scheduling related activities
  *
@@ -78,9 +77,9 @@ AclStatus AclDestroyContext(AclContext ctx);
  *  - @ref AclUnsupportedTarget if the requested target is unsupported
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclCreateQueue(AclQueue *queue, AclContext ctx, const AclQueueOptions *options);
+    AclStatus AclCreateQueue(AclQueue *queue, AclContext ctx, const AclQueueOptions *options);
 
-/** Wait until all elements on the queue have been completed
+    /** Wait until all elements on the queue have been completed
  *
  * @param[in] queue Queue to wait on completion
  *
@@ -91,9 +90,9 @@ AclStatus AclCreateQueue(AclQueue *queue, AclContext ctx, const AclQueueOptions
  *  - @ref AclInvalidArgument if the provided queue is invalid
  *  - @ref AclRuntimeError on any other runtime related error
  */
-AclStatus AclQueueFinish(AclQueue queue);
+    AclStatus AclQueueFinish(AclQueue queue);
 
-/** Destroy a given queue object
+    /** Destroy a given queue object
  *
  * @param[in] queue A valid context object to destroy
  *
@@ -103,9 +102,9 @@ AclStatus AclQueueFinish(AclQueue queue);
  *  - @ref AclSuccess if functions was completed successfully
  *  - @ref AclInvalidArgument if the provided context is invalid
  */
-AclStatus AclDestroyQueue(AclQueue queue);
+    AclStatus AclDestroyQueue(AclQueue queue);
 
-/** Create a Tensor object
+    /** Create a Tensor object
  *
  * Tensor is a generalized matrix construct that can represent up to ND dimensionality (where N = 6 for Compute Library)
  * The object holds a backing memory along-side to operate on
@@ -121,9 +120,9 @@ AclStatus AclDestroyQueue(AclQueue queue);
  *  - @ref AclUnsupportedTarget if the requested target is unsupported
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclCreateTensor(AclTensor *tensor, AclContext ctx, const AclTensorDescriptor *desc, bool allocate);
+    AclStatus AclCreateTensor(AclTensor *tensor, AclContext ctx, const AclTensorDescriptor *desc, bool allocate);
 
-/** Map a tensor's backing memory to the host
+    /** Map a tensor's backing memory to the host
  *
  * @param[in]      tensor Tensor to be mapped
  * @param[in, out] handle A handle to the underlying backing memory
@@ -134,9 +133,9 @@ AclStatus AclCreateTensor(AclTensor *tensor, AclContext ctx, const AclTensorDesc
  *  - @ref AclSuccess if function was completed successfully
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclMapTensor(AclTensor tensor, void **handle);
+    AclStatus AclMapTensor(AclTensor tensor, void **handle);
 
-/** Unmap the tensor's backing memory
+    /** Unmap the tensor's backing memory
  *
  * @param[in] tensor tensor to unmap memory from
  * @param[in] handle Backing memory to be unmapped
@@ -147,9 +146,9 @@ AclStatus AclMapTensor(AclTensor tensor, void **handle);
  *  - @ref AclSuccess if function was completed successfully
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclUnmapTensor(AclTensor tensor, void *handle);
+    AclStatus AclUnmapTensor(AclTensor tensor, void *handle);
 
-/** Import external memory to a given tensor object
+    /** Import external memory to a given tensor object
  *
  * @param[in, out] tensor Tensor to import memory to
  * @param[in]      handle Backing memory to be imported
@@ -159,9 +158,9 @@ AclStatus AclUnmapTensor(AclTensor tensor, void *handle);
  *  - @ref AclSuccess if function was completed successfully
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclTensorImport(AclTensor tensor, void *handle, AclImportMemoryType type);
+    AclStatus AclTensorImport(AclTensor tensor, void *handle, AclImportMemoryType type);
 
-/** Destroy a given tensor object
+    /** Destroy a given tensor object
  *
  * @param[in,out] tensor A valid tensor object to be destroyed
  *
@@ -171,9 +170,9 @@ AclStatus AclTensorImport(AclTensor tensor, void *handle, AclImportMemoryType ty
  *  - @ref AclSuccess if function was completed successfully
  *  - @ref AclInvalidArgument if the provided tensor is invalid
  */
-AclStatus AclDestroyTensor(AclTensor tensor);
+    AclStatus AclDestroyTensor(AclTensor tensor);
 
-/** Creates a tensor pack
+    /** Creates a tensor pack
  *
  * Tensor packs are used to create a collection of tensors that can be passed around for operator execution
  *
@@ -187,9 +186,9 @@ AclStatus AclDestroyTensor(AclTensor tensor);
  *  - @ref AclOutOfMemory if there was a failure allocating memory resources
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclCreateTensorPack(AclTensorPack *pack, AclContext ctx);
+    AclStatus AclCreateTensorPack(AclTensorPack *pack, AclContext ctx);
 
-/** Add a tensor to a tensor pack
+    /** Add a tensor to a tensor pack
  *
  * @param[in,out] pack    Pack to append a tensor to
  * @param[in]     tensor  Tensor to pack
@@ -202,9 +201,9 @@ AclStatus AclCreateTensorPack(AclTensorPack *pack, AclContext ctx);
  *  - @ref AclOutOfMemory if there was a failure allocating memory resources
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclPackTensor(AclTensorPack pack, AclTensor tensor, int32_t slot_id);
+    AclStatus AclPackTensor(AclTensorPack pack, AclTensor tensor, int32_t slot_id);
 
-/** A list of tensors to a tensor pack
+    /** A list of tensors to a tensor pack
  *
  * @param[in,out] pack        Pack to append the tensors to
  * @param[in]     tensors     Tensors to append to the pack
@@ -218,9 +217,9 @@ AclStatus AclPackTensor(AclTensorPack pack, AclTensor tensor, int32_t slot_id);
  *  - @ref AclOutOfMemory if there was a failure allocating memory resources
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclPackTensors(AclTensorPack pack, AclTensor *tensors, int32_t *slot_ids, size_t num_tensors);
+    AclStatus AclPackTensors(AclTensorPack pack, AclTensor *tensors, int32_t *slot_ids, size_t num_tensors);
 
-/** Destroy a given tensor pack object
+    /** Destroy a given tensor pack object
  *
  * @param[in,out] pack A valid tensor pack object to destroy
  *
@@ -230,9 +229,9 @@ AclStatus AclPackTensors(AclTensorPack pack, AclTensor *tensors, int32_t *slot_i
  *  - @ref AclSuccess if functions was completed successfully
  *  - @ref AclInvalidArgument if the provided context is invalid
  */
-AclStatus AclDestroyTensorPack(AclTensorPack pack);
+    AclStatus AclDestroyTensorPack(AclTensorPack pack);
 
-/** Eager execution of a given operator on a list of inputs and outputs
+    /** Eager execution of a given operator on a list of inputs and outputs
  *
  * @param[in]     op      Operator to execute
  * @param[in]     queue   Queue to schedule the operator on
@@ -247,9 +246,9 @@ AclStatus AclDestroyTensorPack(AclTensorPack pack);
  *  - @ref AclInvalidArgument if a given argument is invalid
  *  - @ref AclRuntimeError on any other runtime related error
  */
-AclStatus AclRunOperator(AclOperator op, AclQueue queue, AclTensorPack tensors);
+    AclStatus AclRunOperator(AclOperator op, AclQueue queue, AclTensorPack tensors);
 
-/** Destroy a given operator object
+    /** Destroy a given operator object
  *
  * @param[in,out] op A valid operator object to destroy
  *
@@ -259,7 +258,7 @@ AclStatus AclRunOperator(AclOperator op, AclQueue queue, AclTensorPack tensors);
  *  - @ref AclSuccess if functions was completed successfully
  *  - @ref AclInvalidArgument if the provided context is invalid
  */
-AclStatus AclDestroyOperator(AclOperator op);
+    AclStatus AclDestroyOperator(AclOperator op);
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
diff --git a/arm_compute/AclOpenClExt.h b/arm_compute/AclOpenClExt.h
index ef80fd24434d9bb97e3a6c86a5cd7bc7b497c464..28e918d371d6f6c50ee2d0993f54bacefc60f065 100644
--- a/arm_compute/AclOpenClExt.h
+++ b/arm_compute/AclOpenClExt.h
@@ -37,10 +37,11 @@
 #pragma GCC diagnostic pop
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif /* __cplusplus */
 
-/** Extract the underlying OpenCL context used by a given Compute Library context object
+    /** Extract the underlying OpenCL context used by a given Compute Library context object
  *
  * @note @ref AclContext should be of an OpenCL backend target
  *
@@ -49,9 +50,9 @@ extern "C" {
  *
  * @return Status code
  */
-AclStatus AclGetClContext(AclContext ctx, cl_context *opencl_context);
+    AclStatus AclGetClContext(AclContext ctx, cl_context *opencl_context);
 
-/** Extract the underlying OpenCL device id used by a given Compute Library context object
+    /** Extract the underlying OpenCL device id used by a given Compute Library context object
  *
  * @note @ref AclContext should be of an OpenCL backend target
  *
@@ -60,9 +61,9 @@ AclStatus AclGetClContext(AclContext ctx, cl_context *opencl_context);
  *
  * @return Status code
  */
-AclStatus AclGetClDevice(AclContext ctx, cl_device_id *opencl_device);
+    AclStatus AclGetClDevice(AclContext ctx, cl_device_id *opencl_device);
 
-/** Set the underlying OpenCL context to be used by a given Compute Library context object
+    /** Set the underlying OpenCL context to be used by a given Compute Library context object
  *
  * @note @ref AclContext should be of an OpenCL backend target
  *
@@ -71,9 +72,9 @@ AclStatus AclGetClDevice(AclContext ctx, cl_device_id *opencl_device);
  *
  * @return Status code
  */
-AclStatus AclSetClContext(AclContext ctx, cl_context opencl_context);
+    AclStatus AclSetClContext(AclContext ctx, cl_context opencl_context);
 
-/** Extract the underlying OpenCL queue used by a given Compute Library queue object
+    /** Extract the underlying OpenCL queue used by a given Compute Library queue object
  *
  * @note @ref AclQueue should be of an OpenCL backend target
  * @note @ref AclQueue refcount should be 0, meaning not used by other objects
@@ -83,9 +84,9 @@ AclStatus AclSetClContext(AclContext ctx, cl_context opencl_context);
  *
  * @return Status code
  */
-AclStatus AclGetClQueue(AclQueue queue, cl_command_queue *opencl_queue);
+    AclStatus AclGetClQueue(AclQueue queue, cl_command_queue *opencl_queue);
 
-/** Set the underlying OpenCL queue to be used by a given Compute Library queue object
+    /** Set the underlying OpenCL queue to be used by a given Compute Library queue object
  *
  * @note @ref AclQueue should be of an OpenCL backend target
  * @note opecl_queue needs to be created from the same context that the AclContext that the queue will use
@@ -95,16 +96,16 @@ AclStatus AclGetClQueue(AclQueue queue, cl_command_queue *opencl_queue);
  *
  * @return Status code
  */
-AclStatus AclSetClQueue(AclQueue queue, cl_command_queue opencl_queue);
+    AclStatus AclSetClQueue(AclQueue queue, cl_command_queue opencl_queue);
 
-/** Extract the underlying OpenCL memory object by a given Compute Library tensor object
+    /** Extract the underlying OpenCL memory object by a given Compute Library tensor object
  *
  * @param[in]  tensor     A valid non-zero tensor
  * @param[out] opencl_mem Underlyig OpenCL memory object
  *
  * @return Status code
  */
-AclStatus AclGetClMem(AclTensor tensor, cl_mem *opencl_mem);
+    AclStatus AclGetClMem(AclTensor tensor, cl_mem *opencl_mem);
 
 #ifdef __cplusplus
 }
diff --git a/arm_compute/AclOperators.h b/arm_compute/AclOperators.h
index bfdd7b1b9b1dad5f753c23806b7e14d992e06998..4f6f46e9c8630cd06f19d247e0530bcd05a4a6a8 100644
--- a/arm_compute/AclOperators.h
+++ b/arm_compute/AclOperators.h
@@ -31,10 +31,11 @@
 #define ARM_COMPUTE_VALIDATE_OPERATOR_SUPPORT ((AclOperator *)(size_t)-1)
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif /** __cplusplus */
 
-/** Create an activation operator
+    /** Create an activation operator
  *
  * Applies an activation function to a given tensor .
  * Compute Library supports a wide list of activation functions @ref AclActivationType.
@@ -75,11 +76,11 @@ extern "C" {
  *  - @ref AclUnsupportedTarget if operator for the requested target is unsupported
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclActivation(AclOperator                  *op,
-                        AclContext                    ctx,
-                        const AclTensorDescriptor    *src,
-                        const AclTensorDescriptor    *dst,
-                        const AclActivationDescriptor info);
+    AclStatus AclActivation(AclOperator                  *op,
+                            AclContext                    ctx,
+                            const AclTensorDescriptor    *src,
+                            const AclTensorDescriptor    *dst,
+                            const AclActivationDescriptor info);
 #ifdef __cplusplus
 }
 #endif /** __cplusplus */
diff --git a/arm_compute/AclTypes.h b/arm_compute/AclTypes.h
index 368be1292b197d2d3a6fba73a4931f2894aaa732..9a002ad22c8cd1bfd871fe43fc93a4ffb5cddc80 100644
--- a/arm_compute/AclTypes.h
+++ b/arm_compute/AclTypes.h
@@ -28,185 +28,186 @@
 #include <stdint.h>
 
 #ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
-/**< Opaque Context object */
-typedef struct AclContext_ *AclContext;
-/**< Opaque Queue object */
-typedef struct AclQueue_ *AclQueue;
-/**< Opaque Tensor object */
-typedef struct AclTensor_ *AclTensor;
-/**< Opaque Tensor pack object */
-typedef struct AclTensorPack_ *AclTensorPack;
-/**< Opaque Operator object */
-typedef struct AclOperator_ *AclOperator;
-
-// Capabilities bitfield (Note: if multiple are enabled ComputeLibrary will pick the best possible)
-typedef uint64_t AclTargetCapabilities;
-
-/**< Error codes returned by the public entry-points */
-typedef enum AclStatus : int32_t
-{
-    AclSuccess            = 0, /**< Call succeeded, leading to valid state for all involved objects/data */
-    AclRuntimeError       = 1, /**< Call failed during execution */
-    AclOutOfMemory        = 2, /**< Call failed due to failure to allocate resources */
-    AclUnimplemented      = 3, /**< Call failed as requested capability is not implemented */
-    AclUnsupportedTarget  = 4, /**< Call failed as an invalid backend was requested */
-    AclInvalidTarget      = 5, /**< Call failed as invalid argument was passed */
-    AclInvalidArgument    = 6, /**< Call failed as invalid argument was passed */
-    AclUnsupportedConfig  = 7, /**< Call failed as configuration is unsupported */
-    AclInvalidObjectState = 8, /**< Call failed as an object has invalid state */
-} AclStatus;
-
-/**< Supported CPU targets */
-typedef enum AclTarget
-{
-    AclCpu    = 0, /**< Cpu target that uses SIMD extensions */
-    AclGpuOcl = 1, /**< OpenCL target for GPU */
-} AclTarget;
-
-/** Execution mode types */
-typedef enum AclExecutionMode
-{
-    AclPreferFastRerun = 0, /**< Prioritize performance when multiple iterations are performed */
-    AclPreferFastStart = 1, /**< Prioritize performance when a single iterations is expected to be performed */
-} AclExecutionMode;
-
-/** Available CPU capabilities */
-typedef enum AclCpuCapabilities
+extern "C"
 {
-    AclCpuCapabilitiesAuto = 0, /**< Automatic discovery of capabilities */
-
-    AclCpuCapabilitiesNeon = (1 << 0), /**< Enable NEON optimized paths */
-    AclCpuCapabilitiesSve  = (1 << 1), /**< Enable SVE optimized paths */
-    AclCpuCapabilitiesSve2 = (1 << 2), /**< Enable SVE2 optimized paths */
-    // Reserve 3, 4, 5, 6
-
-    AclCpuCapabilitiesFp16 = (1 << 7), /**< Enable float16 data-type support */
-    AclCpuCapabilitiesBf16 = (1 << 8), /**< Enable bfloat16 data-type support */
-    // Reserve 9, 10, 11, 12
-
-    AclCpuCapabilitiesDot      = (1 << 13), /**< Enable paths that use the udot/sdot instructions */
-    AclCpuCapabilitiesMmlaInt8 = (1 << 14), /**< Enable paths that use the mmla integer instructions */
-    AclCpuCapabilitiesMmlaFp   = (1 << 15), /**< Enable paths that use the mmla float instructions */
-
-    AclCpuCapabilitiesAll = ~0 /**< Enable all paths */
-} AclCpuCapabilities;
+#endif /* __cplusplus */
 
-/**< Allocator interface that can be passed to a context */
-typedef struct AclAllocator
-{
-    /** Allocate a block of size bytes of memory.
+    /**< Opaque Context object */
+    typedef struct AclContext_ *AclContext;
+    /**< Opaque Queue object */
+    typedef struct AclQueue_ *AclQueue;
+    /**< Opaque Tensor object */
+    typedef struct AclTensor_ *AclTensor;
+    /**< Opaque Tensor pack object */
+    typedef struct AclTensorPack_ *AclTensorPack;
+    /**< Opaque Operator object */
+    typedef struct AclOperator_ *AclOperator;
+
+    // Capabilities bitfield (Note: if multiple are enabled ComputeLibrary will pick the best possible)
+    typedef uint64_t AclTargetCapabilities;
+
+    /**< Error codes returned by the public entry-points */
+    typedef enum AclStatus : int32_t
+    {
+        AclSuccess            = 0, /**< Call succeeded, leading to valid state for all involved objects/data */
+        AclRuntimeError       = 1, /**< Call failed during execution */
+        AclOutOfMemory        = 2, /**< Call failed due to failure to allocate resources */
+        AclUnimplemented      = 3, /**< Call failed as requested capability is not implemented */
+        AclUnsupportedTarget  = 4, /**< Call failed as an invalid backend was requested */
+        AclInvalidTarget      = 5, /**< Call failed as invalid argument was passed */
+        AclInvalidArgument    = 6, /**< Call failed as invalid argument was passed */
+        AclUnsupportedConfig  = 7, /**< Call failed as configuration is unsupported */
+        AclInvalidObjectState = 8, /**< Call failed as an object has invalid state */
+    } AclStatus;
+
+    /**< Supported CPU targets */
+    typedef enum AclTarget
+    {
+        AclCpu    = 0, /**< Cpu target that uses SIMD extensions */
+        AclGpuOcl = 1, /**< OpenCL target for GPU */
+    } AclTarget;
+
+    /** Execution mode types */
+    typedef enum AclExecutionMode
+    {
+        AclPreferFastRerun = 0, /**< Prioritize performance when multiple iterations are performed */
+        AclPreferFastStart = 1, /**< Prioritize performance when a single iterations is expected to be performed */
+    } AclExecutionMode;
+
+    /** Available CPU capabilities */
+    typedef enum AclCpuCapabilities
+    {
+        AclCpuCapabilitiesAuto = 0, /**< Automatic discovery of capabilities */
+
+        AclCpuCapabilitiesNeon = (1 << 0), /**< Enable NEON optimized paths */
+        AclCpuCapabilitiesSve  = (1 << 1), /**< Enable SVE optimized paths */
+        AclCpuCapabilitiesSve2 = (1 << 2), /**< Enable SVE2 optimized paths */
+        // Reserve 3, 4, 5, 6
+
+        AclCpuCapabilitiesFp16 = (1 << 7), /**< Enable float16 data-type support */
+        AclCpuCapabilitiesBf16 = (1 << 8), /**< Enable bfloat16 data-type support */
+        // Reserve 9, 10, 11, 12
+
+        AclCpuCapabilitiesDot      = (1 << 13), /**< Enable paths that use the udot/sdot instructions */
+        AclCpuCapabilitiesMmlaInt8 = (1 << 14), /**< Enable paths that use the mmla integer instructions */
+        AclCpuCapabilitiesMmlaFp   = (1 << 15), /**< Enable paths that use the mmla float instructions */
+
+        AclCpuCapabilitiesAll = ~0 /**< Enable all paths */
+    } AclCpuCapabilities;
+
+    /**< Allocator interface that can be passed to a context */
+    typedef struct AclAllocator
+    {
+        /** Allocate a block of size bytes of memory.
      *
      * @param[in] user_data User provided data that can be used by the allocator
      * @param[in] size      Size of the allocation
      *
      * @return A pointer to the allocated block if successfull else NULL
      */
-    void *(*alloc)(void *user_data, size_t size);
-    /** Release a block of size bytes of memory.
+        void *(*alloc)(void *user_data, size_t size);
+        /** Release a block of size bytes of memory.
      *
      * @param[in] user_data User provided data that can be used by the allocator
      * @param[in] size      Size of the allocation
      */
-    void (*free)(void *user_data, void *ptr);
-    /** Allocate a block of size bytes of memory.
+        void (*free)(void *user_data, void *ptr);
+        /** Allocate a block of size bytes of memory.
      *
      * @param[in] user_data User provided data that can be used by the allocator
      * @param[in] size      Size of the allocation
      *
      * @return A pointer to the allocated block if successfull else NULL
      */
-    void *(*aligned_alloc)(void *user_data, size_t size, size_t alignment);
-    /** Allocate a block of size bytes of memory.
+        void *(*aligned_alloc)(void *user_data, size_t size, size_t alignment);
+        /** Allocate a block of size bytes of memory.
      *
      * @param[in] user_data User provided data that can be used by the allocator
      * @param[in] size      Size of the allocation
      */
-    void (*aligned_free)(void *user_data, void *ptr);
-
-    /**< User provided information */
-    void *user_data;
-} AclAllocator;
-
-/**< Context options */
-typedef struct AclContextOptions
-{
-    AclExecutionMode      mode;               /**< Execution mode to use */
-    AclTargetCapabilities capabilities;       /**< Target capabilities */
-    bool                  enable_fast_math;   /**< Allow precision loss */
-    const char           *kernel_config_file; /**< Kernel cofiguration file */
-    int32_t               max_compute_units;  /**< Max compute units that can be used by a queue created from the context.
+        void (*aligned_free)(void *user_data, void *ptr);
+
+        /**< User provided information */
+        void *user_data;
+    } AclAllocator;
+
+    /**< Context options */
+    typedef struct AclContextOptions
+    {
+        AclExecutionMode      mode;               /**< Execution mode to use */
+        AclTargetCapabilities capabilities;       /**< Target capabilities */
+        bool                  enable_fast_math;   /**< Allow precision loss */
+        const char           *kernel_config_file; /**< Kernel cofiguration file */
+        int32_t       max_compute_units; /**< Max compute units that can be used by a queue created from the context.
                                                    If <=0 the system will use the hw concurency insted */
-    AclAllocator         *allocator;          /**< Allocator to be used by all the memory internally */
-} AclContextOptions;
-
-/**< Supported tuning modes */
-typedef enum
-{
-    AclTuningModeNone = 0, /**< No tuning */
-    AclRapid          = 1, /**< Fast tuning mode, testing a small portion of the tuning space */
-    AclNormal         = 2, /**< Normal tuning mode, gives a good balance between tuning mode and performance */
-    AclExhaustive     = 3, /**< Exhaustive tuning mode, increased tuning time but with best results */
-} AclTuningMode;
-
-/**< Queue options */
-typedef struct
-{
-    AclTuningMode mode;          /**< Tuning mode */
-    int32_t       compute_units; /**< Compute Units that the queue will deploy */
-} AclQueueOptions;
-
-/**< Supported data types */
-typedef enum AclDataType
-{
-    AclDataTypeUnknown = 0, /**< Unknown data type */
-    AclUInt8           = 1, /**< 8-bit unsigned integer */
-    AclInt8            = 2, /**< 8-bit signed integer */
-    AclUInt16          = 3, /**< 16-bit unsigned integer */
-    AclInt16           = 4, /**< 16-bit signed integer */
-    AclUint32          = 5, /**< 32-bit unsigned integer */
-    AclInt32           = 6, /**< 32-bit signed integer */
-    AclFloat16         = 7, /**< 16-bit floating point */
-    AclBFloat16        = 8, /**< 16-bit brain floating point */
-    AclFloat32         = 9, /**< 32-bit floating point */
-} AclDataType;
-
-/**< Supported data layouts for operations */
-typedef enum AclDataLayout
-{
-    AclDataLayoutUnknown = 0, /**< Unknown data layout */
-    AclNhwc              = 1, /**< Native, performant, Compute Library data layout */
-    AclNchw              = 2, /**< Data layout where width is the fastest changing dimension */
-} AclDataLayout;
-
-/** Type of memory to be imported */
-typedef enum AclImportMemoryType
-{
-    AclHostPtr = 0 /**< Host allocated memory */
-} AclImportMemoryType;
-
-/**< Tensor Descriptor */
-typedef struct AclTensorDescriptor
-{
-    int32_t     ndims;     /**< Number or dimensions */
-    int32_t    *shape;     /**< Tensor Shape */
-    AclDataType data_type; /**< Tensor Data type */
-    int64_t    *strides;   /**< Strides on each dimension. Linear memory is assumed if nullptr */
-    int64_t     boffset;   /**< Offset in terms of bytes for the first element */
-} AclTensorDescriptor;
-
-/**< Slot type of a tensor */
-typedef enum
-{
-    AclSlotUnknown = -1,
-    AclSrc         = 0,
-    AclSrc0        = 0,
-    AclSrc1        = 1,
-    AclDst         = 30,
-    AclSrcVec      = 256,
-} AclTensorSlot;
+        AclAllocator *allocator;         /**< Allocator to be used by all the memory internally */
+    } AclContextOptions;
+
+    /**< Supported tuning modes */
+    typedef enum
+    {
+        AclTuningModeNone = 0, /**< No tuning */
+        AclRapid          = 1, /**< Fast tuning mode, testing a small portion of the tuning space */
+        AclNormal         = 2, /**< Normal tuning mode, gives a good balance between tuning mode and performance */
+        AclExhaustive     = 3, /**< Exhaustive tuning mode, increased tuning time but with best results */
+    } AclTuningMode;
+
+    /**< Queue options */
+    typedef struct
+    {
+        AclTuningMode mode;          /**< Tuning mode */
+        int32_t       compute_units; /**< Compute Units that the queue will deploy */
+    } AclQueueOptions;
+
+    /**< Supported data types */
+    typedef enum AclDataType
+    {
+        AclDataTypeUnknown = 0, /**< Unknown data type */
+        AclUInt8           = 1, /**< 8-bit unsigned integer */
+        AclInt8            = 2, /**< 8-bit signed integer */
+        AclUInt16          = 3, /**< 16-bit unsigned integer */
+        AclInt16           = 4, /**< 16-bit signed integer */
+        AclUint32          = 5, /**< 32-bit unsigned integer */
+        AclInt32           = 6, /**< 32-bit signed integer */
+        AclFloat16         = 7, /**< 16-bit floating point */
+        AclBFloat16        = 8, /**< 16-bit brain floating point */
+        AclFloat32         = 9, /**< 32-bit floating point */
+    } AclDataType;
+
+    /**< Supported data layouts for operations */
+    typedef enum AclDataLayout
+    {
+        AclDataLayoutUnknown = 0, /**< Unknown data layout */
+        AclNhwc              = 1, /**< Native, performant, Compute Library data layout */
+        AclNchw              = 2, /**< Data layout where width is the fastest changing dimension */
+    } AclDataLayout;
+
+    /** Type of memory to be imported */
+    typedef enum AclImportMemoryType
+    {
+        AclHostPtr = 0 /**< Host allocated memory */
+    } AclImportMemoryType;
+
+    /**< Tensor Descriptor */
+    typedef struct AclTensorDescriptor
+    {
+        int32_t     ndims;     /**< Number or dimensions */
+        int32_t    *shape;     /**< Tensor Shape */
+        AclDataType data_type; /**< Tensor Data type */
+        int64_t    *strides;   /**< Strides on each dimension. Linear memory is assumed if nullptr */
+        int64_t     boffset;   /**< Offset in terms of bytes for the first element */
+    } AclTensorDescriptor;
+
+    /**< Slot type of a tensor */
+    typedef enum
+    {
+        AclSlotUnknown = -1,
+        AclSrc         = 0,
+        AclSrc0        = 0,
+        AclSrc1        = 1,
+        AclDst         = 30,
+        AclSrcVec      = 256,
+    } AclTensorSlot;
 
 #ifdef __cplusplus
 }
diff --git a/arm_compute/AclUtils.h b/arm_compute/AclUtils.h
index ef5fa427080399e64f3e385ce894676cc1dcec9e..61a93e606040a45794e3bb322720da1126f478a7 100644
--- a/arm_compute/AclUtils.h
+++ b/arm_compute/AclUtils.h
@@ -27,10 +27,11 @@
 #include "arm_compute/AclTypes.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif /** __cplusplus */
 
-/** Get the size of the existing tensor in byte
+    /** Get the size of the existing tensor in byte
  *
  * @note The size isn't based on allocated memory, but based on information in its descriptor (dimensions, data type, etc.).
  *
@@ -42,9 +43,9 @@ extern "C" {
  *  - @ref AclSuccess if function was completed successfully
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclGetTensorSize(AclTensor tensor, uint64_t *size);
+    AclStatus AclGetTensorSize(AclTensor tensor, uint64_t *size);
 
-/** Get the descriptor of this tensor
+    /** Get the descriptor of this tensor
  *
  * @param[in]  tensor A tensor in interest
  * @param[out] desc   The descriptor of the tensor
@@ -54,7 +55,7 @@ AclStatus AclGetTensorSize(AclTensor tensor, uint64_t *size);
  *  - @ref AclSuccess if function was completed successfully
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclGetTensorDescriptor(AclTensor tensor, AclTensorDescriptor *desc);
+    AclStatus AclGetTensorDescriptor(AclTensor tensor, AclTensorDescriptor *desc);
 
 #ifdef __cplusplus
 }
diff --git a/arm_compute/AclVersion.h b/arm_compute/AclVersion.h
index 0b05a5e7dc7e666d6bb1ac8b18556c2ac0f680dc..6eed13b924a8fb5c1d7da88cc78c4b22efd53c94 100644
--- a/arm_compute/AclVersion.h
+++ b/arm_compute/AclVersion.h
@@ -25,17 +25,18 @@
 #define ARM_COMPUTE_ACL_VERSION_H_
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif /* __cplusplus */
 
-/** Semantic versioning information */
-typedef struct AclVersion
-{
-    int         major;      /**< Major version, is increased on API incompatible changes */
-    int         minor;      /**< Minor version, is increased on adding back-ward compatible functionality */
-    int         patch;      /**< Patch version, is increased when doing backward compatible fixes */
-    const char *build_info; /**< Build related information */
-} AclVersion;
+    /** Semantic versioning information */
+    typedef struct AclVersion
+    {
+        int         major;      /**< Major version, is increased on API incompatible changes */
+        int         minor;      /**< Minor version, is increased on adding back-ward compatible functionality */
+        int         patch;      /**< Patch version, is increased when doing backward compatible fixes */
+        const char *build_info; /**< Build related information */
+    } AclVersion;
 
 /**< Major version, is increased on API incompatible changes */
 #define ARM_COMPUTE_LIBRARY_VERSION_MAJOR 0
@@ -44,11 +45,11 @@ typedef struct AclVersion
 /**< Patch version, is increased when doing backward compatible fixes */
 #define ARM_COMPUTE_LIBRARY_VERSION_PATCH 0
 
-/** Get library's version meta-data
+    /** Get library's version meta-data
  *
  * @return Version information
  */
-const AclVersion *AclVersionInfo();
+    const AclVersion *AclVersionInfo();
 
 #ifdef __cplusplus
 }
diff --git a/arm_compute/core/CL/CLCompileContext.h b/arm_compute/core/CL/CLCompileContext.h
index 60e0f95f8322b404d764561e8baf890ca0a15f2a..dcd3b456701b37c6309f357991b4127602453c2f 100644
--- a/arm_compute/core/CL/CLCompileContext.h
+++ b/arm_compute/core/CL/CLCompileContext.h
@@ -250,8 +250,12 @@ public:
      *
      * @return The created kernel.
      */
-    Kernel create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source,
-                         const std::string &kernel_path, const StringSet &build_options_set, bool is_binary) const;
+    Kernel create_kernel(const std::string &kernel_name,
+                         const std::string &program_name,
+                         const std::string &program_source,
+                         const std::string &kernel_path,
+                         const StringSet   &build_options_set,
+                         bool               is_binary) const;
 
     /** Clear the library's cache of binary programs
      */
@@ -323,7 +327,8 @@ private:
      * @param[in] program_source Source of the program.
      * @param[in] is_binary      Flag to indicate if the program source is binary.
      */
-    const Program &load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const;
+    const Program &
+    load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const;
 
     /** Generates the build options given a string of user defined ones
      *
@@ -343,11 +348,11 @@ private:
      */
     std::string stringify_set(const StringSet &s, const std::string &kernel_path) const;
 
-    cl::Context _context;                                             /**< Underlying CL context. */
-    CLDevice    _device;                                              /**< Underlying CL device. */
+    cl::Context                                  _context;            /**< Underlying CL context. */
+    CLDevice                                     _device;             /**< Underlying CL device. */
     mutable std::map<std::string, const Program> _programs_map;       /**< Map with all already loaded program data. */
     mutable std::map<std::string, cl::Program>   _built_programs_map; /**< Map with all already built program data. */
-    bool _is_wbsm_supported;                                          /**< Support of worksize batch size modifier support boolean*/
+    bool _is_wbsm_supported; /**< Support of worksize batch size modifier support boolean*/
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLCOMPILECONTEXT_H */
diff --git a/arm_compute/core/CL/CLDevice.h b/arm_compute/core/CL/CLDevice.h
index 5e0f86e6d9fe8d4862cf234cd6509adc8f463c5c..ded6bb84938055b53647899429e82b7796f3b342 100644
--- a/arm_compute/core/CL/CLDevice.h
+++ b/arm_compute/core/CL/CLDevice.h
@@ -44,8 +44,7 @@ class CLDevice : public IDevice
 {
 public:
     /** Default Constructor */
-    CLDevice()
-        : _device(cl::Device()), _options()
+    CLDevice() : _device(cl::Device()), _options()
     {
     }
 
@@ -53,8 +52,7 @@ public:
      *
      * @param[in] cl_device OpenCL device
      */
-    CLDevice(const cl::Device &cl_device)
-        : _device(), _options()
+    CLDevice(const cl::Device &cl_device) : _device(), _options()
     {
         _device = cl_device;
 
@@ -66,13 +64,13 @@ public:
         std::string extensions = _device.getInfo<CL_DEVICE_EXTENSIONS>();
 
         std::istringstream iss(extensions);
-        for(std::string s; iss >> s;)
+        for (std::string s; iss >> s;)
         {
             _options.extensions.insert(s);
         }
 
         // SW workaround for G76
-        if(_options.gpu_target == GPUTarget::G76)
+        if (_options.gpu_target == GPUTarget::G76)
         {
             _options.extensions.insert("cl_arm_integer_dot_product_int8");
         }
@@ -153,15 +151,15 @@ public:
      */
     std::tuple<bool, std::string> is_non_uniform_workgroup_supported() const
     {
-        if(version() == CLVersion::CL30 && get_cl_non_uniform_work_group_supported(_device))
+        if (version() == CLVersion::CL30 && get_cl_non_uniform_work_group_supported(_device))
         {
             return {true, " -cl-std=CL3.0 "};
         }
-        else if(version() == CLVersion::CL20)
+        else if (version() == CLVersion::CL20)
         {
             return {true, " -cl-std=CL2.0 "};
         }
-        else if(supported("cl_arm_non_uniform_work_group_size"))
+        else if (supported("cl_arm_non_uniform_work_group_size"))
         {
             return {true, " -cl-arm-non-uniform-work-group-size "};
         }
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
index a162b1c1182cf6623f60232160ee4540e09ce096..1a639e47f92cd35c19d153afcd1a0d57bfd0e36b 100644
--- a/arm_compute/core/CL/CLHelpers.h
+++ b/arm_compute/core/CL/CLHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -179,7 +179,9 @@ bool dot8_acc_supported(const cl::Device &device);
  *
  * @return True if the configuration is supported
  */
-bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Size2D &kernel_size, DataLayout data_layout);
+bool cl_winograd_convolution_layer_supported(const Size2D &output_tile,
+                                             const Size2D &kernel_size,
+                                             DataLayout    data_layout);
 
 /** Helper function to get the preferred native vector width size for built-in scalar types that can be put into vectors
  *
@@ -215,7 +217,9 @@ bool image2d_from_buffer_supported(const cl::Device &device);
  *
  * @return An opencl kernel
  */
-cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts = std::set<std::string>());
+cl::Kernel create_kernel(const CLCompileContext      &ctx,
+                         const std::string           &kernel_name,
+                         const std::set<std::string> &build_opts = std::set<std::string>());
 
 /** Creates a suitable LWS hint object for parallel implementations. Sets the number of WG based on the input size.
  *  If input width is smaller than 128 we can use fewer threads than 8.
@@ -267,5 +271,22 @@ void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list<in
  * @return True if the extension is supported
  */
 bool arm_matrix_multiply_supported(const cl::Device &device);
+
+/** Check whether cl_khr_command_buffer extension is supported by the specified CL device.
+ *
+ * @param[in] device The CL device
+ *
+ * @return True if the extension is supported by the CL device.
+ */
+bool command_buffer_supported(const cl::Device &device);
+
+/** Check whether cl_khr_command_buffer_mutable_dispatch extension is supported by the specified CL device.
+ *
+ * @param[in] device The CL device
+ *
+ * @return True if the extension is supported by the CL device.
+ */
+bool command_buffer_mutable_dispatch_supported(const cl::Device &device);
+
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLHELPERS_H */
diff --git a/arm_compute/core/CL/CLTypes.h b/arm_compute/core/CL/CLTypes.h
index 00b7cda2e1621a2cfe239cb394b361a98ef5e161..0f088e2b10e719a1ba011b5a77ec23c2e44e0d38 100644
--- a/arm_compute/core/CL/CLTypes.h
+++ b/arm_compute/core/CL/CLTypes.h
@@ -63,15 +63,13 @@ struct CLDeviceOptions
 struct CLQuantization
 {
     /** Default Constructor */
-    CLQuantization()
-        : scale(nullptr), offset(nullptr) {};
+    CLQuantization() : scale(nullptr), offset(nullptr){};
     /** Constructor
      *
      * @param[in] scale  OpenCL scale array
      * @param[in] offset OpenCL offset array
      */
-    CLQuantization(const ICLFloatArray *scale, const ICLInt32Array *offset)
-        : scale(scale), offset(offset) {};
+    CLQuantization(const ICLFloatArray *scale, const ICLInt32Array *offset) : scale(scale), offset(offset){};
 
     const ICLFloatArray *scale;  /**< Quantization scale array */
     const ICLInt32Array *offset; /**< Quantization offset array */
diff --git a/arm_compute/core/CL/ICLArray.h b/arm_compute/core/CL/ICLArray.h
index 57f842b6f91476838be886e7edd74ad615cfaf95..a2b2baa5b37d0c4a125a65162ce3c200ae04c3a2 100644
--- a/arm_compute/core/CL/ICLArray.h
+++ b/arm_compute/core/CL/ICLArray.h
@@ -40,8 +40,7 @@ public:
      * @param[in] max_num_values Maximum size of the array.
      *
      */
-    explicit ICLArray(size_t max_num_values)
-        : IArray<T>(max_num_values), _mapping(nullptr)
+    explicit ICLArray(size_t max_num_values) : IArray<T>(max_num_values), _mapping(nullptr)
     {
     }
 
@@ -125,5 +124,5 @@ using ICLInt16Array = ICLArray<cl_short>;
 using ICLInt32Array = ICLArray<cl_int>;
 /** Interface for OpenCL Array of floats. */
 using ICLFloatArray = ICLArray<cl_float>;
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLARRAY_H*/
diff --git a/arm_compute/core/CL/ICLTensor.h b/arm_compute/core/CL/ICLTensor.h
index 78d3757e593af9e5d38faa3e646847a429518dc0..8de5423762090b81dd26c5861f6c06eda69c0121 100644
--- a/arm_compute/core/CL/ICLTensor.h
+++ b/arm_compute/core/CL/ICLTensor.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_ICLTENSOR_H
 #define ARM_COMPUTE_ICLTENSOR_H
 
-#include "arm_compute/core/ITensor.h"
-
 #include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/ITensor.h"
 
 #include <cstdint>
 
@@ -34,7 +33,7 @@ namespace cl
 {
 class Buffer;
 class CommandQueue;
-}
+} // namespace cl
 
 namespace arm_compute
 {
@@ -113,5 +112,5 @@ private:
 };
 
 using ICLImage = ICLTensor;
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLTENSOR_H */
diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
index 1e1f5291d9690e2c7dbb9d4d8f1b570ba98ae76d..a5c4e39df2d9f7e42238c05e7ceb33afc425de2d 100644
--- a/arm_compute/core/CL/OpenCL.h
+++ b/arm_compute/core/CL/OpenCL.h
@@ -31,8 +31,8 @@
 #ifndef ARM_COMPUTE_NO_EXCEPTIONS
 #define CL_HPP_ENABLE_EXCEPTIONS
 #endif // ARM_COMPUTE_NO_EXCEPTIONS
-#define CL_TARGET_OPENCL_VERSION 300
-#define CL_HPP_TARGET_OPENCL_VERSION 110
+#define CL_TARGET_OPENCL_VERSION      300
+#define CL_HPP_TARGET_OPENCL_VERSION  110
 #define CL_HPP_MINIMUM_OPENCL_VERSION 110
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Weffc++"
@@ -40,7 +40,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #if defined(__GNUG__) && __GNUG__ >= 8
 #pragma GCC diagnostic ignored "-Wcatch-value"
-#endif // defined(__GNUG__) && __GNUG__ >= 8
+#endif                   // defined(__GNUG__) && __GNUG__ >= 8
 #include <CL/opencl.hpp> // include new hpp header instead of cl2.hpp
 #pragma GCC diagnostic pop
 
@@ -88,8 +88,7 @@ public:
      */
     bool load_default();
 
-#define DECLARE_FUNCTION_PTR(func_name) \
-    std::function<decltype(func_name)> func_name##_ptr = nullptr
+#define DECLARE_FUNCTION_PTR(func_name) std::function<decltype(func_name)> func_name##_ptr = nullptr
 
     DECLARE_FUNCTION_PTR(clCreateContext);
     DECLARE_FUNCTION_PTR(clCreateContextFromType);
@@ -141,6 +140,16 @@ public:
     DECLARE_FUNCTION_PTR(clCreateImage);
     DECLARE_FUNCTION_PTR(clSetKernelExecInfo);
 
+    // Command buffer and mutable dispatch command buffer extensions
+    DECLARE_FUNCTION_PTR(clCreateCommandBufferKHR);
+    DECLARE_FUNCTION_PTR(clRetainCommandBufferKHR);
+    DECLARE_FUNCTION_PTR(clReleaseCommandBufferKHR);
+    DECLARE_FUNCTION_PTR(clFinalizeCommandBufferKHR);
+    DECLARE_FUNCTION_PTR(clEnqueueCommandBufferKHR);
+    DECLARE_FUNCTION_PTR(clCommandNDRangeKernelKHR);
+
+    DECLARE_FUNCTION_PTR(clUpdateMutableCommandsKHR);
+
     // Third-party extensions
     DECLARE_FUNCTION_PTR(clImportMemoryARM);
 
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index e4cbd9ff9b7626d02009c3edd22ee47627955975..b080a86938e8f808749e49074c0a8ff72d079277 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -78,10 +78,10 @@ public:
 
     /* Delete move and copy constructors and assignment operator
     s */
-    CPUInfo(CPUInfo const &) = delete;            // Copy construct
-    CPUInfo(CPUInfo &&)      = delete;            // Move construct
+    CPUInfo(CPUInfo const &)            = delete; // Copy construct
+    CPUInfo(CPUInfo &&)                 = delete; // Move construct
     CPUInfo &operator=(CPUInfo const &) = delete; // Copy assign
-    CPUInfo &operator=(CPUInfo &&) = delete;      // Move assign
+    CPUInfo &operator=(CPUInfo &&)      = delete; // Move assign
 
     /** Checks if the cpu model supports fp16.
      *
@@ -179,9 +179,9 @@ private:
 /** Information about executing thread and CPU. */
 struct ThreadInfo
 {
-    int            thread_id{ 0 };
-    int            num_threads{ 1 };
-    const CPUInfo *cpu_info{ nullptr };
+    int            thread_id{0};
+    int            num_threads{1};
+    const CPUInfo *cpu_info{nullptr};
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CPP_TYPES_H */
diff --git a/arm_compute/core/CPP/ICPPKernel.h b/arm_compute/core/CPP/ICPPKernel.h
index 00a10555e32a78dc59926e19d4c8e80317e2d4d4..03967a536d6f4ec36194948e2f2afb805728c168 100644
--- a/arm_compute/core/CPP/ICPPKernel.h
+++ b/arm_compute/core/CPP/ICPPKernel.h
@@ -25,9 +25,9 @@
 #define ARM_COMPUTE_ICPPKERNEL_H
 
 #include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/IKernel.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/Types.h"
 
 namespace arm_compute
 {
@@ -38,7 +38,7 @@ class ITensor;
 class ICPPKernel : public IKernel
 {
 public:
-    static constexpr size_t default_mws       = 1; /* Default minimum workload size value  - no impact */
+    static constexpr size_t default_mws = 1; /* Default minimum workload size value  - no impact */
 
     /** Default destructor */
     virtual ~ICPPKernel() = default;
diff --git a/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h b/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
index 068b37d80cf4777bca0bd441910fb6acc80cc638..dd91595ea67a8f65401ebaf0523e88570df504cf 100644
--- a/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
@@ -63,8 +63,16 @@ public:
      * @param[out] keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: U32
      * @param[in]  info             (Optional) BoxNMSLimitInfo information.
      */
-    void configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
-                   ITensor *batch_splits_out = nullptr, ITensor *keeps = nullptr, ITensor *keeps_size = nullptr, const BoxNMSLimitInfo info = BoxNMSLimitInfo());
+    void configure(const ITensor        *scores_in,
+                   const ITensor        *boxes_in,
+                   const ITensor        *batch_splits_in,
+                   ITensor              *scores_out,
+                   ITensor              *boxes_out,
+                   ITensor              *classes,
+                   ITensor              *batch_splits_out = nullptr,
+                   ITensor              *keeps            = nullptr,
+                   ITensor              *keeps_size       = nullptr,
+                   const BoxNMSLimitInfo info             = BoxNMSLimitInfo());
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -74,9 +82,9 @@ public:
     void run_nmslimit();
 
 private:
-    const ITensor *_scores_in;
-    const ITensor *_boxes_in;
-    const ITensor *_batch_splits_in;
+    const ITensor  *_scores_in;
+    const ITensor  *_boxes_in;
+    const ITensor  *_batch_splits_in;
     ITensor        *_scores_out;
     ITensor        *_boxes_out;
     ITensor        *_classes;
diff --git a/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h b/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
index e32b5d8f7bc3f5e18542dcc9b825576e536125b6..d1f7f8670f72d4da40ab772786f8e16edb5c687b 100644
--- a/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSIONKERNEL_LAYER_H
 #define ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSIONKERNEL_LAYER_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -65,7 +64,12 @@ public:
      * @param[in]  iou_threshold   The threshold used in non maximum suppression.
      *
      */
-    void configure(const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices, unsigned int max_output_size, const float score_threshold, const float iou_threshold);
+    void configure(const ITensor *input_bboxes,
+                   const ITensor *input_scores,
+                   ITensor       *output_indices,
+                   unsigned int   max_output_size,
+                   const float    score_threshold,
+                   const float    iou_threshold);
 
     /** Static function to check if given arguments will lead to a valid configuration of @ref CPPNonMaximumSuppressionKernel
      *
@@ -77,8 +81,12 @@ public:
      * @param[in]  iou_threshold   The threshold used in non maximum suppression.
      *
      */
-    static Status validate(const ITensorInfo *input_bboxes, const ITensorInfo *input_scores, const ITensorInfo *output_indices, unsigned int max_output_size,
-                           const float score_threshold, const float iou_threshold);
+    static Status validate(const ITensorInfo *input_bboxes,
+                           const ITensorInfo *input_scores,
+                           const ITensorInfo *output_indices,
+                           unsigned int       max_output_size,
+                           const float        score_threshold,
+                           const float        iou_threshold);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/CPP/kernels/CPPTopKVKernel.h b/arm_compute/core/CPP/kernels/CPPTopKVKernel.h
index 1245dbc14c401f1ab7c6be1eeb7d6c7e2f63b868..7326a10e2ffed837c5cd2f20fbd9800183edf497 100644
--- a/arm_compute/core/CPP/kernels/CPPTopKVKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPTopKVKernel.h
@@ -69,7 +69,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
+    static Status
+    validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/Coordinates.h b/arm_compute/core/Coordinates.h
index f6e1f4d2822d673367e5ddc2e3bd39a6414024bf..d1240bb10a757c00053ef5e21d2708af83af1139 100644
--- a/arm_compute/core/Coordinates.h
+++ b/arm_compute/core/Coordinates.h
@@ -42,8 +42,7 @@ public:
      * @param[in] coords Values to initialize the dimensions.
      */
     template <typename... Ts>
-    constexpr Coordinates(Ts... coords)
-        : Dimensions{ coords... }
+    constexpr Coordinates(Ts... coords) : Dimensions{coords...}
     {
     }
     /** Allow instances of this class to be copy constructed */
@@ -57,5 +56,5 @@ public:
     /** Default destructor */
     ~Coordinates() = default;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_COORDINATES_H*/
diff --git a/arm_compute/core/CoreTypes.h b/arm_compute/core/CoreTypes.h
index 4a48a366518b99cc82893a102ee62360a8f3da10..1a9db1937c5f5cbe5ecf721035d5b915a754fb14 100644
--- a/arm_compute/core/CoreTypes.h
+++ b/arm_compute/core/CoreTypes.h
@@ -25,6 +25,7 @@
 #define ACL_ARM_COMPUTE_CORE_CORETYPES
 
 #include "arm_compute/core/Strides.h"
+
 #include "support/Half.h"
 
 /** CoreTypes.h groups together essential small types that are used across functions */
@@ -146,9 +147,11 @@ public:
      * @param[in] pad_y    (Optional) Padding, in elements, across y. Defaults to 0.
      * @param[in] round    (Optional) Dimensions rounding. Defaults to @ref DimensionRoundingType::FLOOR.
      */
-    PadStrideInfo(unsigned int stride_x = 1, unsigned int stride_y = 1,
-                  unsigned int pad_x = 0, unsigned int pad_y = 0,
-                  DimensionRoundingType round = DimensionRoundingType::FLOOR)
+    PadStrideInfo(unsigned int          stride_x = 1,
+                  unsigned int          stride_y = 1,
+                  unsigned int          pad_x    = 0,
+                  unsigned int          pad_y    = 0,
+                  DimensionRoundingType round    = DimensionRoundingType::FLOOR)
         : _stride(std::make_pair(stride_x, stride_y)),
           _pad_left(pad_x),
           _pad_top(pad_y),
@@ -167,9 +170,12 @@ public:
      * @param[in] pad_bottom Padding across y on the bottom, in elements.
      * @param[in] round      Dimensions rounding.
      */
-    PadStrideInfo(unsigned int stride_x, unsigned int stride_y,
-                  unsigned int pad_left, unsigned int pad_right,
-                  unsigned int pad_top, unsigned int pad_bottom,
+    PadStrideInfo(unsigned int          stride_x,
+                  unsigned int          stride_y,
+                  unsigned int          pad_left,
+                  unsigned int          pad_right,
+                  unsigned int          pad_top,
+                  unsigned int          pad_bottom,
                   DimensionRoundingType round)
         : _stride(std::make_pair(stride_x, stride_y)),
           _pad_left(pad_left),
@@ -243,10 +249,10 @@ public:
 
 private:
     std::pair<unsigned int, unsigned int> _stride;
-    unsigned int _pad_left;
-    unsigned int _pad_top;
-    unsigned int _pad_right;
-    unsigned int _pad_bottom;
+    unsigned int                          _pad_left;
+    unsigned int                          _pad_top;
+    unsigned int                          _pad_right;
+    unsigned int                          _pad_bottom;
 
     DimensionRoundingType _round_type;
 };
diff --git a/arm_compute/core/Dimensions.h b/arm_compute/core/Dimensions.h
index 2ebfcd7f839326f5fa336b4d2850aeaf44c815cb..bb8692d70acf2cd1637174ca8b089aa49b5fcdae 100644
--- a/arm_compute/core/Dimensions.h
+++ b/arm_compute/core/Dimensions.h
@@ -50,8 +50,7 @@ public:
      * @param[in] dims Values to initialize the dimensions.
      */
     template <typename... Ts>
-    explicit Dimensions(Ts... dims)
-        : _id{ { static_cast<T>(dims)... } }, _num_dimensions{ sizeof...(dims) }
+    explicit Dimensions(Ts... dims) : _id{{static_cast<T>(dims)...}}, _num_dimensions{sizeof...(dims)}
     {
     }
 
@@ -78,7 +77,7 @@ public:
         ARM_COMPUTE_ERROR_ON(dimension >= num_max_dimensions);
         _id[dimension] = value;
         // Don't increase the number of dimensions if the new dimension is 1
-        if(increase_dim_unit || value != 1)
+        if (increase_dim_unit || value != 1)
         {
             _num_dimensions = std::max(_num_dimensions, dimension + 1);
         }
@@ -108,7 +107,7 @@ public:
     void increment(size_t dim, T step = 1)
     {
         ARM_COMPUTE_ERROR_ON(dim >= _num_dimensions);
-        if((std::numeric_limits<T>::max() - _id[dim]) >= step)
+        if ((std::numeric_limits<T>::max() - _id[dim]) >= step)
         {
             _id[dim] += step;
         }
@@ -162,7 +161,7 @@ public:
 
         const size_t last = std::min(_num_dimensions, first + n);
 
-        if(last > (first + 1))
+        if (last > (first + 1))
         {
             // Collapse dimensions into the first
             _id[first] = std::accumulate(&_id[first], &_id[last], 1, std::multiplies<T>());
@@ -196,7 +195,7 @@ public:
     void remove(size_t idx)
     {
         ARM_COMPUTE_ERROR_ON(_num_dimensions < 1);
-        if(idx >= _num_dimensions)
+        if (idx >= _num_dimensions)
         {
             return;
         }
@@ -262,7 +261,7 @@ protected:
     ~Dimensions() = default;
 
     std::array<T, num_max_dimensions> _id;
-    size_t _num_dimensions{ 0 };
+    size_t                            _num_dimensions{0};
 };
 
 /** Check that given dimensions are equal.
@@ -289,5 +288,5 @@ inline bool operator!=(const Dimensions<T> &lhs, const Dimensions<T> &rhs)
 {
     return !(lhs == rhs);
 }
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_DIMENSIONS_H*/
diff --git a/arm_compute/core/Error.h b/arm_compute/core/Error.h
index 0854f2c5274eddb3a5ac5052c4732548719d6d95..7a7033805a1d29c10723ba13248bdffbc5fe1547 100644
--- a/arm_compute/core/Error.h
+++ b/arm_compute/core/Error.h
@@ -53,8 +53,7 @@ class Status
 {
 public:
     /** Default Constructor **/
-    Status()
-        : _code(ErrorCode::OK), _error_description(" ")
+    Status() : _code(ErrorCode::OK), _error_description(" ")
     {
     }
     /** Default Constructor
@@ -101,7 +100,7 @@ public:
     /** Throws a runtime exception in case it contains a valid error status */
     void throw_if_error() const
     {
-        if(!bool(*this))
+        if (!bool(*this))
         {
             internal_throw_on_error();
         }
@@ -141,7 +140,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] err Error status
  */
 [[noreturn]] void throw_error(Status err);
-}
+} // namespace arm_compute
 /** To avoid unused variables warnings
  *
  * This is useful if for example a variable is only used
@@ -156,7 +155,8 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] error_code Error code.
  * @param[in] msg        Message to encapsulate.
  */
-#define ARM_COMPUTE_CREATE_ERROR(error_code, msg) arm_compute::create_error_msg(error_code, __func__, __FILE__, __LINE__, msg)
+#define ARM_COMPUTE_CREATE_ERROR(error_code, msg) \
+    arm_compute::create_error_msg(error_code, __func__, __FILE__, __LINE__, msg)
 
 /** Creates an error on location with a given message
  *
@@ -166,7 +166,8 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] line       Line in which the error occurred.
  * @param[in] msg        Message to display before abandoning.
  */
-#define ARM_COMPUTE_CREATE_ERROR_LOC(error_code, func, file, line, msg) arm_compute::create_error_msg(error_code, func, file, line, msg)
+#define ARM_COMPUTE_CREATE_ERROR_LOC(error_code, func, file, line, msg) \
+    arm_compute::create_error_msg(error_code, func, file, line, msg)
 
 /** Creates an error on location with a given message. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -178,14 +179,14 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] msg        Error description message format.
  * @param[in] ...        List of arguments matching the format description.
  */
-#define ARM_COMPUTE_CREATE_ERROR_LOC_VAR(error_code, func, file, line, msg, ...)          \
-    do                                                                                    \
-    {                                                                                     \
-        std::array<char, 512> out{ 0 };                                                   \
-        int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line); \
-        snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);             \
-        arm_compute::create_error(error_code, std::string(out.data()));                   \
-    } while(false)
+#define ARM_COMPUTE_CREATE_ERROR_LOC_VAR(error_code, func, file, line, msg, ...)                            \
+    do                                                                                                      \
+    {                                                                                                       \
+        std::array<char, 512> out{0};                                                                       \
+        int                   offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line); \
+        snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                               \
+        arm_compute::create_error(error_code, std::string(out.data()));                                     \
+    } while (false)
 
 /** An error is returned with the given description.
  *
@@ -195,7 +196,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
     do                                                                                       \
     {                                                                                        \
         return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, __VA_ARGS__); \
-    } while(false)
+    } while (false)
 
 /** Checks if a status contains an error and returns it
  *
@@ -205,18 +206,17 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
     do                                      \
     {                                       \
         const auto s = status;              \
-        if(!bool(s))                        \
+        if (!bool(s))                       \
         {                                   \
             return s;                       \
         }                                   \
-    } while(false)
+    } while (false)
 
 /** Checks if an error value is valid if not throws an exception with the error
  *
  * @param[in] error Error value to check.
  */
-#define ARM_COMPUTE_THROW_ON_ERROR(error) \
-    error.throw_if_error();
+#define ARM_COMPUTE_THROW_ON_ERROR(error) error.throw_if_error();
 
 /** If the condition is true, an error is returned. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -228,28 +228,29 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(cond, msg, ...)                                                   \
     do                                                                                                        \
     {                                                                                                         \
-        if(cond)                                                                                              \
+        if (cond)                                                                                             \
         {                                                                                                     \
-            std::array<char, 512> out{ 0 };                                                                   \
+            std::array<char, 512> out{0};                                                                     \
             int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", __func__, __FILE__, __LINE__);     \
             snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                             \
             return arm_compute::create_error(arm_compute::ErrorCode::RUNTIME_ERROR, std::string(out.data())); \
         }                                                                                                     \
-    } while(false)
+    } while (false)
 
 /** If the condition is true, an error is returned
  *
  * @param[in] cond Condition to evaluate.
  * @param[in] msg  Error description message
  */
-#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)                                                                          \
-    do                                                                                                                      \
-    {                                                                                                                       \
-        if(cond)                                                                                                            \
-        {                                                                                                                   \
-            return arm_compute::create_error_msg(arm_compute::ErrorCode::RUNTIME_ERROR, __func__, __FILE__, __LINE__, msg); \
-        }                                                                                                                   \
-    } while(false)
+#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)                                                                    \
+    do                                                                                                                \
+    {                                                                                                                 \
+        if (cond)                                                                                                     \
+        {                                                                                                             \
+            return arm_compute::create_error_msg(arm_compute::ErrorCode::RUNTIME_ERROR, __func__, __FILE__, __LINE__, \
+                                                 msg);                                                                \
+        }                                                                                                             \
+    } while (false)
 
 /** If the condition is true, an error is thrown. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -261,17 +262,17 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] msg  Error description message format.
  * @param[in] ...  List of arguments matching the format description.
  */
-#define ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(cond, func, file, line, msg, ...)                \
-    do                                                                                           \
-    {                                                                                            \
-        if(cond)                                                                                 \
-        {                                                                                        \
-            std::array<char, 512> out{ 0 };                                                      \
-            int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line);    \
-            snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                \
-            return arm_compute::create_error(ErrorCode::RUNTIME_ERROR, std::string(out.data())); \
-        }                                                                                        \
-    } while(false)
+#define ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(cond, func, file, line, msg, ...)                               \
+    do                                                                                                          \
+    {                                                                                                           \
+        if (cond)                                                                                               \
+        {                                                                                                       \
+            std::array<char, 512> out{0};                                                                       \
+            int                   offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line); \
+            snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                               \
+            return arm_compute::create_error(ErrorCode::RUNTIME_ERROR, std::string(out.data()));                \
+        }                                                                                                       \
+    } while (false)
 
 /** If the condition is true, an error is thrown.
  *
@@ -284,18 +285,17 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(cond, func, file, line, msg)                           \
     do                                                                                             \
     {                                                                                              \
-        if(cond)                                                                                   \
+        if (cond)                                                                                  \
         {                                                                                          \
             return arm_compute::create_error_msg(ErrorCode::RUNTIME_ERROR, func, file, line, msg); \
         }                                                                                          \
-    } while(false)
+    } while (false)
 
 /** If the condition is true, an error is returned
  *
  * @param[in] cond Condition to evaluate
  */
-#define ARM_COMPUTE_RETURN_ERROR_ON(cond) \
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, #cond)
+#define ARM_COMPUTE_RETURN_ERROR_ON(cond) ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, #cond)
 
 /** If the condition is true, an error is returned
  *
@@ -314,11 +314,12 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] line Line in which the error occurred.
  * @param[in] msg  Message to display.
  */
-#define ARM_COMPUTE_THROW_ERROR(func, file, line, msg)                                                                         \
-    do                                                                                                                         \
-    {                                                                                                                          \
-        arm_compute::throw_error(arm_compute::create_error_msg(arm_compute::ErrorCode::RUNTIME_ERROR, func, file, line, msg)); \
-    } while(false)
+#define ARM_COMPUTE_THROW_ERROR(func, file, line, msg)                                                    \
+    do                                                                                                    \
+    {                                                                                                     \
+        arm_compute::throw_error(                                                                         \
+            arm_compute::create_error_msg(arm_compute::ErrorCode::RUNTIME_ERROR, func, file, line, msg)); \
+    } while (false)
 
 /** Print the given message then throw an std::runtime_error. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -332,11 +333,11 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_THROW_ERROR_VAR(func, file, line, msg, ...)                                                        \
     do                                                                                                                 \
     {                                                                                                                  \
-        std::array<char, 512> out{ 0 };                                                                                \
-        int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line);                              \
+        std::array<char, 512> out{0};                                                                                  \
+        int                   offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line);            \
         snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                                          \
         arm_compute::throw_error(arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, std::string(out.data()))); \
-    } while(false)
+    } while (false)
 
 /** Print the given message then throw an std::runtime_error. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -361,7 +362,8 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] msg  Error description message format.
  * @param[in] ...  List of arguments matching the format description.
  */
-#define ARM_COMPUTE_ERROR_LOC_VAR(func, file, line, msg, ...) ARM_COMPUTE_THROW_ERROR_VAR(func, file, line, msg, __VA_ARGS__) // NOLINT
+#define ARM_COMPUTE_ERROR_LOC_VAR(func, file, line, msg, ...) \
+    ARM_COMPUTE_THROW_ERROR_VAR(func, file, line, msg, __VA_ARGS__) // NOLINT
 
 /** Print the given message then throw an std::runtime_error.
  *
@@ -380,11 +382,11 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_EXIT_ON_MSG(cond, msg) \
     do                                     \
     {                                      \
-        if(cond)                           \
+        if (cond)                          \
         {                                  \
             ARM_COMPUTE_ERROR(msg);        \
         }                                  \
-    } while(false)
+    } while (false)
 
 /** If the condition is true, the given message is printed and program exits. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -396,27 +398,25 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_EXIT_ON_MSG_VAR(cond, msg, ...)  \
     do                                               \
     {                                                \
-        if(cond)                                     \
+        if (cond)                                    \
         {                                            \
             ARM_COMPUTE_ERROR_VAR(msg, __VA_ARGS__); \
         }                                            \
-    } while(false)
+    } while (false)
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
 /** Checks if a status value is valid if not throws an exception with the error
  *
  * @param[in] status Status value to check.
  */
-#define ARM_COMPUTE_ERROR_THROW_ON(status) \
-    status.throw_if_error()
+#define ARM_COMPUTE_ERROR_THROW_ON(status) status.throw_if_error()
 
 /** If the condition is true, the given message is printed and an exception is thrown
  *
  * @param[in] cond Condition to evaluate.
  * @param[in] msg  Message to display.
  */
-#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg) \
-    ARM_COMPUTE_EXIT_ON_MSG(cond, msg)
+#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg) ARM_COMPUTE_EXIT_ON_MSG(cond, msg)
 
 /** If the condition is true, the given message is printed and an exception is thrown. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -425,8 +425,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] msg  Error description message format.
  * @param[in] ...  List of arguments matching the format description.
  */
-#define ARM_COMPUTE_ERROR_ON_MSG_VAR(cond, msg, ...) \
-    ARM_COMPUTE_EXIT_ON_MSG_VAR(cond, msg, __VA_ARGS__)
+#define ARM_COMPUTE_ERROR_ON_MSG_VAR(cond, msg, ...) ARM_COMPUTE_EXIT_ON_MSG_VAR(cond, msg, __VA_ARGS__)
 
 /** If the condition is true, the given message is printed and an exception is thrown.
  *
@@ -439,11 +438,11 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_ERROR_ON_LOC_MSG(cond, func, file, line, ...)     \
     do                                                                \
     {                                                                 \
-        if(cond)                                                      \
+        if (cond)                                                     \
         {                                                             \
             ARM_COMPUTE_ERROR_LOC_VAR(func, file, line, __VA_ARGS__); \
         }                                                             \
-    } while(false)
+    } while (false)
 
 /** If the condition is true, the given message is printed and an exception is thrown, otherwise value is returned
  *
@@ -464,8 +463,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  *
  * @param[in] cond Condition to evaluate.
  */
-#define ARM_COMPUTE_ERROR_ON(cond) \
-    ARM_COMPUTE_ERROR_ON_MSG(cond, #cond)
+#define ARM_COMPUTE_ERROR_ON(cond) ARM_COMPUTE_ERROR_ON_MSG(cond, #cond)
 
 /** If the condition is true then an error message is printed and an exception thrown
  *
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index f19e1e12e054847e1c1edc9820599b79d9869b4b..960201510a126280d562eff4725c11e38b665258 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -96,7 +96,6 @@ public:
     void reset(size_t dimension);
 
 private:
-
     /** Initialize a container iterator for the tensor with the specified number of dimensions, stride, buffer pointer and window.
      *
      * @param[in] num_dims The number of dimensions.
@@ -112,8 +111,7 @@ private:
     class Dimension
     {
     public:
-        constexpr Dimension()
-            : _dim_start(0), _stride(0)
+        constexpr Dimension() : _dim_start(0), _stride(0)
         {
         }
 
@@ -133,7 +131,7 @@ private:
  * @param[in,out] iterators       Tensor iterators which will be updated by this function before calling lambda_function.
  */
 template <typename L, typename... Ts>
-inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators);
+inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&...iterators);
 
 /** Permutes given Dimensions according to a permutation vector
  *
@@ -146,7 +144,7 @@ template <typename T>
 inline void permute(Dimensions<T> &dimensions, const PermutationVector &perm)
 {
     auto dimensions_copy = utility::make_array<Dimensions<T>::num_max_dimensions>(dimensions.begin(), dimensions.end());
-    for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < perm.num_dimensions(); ++i)
     {
         T dimension_val = (perm[i] < dimensions.num_dimensions()) ? dimensions_copy[perm[i]] : 0;
         dimensions.set(i, dimension_val);
@@ -163,7 +161,7 @@ inline void permute(Dimensions<T> &dimensions, const PermutationVector &perm)
 inline void permute(TensorShape &shape, const PermutationVector &perm)
 {
     TensorShape shape_copy = shape;
-    for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < perm.num_dimensions(); ++i)
     {
         size_t dimension_val = (perm[i] < shape.num_dimensions()) ? shape_copy[perm[i]] : 1;
         shape.set(i, dimension_val, false, false); // Avoid changes in _num_dimension
@@ -180,8 +178,11 @@ inline void permute(TensorShape &shape, const PermutationVector &perm)
  *
  * @return The corresponding valid region
  */
-ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape,
-                                         InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined);
+ValidRegion calculate_valid_region_scale(const ITensorInfo  &src_info,
+                                         const TensorShape  &dst_shape,
+                                         InterpolationPolicy interpolate_policy,
+                                         SamplingPolicy      sampling_policy,
+                                         bool                border_undefined);
 
 /** Convert a linear index into n-dimensional coordinates.
  *
@@ -224,7 +225,8 @@ const std::map<DataLayout, std::vector<DataLayoutDimension>> &get_layout_map();
  *
  * @return The int conversion of the requested data layout index.
  */
-inline size_t get_data_layout_dimension_index(const DataLayout &data_layout, const DataLayoutDimension &data_layout_dimension);
+inline size_t get_data_layout_dimension_index(const DataLayout          &data_layout,
+                                              const DataLayoutDimension &data_layout_dimension);
 
 /** Get the DataLayoutDimension of a given index and layout.
  *
@@ -245,10 +247,17 @@ inline DataLayoutDimension get_index_data_layout_dimension(const DataLayout &dat
  *
  * @return the number of output tiles along the x and y directions of size "output_tile_size"
  */
-inline Size2D compute_winograd_convolution_tiles(const Size2D &in_dims, const Size2D &kernel_size, const Size2D &output_tile_size, const PadStrideInfo &conv_info)
+inline Size2D compute_winograd_convolution_tiles(const Size2D        &in_dims,
+                                                 const Size2D        &kernel_size,
+                                                 const Size2D        &output_tile_size,
+                                                 const PadStrideInfo &conv_info)
 {
-    int num_tiles_x = std::ceil((in_dims.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast<float>(output_tile_size.width));
-    int num_tiles_y = std::ceil((in_dims.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast<float>(output_tile_size.height));
+    int num_tiles_x =
+        std::ceil((in_dims.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) /
+                  static_cast<float>(output_tile_size.width));
+    int num_tiles_y =
+        std::ceil((in_dims.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) /
+                  static_cast<float>(output_tile_size.height));
 
     // Clamp in case we provide paddings but we have 1D convolution
     num_tiles_x = std::min(num_tiles_x, static_cast<int>(in_dims.width));
@@ -277,7 +286,7 @@ inline T wrap_around(T x, T m)
  */
 inline Coordinates &convert_negative_axis(Coordinates &coords, int max_value)
 {
-    for(unsigned int i = 0; i < coords.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < coords.num_dimensions(); ++i)
     {
         coords[i] = wrap_around(coords[i], max_value);
     }
diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl
index ff902bba203a6c699281705e4a9c1ff60c793afd..60a21e94181c09b3c763a454ff800aefdcae27f6 100644
--- a/arm_compute/core/Helpers.inl
+++ b/arm_compute/core/Helpers.inl
@@ -32,12 +32,9 @@ template <size_t dimension>
 struct IncrementIterators
 {
     template <typename T, typename... Ts>
-    static void unroll(T &&it, Ts &&... iterators)
+    static void unroll(T &&it, Ts &&...iterators)
     {
-        auto increment = [](T && it)
-        {
-            it.increment(dimension);
-        };
+        auto increment = [](T &&it) { it.increment(dimension); };
         utility::for_each(increment, std::forward<T>(it), std::forward<Ts>(iterators)...);
     }
     static void unroll()
@@ -50,14 +47,14 @@ template <size_t dim>
 struct ForEachDimension
 {
     template <typename L, typename... Ts>
-    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&... iterators)
+    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&...iterators)
     {
         const auto &d = w[dim - 1];
 
-        for(auto v = d.start(); v < d.end(); v += d.step(), IncrementIterators < dim - 1 >::unroll(iterators...))
+        for (auto v = d.start(); v < d.end(); v += d.step(), IncrementIterators<dim - 1>::unroll(iterators...))
         {
             id.set(dim - 1, v);
-            ForEachDimension < dim - 1 >::unroll(w, id, lambda_function, iterators...);
+            ForEachDimension<dim - 1>::unroll(w, id, lambda_function, iterators...);
         }
     }
 };
@@ -66,7 +63,7 @@ template <>
 struct ForEachDimension<0>
 {
     template <typename L, typename... Ts>
-    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&... iterators)
+    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&...iterators)
     {
         ARM_COMPUTE_UNUSED(w, iterators...);
         lambda_function(id);
@@ -74,31 +71,31 @@ struct ForEachDimension<0>
 };
 
 template <typename L, typename... Ts>
-inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
+inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&...iterators)
 {
     w.validate();
 
-    for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_ERROR_ON(w[i].step() == 0);
     }
 
     Coordinates id;
-    ForEachDimension<Coordinates::num_max_dimensions>::unroll(w, id, std::forward<L>(lambda_function), std::forward<Ts>(iterators)...);
+    ForEachDimension<Coordinates::num_max_dimensions>::unroll(w, id, std::forward<L>(lambda_function),
+                                                              std::forward<Ts>(iterators)...);
 }
 
-inline constexpr Iterator::Iterator()
-    : _ptr(nullptr), _dims()
+inline constexpr Iterator::Iterator() : _ptr(nullptr), _dims()
 {
 }
 
-inline Iterator::Iterator(const ITensor *tensor, const Window &win)
-    : Iterator()
+inline Iterator::Iterator(const ITensor *tensor, const Window &win) : Iterator()
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
     ARM_COMPUTE_ERROR_ON(tensor->info() == nullptr);
 
-    initialize(tensor->info()->num_dimensions(), tensor->info()->strides_in_bytes(), tensor->buffer(), tensor->info()->offset_first_element_in_bytes(), win);
+    initialize(tensor->info()->num_dimensions(), tensor->info()->strides_in_bytes(), tensor->buffer(),
+               tensor->info()->offset_first_element_in_bytes(), win);
 }
 
 inline Iterator::Iterator(size_t num_dims, const Strides &strides, uint8_t *buffer, size_t offset, const Window &win)
@@ -107,21 +104,22 @@ inline Iterator::Iterator(size_t num_dims, const Strides &strides, uint8_t *buff
     initialize(num_dims, strides, buffer, offset, win);
 }
 
-inline void Iterator::initialize(size_t num_dims, const Strides &strides, uint8_t *buffer, size_t offset, const Window &win)
+inline void
+Iterator::initialize(size_t num_dims, const Strides &strides, uint8_t *buffer, size_t offset, const Window &win)
 {
     ARM_COMPUTE_ERROR_ON(buffer == nullptr);
 
     _ptr = buffer + offset;
 
     //Initialize the stride for each dimension and calculate the position of the first element of the iteration:
-    for(unsigned int n = 0; n < num_dims; ++n)
+    for (unsigned int n = 0; n < num_dims; ++n)
     {
         _dims[n]._stride = win[n].step() * strides[n];
         std::get<0>(_dims)._dim_start += static_cast<size_t>(strides[n]) * win[n].start();
     }
 
     //Copy the starting point to all the dimensions:
-    for(unsigned int n = 1; n < Coordinates::num_max_dimensions; ++n)
+    for (unsigned int n = 1; n < Coordinates::num_max_dimensions; ++n)
     {
         _dims[n]._dim_start = std::get<0>(_dims)._dim_start;
     }
@@ -135,7 +133,7 @@ inline void Iterator::increment(const size_t dimension)
 
     _dims[dimension]._dim_start += _dims[dimension]._stride;
 
-    for(unsigned int n = 0; n < dimension; ++n)
+    for (unsigned int n = 0; n < dimension; ++n)
     {
         _dims[n]._dim_start = _dims[dimension]._dim_start;
     }
@@ -157,7 +155,7 @@ inline void Iterator::reset(const size_t dimension)
 
     _dims[dimension]._dim_start = _dims[dimension + 1]._dim_start;
 
-    for(unsigned int n = 0; n < dimension; ++n)
+    for (unsigned int n = 0; n < dimension; ++n)
     {
         _dims[n]._dim_start = _dims[dimension]._dim_start;
     }
@@ -170,9 +168,9 @@ inline Coordinates index2coords(const TensorShape &shape, int index)
     ARM_COMPUTE_ERROR_ON_MSG(index < 0 || index >= num_elements, "Index has to be in [0, num_elements]!");
     ARM_COMPUTE_ERROR_ON_MSG(num_elements == 0, "Cannot create coordinate from empty shape!");
 
-    Coordinates coord{ 0 };
+    Coordinates coord{0};
 
-    for(int d = shape.num_dimensions() - 1; d >= 0; --d)
+    for (int d = shape.num_dimensions() - 1; d >= 0; --d)
     {
         num_elements /= shape[d];
         coord.set(d, index / num_elements);
@@ -191,7 +189,7 @@ inline int coords2index(const TensorShape &shape, const Coordinates &coord)
     int index  = 0;
     int stride = 1;
 
-    for(unsigned int d = 0; d < coord.num_dimensions(); ++d)
+    for (unsigned int d = 0; d < coord.num_dimensions(); ++d)
     {
         index += coord[d] * stride;
         stride *= shape[d];
@@ -200,9 +198,11 @@ inline int coords2index(const TensorShape &shape, const Coordinates &coord)
     return index;
 }
 
-inline size_t get_data_layout_dimension_index(const DataLayout &data_layout, const DataLayoutDimension &data_layout_dimension)
+inline size_t get_data_layout_dimension_index(const DataLayout          &data_layout,
+                                              const DataLayoutDimension &data_layout_dimension)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
+    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN,
+                             "Cannot retrieve the dimension index for an unknown layout!");
     const auto &dims = get_layout_map().at(data_layout);
     const auto &it   = std::find(dims.cbegin(), dims.cend(), data_layout_dimension);
     ARM_COMPUTE_ERROR_ON_MSG(it == dims.cend(), "Invalid dimension for the given layout.");
@@ -211,7 +211,8 @@ inline size_t get_data_layout_dimension_index(const DataLayout &data_layout, con
 
 inline DataLayoutDimension get_index_data_layout_dimension(const DataLayout &data_layout, const size_t index)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN, "Cannot retrieve the layout dimension for an unknown layout!");
+    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN,
+                             "Cannot retrieve the layout dimension for an unknown layout!");
     const auto &dims = get_layout_map().at(data_layout);
     ARM_COMPUTE_ERROR_ON_MSG(index >= dims.size(), "Invalid index for the given layout.");
     return dims[index];
diff --git a/arm_compute/core/IAccessWindow.h b/arm_compute/core/IAccessWindow.h
index 880f6d6b271faf04f7c8232e870cac6c16f2ccb9..9c9fb909150e965bd7ecda2e14c80dc08b83718e 100644
--- a/arm_compute/core/IAccessWindow.h
+++ b/arm_compute/core/IAccessWindow.h
@@ -100,7 +100,10 @@ public:
      * @return a valid region.
      *
      */
-    virtual ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const = 0;
+    virtual ValidRegion compute_valid_region(const Window &window,
+                                             ValidRegion   input_valid_region,
+                                             bool          border_undefined,
+                                             BorderSize    border_size) const = 0;
 };
 
 /** Implementation of a rectangular access pattern. */
@@ -161,7 +164,10 @@ public:
      * @param[in] border_undefined   (Optional) Undefined borders are excluded from the valid region.
      * @param[in] border_size        (Optional) Size of the border around the XY-plane of the tensor.
      */
-    void set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined = false, const BorderSize &border_size = BorderSize(0));
+    void set_valid_region(const Window      &window,
+                          const ValidRegion &input_valid_region,
+                          bool               border_undefined = false,
+                          const BorderSize  &border_size      = BorderSize(0));
 
     /** Compute the valid region based on access pattern, valid region of the inputs and border mode.
      *
@@ -189,7 +195,10 @@ public:
      * @return a valid region.
      *
      */
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    ValidRegion compute_valid_region(const Window &window,
+                                     ValidRegion   input_valid_region,
+                                     bool          border_undefined,
+                                     BorderSize    border_size) const override;
 
     bool update_window_if_needed(Window &window) const override;
     bool update_padding_if_needed(const Window &window) override;
diff --git a/arm_compute/core/IArray.h b/arm_compute/core/IArray.h
index 6edbc1d5d56037817a84b67a594839f7bcf18bcf..3471fc9a86c4e5c631bdfa011840c6aa708bfc18 100644
--- a/arm_compute/core/IArray.h
+++ b/arm_compute/core/IArray.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_IARRAY_H
 
 #include "arm_compute/core/Error.h"
+
 #include <cstddef>
 #include <cstdint>
 
@@ -36,14 +37,12 @@ class IArray
 {
 public:
     /** Default constructor */
-    IArray()
-        : _num_values(0), _max_size(0) {};
+    IArray() : _num_values(0), _max_size(0){};
     /** Constructor: initializes an array which can contain up to max_num_points values
      *
      * @param[in] max_num_values Maximum number of values the array will be able to stored
      */
-    IArray(size_t max_num_values)
-        : _num_values(0), _max_size(max_num_values)
+    IArray(size_t max_num_values) : _num_values(0), _max_size(max_num_values)
     {
     }
     /** Maximum number of values which can be stored in this array
@@ -73,7 +72,7 @@ public:
     bool push_back(const T &val)
     {
         ARM_COMPUTE_ERROR_ON(0 == _max_size);
-        if(_num_values >= max_num_values())
+        if (_num_values >= max_num_values())
         {
             _num_values = max_num_values() + 1;
             return false;
@@ -142,5 +141,5 @@ using IInt16Array = IArray<int16_t>;
 using IInt32Array = IArray<int32_t>;
 /** Interface for Array of floats. */
 using IFloatArray = IArray<float>;
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_IARRAY_H */
diff --git a/arm_compute/core/IKernel.h b/arm_compute/core/IKernel.h
index 98fd18cc9104adce056c1b84b01fe5840ee206b0..403a2c724e8245cb882bfa1a856f671300803f73 100644
--- a/arm_compute/core/IKernel.h
+++ b/arm_compute/core/IKernel.h
@@ -73,5 +73,5 @@ protected:
 private:
     Window _window;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IKERNEL_H */
diff --git a/arm_compute/core/ITensor.h b/arm_compute/core/ITensor.h
index 32b93576bddc6461a72052b5df94b31ca175f030..aad831326104a68b71ad466733af4c566a54e3ec 100644
--- a/arm_compute/core/ITensor.h
+++ b/arm_compute/core/ITensor.h
@@ -94,9 +94,9 @@ public:
     void mark_as_used() const;
 
 private:
-    mutable bool _is_used = { true }; /**< Flag that marks if the tensor is used or not */
+    mutable bool _is_used = {true}; /**< Flag that marks if the tensor is used or not */
 };
 
 using IImage = ITensor;
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ITENSOR_H */
diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
index e7c0b182c69e5f192dafb2d33f8498a815f33d38..c42f4b57a1ed5fb6e200358b75feae49b585babe 100644
--- a/arm_compute/core/ITensorInfo.h
+++ b/arm_compute/core/ITensorInfo.h
@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+
 #include "support/ICloneable.h"
 
 #include <cstddef>
@@ -328,23 +329,23 @@ public:
      * not broadcast compatible.
      */
     template <typename... Infos>
-    static std::pair<TensorShape, ValidRegion> broadcast_shape_and_valid_region(const Infos &... infos)
+    static std::pair<TensorShape, ValidRegion> broadcast_shape_and_valid_region(const Infos &...infos)
     {
         TensorShape bc_shape = TensorShape::broadcast_shape(infos.tensor_shape()...);
-        ValidRegion bc_valid_region{ Coordinates(), bc_shape };
+        ValidRegion bc_valid_region{Coordinates(), bc_shape};
 
-        auto broadcast_valid_region = [&bc_valid_region](const ITensorInfo & info)
+        auto broadcast_valid_region = [&bc_valid_region](const ITensorInfo &info)
         {
-            if(info.num_dimensions() != 0)
+            if (info.num_dimensions() != 0)
             {
-                for(size_t d = 0; d < bc_valid_region.shape.num_dimensions(); ++d)
+                for (size_t d = 0; d < bc_valid_region.shape.num_dimensions(); ++d)
                 {
                     const bool is_broadcast = (info.tensor_shape()[d] == 1);
 
                     const int    anchor_max = std::max(bc_valid_region.anchor[d], info.valid_region().anchor[d]);
                     const size_t valid_min  = std::min(bc_valid_region.shape[d], info.valid_region().shape[d]);
 
-                    if(!is_broadcast || (valid_min == 0))
+                    if (!is_broadcast || (valid_min == 0))
                     {
                         bc_valid_region.anchor.set(d, anchor_max);
                         bc_valid_region.shape.set(d, valid_min);
diff --git a/arm_compute/core/ITensorPack.h b/arm_compute/core/ITensorPack.h
index 17b72418621b4f33aeb5176134cb22a437b1fcb1..f456c50769ed0699284b832b58f804fe75981921 100644
--- a/arm_compute/core/ITensorPack.h
+++ b/arm_compute/core/ITensorPack.h
@@ -42,18 +42,16 @@ public:
     struct PackElement
     {
         PackElement() = default;
-        PackElement(int id, ITensor *tensor)
-            : id(id), tensor(tensor), ctensor(nullptr)
+        PackElement(int id, ITensor *tensor) : id(id), tensor(tensor), ctensor(nullptr)
         {
         }
-        PackElement(int id, const ITensor *ctensor)
-            : id(id), tensor(nullptr), ctensor(ctensor)
+        PackElement(int id, const ITensor *ctensor) : id(id), tensor(nullptr), ctensor(ctensor)
         {
         }
 
-        int            id{ -1 };
-        ITensor       *tensor{ nullptr };
-        const ITensor *ctensor{ nullptr };
+        int            id{-1};
+        ITensor       *tensor{nullptr};
+        const ITensor *ctensor{nullptr};
     };
 
 public:
diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h
index 305766e8251721935efcd581388951674921ffc7..168a06a55cfe15e788b606aa73ed7940cc3252fc 100644
--- a/arm_compute/core/KernelDescriptors.h
+++ b/arm_compute/core/KernelDescriptors.h
@@ -21,12 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS
-#define ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS
+#ifndef ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS_H
+#define ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS_H
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
 namespace arm_compute
@@ -34,24 +33,24 @@ namespace arm_compute
 /** Descriptor for FFT scale kernels */
 struct FFTScaleKernelInfo
 {
-    float scale{ 0.f };      /**< Axis to perform the kernel on. */
-    bool  conjugate{ true }; /**< Flag to conjugate the output/ */
+    float scale{0.f};      /**< Axis to perform the kernel on. */
+    bool  conjugate{true}; /**< Flag to conjugate the output/ */
 };
 
 /** Descriptor for FFT digit reverse kernels */
 struct FFTDigitReverseKernelInfo
 {
-    unsigned int axis{ 0 };          /**< Axis to perform the kernel on. */
-    bool         conjugate{ false }; /**< Flag to conjugate the output/ */
+    unsigned int axis{0};          /**< Axis to perform the kernel on. */
+    bool         conjugate{false}; /**< Flag to conjugate the output/ */
 };
 
 /** Descriptor used by the FFT core kernels */
 struct FFTRadixStageKernelInfo
 {
-    unsigned int axis{ 0 };               /**< Axis to run the kernel on. */
-    unsigned int radix{ 0 };              /**< Radix to use. */
-    unsigned int Nx{ 0 };                 /**< Nx coefficient. */
-    bool         is_first_stage{ false }; /**< Flags if the FFT kernels is the first stage of a decomposed FFT. */
+    unsigned int axis{0};               /**< Axis to run the kernel on. */
+    unsigned int radix{0};              /**< Radix to use. */
+    unsigned int Nx{0};                 /**< Nx coefficient. */
+    bool         is_first_stage{false}; /**< Flags if the FFT kernels is the first stage of a decomposed FFT. */
 };
 
 class ITensorInfo;
@@ -59,92 +58,102 @@ class ITensorInfo;
 struct GEMMKernelInfo
 {
     GEMMKernelInfo() = default;
-    GEMMKernelInfo(
-        unsigned int                                   im,
-        unsigned int                                   in,
-        unsigned int                                   ik,
-        unsigned int                                   idepth_output_gemm3d,
-        bool                                           ireinterpret_input_as_3d,
-        bool                                           ibroadcast_bias,
-        bool                                           ifp_mixed_precision,
-        bool                                           ihas_pad_y,
-        ActivationLayerInfo                            iactivation_info,
-        int                                            inmult_transpose1xW_width,
-        int                                            imult_interleave4x4_height,
-        GEMMLHSMatrixInfo                              ilhs_info,
-        GEMMRHSMatrixInfo                              irhs_info,
-        int32_t                                        ina_offset,
-        int32_t                                        inb_offset,
-        const experimental::PostOpList<ITensorInfo *> &ipost_ops = experimental::PostOpList<ITensorInfo *> {})
-        : m(im), n(in), k(ik), depth_output_gemm3d(idepth_output_gemm3d), reinterpret_input_as_3d(ireinterpret_input_as_3d), broadcast_bias(ibroadcast_bias), fp_mixed_precision(ifp_mixed_precision),
-          has_pad_y(ihas_pad_y), activation_info(iactivation_info), mult_transpose1xW_width(inmult_transpose1xW_width), mult_interleave4x4_height(imult_interleave4x4_height), lhs_info(ilhs_info),
-          rhs_info(irhs_info), a_offset(ina_offset), b_offset(inb_offset), post_ops(ipost_ops)
+    GEMMKernelInfo(unsigned int        im,
+                   unsigned int        in,
+                   unsigned int        ik,
+                   unsigned int        idepth_output_gemm3d,
+                   bool                ireinterpret_input_as_3d,
+                   bool                ibroadcast_bias,
+                   bool                ifp_mixed_precision,
+                   bool                ihas_pad_y,
+                   ActivationLayerInfo iactivation_info,
+                   int                 inmult_transpose1xW_width,
+                   int                 imult_interleave4x4_height,
+                   GEMMLHSMatrixInfo   ilhs_info,
+                   GEMMRHSMatrixInfo   irhs_info,
+                   int32_t             ina_offset,
+                   int32_t             inb_offset)
+        : m(im),
+          n(in),
+          k(ik),
+          depth_output_gemm3d(idepth_output_gemm3d),
+          reinterpret_input_as_3d(ireinterpret_input_as_3d),
+          broadcast_bias(ibroadcast_bias),
+          fp_mixed_precision(ifp_mixed_precision),
+          has_pad_y(ihas_pad_y),
+          activation_info(iactivation_info),
+          mult_transpose1xW_width(inmult_transpose1xW_width),
+          mult_interleave4x4_height(imult_interleave4x4_height),
+          lhs_info(ilhs_info),
+          rhs_info(irhs_info),
+          a_offset(ina_offset),
+          b_offset(inb_offset)
     {
     }
 
-    unsigned int                            m{ 0 };                           /**< Number of LHS rows*/
-    unsigned int                            n{ 0 };                           /**< Number of RHS columns*/
-    unsigned int                            k{ 0 };                           /**< Number of LHS columns or RHS rows */
-    unsigned int                            depth_output_gemm3d{ 0 };         /**< Depth of the output tensor in case is reinterpreted as 3D */
-    bool                                    reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */
-    bool                                    broadcast_bias{ false };          /**< Flag used to broadcast the bias addition */
-    bool                                    fp_mixed_precision{ false };      /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
-    bool                                    has_pad_y{ false };               /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */
-    ActivationLayerInfo                     activation_info{};                /**< Activation function to perform after the matrix multiplication */
-    int                                     mult_transpose1xW_width{ 1 };     /**< Multiplication factor for the width of the 1xW transposed block */
-    int                                     mult_interleave4x4_height{ 1 };   /**< Multiplication factor for the height of the 4x4 interleaved block */
-    GEMMLHSMatrixInfo                       lhs_info{};                       /**< LHS matrix information used to retrieve the number of rows processed by each thread */
-    GEMMRHSMatrixInfo                       rhs_info{};                       /**< RHS matrix information used for reshaping the RHS matrix */
-    int32_t                                 a_offset{ 0 };                    /**< Offset to be added to each element of the matrix A */
-    int32_t                                 b_offset{ 0 };                    /**< Offset to be added to each element of the matrix B */
-    GEMMLowpOutputStageInfo                 output_stage{};                   /**< GEMMLowp output stage information */
-    experimental::PostOpList<ITensorInfo *> post_ops{};                       /**< (EXPERIMENTAL_POST_OPS) Specifies a list of post ops to be fused after the main op. Note unsupported post ops would not be executed.
-                                                          *   If specified, automatically disable the @ref activation_info */
+    unsigned int m{0};                           /**< Number of LHS rows*/
+    unsigned int n{0};                           /**< Number of RHS columns*/
+    unsigned int k{0};                           /**< Number of LHS columns or RHS rows */
+    unsigned int depth_output_gemm3d{0};         /**< Depth of the output tensor in case is reinterpreted as 3D */
+    bool         reinterpret_input_as_3d{false}; /**< Flag used to reinterpret the input as 3D */
+    bool         broadcast_bias{false};          /**< Flag used to broadcast the bias addition */
+    bool fp_mixed_precision{false}; /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
+    bool has_pad_y{
+        false}; /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */
+    ActivationLayerInfo activation_info{}; /**< Activation function to perform after the matrix multiplication */
+    int mult_transpose1xW_width{1};        /**< Multiplication factor for the width of the 1xW transposed block */
+    int mult_interleave4x4_height{1};      /**< Multiplication factor for the height of the 4x4 interleaved block */
+    GEMMLHSMatrixInfo
+        lhs_info{}; /**< LHS matrix information used to retrieve the number of rows processed by each thread */
+    GEMMRHSMatrixInfo       rhs_info{};     /**< RHS matrix information used for reshaping the RHS matrix */
+    int32_t                 a_offset{0};    /**< Offset to be added to each element of the matrix A */
+    int32_t                 b_offset{0};    /**< Offset to be added to each element of the matrix B */
+    GEMMLowpOutputStageInfo output_stage{}; /**< GEMMLowp output stage information */
 };
 
 /** Compute descriptor used by the depthwise convolution native kernel */
 struct DWCComputeKernelInfo
 {
-    unsigned int n0{ 1 };                             /**< Number of columns processed by each thread */
-    unsigned int m0{ 1 };                             /**< Number of rows processed by each thread */
-    bool         export_input_to_cl_image{ false };   /**< Export input to cl_image */
-    bool         export_weights_to_cl_image{ false }; /**< Export the weights to cl_image */
+    unsigned int n0{1};                             /**< Number of columns processed by each thread */
+    unsigned int m0{1};                             /**< Number of rows processed by each thread */
+    bool         export_input_to_cl_image{false};   /**< Export input to cl_image */
+    bool         export_weights_to_cl_image{false}; /**< Export the weights to cl_image */
 };
 
 /** Compute descriptor used by the direct convolution kernel */
 struct DirectConvComputeKernelInfo
 {
-    int32_t m0{ 1 };                             /**< Number of rows to be processed by the kernel */
-    int32_t n0{ 1 };                             /**< Number of columns to be processed by the kernel */
-    int32_t k0{ 1 };                             /**< Number of partial accumulations to be processed in a single iteration by the kernel */
-    bool    export_weights_to_cl_image{ false }; /**< Flag to export the weights to cl_image */
-    bool    export_output_to_cl_image{ false };  /**< Flag to export the output to cl_image */
-    bool    export_input_to_cl_image{ false };   /**< Flag to export the input to cl_image */
+    int32_t m0{1}; /**< Number of rows to be processed by the kernel */
+    int32_t n0{1}; /**< Number of columns to be processed by the kernel */
+    int32_t k0{1}; /**< Number of partial accumulations to be processed in a single iteration by the kernel */
+    bool    export_weights_to_cl_image{false}; /**< Flag to export the weights to cl_image */
+    bool    export_output_to_cl_image{false};  /**< Flag to export the output to cl_image */
+    bool    export_input_to_cl_image{false};   /**< Flag to export the input to cl_image */
 };
 
 /** Descriptor used by the softmax kernels */
 struct SoftmaxKernelInfo
 {
-    float    beta{ 1.f };                          /**< A scaling factor for the exponent with default value 1.0 */
-    bool     is_log{ false };                      /**< Flag used to perform Log Softmax operation */
-    DataType input_data_type{ DataType::UNKNOWN }; /**< Input tensor data type */
-    int32_t  axis{ 0 };                            /**< The dimension in which to apply softmax. */
+    float    beta{1.f};                          /**< A scaling factor for the exponent with default value 1.0 */
+    bool     is_log{false};                      /**< Flag used to perform Log Softmax operation */
+    DataType input_data_type{DataType::UNKNOWN}; /**< Input tensor data type */
+    int32_t  axis{0};                            /**< The dimension in which to apply softmax. */
 };
 
 /** Descriptor used by the direct convolution layer output stage kernels */
 struct DirectConvolutionLayerOutputStageKernelInfo
 {
-    int32_t  result_fixedpoint_multiplier{ 0 };     /**< Result output stage multiplier used for quantizing */
-    int32_t  result_shift{ 0 };                     /**< Result output stage shift used for quantizing */
-    int32_t  result_offset_after_shift{ 0 };        /**< Result offset used for quantizing */
-    DataType output_data_type{ DataType::UNKNOWN }; /**< Output tensor data type to use if the output is not initialized */
+    int32_t  result_fixedpoint_multiplier{0}; /**< Result output stage multiplier used for quantizing */
+    int32_t  result_shift{0};                 /**< Result output stage shift used for quantizing */
+    int32_t  result_offset_after_shift{0};    /**< Result offset used for quantizing */
+    DataType output_data_type{
+        DataType::UNKNOWN}; /**< Output tensor data type to use if the output is not initialized */
 };
 
 struct InstanceNormalizationLayerKernelInfo
 {
     /** Default constructor */
-    InstanceNormalizationLayerKernelInfo()
-        : InstanceNormalizationLayerKernelInfo(1.f, 0.f, 1e-12, true)
+    InstanceNormalizationLayerKernelInfo() : InstanceNormalizationLayerKernelInfo(1.f, 0.f, 1e-12, true)
     {
     }
     /** Constructor
@@ -181,10 +190,10 @@ struct GEMMLowpReductionKernelInfo
     {
     }
 
-    int32_t k{ 0 };                 /**< Number of matrix columns/rows */
-    bool    is_reshaped{ false };   /**< True if the input tensor has been reshaped */
-    int32_t scalar{ 0 };            /**< Scalar value to multiply each reduced column/row by */
-    bool    mul_by_scalar{ false }; /**< True if each column/row reduction has to be multiplied by a scalar value */
+    int32_t k{0};                 /**< Number of matrix columns/rows */
+    bool    is_reshaped{false};   /**< True if the input tensor has been reshaped */
+    int32_t scalar{0};            /**< Scalar value to multiply each reduced column/row by */
+    bool    mul_by_scalar{false}; /**< True if each column/row reduction has to be multiplied by a scalar value */
 };
 
 struct ScaleKernelInfo
@@ -206,13 +215,13 @@ struct ScaleKernelInfo
                     bool                use_padding           = true,
                     bool                align_corners         = false,
                     DataLayout          data_layout           = DataLayout::UNKNOWN) noexcept
-        : interpolation_policy{ interpolation_policy },
-    border_mode{ border_mode },
-    constant_border_value{ constant_border_value },
-    sampling_policy{ sampling_policy },
-    use_padding{ use_padding },
-    align_corners{ align_corners },
-    data_layout{ data_layout }
+        : interpolation_policy{interpolation_policy},
+          border_mode{border_mode},
+          constant_border_value{constant_border_value},
+          sampling_policy{sampling_policy},
+          use_padding{use_padding},
+          align_corners{align_corners},
+          data_layout{data_layout}
     {
     }
 
@@ -228,16 +237,17 @@ struct ScaleKernelInfo
 struct MatMulKernelInfo
 {
     MatMulKernelInfo() = default;
-    MatMulKernelInfo(bool adj_lhs, bool adj_rhs, int m0 = 1, int n0 = 1, int k0 = 1, bool export_rhs_to_cl_image = false)
-        : adj_lhs{ adj_lhs }, adj_rhs{ adj_rhs }, m0{ m0 }, n0{ n0 }, k0{ k0 }, export_rhs_to_cl_image{ export_rhs_to_cl_image }
+    MatMulKernelInfo(
+        bool adj_lhs, bool adj_rhs, int m0 = 1, int n0 = 1, int k0 = 1, bool export_rhs_to_cl_image = false)
+        : adj_lhs{adj_lhs}, adj_rhs{adj_rhs}, m0{m0}, n0{n0}, k0{k0}, export_rhs_to_cl_image{export_rhs_to_cl_image}
     {
     }
-    bool adj_lhs{ false };                /**< Get Adjoint LHS flag value */
-    bool adj_rhs{ false };                /**< Get Adjoint RHS flag value */
-    int  m0{ 1 };                         /**< Number of output rows processed by each work-item*/
-    int  n0{ 1 };                         /**< Number of output columns processed by each work-item*/
-    int  k0{ 1 };                         /**< Number of inner accumulations */
-    bool export_rhs_to_cl_image{ false }; /**< Flag to know whether the RHS tensor should be exported to cl_image*/
+    bool adj_lhs{false};                /**< Get Adjoint LHS flag value */
+    bool adj_rhs{false};                /**< Get Adjoint RHS flag value */
+    int  m0{1};                         /**< Number of output rows processed by each work-item*/
+    int  n0{1};                         /**< Number of output columns processed by each work-item*/
+    int  k0{1};                         /**< Number of inner accumulations */
+    bool export_rhs_to_cl_image{false}; /**< Flag to know whether the RHS tensor should be exported to cl_image*/
 };
 } // namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS */
+#endif // ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS_H
diff --git a/arm_compute/core/Log.h b/arm_compute/core/Log.h
index bc0ecb802e0d83484db704a4500d13fdaf83a856..03b861f765c34e8c24641a3f540603fe67d59ae4 100644
--- a/arm_compute/core/Log.h
+++ b/arm_compute/core/Log.h
@@ -34,11 +34,11 @@
 #define ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER()                                   \
     do                                                                             \
     {                                                                              \
-        if(arm_compute::logging::LoggerRegistry::get().logger("CORE") == nullptr)  \
+        if (arm_compute::logging::LoggerRegistry::get().logger("CORE") == nullptr) \
         {                                                                          \
             arm_compute::logging::LoggerRegistry::get().create_reserved_loggers(); \
         }                                                                          \
-    } while(false)
+    } while (false)
 #else /* ARM_COMPUTE_LOGGING_ENABLED */
 #define ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER()
 #endif /* ARM_COMPUTE_LOGGING_ENABLED */
@@ -53,7 +53,7 @@
     {                                                \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();    \
         ARM_COMPUTE_LOG_MSG("CORE", log_level, msg); \
-    } while(false)
+    } while (false)
 
 /** Log a message with format to the core system logger
  *
@@ -66,7 +66,7 @@
     {                                                                         \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                             \
         ARM_COMPUTE_LOG_MSG_WITH_FORMAT("CORE", log_level, fmt, __VA_ARGS__); \
-    } while(false)
+    } while (false)
 
 /** Log a stream to the core system logger
  *
@@ -78,7 +78,7 @@
     {                                                  \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();      \
         ARM_COMPUTE_LOG_STREAM("CORE", log_level, ss); \
-    } while(false)
+    } while (false)
 
 /** Log information level message to the core system logger
  *
@@ -89,7 +89,7 @@
     {                                                                        \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                            \
         ARM_COMPUTE_LOG_MSG_CORE(arm_compute::logging::LogLevel::INFO, msg); \
-    } while(false)
+    } while (false)
 
 /** Log information level formatted message to the core system logger
  *
@@ -101,7 +101,7 @@
     {                                                                                                  \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                                                      \
         ARM_COMPUTE_LOG_MSG_WITH_FORMAT_CORE(arm_compute::logging::LogLevel::INFO, #fmt, __VA_ARGS__); \
-    } while(false)
+    } while (false)
 
 /** Log information level stream to the core system logger
  *
@@ -112,6 +112,6 @@
     {                                                                          \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                              \
         ARM_COMPUTE_LOG_STREAM_CORE(arm_compute::logging::LogLevel::INFO, ss); \
-    } while(false)
+    } while (false)
 
 #endif /* ARM_COMPUTE_LOGGING_MACROS_H */
diff --git a/arm_compute/core/PixelValue.h b/arm_compute/core/PixelValue.h
index 790f58a7935ab3f210a1038695506c177e5dbd6f..0b4df4f2e2e68c8fb49ed4580f4ea1b4212c0dee 100644
--- a/arm_compute/core/PixelValue.h
+++ b/arm_compute/core/PixelValue.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_PIXELVALUE_H
 #define ARM_COMPUTE_PIXELVALUE_H
 
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/core/Types.h"
 
 #include <cstdint>
 
@@ -36,11 +36,7 @@ class PixelValue
 {
 public:
     /** Default constructor: value initialized to 0 */
-    PixelValue() noexcept
-        : value
-    {
-        int64_t(0)
-    }
+    PixelValue() noexcept : value{int64_t(0)}
     {
     }
     /** Initialize the union with a pixel value of chosen datatype
@@ -49,10 +45,9 @@ public:
      * @param[in] datatype DataType that @p v have to be stored
      * @param[in] qinfo    (Optional) QuantizationInfo to apply in case of quantized data types to @p v
      */
-    PixelValue(double v, DataType datatype, QuantizationInfo qinfo = QuantizationInfo())
-        : PixelValue()
+    PixelValue(double v, DataType datatype, QuantizationInfo qinfo = QuantizationInfo()) : PixelValue()
     {
-        switch(datatype)
+        switch (datatype)
         {
             case DataType::U8:
                 value.u8 = static_cast<uint8_t>(v);
@@ -112,8 +107,7 @@ public:
      *
      * @param[in] v S8 value.
      */
-    PixelValue(int8_t v)
-        : PixelValue()
+    PixelValue(int8_t v) : PixelValue()
     {
         value.s8 = v;
     }
@@ -121,8 +115,7 @@ public:
      *
      * @param[in] v U8 value.
      */
-    PixelValue(uint8_t v)
-        : PixelValue()
+    PixelValue(uint8_t v) : PixelValue()
     {
         value.u8 = v;
     }
@@ -130,8 +123,7 @@ public:
      *
      * @param[in] v U16 value.
      */
-    PixelValue(uint16_t v)
-        : PixelValue()
+    PixelValue(uint16_t v) : PixelValue()
     {
         value.u16 = v;
     }
@@ -139,8 +131,7 @@ public:
      *
      * @param[in] v S16 value.
      */
-    PixelValue(int16_t v)
-        : PixelValue()
+    PixelValue(int16_t v) : PixelValue()
     {
         value.s16 = v;
     }
@@ -148,8 +139,7 @@ public:
      *
      * @param[in] v U32 value.
      */
-    PixelValue(uint32_t v)
-        : PixelValue()
+    PixelValue(uint32_t v) : PixelValue()
     {
         value.u32 = v;
     }
@@ -157,8 +147,7 @@ public:
      *
      * @param[in] v S32 value.
      */
-    PixelValue(int32_t v)
-        : PixelValue()
+    PixelValue(int32_t v) : PixelValue()
     {
         value.s32 = v;
     }
@@ -167,8 +156,7 @@ public:
      *
      * @param[in] v U64 value.
      */
-    PixelValue(uint64_t v)
-        : PixelValue()
+    PixelValue(uint64_t v) : PixelValue()
     {
         value.u64 = v;
     }
@@ -176,8 +164,7 @@ public:
      *
      * @param[in] v S64 value.
      */
-    PixelValue(int64_t v)
-        : PixelValue()
+    PixelValue(int64_t v) : PixelValue()
     {
         value.s64 = v;
     }
@@ -185,8 +172,7 @@ public:
      *
      * @param[in] v F16 value.
      */
-    PixelValue(bfloat16 v)
-        : PixelValue()
+    PixelValue(bfloat16 v) : PixelValue()
     {
         value.bf16 = v;
     }
@@ -194,8 +180,7 @@ public:
      *
      * @param[in] v F16 value.
      */
-    PixelValue(half v)
-        : PixelValue()
+    PixelValue(half v) : PixelValue()
     {
         value.f16 = v;
     }
@@ -203,8 +188,7 @@ public:
      *
      * @param[in] v F32 value.
      */
-    PixelValue(float v)
-        : PixelValue()
+    PixelValue(float v) : PixelValue()
     {
         value.f32 = v;
     }
@@ -212,8 +196,7 @@ public:
      *
      * @param[in] v F64 value.
      */
-    PixelValue(double v)
-        : PixelValue()
+    PixelValue(double v) : PixelValue()
     {
         value.f64 = v;
     }
@@ -221,23 +204,23 @@ public:
      * Use the field corresponding to the image format
      */
     union
-        {
-            uint64_t u64;     /**< Single channel U64 */
-            int64_t  s64;     /**< Single channel S64 */
-            uint8_t  rgb[3];  /**< 3 channels: RGB888 */
-            uint8_t  yuv[3];  /**< 3 channels: Any YUV format */
-            uint8_t  rgbx[4]; /**< 4 channels: RGBX8888 */
-            double   f64;     /**< Single channel double */
-            float    f32;     /**< Single channel float 32 */
-            half     f16;     /**< Single channel F16 */
-            bfloat16 bf16;    /**< Single channel brain floating-point number */
-            uint8_t  u8;      /**< Single channel U8 */
-            int8_t   s8;      /**< Single channel S8 */
-            uint16_t u16;     /**< Single channel U16 */
-            int16_t  s16;     /**< Single channel S16 */
-            uint32_t u32;     /**< Single channel U32 */
-            int32_t  s32;     /**< Single channel S32 */
-        } value;
+    {
+        uint64_t u64;     /**< Single channel U64 */
+        int64_t  s64;     /**< Single channel S64 */
+        uint8_t  rgb[3];  /**< 3 channels: RGB888 */
+        uint8_t  yuv[3];  /**< 3 channels: Any YUV format */
+        uint8_t  rgbx[4]; /**< 4 channels: RGBX8888 */
+        double   f64;     /**< Single channel double */
+        float    f32;     /**< Single channel float 32 */
+        half     f16;     /**< Single channel F16 */
+        bfloat16 bf16;    /**< Single channel brain floating-point number */
+        uint8_t  u8;      /**< Single channel U8 */
+        int8_t   s8;      /**< Single channel S8 */
+        uint16_t u16;     /**< Single channel U16 */
+        int16_t  s16;     /**< Single channel S16 */
+        uint32_t u32;     /**< Single channel U32 */
+        int32_t  s32;     /**< Single channel S32 */
+    } value;
     /** Interpret the pixel value as a U8
      *
      * @param[out] v Returned value
diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h
index 8fa513eee1e61e442eae70589bcbe6166da27fd3..471b8c57aba6100f8962e270659ebd252968fd57 100644
--- a/arm_compute/core/QuantizationInfo.h
+++ b/arm_compute/core/QuantizationInfo.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Rounding.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+
 #include "support/ToolchainSupport.h"
 
 #include <vector>
@@ -41,8 +42,7 @@ using qasymm16_t       = uint16_t; /**< 16 bit quantized asymmetric scalar value
 struct UniformQuantizationInfo
 {
     /** Default constructor */
-    UniformQuantizationInfo()
-        : scale(0.f), offset(0)
+    UniformQuantizationInfo() : scale(0.f), offset(0)
     {
     }
     /** Constructor
@@ -50,8 +50,7 @@ struct UniformQuantizationInfo
      * @param[in] scale  Quantization scale
      * @param[in] offset Quantization offset
      */
-    UniformQuantizationInfo(float scale, int32_t offset)
-        : scale(scale), offset(offset)
+    UniformQuantizationInfo(float scale, int32_t offset) : scale(scale), offset(offset)
     {
     }
     /** Checks if the scale and offset are both zero */
@@ -69,9 +68,7 @@ class QuantizationInfo
 {
 public:
     /** Default constructor */
-    QuantizationInfo() noexcept
-        : _scale(),
-          _offset()
+    QuantizationInfo() noexcept : _scale(), _offset()
     {
     }
     /** Construct quantization info.
@@ -80,8 +77,7 @@ public:
      *
      * @param[in] scale Scale.
      */
-    QuantizationInfo(float scale)
-        : _scale(1, scale), _offset()
+    QuantizationInfo(float scale) : _scale(1, scale), _offset()
     {
     }
     /** Construct quantization info.
@@ -91,8 +87,7 @@ public:
      * @param[in] scale  Scale.
      * @param[in] offset Offset.
      */
-    QuantizationInfo(float scale, int offset)
-        : _scale(1, scale), _offset(1, offset)
+    QuantizationInfo(float scale, int offset) : _scale(1, scale), _offset(1, offset)
     {
     }
     /** Construct quantization info.
@@ -101,8 +96,7 @@ public:
      *
      * @param[in] scale Scale.
      */
-    QuantizationInfo(std::vector<float> scale)
-        : _scale(scale), _offset()
+    QuantizationInfo(std::vector<float> scale) : _scale(scale), _offset()
     {
     }
     /** Construct quantization info.
@@ -112,8 +106,7 @@ public:
      * @param[in] scale  Scale.
      * @param[in] offset Offset.
      */
-    QuantizationInfo(std::vector<float> scale, std::vector<int32_t> offset)
-        : _scale(scale), _offset(offset)
+    QuantizationInfo(std::vector<float> scale, std::vector<int32_t> offset) : _scale(scale), _offset(offset)
     {
     }
     /** Scale vector accessor
@@ -208,8 +201,7 @@ inline bool operator!=(const UniformQuantizationInfo &lhs, const UniformQuantiza
 template <typename QUANTIZED_TYPE = uint8_t>
 struct Qasymm8QuantizationHelper
 {
-    static_assert(std::is_same<QUANTIZED_TYPE, uint8_t>::value
-                  || std::is_same<QUANTIZED_TYPE, int8_t>::value,
+    static_assert(std::is_same<QUANTIZED_TYPE, uint8_t>::value || std::is_same<QUANTIZED_TYPE, int8_t>::value,
                   "quantized type should be either uint8_t or int8_t.");
 
     /** Quantize a value given a 8-bit asymmetric quantization scheme
@@ -234,9 +226,10 @@ struct Qasymm8QuantizationHelper
      *
      * @return Quantized value
      */
-    static inline QUANTIZED_TYPE quantize(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy)
+    static inline QUANTIZED_TYPE
+    quantize(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy)
     {
-        if(rounding_policy == RoundingPolicy::TO_NEAREST_UP)
+        if (rounding_policy == RoundingPolicy::TO_NEAREST_UP)
         {
             return quantize(value, qinfo);
         }
@@ -254,7 +247,8 @@ struct Qasymm8QuantizationHelper
      *
      * @return Quantized value
      */
-    static inline QUANTIZED_TYPE quantize(float value, const QuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+    static inline QUANTIZED_TYPE
+    quantize(float value, const QuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
     {
         const UniformQuantizationInfo uqinfo = qinfo.uniform();
         ARM_COMPUTE_ERROR_ON(uqinfo.scale == 0);
@@ -297,7 +291,8 @@ struct Qasymm8QuantizationHelper
  * @return Quantized value
  */
 template <typename INFO_TYPE>
-inline uint8_t quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+inline uint8_t
+quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
 {
     return Qasymm8QuantizationHelper<uint8_t>::quantize(value, qinfo, rounding_policy);
 }
@@ -311,7 +306,9 @@ inline uint8_t quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPol
  * @return Quantized value
  */
 template <typename INFO_TYPE>
-inline int8_t quantize_qasymm8_signed(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+inline int8_t quantize_qasymm8_signed(float            value,
+                                      const INFO_TYPE &qinfo,
+                                      RoundingPolicy   rounding_policy = RoundingPolicy::TO_NEAREST_UP)
 {
     return Qasymm8QuantizationHelper<int8_t>::quantize(value, qinfo, rounding_policy);
 }
@@ -441,7 +438,9 @@ inline float dequantize(uint16_t value, float scale, int32_t offset)
  *
  * @return Quantized value
  */
-inline int16_t quantize_qsymm16(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+inline int16_t quantize_qsymm16(float                          value,
+                                const UniformQuantizationInfo &qinfo,
+                                RoundingPolicy                 rounding_policy = RoundingPolicy::TO_NEAREST_UP)
 {
     int quantized = arm_compute::round(value / qinfo.scale, rounding_policy);
     quantized     = arm_compute::utility::clamp<int, int16_t>(quantized);
@@ -492,7 +491,9 @@ inline float dequantize_qsymm16(int16_t value, const QuantizationInfo &qinfo)
  *
  * @return Quantized value
  */
-inline uint16_t quantize_qasymm16(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+inline uint16_t quantize_qasymm16(float                          value,
+                                  const UniformQuantizationInfo &qinfo,
+                                  RoundingPolicy                 rounding_policy = RoundingPolicy::TO_NEAREST_UP)
 {
     int quantized = arm_compute::round(value / qinfo.scale, rounding_policy) + qinfo.offset;
     quantized     = arm_compute::utility::clamp<int, uint16_t>(quantized);
@@ -565,7 +566,8 @@ inline float dequantize_qasymm16(uint16_t value, const QuantizationInfo &qinfo)
  * z_n = - z_i * s_i / s_o + z_o
  *
  */
-inline UniformQuantizationInfo compute_requantization_scale_offset(const UniformQuantizationInfo &uqinfo_in, const UniformQuantizationInfo &uqinfo_out)
+inline UniformQuantizationInfo compute_requantization_scale_offset(const UniformQuantizationInfo &uqinfo_in,
+                                                                   const UniformQuantizationInfo &uqinfo_out)
 {
     float   scale_to_apply  = uqinfo_out.scale;
     int32_t offset_to_apply = uqinfo_out.offset;
diff --git a/arm_compute/core/Rounding.h b/arm_compute/core/Rounding.h
index b6817b5107ae5119dc3a3d3867fad39d3878ff4e..30a5a0fe9dbd91989e2a868e827d6ac65da9d5c6 100644
--- a/arm_compute/core/Rounding.h
+++ b/arm_compute/core/Rounding.h
@@ -42,5 +42,5 @@ enum class RoundingPolicy
  * @return Rounded value of the argument x.
  */
 int round(float x, RoundingPolicy rounding_policy);
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ROUNDING_H */
diff --git a/arm_compute/core/Size2D.h b/arm_compute/core/Size2D.h
index f3e9bea4c7f594cba73c56a6f082c203b450ee8d..672b392050152b24baa89ef19305a8bf7b89e899 100644
--- a/arm_compute/core/Size2D.h
+++ b/arm_compute/core/Size2D.h
@@ -41,9 +41,7 @@ public:
      * @param[in] w Width of the image or rectangle
      * @param[in] h Height of the image or rectangle
      */
-    Size2D(size_t w, size_t h) noexcept
-        : width(w),
-          height(h)
+    Size2D(size_t w, size_t h) noexcept : width(w), height(h)
     {
     }
     /** The area of the image or rectangle calculated as (width * height)
@@ -90,5 +88,5 @@ public:
     size_t width  = {}; /**< Width of the image region or rectangle */
     size_t height = {}; /**< Height of the image region or rectangle */
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_SIZE2D_H */
diff --git a/arm_compute/core/Size3D.h b/arm_compute/core/Size3D.h
index 4241ed4f7e0afc4ea68ac44e5a2e7f04eefd4658..e2dc6fe012870c8c06b660a46ff87a0d0dc58eb2 100644
--- a/arm_compute/core/Size3D.h
+++ b/arm_compute/core/Size3D.h
@@ -40,8 +40,7 @@ public:
      * @param[in] h Height of the 3D shape or object
      * @param[in] d Depth of the 3D shape or object
      */
-    Size3D(size_t w, size_t h, size_t d) noexcept
-        : width(w), height(h), depth(d)
+    Size3D(size_t w, size_t h, size_t d) noexcept : width(w), height(h), depth(d)
     {
     }
 
diff --git a/arm_compute/core/Steps.h b/arm_compute/core/Steps.h
index 208fc4b294f45545240e5860d78ebd9bfcce7cf0..6b261becc0382d3663e8f6e90d968dfb1cbf6218 100644
--- a/arm_compute/core/Steps.h
+++ b/arm_compute/core/Steps.h
@@ -45,8 +45,7 @@ public:
      * @param[in] steps Values to initialize the steps.
      */
     template <typename... Ts>
-    Steps(Ts... steps)
-        : Dimensions{ steps... }
+    Steps(Ts... steps) : Dimensions{steps...}
     {
         // Initialize empty dimensions to 1
         std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
@@ -62,5 +61,5 @@ public:
     /** Default destructor */
     ~Steps() = default;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_STEPS_H*/
diff --git a/arm_compute/core/Strides.h b/arm_compute/core/Strides.h
index b582d066f7174475b849852ec9493d0434ee045f..627b219987b054d5e8ca10a2f03583ce3de70233 100644
--- a/arm_compute/core/Strides.h
+++ b/arm_compute/core/Strides.h
@@ -43,8 +43,7 @@ public:
      * @param[in] strides Values to initialize the strides.
      */
     template <typename... Ts>
-    constexpr Strides(Ts... strides)
-        : Dimensions{ strides... }
+    constexpr Strides(Ts... strides) : Dimensions{strides...}
     {
     }
     /** Allow instances of this class to be copy constructed */
diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
index 21703b0d9359d88d954415f8fc69e7c41061a644..7a3ee2cfd0f2c6588955589d935da03925c05d65 100644
--- a/arm_compute/core/SubTensorInfo.h
+++ b/arm_compute/core/SubTensorInfo.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_SUBTENSORINFO_H
 #define ARM_COMPUTE_SUBTENSORINFO_H
 
-#include "arm_compute/core/ITensorInfo.h"
-
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
@@ -73,7 +72,7 @@ public:
 
     // Inherited methods overridden:
     std::unique_ptr<ITensorInfo> clone() const override;
-    ITensorInfo &set_data_type(DataType data_type) override
+    ITensorInfo                 &set_data_type(DataType data_type) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         _parent->set_data_type(data_type);
@@ -143,7 +142,7 @@ public:
         return _parent->offset_element_in_bytes(_coords);
     }
     int32_t offset_element_in_bytes(const Coordinates &pos) const override;
-    size_t element_size() const override
+    size_t  element_size() const override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         return _parent->element_size();
@@ -227,7 +226,7 @@ public:
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         // Check if subtensor is valid if parent is configured
-        if(_parent->tensor_shape().total_size() != 0)
+        if (_parent->tensor_shape().total_size() != 0)
         {
             ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(_parent->valid_region(), valid_region);
         }
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index e738a797b298ff3d468a8e55f74fa627350e5ccf..b18f750427a1a57bbac98f469129640fcda1d4ab 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -24,15 +24,14 @@
 #ifndef ARM_COMPUTE_TENSORINFO_H
 #define ARM_COMPUTE_TENSORINFO_H
 
-#include "arm_compute/core/ITensorInfo.h"
-
-#include "ITensorInfo.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 
+#include "ITensorInfo.h"
 #include <cstddef>
 #include <memory>
 
@@ -112,7 +111,10 @@ public:
      * @param[in] data_type         Data type to use for each tensor element
      * @param[in] quantization_info The quantization settings for the tensor data.
      */
-    TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info);
+    TensorInfo(const TensorShape &tensor_shape,
+               size_t             num_channels,
+               DataType           data_type,
+               QuantizationInfo   quantization_info);
 
     /** Initialize the tensor info with just a format.
      *
@@ -136,7 +138,11 @@ public:
      * @param[in] offset_first_element_in_bytes Offset in bytes from the beginning of memory allocation to access the first element.
      * @param[in] total_size_in_bytes           Size in bytes of the memory allocation (including the offset to the first element).
      */
-    void init(const TensorShape &tensor_shape, Format format, const Strides &strides_in_bytes, size_t offset_first_element_in_bytes, size_t total_size_in_bytes);
+    void init(const TensorShape &tensor_shape,
+              Format             format,
+              const Strides     &strides_in_bytes,
+              size_t             offset_first_element_in_bytes,
+              size_t             total_size_in_bytes);
 
     /** Initialize the tensor info with just a format.
      *
@@ -164,8 +170,12 @@ public:
      * @param[in] offset_first_element_in_bytes Offset in bytes from the beginning of memory allocation to access the first element.
      * @param[in] total_size_in_bytes           Size in bytes of the memory allocation (including the offset to the first element).
      */
-    void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
-              size_t total_size_in_bytes);
+    void init(const TensorShape &tensor_shape,
+              size_t             num_channels,
+              DataType           data_type,
+              const Strides     &strides_in_bytes,
+              size_t             offset_first_element_in_bytes,
+              size_t             total_size_in_bytes);
     /** Initialize the metadata structure for the given tensor shape and single-plane format, (Padding is automatically calculated)
      *
      * @note The padding used by this method is really conservative so that the tensor can be used for most functions.
@@ -191,19 +201,19 @@ public:
 
     // Inherited methods overridden:
     std::unique_ptr<ITensorInfo> clone() const override;
-    ITensorInfo &set_data_type(DataType data_type) override;
-    ITensorInfo &set_num_channels(int num_channels) override;
-    ITensorInfo &set_format(Format format) override;
-    ITensorInfo &set_tensor_shape(const TensorShape &shape) override;
-    ITensorInfo &set_tensor_dims_state(const TensorDimsState &state) override;
-    ITensorInfo &set_quantization_info(const QuantizationInfo &quantization_info) override;
-    ITensorInfo &set_data_layout(const DataLayout &data_layout) override;
-    ITensorInfo &reset_padding() override;
-    bool         auto_padding() override;
-    ITensorInfo &set_lock_paddings(bool flag) override;
-    bool lock_paddings() const override;
-    bool extend_padding(const PaddingSize &padding) override;
-    size_t dimension(size_t index) const override
+    ITensorInfo                 &set_data_type(DataType data_type) override;
+    ITensorInfo                 &set_num_channels(int num_channels) override;
+    ITensorInfo                 &set_format(Format format) override;
+    ITensorInfo                 &set_tensor_shape(const TensorShape &shape) override;
+    ITensorInfo                 &set_tensor_dims_state(const TensorDimsState &state) override;
+    ITensorInfo                 &set_quantization_info(const QuantizationInfo &quantization_info) override;
+    ITensorInfo                 &set_data_layout(const DataLayout &data_layout) override;
+    ITensorInfo                 &reset_padding() override;
+    bool                         auto_padding() override;
+    ITensorInfo                 &set_lock_paddings(bool flag) override;
+    bool                         lock_paddings() const override;
+    bool                         extend_padding(const PaddingSize &padding) override;
+    size_t                       dimension(size_t index) const override
     {
         return _tensor_shape[index];
     }
@@ -220,7 +230,7 @@ public:
         return _offset_first_element_in_bytes;
     }
     int32_t offset_element_in_bytes(const Coordinates &pos) const override;
-    size_t element_size() const override
+    size_t  element_size() const override
     {
         return data_size_from_type(_data_type) * _num_channels;
     }
@@ -266,7 +276,8 @@ public:
     }
     bool is_dynamic() const override
     {
-        return std::find(std::cbegin(_dims_state), std::cend(_dims_state), get_dynamic_state_value()) != std::cend(_dims_state);
+        return std::find(std::cbegin(_dims_state), std::cend(_dims_state), get_dynamic_state_value()) !=
+               std::cend(_dims_state);
     }
     bool are_values_constant() const override
     {
@@ -343,11 +354,15 @@ private:
  */
 inline bool operator==(const TensorInfo &lhs, const TensorInfo &rhs)
 {
-    return (lhs._total_size == rhs._total_size) && (lhs._offset_first_element_in_bytes == rhs._offset_first_element_in_bytes) && (lhs._strides_in_bytes == rhs._strides_in_bytes)
-           && (lhs._num_channels == rhs._num_channels) && (lhs._tensor_shape == rhs._tensor_shape) && (lhs._dims_state == rhs._dims_state) && (lhs._data_type == rhs._data_type) && (lhs._format == rhs._format)
-           && (lhs._is_resizable == rhs._is_resizable) && (lhs._valid_region == rhs._valid_region) && (lhs._padding == rhs._padding) && (lhs._quantization_info == rhs._quantization_info)
-           && (lhs._data_layout == rhs._data_layout) && (lhs._are_values_constant == rhs._are_values_constant)
-           && (lhs._id == rhs._id);
+    return (lhs._total_size == rhs._total_size) &&
+           (lhs._offset_first_element_in_bytes == rhs._offset_first_element_in_bytes) &&
+           (lhs._strides_in_bytes == rhs._strides_in_bytes) && (lhs._num_channels == rhs._num_channels) &&
+           (lhs._tensor_shape == rhs._tensor_shape) && (lhs._dims_state == rhs._dims_state) &&
+           (lhs._data_type == rhs._data_type) && (lhs._format == rhs._format) &&
+           (lhs._is_resizable == rhs._is_resizable) && (lhs._valid_region == rhs._valid_region) &&
+           (lhs._padding == rhs._padding) && (lhs._quantization_info == rhs._quantization_info) &&
+           (lhs._data_layout == rhs._data_layout) && (lhs._are_values_constant == rhs._are_values_constant) &&
+           (lhs._id == rhs._id);
 }
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_TENSORINFO_H */
diff --git a/arm_compute/core/TensorShape.h b/arm_compute/core/TensorShape.h
index 4c9186ac64cfb5c847692781de6ab0d7f2faed6a..c1707e262f1e2364d5ce58f173dd58e3771cedbf 100644
--- a/arm_compute/core/TensorShape.h
+++ b/arm_compute/core/TensorShape.h
@@ -44,11 +44,10 @@ public:
      * @param[in] dims Values to initialize the dimensions.
      */
     template <typename... Ts>
-    TensorShape(Ts... dims)
-        : Dimensions{ dims... }
+    TensorShape(Ts... dims) : Dimensions{dims...}
     {
         // Initialize unspecified dimensions to 1
-        if(_num_dimensions > 0)
+        if (_num_dimensions > 0)
         {
             std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
         }
@@ -79,7 +78,7 @@ public:
     TensorShape &set(size_t dimension, size_t value, bool apply_dim_correction = true, bool increase_dim_unit = true)
     {
         // Clear entire shape if one dimension is zero
-        if(value == 0)
+        if (value == 0)
         {
             _num_dimensions = 0;
             std::fill(_id.begin(), _id.end(), 0);
@@ -94,7 +93,7 @@ public:
             Dimensions::set(dimension, value, increase_dim_unit);
 
             // Correct number dimensions to ignore trailing dimensions of size 1
-            if(apply_dim_correction)
+            if (apply_dim_correction)
             {
                 apply_dimension_correction();
             }
@@ -123,7 +122,7 @@ public:
         std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
 
         // Correct number dimensions to ignore trailing dimensions of size 1
-        if(apply_dim_correction)
+        if (apply_dim_correction)
         {
             apply_dimension_correction();
         }
@@ -212,26 +211,26 @@ public:
      * @return The broadcasted shape or an empty shape if the shapes are not broadcast compatible.
      */
     template <typename... Shapes>
-    static TensorShape broadcast_shape(const Shapes &... shapes)
+    static TensorShape broadcast_shape(const Shapes &...shapes)
     {
         TensorShape bc_shape;
 
-        auto broadcast = [&bc_shape](const TensorShape & other)
+        auto broadcast = [&bc_shape](const TensorShape &other)
         {
-            if(bc_shape.num_dimensions() == 0)
+            if (bc_shape.num_dimensions() == 0)
             {
                 bc_shape = other;
             }
-            else if(other.num_dimensions() != 0)
+            else if (other.num_dimensions() != 0)
             {
-                for(size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
+                for (size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
                 {
                     const size_t dim_min = std::min(bc_shape[d], other[d]);
                     const size_t dim_max = std::max(bc_shape[d], other[d]);
 
-                    if((dim_min != 1) && (dim_min != dim_max))
+                    if ((dim_min != 1) && (dim_min != dim_max))
                     {
-                        bc_shape = TensorShape{ 0U };
+                        bc_shape = TensorShape{0U};
                         break;
                     }
 
@@ -249,9 +248,9 @@ private:
     /** Remove trailing dimensions of size 1 from the reported number of dimensions. */
     void apply_dimension_correction()
     {
-        for(int i = static_cast<int>(_num_dimensions) - 1; i > 0; --i)
+        for (int i = static_cast<int>(_num_dimensions) - 1; i > 0; --i)
         {
-            if(_id[i] == 1)
+            if (_id[i] == 1)
             {
                 --_num_dimensions;
             }
@@ -262,5 +261,5 @@ private:
         }
     }
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_TENSORSHAPE_H*/
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 12d860205ead9c91301fc4af498d248e75e0556b..6b51af17d4bb060805864baf4e4a5d48211d41b7 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_ARM_COMPUTE_CORE_TYPES
-#define ACL_ARM_COMPUTE_CORE_TYPES
+#ifndef ACL_ARM_COMPUTE_CORE_TYPES_H
+#define ACL_ARM_COMPUTE_CORE_TYPES_H
 
 /** The following symbols have been moved to:
  * half
@@ -59,14 +59,13 @@
 /** The following symbols have been moved to:
  * MatMulInfo
  */
-#include "arm_compute/function_info/MatMulInfo.h"
-
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Size3D.h"
 #include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/experimental/IPostOp.h"
 #include "arm_compute/core/utils/misc/Macros.h"
+#include "arm_compute/function_info/MatMulInfo.h"
+
 #include "support/Bfloat16.h"
 
 #include <cmath>
@@ -144,8 +143,7 @@ enum class ComparisonOperation
 struct ValidRegion
 {
     /** Default constructor */
-    ValidRegion()
-        : anchor{}, shape{}
+    ValidRegion() : anchor{}, shape{}
     {
     }
 
@@ -166,8 +164,7 @@ struct ValidRegion
      * @param[in] a_shape   Shape of the valid region.
      *
      */
-    ValidRegion(const Coordinates &an_anchor, const TensorShape &a_shape)
-        : anchor{ an_anchor }, shape{ a_shape }
+    ValidRegion(const Coordinates &an_anchor, const TensorShape &a_shape) : anchor{an_anchor}, shape{a_shape}
     {
         anchor.set_num_dimensions(std::max(anchor.num_dimensions(), shape.num_dimensions()));
     }
@@ -180,7 +177,7 @@ struct ValidRegion
      *
      */
     ValidRegion(const Coordinates &an_anchor, const TensorShape &a_shape, size_t num_dimensions)
-        : anchor{ an_anchor }, shape{ a_shape }
+        : anchor{an_anchor}, shape{a_shape}
     {
         ARM_COMPUTE_ERROR_ON(num_dimensions < std::max(anchor.num_dimensions(), shape.num_dimensions()));
         anchor.set_num_dimensions(num_dimensions);
@@ -242,32 +239,24 @@ enum class BorderMode
 struct BorderSize
 {
     /** Empty border, i.e. no border */
-    constexpr BorderSize() noexcept
-        : top{ 0 },
-    right{ 0 },
-    bottom{ 0 },
-    left{ 0 }
+    constexpr BorderSize() noexcept : top{0}, right{0}, bottom{0}, left{0}
     {
     }
 
     /** Border with equal size around the 2D plane */
-    explicit constexpr BorderSize(unsigned int size) noexcept
-        : top{ size },
-    right{ size },
-    bottom{ size },
-    left{ size }
+    explicit constexpr BorderSize(unsigned int size) noexcept : top{size}, right{size}, bottom{size}, left{size}
     {
     }
 
     /** Border with same size for top/bottom and left/right */
     constexpr BorderSize(unsigned int top_bottom, unsigned int left_right)
-        : top{ top_bottom }, right{ left_right }, bottom{ top_bottom }, left{ left_right }
+        : top{top_bottom}, right{left_right}, bottom{top_bottom}, left{left_right}
     {
     }
 
     /** Border with different sizes */
     constexpr BorderSize(unsigned int top, unsigned int right, unsigned int bottom, unsigned int left)
-        : top{ top }, right{ right }, bottom{ bottom }, left{ left }
+        : top{top}, right{right}, bottom{bottom}, left{left}
     {
     }
 
@@ -372,7 +361,7 @@ enum class InterpolationPolicy
 {
     NEAREST_NEIGHBOR, /**< Output values are defined to match the source pixel whose center is nearest to the sample position */
     BILINEAR,         /**< Output values are defined by bilinear interpolation between the pixels */
-    AREA,             /**< Output values are determined by averaging the source pixels whose areas fall under the area of the destination pixel, projected onto the source image */
+    AREA, /**< Output values are determined by averaging the source pixels whose areas fall under the area of the destination pixel, projected onto the source image */
 };
 
 /** Bilinear Interpolation method used by LKTracker */
@@ -479,12 +468,12 @@ enum class NormType
  */
 struct DetectionWindow
 {
-    uint16_t x{ 0 };         /**< Top-left x coordinate */
-    uint16_t y{ 0 };         /**< Top-left y coordinate */
-    uint16_t width{ 0 };     /**< Width of the detection window */
-    uint16_t height{ 0 };    /**< Height of the detection window */
-    uint16_t idx_class{ 0 }; /**< Index of the class */
-    float    score{ 0.f };   /**< Confidence value for the detection window */
+    uint16_t x{0};         /**< Top-left x coordinate */
+    uint16_t y{0};         /**< Top-left y coordinate */
+    uint16_t width{0};     /**< Width of the detection window */
+    uint16_t height{0};    /**< Height of the detection window */
+    uint16_t idx_class{0}; /**< Index of the class */
+    float    score{0.f};   /**< Confidence value for the detection window */
 };
 
 /** Available pooling types */
@@ -521,12 +510,28 @@ public:
      * @param[in] im_width                 (Optional) Boxes whose centers (on the x axis) is beyond im_width will be filtered. Defaults to 1
      * @param[in] im_height                (Optional) Boxes whose centers (on the y axis) is beyond im_height will be filtered. Defaults to 1
      */
-    BoxNMSLimitInfo(float score_thresh = 0.05f, float nms = 0.3f,
-                    int detections = 100, bool soft_nms_enabled = false,
-                    NMSType soft_nms_method = NMSType::LINEAR,
-                    float soft_nms_sigma = 0.5f, float soft_nms_min_score_thres = 0.001f, bool suppress_size = false, float min_size = 1.0f, float im_width = 1.0f, float im_height = 1.0f)
-        : _score_thresh(score_thresh), _nms(nms), _detections_per_im(detections), _soft_nms_enabled(soft_nms_enabled), _soft_nms_method(soft_nms_method), _soft_nms_sigma(soft_nms_sigma),
-          _soft_nms_min_score_thres(soft_nms_min_score_thres), _suppress_size(suppress_size), _min_size(min_size), _im_width(im_width), _im_height(im_height)
+    BoxNMSLimitInfo(float   score_thresh             = 0.05f,
+                    float   nms                      = 0.3f,
+                    int     detections               = 100,
+                    bool    soft_nms_enabled         = false,
+                    NMSType soft_nms_method          = NMSType::LINEAR,
+                    float   soft_nms_sigma           = 0.5f,
+                    float   soft_nms_min_score_thres = 0.001f,
+                    bool    suppress_size            = false,
+                    float   min_size                 = 1.0f,
+                    float   im_width                 = 1.0f,
+                    float   im_height                = 1.0f)
+        : _score_thresh(score_thresh),
+          _nms(nms),
+          _detections_per_im(detections),
+          _soft_nms_enabled(soft_nms_enabled),
+          _soft_nms_method(soft_nms_method),
+          _soft_nms_sigma(soft_nms_sigma),
+          _soft_nms_min_score_thres(soft_nms_min_score_thres),
+          _suppress_size(suppress_size),
+          _min_size(min_size),
+          _im_width(im_width),
+          _im_height(im_height)
     {
     }
     /** Get the score threshold */
@@ -604,14 +609,13 @@ private:
 struct Padding2D
 {
     Padding2D() = default;
-    Padding2D(size_t left, size_t right, size_t top, size_t bottom)
-        : left(left), right(right), top(top), bottom(bottom)
+    Padding2D(size_t left, size_t right, size_t top, size_t bottom) : left(left), right(right), top(top), bottom(bottom)
     {
     }
-    size_t left   = { 0 }; /**<  Padding across the width dimension on the left, in elements. */
-    size_t right  = { 0 }; /**<  Padding across the width dimension on the right, in elements. */
-    size_t top    = { 0 }; /**<  Padding across the height dimension on the top, in elements. */
-    size_t bottom = { 0 }; /**<  Padding across the height dimension on the bottom, in elements. */
+    size_t left   = {0}; /**<  Padding across the width dimension on the left, in elements. */
+    size_t right  = {0}; /**<  Padding across the width dimension on the right, in elements. */
+    size_t top    = {0}; /**<  Padding across the height dimension on the top, in elements. */
+    size_t bottom = {0}; /**<  Padding across the height dimension on the bottom, in elements. */
 };
 
 /** Padding information for 3D operations like Conv3d */
@@ -631,12 +635,12 @@ struct Padding3D
     {
     }
 
-    size_t left   = { 0 }; /**<  Padding across the width dimenstion on the left, in elements. */
-    size_t right  = { 0 }; /**<  Padding across the width dimenstion on the right, in elements. */
-    size_t top    = { 0 }; /**<  Padding across the height dimenstion  on the top, in elements. */
-    size_t bottom = { 0 }; /**<  Padding across the height dimenstion on the bottom, in elements. */
-    size_t front  = { 0 }; /**<  Padding across the depth dimenstion on the front, in elements. */
-    size_t back   = { 0 }; /**<  Padding across the depth dimenstion on the back, in elements. */
+    size_t left   = {0}; /**<  Padding across the width dimenstion on the left, in elements. */
+    size_t right  = {0}; /**<  Padding across the width dimenstion on the right, in elements. */
+    size_t top    = {0}; /**<  Padding across the height dimenstion  on the top, in elements. */
+    size_t bottom = {0}; /**<  Padding across the height dimenstion on the bottom, in elements. */
+    size_t front  = {0}; /**<  Padding across the depth dimenstion on the front, in elements. */
+    size_t back   = {0}; /**<  Padding across the depth dimenstion on the back, in elements. */
 };
 
 /** PriorBox layer info */
@@ -668,9 +672,15 @@ public:
      * @param[in] img_size      (Optional) Image size.
      * @param[in] steps         (Optional) Step values.
      */
-    PriorBoxLayerInfo(const std::vector<float> &min_sizes, const std::vector<float> &variances, float offset, bool flip = true, bool clip = false,
-                      const std::vector<float> &max_sizes = {}, const std::vector<float> &aspect_ratios = {},
-    const Coordinates2D &img_size = Coordinates2D{ 0, 0 }, const std::array<float, 2> &steps = { { 0.f, 0.f } })
+    PriorBoxLayerInfo(const std::vector<float>   &min_sizes,
+                      const std::vector<float>   &variances,
+                      float                       offset,
+                      bool                        flip          = true,
+                      bool                        clip          = false,
+                      const std::vector<float>   &max_sizes     = {},
+                      const std::vector<float>   &aspect_ratios = {},
+                      const Coordinates2D        &img_size      = Coordinates2D{0, 0},
+                      const std::array<float, 2> &steps         = {{0.f, 0.f}})
         : _min_sizes(min_sizes),
           _variances(variances),
           _offset(offset),
@@ -682,22 +692,22 @@ public:
           _steps(steps)
     {
         _aspect_ratios.push_back(1.);
-        for(unsigned int i = 0; i < aspect_ratios.size(); ++i)
+        for (unsigned int i = 0; i < aspect_ratios.size(); ++i)
         {
             float ar            = aspect_ratios[i];
             bool  already_exist = false;
-            for(auto ar_new : _aspect_ratios)
+            for (auto ar_new : _aspect_ratios)
             {
-                if(fabs(ar - ar_new) < 1e-6)
+                if (fabs(ar - ar_new) < 1e-6)
                 {
                     already_exist = true;
                     break;
                 }
             }
-            if(!already_exist)
+            if (!already_exist)
             {
                 _aspect_ratios.push_back(ar);
-                if(flip)
+                if (flip)
                 {
                     _aspect_ratios.push_back(1.f / ar);
                 }
@@ -751,14 +761,14 @@ public:
     }
 
 private:
-    std::vector<float> _min_sizes;
-    std::vector<float> _variances;
-    float              _offset;
-    bool               _flip;
-    bool               _clip;
-    std::vector<float> _max_sizes;
-    std::vector<float> _aspect_ratios;
-    Coordinates2D      _img_size;
+    std::vector<float>   _min_sizes;
+    std::vector<float>   _variances;
+    float                _offset;
+    bool                 _flip;
+    bool                 _clip;
+    std::vector<float>   _max_sizes;
+    std::vector<float>   _aspect_ratios;
+    Coordinates2D        _img_size;
     std::array<float, 2> _steps;
 };
 
@@ -809,8 +819,16 @@ public:
      * @param[in] variance_encoded_in_target (Optional) If true, variance is encoded in target. Otherwise we need to adjust the predicted offset accordingly.Default set to false.
      * @param[in] eta                        (Optional) Eta.
      */
-    DetectionOutputLayerInfo(int num_classes, bool share_location, DetectionOutputLayerCodeType code_type, int keep_top_k, float nms_threshold, int top_k = -1, int background_label_id = -1,
-                             float confidence_threshold = std::numeric_limits<float>::lowest(), bool variance_encoded_in_target = false, float eta = 1)
+    DetectionOutputLayerInfo(int                          num_classes,
+                             bool                         share_location,
+                             DetectionOutputLayerCodeType code_type,
+                             int                          keep_top_k,
+                             float                        nms_threshold,
+                             int                          top_k                = -1,
+                             int                          background_label_id  = -1,
+                             float                        confidence_threshold = std::numeric_limits<float>::lowest(),
+                             bool                         variance_encoded_in_target = false,
+                             float                        eta                        = 1)
         : _num_classes(num_classes),
           _share_location(share_location),
           _code_type(code_type),
@@ -924,8 +942,15 @@ public:
      * @param[in] detection_per_class       (Optional) Number of detection per class. Used in the Regular Non-Max-Suppression. Defaults to 100.
      * @param[in] dequantize_scores         (Optional) If the scores need to be dequantized. Defaults to true.
      */
-    DetectionPostProcessLayerInfo(unsigned int max_detections, unsigned int max_classes_per_detection, float nms_score_threshold, float iou_threshold, unsigned int num_classes,
-                                  std::array<float, 4> scales_values, bool use_regular_nms = false, unsigned int detection_per_class = 100, bool dequantize_scores = true)
+    DetectionPostProcessLayerInfo(unsigned int         max_detections,
+                                  unsigned int         max_classes_per_detection,
+                                  float                nms_score_threshold,
+                                  float                iou_threshold,
+                                  unsigned int         num_classes,
+                                  std::array<float, 4> scales_values,
+                                  bool                 use_regular_nms     = false,
+                                  unsigned int         detection_per_class = 100,
+                                  bool                 dequantize_scores   = true)
         : _max_detections(max_detections),
           _max_classes_per_detection(max_classes_per_detection),
           _nms_score_threshold(nms_score_threshold),
@@ -1003,15 +1028,15 @@ public:
     }
 
 private:
-    unsigned int _max_detections;
-    unsigned int _max_classes_per_detection;
-    float        _nms_score_threshold;
-    float        _iou_threshold;
-    unsigned int _num_classes;
+    unsigned int         _max_detections;
+    unsigned int         _max_classes_per_detection;
+    float                _nms_score_threshold;
+    float                _iou_threshold;
+    unsigned int         _num_classes;
     std::array<float, 4> _scales_values;
-    bool         _use_regular_nms;
-    unsigned int _detection_per_class;
-    bool         _dequantize_scores;
+    bool                 _use_regular_nms;
+    unsigned int         _detection_per_class;
+    bool                 _dequantize_scores;
 };
 
 /** Pooling Layer Information struct*/
@@ -1241,8 +1266,14 @@ public:
      * @param[in] spatial_scale  Spatial scale to be applied to the ROI coordinates and dimensions.
      * @param[in] sampling_ratio Number of samples to include in each pooling region (if set to zero, a ceil(roi_dims/pooling_dims))
      */
-    ROIPoolingLayerInfo(unsigned int pooled_width, unsigned int pooled_height, float spatial_scale, unsigned int sampling_ratio = 0)
-        : _pooled_width(pooled_width), _pooled_height(pooled_height), _spatial_scale(spatial_scale), _sampling_ratio(sampling_ratio)
+    ROIPoolingLayerInfo(unsigned int pooled_width,
+                        unsigned int pooled_height,
+                        float        spatial_scale,
+                        unsigned int sampling_ratio = 0)
+        : _pooled_width(pooled_width),
+          _pooled_height(pooled_height),
+          _spatial_scale(spatial_scale),
+          _sampling_ratio(sampling_ratio)
     {
     }
     /** Get the pooled width of the layer */
@@ -1289,10 +1320,24 @@ public:
      * @param[in] min_size       (Optional)Size used to validate the anchors produced. Defaults to 16.
      * @param[in] values_per_roi (Optional)Values used to represent a ROI(Region of interest). Defaults to 4.
      */
-    GenerateProposalsInfo(float im_width, float im_height, float im_scale, float spatial_scale = 1.0, int pre_nms_topN = 6000, int post_nms_topN = 300, float nms_thres = 0.7, float min_size = 16.0,
+    GenerateProposalsInfo(float  im_width,
+                          float  im_height,
+                          float  im_scale,
+                          float  spatial_scale  = 1.0,
+                          int    pre_nms_topN   = 6000,
+                          int    post_nms_topN  = 300,
+                          float  nms_thres      = 0.7,
+                          float  min_size       = 16.0,
                           size_t values_per_roi = 4)
-        : _im_height(im_height), _im_width(im_width), _im_scale(im_scale), _spatial_scale(spatial_scale), _pre_nms_topN(pre_nms_topN), _post_nms_topN(post_nms_topN), _nms_thres(nms_thres),
-          _min_size(min_size), _values_per_roi(values_per_roi)
+        : _im_height(im_height),
+          _im_width(im_width),
+          _im_scale(im_scale),
+          _spatial_scale(spatial_scale),
+          _pre_nms_topN(pre_nms_topN),
+          _post_nms_topN(post_nms_topN),
+          _nms_thres(nms_thres),
+          _min_size(min_size),
+          _values_per_roi(values_per_roi)
     {
     }
 
@@ -1418,11 +1463,20 @@ public:
      * @param[in] correct_transform_coords (Optional)Correct bounding box transform coordinates. Defaults to false
      * @param[in] bbox_xform_clip          (Optional)Minimum bounding box width and height after bounding box transformation in log-space. Defaults to log(1000/16)
      */
-    BoundingBoxTransformInfo(float img_width, float img_height, float scale, bool apply_scale = false, const std::array<float, 4> weights = { { 1.f, 1.f, 1.f, 1.f } }, bool correct_transform_coords =
-    false,
-    float bbox_xform_clip =
-        4.135166556742356f)
-        : _img_width(img_width), _img_height(img_height), _scale(scale), _apply_scale(apply_scale), _correct_transform_coords(correct_transform_coords), _weights(weights), _bbox_xform_clip(bbox_xform_clip)
+    BoundingBoxTransformInfo(float                      img_width,
+                             float                      img_height,
+                             float                      scale,
+                             bool                       apply_scale              = false,
+                             const std::array<float, 4> weights                  = {{1.f, 1.f, 1.f, 1.f}},
+                             bool                       correct_transform_coords = false,
+                             float                      bbox_xform_clip          = 4.135166556742356f)
+        : _img_width(img_width),
+          _img_height(img_height),
+          _scale(scale),
+          _apply_scale(apply_scale),
+          _correct_transform_coords(correct_transform_coords),
+          _weights(weights),
+          _bbox_xform_clip(bbox_xform_clip)
     {
     }
 
@@ -1462,13 +1516,13 @@ public:
     }
 
 private:
-    float _img_width;
-    float _img_height;
-    float _scale;
-    bool  _apply_scale;
-    bool  _correct_transform_coords;
+    float                _img_width;
+    float                _img_height;
+    float                _scale;
+    bool                 _apply_scale;
+    bool                 _correct_transform_coords;
     std::array<float, 4> _weights;
-    float _bbox_xform_clip;
+    float                _bbox_xform_clip;
 };
 
 /** Normalization Layer Information class */
@@ -1485,7 +1539,12 @@ public:
      * @param[in] is_scaled (Optional) Boolean that specifies if alpha will be scaled by the normalization size or not.
      *                      Should be false to follow [Krichevksy 2012].
      */
-    NormalizationLayerInfo(NormType type, uint32_t norm_size = 5, float alpha = 0.0001f, float beta = 0.5f, float kappa = 1.f, bool is_scaled = true)
+    NormalizationLayerInfo(NormType type,
+                           uint32_t norm_size = 5,
+                           float    alpha     = 0.0001f,
+                           float    beta      = 0.5f,
+                           float    kappa     = 1.f,
+                           bool     is_scaled = true)
         : _type(type), _norm_size(norm_size), _alpha(alpha), _beta(beta), _kappa(kappa), _is_scaled(is_scaled)
     {
     }
@@ -1613,7 +1672,12 @@ class WeightsInfo
 public:
     /** Default constructor */
     WeightsInfo()
-        : _are_reshaped(false), _kernel_width(0), _kernel_height(0), _num_kernels(0), _retain_internal_weights(false), _weight_format(arm_compute::WeightFormat::UNSPECIFIED)
+        : _are_reshaped(false),
+          _kernel_width(0),
+          _kernel_height(0),
+          _num_kernels(0),
+          _retain_internal_weights(false),
+          _weight_format(arm_compute::WeightFormat::UNSPECIFIED)
     {
     }
     /** Constructor
@@ -1625,9 +1689,18 @@ public:
      * @param[in] retain_internal_weights (Optional) True if internal reshaped weights must be retained. Used for reconfiguration purposes. Default is false.
      * @param[in] weight_format           (Optional) arm_gemm:WeightFormat enumeration requested by the user. Default is arm_compute::WeightFormat::UNSPECIFIED.
      */
-    WeightsInfo(bool are_reshaped, unsigned int kernel_width, unsigned int kernel_height, unsigned int num_kernels, bool retain_internal_weights = false,
-                arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED)
-        : _are_reshaped(are_reshaped), _kernel_width(kernel_width), _kernel_height(kernel_height), _num_kernels(num_kernels), _retain_internal_weights(retain_internal_weights), _weight_format(weight_format)
+    WeightsInfo(bool                      are_reshaped,
+                unsigned int              kernel_width,
+                unsigned int              kernel_height,
+                unsigned int              num_kernels,
+                bool                      retain_internal_weights = false,
+                arm_compute::WeightFormat weight_format           = arm_compute::WeightFormat::UNSPECIFIED)
+        : _are_reshaped(are_reshaped),
+          _kernel_width(kernel_width),
+          _kernel_height(kernel_height),
+          _num_kernels(num_kernels),
+          _retain_internal_weights(retain_internal_weights),
+          _weight_format(weight_format)
     {
     }
     /** Flag which specifies if the weights tensor has been reshaped.
@@ -1699,7 +1772,14 @@ class GEMMReshapeInfo final
 public:
     /** Default constructor */
     GEMMReshapeInfo()
-        : _m(1), _n(1), _k(1), _mult_transpose1xW_width(1), _mult_interleave4x4_height(1), _depth_output_gemm3d(0), _reinterpret_input_as_3d(false), _broadcast_bias(false)
+        : _m(1),
+          _n(1),
+          _k(1),
+          _mult_transpose1xW_width(1),
+          _mult_interleave4x4_height(1),
+          _depth_output_gemm3d(0),
+          _reinterpret_input_as_3d(false),
+          _broadcast_bias(false)
     {
     }
     /** Constructor
@@ -1715,9 +1795,22 @@ public:
      *                                      to perform 1x1 convolutions with the NHWC data layout)
      * @param[in] broadcast_bias            (Optional) Broadcast the shape of the bias tensor from a vector to a matrix.
      */
-    GEMMReshapeInfo(int m, int n, int k, int mult_transpose1xW_width = 1, int mult_interleave4x4_height = 1, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false, bool broadcast_bias = false)
-        : _m(m), _n(n), _k(k), _mult_transpose1xW_width(mult_transpose1xW_width), _mult_interleave4x4_height(mult_interleave4x4_height), _depth_output_gemm3d(depth_output_gemm3d),
-          _reinterpret_input_as_3d(reinterpret_input_as_3d), _broadcast_bias(broadcast_bias)
+    GEMMReshapeInfo(int  m,
+                    int  n,
+                    int  k,
+                    int  mult_transpose1xW_width   = 1,
+                    int  mult_interleave4x4_height = 1,
+                    int  depth_output_gemm3d       = 0,
+                    bool reinterpret_input_as_3d   = false,
+                    bool broadcast_bias            = false)
+        : _m(m),
+          _n(n),
+          _k(k),
+          _mult_transpose1xW_width(mult_transpose1xW_width),
+          _mult_interleave4x4_height(mult_interleave4x4_height),
+          _depth_output_gemm3d(depth_output_gemm3d),
+          _reinterpret_input_as_3d(reinterpret_input_as_3d),
+          _broadcast_bias(broadcast_bias)
     {
     }
     /** Number of matrix A rows
@@ -1807,11 +1900,11 @@ struct GEMMLHSMatrixInfo
         : m0(m), k0(k), v0(v), transpose(trans), interleave(inter)
     {
     }
-    unsigned int m0{ 1 };            /**< Number of rows processed by the matrix multiplication */
-    unsigned int k0{ 1 };            /**< Number of partial accumulations performed by the matrix multiplication */
-    unsigned int v0{ 1 };            /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
-    bool         transpose{ true };  /**< True if the (m0xk0) block has to be transposed before been stored */
-    bool         interleave{ true }; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
+    unsigned int m0{1};            /**< Number of rows processed by the matrix multiplication */
+    unsigned int k0{1};            /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int v0{1};            /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
+    bool         transpose{true};  /**< True if the (m0xk0) block has to be transposed before been stored */
+    bool         interleave{true}; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
 };
 
 /** GEMM RHS (Right Hand Side) matrix information */
@@ -1822,12 +1915,13 @@ struct GEMMRHSMatrixInfo
         : n0(n), k0(k), h0(h), transpose(trans), interleave(inter), export_to_cl_image(export_to_cl_img)
     {
     }
-    unsigned int n0{ 1 };                     /**< Number of columns processed by the matrix multiplication */
-    unsigned int k0{ 1 };                     /**< Number of partial accumulations performed by the matrix multiplication */
-    unsigned int h0{ 1 };                     /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool         transpose{ true };           /**< True if the (k0xn0) block has to be transposed before been stored */
-    bool         interleave{ true };          /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
-    bool         export_to_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
+    unsigned int n0{1};            /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{1};            /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int h0{1};            /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool         transpose{true};  /**< True if the (k0xn0) block has to be transposed before been stored */
+    bool         interleave{true}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+    bool         export_to_cl_image{
+        false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
 };
 
 class ITensorInfo;
@@ -1843,16 +1937,23 @@ struct WinogradInfo
      * @param[in] conv_info      Convolution info (Pads, strides)
      * @param[in] data_layout    Data layout to use for the output tensor once the convolution has been applied
      */
-    WinogradInfo(Size2D output_tile_sz, Size2D kernel_sz, Size2D input_dims, PadStrideInfo conv_info, DataLayout data_layout)
-        : output_tile_size(output_tile_sz), kernel_size(kernel_sz), input_dimensions(input_dims), convolution_info(conv_info), output_data_layout(data_layout)
-    {
-    }
-
-    Size2D        output_tile_size{};                     /**< Width and height of the output tile */
-    Size2D        kernel_size{};                          /**< Width and height of the kernel*/
-    Size2D        input_dimensions{};                     /**< Width and height of the input tensor before the convolution is applied */
-    PadStrideInfo convolution_info{};                     /**< Convolution info (Pads, strides,...) */
-    DataLayout    output_data_layout{ DataLayout::NCHW }; /**< Data layout to use for the output tensor once the convolution has been applied (NCHW or NHWC) */
+    WinogradInfo(
+        Size2D output_tile_sz, Size2D kernel_sz, Size2D input_dims, PadStrideInfo conv_info, DataLayout data_layout)
+        : output_tile_size(output_tile_sz),
+          kernel_size(kernel_sz),
+          input_dimensions(input_dims),
+          convolution_info(conv_info),
+          output_data_layout(data_layout)
+    {
+    }
+
+    Size2D        output_tile_size{}; /**< Width and height of the output tile */
+    Size2D        kernel_size{};      /**< Width and height of the kernel*/
+    Size2D        input_dimensions{}; /**< Width and height of the input tensor before the convolution is applied */
+    PadStrideInfo convolution_info{}; /**< Convolution info (Pads, strides,...) */
+    DataLayout    output_data_layout{
+        DataLayout::
+            NCHW}; /**< Data layout to use for the output tensor once the convolution has been applied (NCHW or NHWC) */
 };
 
 /** IO formatting information class*/
@@ -1915,4 +2016,4 @@ struct IOFormatInfo
 /** Class for holding information related to cropping */
 using CropInfo = Padding2D;
 } // namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_CORE_TYPES */
+#endif // ACL_ARM_COMPUTE_CORE_TYPES_H
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index c5b50167bf6eaecd4b3d2f471dc6ac3877adc3c2..a2146522f76858a8fccbfa4dba06e60b522485fe 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -69,7 +69,7 @@ template <typename T>
 inline void permute_strides(Dimensions<T> &dimensions, const PermutationVector &perm)
 {
     const auto old_dim = utility::make_array<Dimensions<T>::num_max_dimensions>(dimensions.begin(), dimensions.end());
-    for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < perm.num_dimensions(); ++i)
     {
         T dimension_val = old_dim[i];
         dimensions.set(perm[i], dimension_val);
@@ -87,7 +87,11 @@ inline void permute_strides(Dimensions<T> &dimensions, const PermutationVector &
  *
  * @return PadStrideInfo for SAME padding
  */
-PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout = DataLayout::NCHW, const Size2D &dilation = Size2D(1u, 1u),
+PadStrideInfo calculate_same_pad(TensorShape                  input_shape,
+                                 TensorShape                  weights_shape,
+                                 PadStrideInfo                conv_info,
+                                 DataLayout                   data_layout   = DataLayout::NCHW,
+                                 const Size2D                &dilation      = Size2D(1u, 1u),
                                  const DimensionRoundingType &rounding_type = DimensionRoundingType::FLOOR);
 
 /** Returns expected width and height of the deconvolution's output tensor.
@@ -100,8 +104,10 @@ PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_sh
  *
  * @return A pair with the new width in the first position and the new height in the second.
  */
-std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
-                                                                      unsigned int kernel_width, unsigned int kernel_height,
+std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int         in_width,
+                                                                      unsigned int         in_height,
+                                                                      unsigned int         kernel_width,
+                                                                      unsigned int         kernel_height,
                                                                       const PadStrideInfo &pad_stride_info);
 
 /** Returns expected width and height of output scaled tensor depending on dimensions rounding mode.
@@ -115,8 +121,10 @@ std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned i
  *
  * @return A pair with the new width in the first position and the new height in the second.
  */
-std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
-                                                        int kernel_width, int kernel_height,
+std::pair<unsigned int, unsigned int> scaled_dimensions(int                  width,
+                                                        int                  height,
+                                                        int                  kernel_width,
+                                                        int                  kernel_height,
                                                         const PadStrideInfo &pad_stride_info,
                                                         const Size2D        &dilation = Size2D(1U, 1U));
 
@@ -130,9 +138,8 @@ std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
  *
  * @return A pair with the new width in the first position and the new height in the second, returned values can be < 1
  */
-std::pair<int, int> scaled_dimensions_signed(int width, int height,
-                                             int kernel_width, int kernel_height,
-                                             const PadStrideInfo &pad_stride_info);
+std::pair<int, int> scaled_dimensions_signed(
+    int width, int height, int kernel_width, int kernel_height, const PadStrideInfo &pad_stride_info);
 
 /** Returns calculated width, height and depth of output scaled tensor depending on dimensions rounding mode.
  *
@@ -147,8 +154,12 @@ std::pair<int, int> scaled_dimensions_signed(int width, int height,
  * @return A tuple with the new width in the first position, the new height in the second, and the new depth in the third.
  *         Returned values can be < 1
  */
-std::tuple<int, int, int> scaled_3d_dimensions_signed(int width, int height, int depth,
-                                                      int kernel_width, int kernel_height, int kernel_depth,
+std::tuple<int, int, int> scaled_3d_dimensions_signed(int                       width,
+                                                      int                       height,
+                                                      int                       depth,
+                                                      int                       kernel_width,
+                                                      int                       kernel_height,
+                                                      int                       kernel_depth,
                                                       const Pooling3dLayerInfo &pool3d_info);
 
 /** Check if the given reduction operation should be handled in a serial way.
@@ -178,7 +189,9 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool
  *
  * @return The pair with minimum and maximum values
  */
-std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info, DataType data_type, UniformQuantizationInfo oq_info);
+std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info,
+                                                             DataType                   data_type,
+                                                             UniformQuantizationInfo    oq_info);
 
 /** Convert a channel identity into a string.
  *
@@ -295,26 +308,27 @@ inline size_t num_of_elements_in_range(const float start, const float end, const
  * @param[in]  element_delim (Optional) Delimeter among the consecutive elements. Defaults to space delimeter
  */
 template <typename T>
-void print_consecutive_elements_impl(std::ostream &s, const T *ptr, unsigned int n, int stream_width = 0, const std::string &element_delim = " ")
+void print_consecutive_elements_impl(
+    std::ostream &s, const T *ptr, unsigned int n, int stream_width = 0, const std::string &element_delim = " ")
 {
     using print_type = typename std::conditional<std::is_floating_point<T>::value, T, int>::type;
     std::ios stream_status(nullptr);
     stream_status.copyfmt(s);
 
-    for(unsigned int i = 0; i < n; ++i)
+    for (unsigned int i = 0; i < n; ++i)
     {
         // Set stream width as it is not a "sticky" stream manipulator
-        if(stream_width != 0)
+        if (stream_width != 0)
         {
             s.width(stream_width);
         }
 
-        if(std::is_same<typename std::decay<T>::type, half>::value)
+        if (std::is_same<typename std::decay<T>::type, half>::value)
         {
             // We use T instead of print_type here is because the std::is_floating_point<half> returns false and then the print_type becomes int.
             s << std::right << static_cast<T>(ptr[i]) << element_delim;
         }
-        else if(std::is_same<typename std::decay<T>::type, bfloat16>::value)
+        else if (std::is_same<typename std::decay<T>::type, bfloat16>::value)
         {
             // We use T instead of print_type here is because the std::is_floating_point<bfloat16> returns false and then the print_type becomes int.
             s << std::right << float(ptr[i]) << element_delim;
@@ -343,17 +357,17 @@ int max_consecutive_elements_display_width_impl(std::ostream &s, const T *ptr, u
     using print_type = typename std::conditional<std::is_floating_point<T>::value, T, int>::type;
 
     int max_width = -1;
-    for(unsigned int i = 0; i < n; ++i)
+    for (unsigned int i = 0; i < n; ++i)
     {
         std::stringstream ss;
         ss.copyfmt(s);
 
-        if(std::is_same<typename std::decay<T>::type, half>::value)
+        if (std::is_same<typename std::decay<T>::type, half>::value)
         {
             // We use T instead of print_type here is because the std::is_floating_point<half> returns false and then the print_type becomes int.
             ss << static_cast<T>(ptr[i]);
         }
-        else if(std::is_same<typename std::decay<T>::type, bfloat16>::value)
+        else if (std::is_same<typename std::decay<T>::type, bfloat16>::value)
         {
             // We use T instead of print_type here is because the std::is_floating_point<bfloat> returns false and then the print_type becomes int.
             ss << float(ptr[i]);
@@ -377,7 +391,12 @@ int max_consecutive_elements_display_width_impl(std::ostream &s, const T *ptr, u
  * @param[in]  stream_width  (Optional) Width of the stream. If set to 0 the element's width is used. Defaults to 0.
  * @param[in]  element_delim (Optional) Delimeter among the consecutive elements. Defaults to space delimeter
  */
-void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim = " ");
+void print_consecutive_elements(std::ostream      &s,
+                                DataType           dt,
+                                const uint8_t     *ptr,
+                                unsigned int       n,
+                                int                stream_width,
+                                const std::string &element_delim = " ");
 
 /** Identify the maximum width of n consecutive elements.
  *
@@ -390,5 +409,5 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr
  */
 int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n);
 #endif /* ARM_COMPUTE_ASSERTS_ENABLED */
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_UTILS_H */
diff --git a/arm_compute/core/Validate.h b/arm_compute/core/Validate.h
index 5bffc16f3b56831fed298cc8ac39c582df4e2b5a..5550560affd059612361a4e96a367060edbe830f 100644
--- a/arm_compute/core/Validate.h
+++ b/arm_compute/core/Validate.h
@@ -24,13 +24,13 @@
 #ifndef ARM_COMPUTE_VALIDATE_H
 #define ARM_COMPUTE_VALIDATE_H
 
-#include "arm_compute/core/utils/DataLayoutUtils.h"
-#include "arm_compute/core/utils/DataTypeUtils.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/FormatUtils.h"
 #include "arm_compute/core/IKernel.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/core/utils/DataLayoutUtils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
+#include "arm_compute/core/utils/FormatUtils.h"
 #include "arm_compute/core/Window.h"
 
 #include <algorithm>
@@ -50,9 +50,9 @@ namespace detail
 template <typename T>
 inline bool have_different_dimensions(const Dimensions<T> &dim1, const Dimensions<T> &dim2, unsigned int upper_dim)
 {
-    for(unsigned int i = upper_dim; i < arm_compute::Dimensions<T>::num_max_dimensions; ++i)
+    for (unsigned int i = upper_dim; i < arm_compute::Dimensions<T>::num_max_dimensions; ++i)
     {
-        if(dim1[i] != dim2[i])
+        if (dim1[i] != dim2[i])
         {
             return true;
         }
@@ -80,7 +80,7 @@ public:
      * @param[in] line     Source code line. Used for error reporting.
      */
     compare_dimension(const Dimensions<T> &dim, const char *function, const char *file, int line)
-        : _dim{ dim }, _function{ function }, _file{ file }, _line{ line }
+        : _dim{dim}, _function{function}, _file{file}, _line{line}
     {
     }
 
@@ -111,7 +111,7 @@ inline arm_compute::Status for_each_error(F &&)
 }
 
 template <typename F, typename T, typename... Ts>
-inline arm_compute::Status for_each_error(F &&func, T &&arg, Ts &&... args)
+inline arm_compute::Status for_each_error(F &&func, T &&arg, Ts &&...args)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(func(arg));
     ARM_COMPUTE_RETURN_ON_ERROR(for_each_error(func, args...));
@@ -148,13 +148,11 @@ struct get_tensor_info_t<ITensorInfo *>
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_nullptr(const char *function, const char *file, const int line, Ts &&... pointers)
+inline arm_compute::Status error_on_nullptr(const char *function, const char *file, const int line, Ts &&...pointers)
 {
-    const std::array<const void *, sizeof...(Ts)> pointers_array{ { std::forward<Ts>(pointers)... } };
-    bool has_nullptr = std::any_of(pointers_array.begin(), pointers_array.end(), [&](const void *ptr)
-    {
-        return (ptr == nullptr);
-    });
+    const std::array<const void *, sizeof...(Ts)> pointers_array{{std::forward<Ts>(pointers)...}};
+    bool                                          has_nullptr =
+        std::any_of(pointers_array.begin(), pointers_array.end(), [&](const void *ptr) { return (ptr == nullptr); });
     ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(has_nullptr, function, file, line, "Nullptr object!");
     return arm_compute::Status{};
 }
@@ -178,8 +176,8 @@ inline arm_compute::Status error_on_nullptr(const char *function, const char *fi
  *
  * @return Status
  */
-arm_compute::Status error_on_mismatching_windows(const char *function, const char *file, const int line,
-                                                 const Window &full, const Window &win);
+arm_compute::Status error_on_mismatching_windows(
+    const char *function, const char *file, const int line, const Window &full, const Window &win);
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(f, w) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_windows(__func__, __FILE__, __LINE__, f, w))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_WINDOWS(f, w) \
@@ -200,8 +198,8 @@ arm_compute::Status error_on_mismatching_windows(const char *function, const cha
  *
  * @return Status
  */
-arm_compute::Status error_on_invalid_subwindow(const char *function, const char *file, const int line,
-                                               const Window &full, const Window &sub);
+arm_compute::Status error_on_invalid_subwindow(
+    const char *function, const char *file, const int line, const Window &full, const Window &sub);
 #define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subwindow(__func__, __FILE__, __LINE__, f, s))
 #define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBWINDOW(f, s) \
@@ -220,12 +218,14 @@ arm_compute::Status error_on_invalid_subwindow(const char *function, const char
  *
  * @return Status
  */
-arm_compute::Status error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line,
-                                                                 const Window &full, const Window &window, const int dim);
+arm_compute::Status error_on_window_not_collapsable_at_dimension(
+    const char *function, const char *file, const int line, const Window &full, const Window &window, const int dim);
 #define ARM_COMPUTE_ERROR_ON_WINDOW_NOT_COLLAPSABLE_AT_DIMENSION(f, w, d) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
+    ARM_COMPUTE_ERROR_THROW_ON(                                           \
+        ::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
 #define ARM_COMPUTE_RETURN_ERROR_ON_WINDOW_NOT_COLLAPSABLE_AT_DIMENSION(f, w, d) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                                 \
+        ::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
 
 /** Return an error if the passed coordinates have too many dimensions.
  *
@@ -239,8 +239,8 @@ arm_compute::Status error_on_window_not_collapsable_at_dimension(const char *fun
  *
  * @return Status
  */
-arm_compute::Status error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
-                                                        const Coordinates &pos, unsigned int max_dim);
+arm_compute::Status error_on_coordinates_dimensions_gte(
+    const char *function, const char *file, const int line, const Coordinates &pos, unsigned int max_dim);
 #define ARM_COMPUTE_ERROR_ON_COORDINATES_DIMENSIONS_GTE(p, md) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_coordinates_dimensions_gte(__func__, __FILE__, __LINE__, p, md))
 #define ARM_COMPUTE_RETURN_ERROR_ON_COORDINATES_DIMENSIONS_GTE(p, md) \
@@ -258,8 +258,8 @@ arm_compute::Status error_on_coordinates_dimensions_gte(const char *function, co
  *
  * @return Status
  */
-arm_compute::Status error_on_window_dimensions_gte(const char *function, const char *file, const int line,
-                                                   const Window &win, unsigned int max_dim);
+arm_compute::Status error_on_window_dimensions_gte(
+    const char *function, const char *file, const int line, const Window &win, unsigned int max_dim);
 #define ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_window_dimensions_gte(__func__, __FILE__, __LINE__, w, md))
 #define ARM_COMPUTE_RETURN_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) \
@@ -277,16 +277,23 @@ arm_compute::Status error_on_window_dimensions_gte(const char *function, const c
  * @return Status
  */
 template <typename T, typename... Ts>
-arm_compute::Status error_on_mismatching_dimensions(const char *function, const char *file, int line,
-                                                    const Dimensions<T> &dim1, const Dimensions<T> &dim2, Ts &&... dims)
+arm_compute::Status error_on_mismatching_dimensions(const char          *function,
+                                                    const char          *file,
+                                                    int                  line,
+                                                    const Dimensions<T> &dim1,
+                                                    const Dimensions<T> &dim2,
+                                                    Ts &&...dims)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(detail::for_each_error(detail::compare_dimension<T>(dim1, function, file, line), dim2, std::forward<Ts>(dims)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(detail::for_each_error(detail::compare_dimension<T>(dim1, function, file, line), dim2,
+                                                       std::forward<Ts>(dims)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                          \
+        ::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                \
+        ::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Return true if the given format has horizontal subsampling.
  *
@@ -296,7 +303,10 @@ arm_compute::Status error_on_mismatching_dimensions(const char *function, const
  */
 inline bool has_format_horizontal_subsampling(Format format)
 {
-    return (format == Format::YUYV422 || format == Format::UYVY422 || format == Format::NV12 || format == Format::NV21 || format == Format::IYUV || format == Format::UV88) ? true : false;
+    return (format == Format::YUYV422 || format == Format::UYVY422 || format == Format::NV12 ||
+            format == Format::NV21 || format == Format::IYUV || format == Format::UV88)
+               ? true
+               : false;
 }
 
 /** Return true if the given format has vertical subsampling.
@@ -307,7 +317,9 @@ inline bool has_format_horizontal_subsampling(Format format)
  */
 inline bool has_format_vertical_subsampling(Format format)
 {
-    return (format == Format::NV12 || format == Format::NV21 || format == Format::IYUV || format == Format::UV88) ? true : false;
+    return (format == Format::NV12 || format == Format::NV21 || format == Format::IYUV || format == Format::UV88)
+               ? true
+               : false;
 }
 
 /** Adjust tensor shape size if width or height are odd for a given multi-planar format. No modification is done for other formats.
@@ -325,16 +337,16 @@ inline bool has_format_vertical_subsampling(Format format)
  */
 inline TensorShape adjust_odd_shape(const TensorShape &shape, Format format)
 {
-    TensorShape output{ shape };
+    TensorShape output{shape};
 
     // Force width to be even for formats which require subsampling of the U and V channels
-    if(has_format_horizontal_subsampling(format))
+    if (has_format_horizontal_subsampling(format))
     {
         output.set(0, (output.x() + 1) & ~1U);
     }
 
     // Force height to be even for formats which require subsampling of the U and V channels
-    if(has_format_vertical_subsampling(format))
+    if (has_format_vertical_subsampling(format))
     {
         output.set(1, (output.y() + 1) & ~1U);
     }
@@ -354,18 +366,20 @@ inline TensorShape adjust_odd_shape(const TensorShape &shape, Format format)
  * @return Status
  */
 template <typename... Ts>
-arm_compute::Status error_on_tensors_not_even(const char *function, const char *file, int line,
-                                              const Format &format, const ITensor *tensor1, Ts... tensors)
+arm_compute::Status error_on_tensors_not_even(
+    const char *function, const char *file, int line, const Format &format, const ITensor *tensor1, Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor1 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
-    const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_info_array{ { tensor1, std::forward<Ts>(tensors)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_info_array.cbegin(), tensors_info_array.cend(), [&](const ITensor * tensor)
-    {
-        const TensorShape correct_shape = adjust_odd_shape(tensor->info()->tensor_shape(), format);
-        return detail::have_different_dimensions(tensor->info()->tensor_shape(), correct_shape, 2);
-    }),
-    function, file, line, "Tensor shape has odd dimensions");
+    const std::array<const ITensor *, 1 + sizeof...(Ts)> tensors_info_array{{tensor1, std::forward<Ts>(tensors)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        std::any_of(tensors_info_array.cbegin(), tensors_info_array.cend(),
+                    [&](const ITensor *tensor)
+                    {
+                        const TensorShape correct_shape = adjust_odd_shape(tensor->info()->tensor_shape(), format);
+                        return detail::have_different_dimensions(tensor->info()->tensor_shape(), correct_shape, 2);
+                    }),
+        function, file, line, "Tensor shape has odd dimensions");
     return arm_compute::Status{};
 }
 
@@ -382,21 +396,22 @@ arm_compute::Status error_on_tensors_not_even(const char *function, const char *
  *
  * @return The subsampled tensor shape.
  */
-inline TensorShape calculate_subsampled_shape(const TensorShape &shape, Format format, Channel channel = Channel::UNKNOWN)
+inline TensorShape
+calculate_subsampled_shape(const TensorShape &shape, Format format, Channel channel = Channel::UNKNOWN)
 {
-    TensorShape output{ shape };
+    TensorShape output{shape};
 
     // Subsample shape only for U or V channel
-    if(Channel::U == channel || Channel::V == channel || Channel::UNKNOWN == channel)
+    if (Channel::U == channel || Channel::V == channel || Channel::UNKNOWN == channel)
     {
         // Subsample width for the tensor shape when channel is U or V
-        if(has_format_horizontal_subsampling(format))
+        if (has_format_horizontal_subsampling(format))
         {
             output.set(0, output.x() / 2U);
         }
 
         // Subsample height for the tensor shape when channel is U or V
-        if(has_format_vertical_subsampling(format))
+        if (has_format_vertical_subsampling(format))
         {
             output.set(1, output.y() / 2U);
         }
@@ -418,25 +433,32 @@ inline TensorShape calculate_subsampled_shape(const TensorShape &shape, Format f
  * @return Status
  */
 template <typename... Ts>
-arm_compute::Status error_on_tensors_not_subsampled(const char *function, const char *file, int line,
-                                                    const Format &format, const TensorShape &shape, const ITensor *tensor1, Ts... tensors)
+arm_compute::Status error_on_tensors_not_subsampled(const char        *function,
+                                                    const char        *file,
+                                                    int                line,
+                                                    const Format      &format,
+                                                    const TensorShape &shape,
+                                                    const ITensor     *tensor1,
+                                                    Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor1 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
-    const TensorShape sub2_shape = calculate_subsampled_shape(shape, format);
-    const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_info_array{ { tensor1, std::forward<Ts>(tensors)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_info_array.cbegin(), tensors_info_array.cend(), [&](const ITensor * tensor)
-    {
-        return detail::have_different_dimensions(tensor->info()->tensor_shape(), sub2_shape, 2);
-    }),
-    function, file, line, "Tensor shape has mismatch dimensions for sub-sampling");
+    const TensorShape                                    sub2_shape = calculate_subsampled_shape(shape, format);
+    const std::array<const ITensor *, 1 + sizeof...(Ts)> tensors_info_array{{tensor1, std::forward<Ts>(tensors)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        std::any_of(tensors_info_array.cbegin(), tensors_info_array.cend(),
+                    [&](const ITensor *tensor)
+                    { return detail::have_different_dimensions(tensor->info()->tensor_shape(), sub2_shape, 2); }),
+        function, file, line, "Tensor shape has mismatch dimensions for sub-sampling");
     return arm_compute::Status{};
 }
 
 #define ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_tensors_not_subsampled(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                          \
+        ::arm_compute::error_on_tensors_not_subsampled(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_TENSORS_NOT_SUBSAMPLED(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_tensors_not_subsampled(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                \
+        ::arm_compute::error_on_tensors_not_subsampled(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Return an error if the passed two tensor infos have different shapes from the given dimension
  *
@@ -450,10 +472,15 @@ arm_compute::Status error_on_tensors_not_subsampled(const char *function, const
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                                       const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_shapes(const char        *function,
+                                                       const char        *file,
+                                                       const int          line,
+                                                       const ITensorInfo *tensor_info_1,
+                                                       const ITensorInfo *tensor_info_2,
+                                                       Ts... tensor_infos)
 {
-    return error_on_mismatching_shapes(function, file, line, 0U, tensor_info_1, tensor_info_2, std::forward<Ts>(tensor_infos)...);
+    return error_on_mismatching_shapes(function, file, line, 0U, tensor_info_1, tensor_info_2,
+                                       std::forward<Ts>(tensor_infos)...);
 }
 /** Return an error if the passed two tensors have different shapes from the given dimension
  *
@@ -467,8 +494,12 @@ inline arm_compute::Status error_on_mismatching_shapes(const char *function, con
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                                       const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_shapes(const char    *function,
+                                                       const char    *file,
+                                                       const int      line,
+                                                       const ITensor *tensor_1,
+                                                       const ITensor *tensor_2,
+                                                       Ts... tensors)
 {
     return error_on_mismatching_shapes(function, file, line, 0U, tensor_1, tensor_2, std::forward<Ts>(tensors)...);
 }
@@ -485,19 +516,28 @@ inline arm_compute::Status error_on_mismatching_shapes(const char *function, con
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                                       unsigned int upper_dim, const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_shapes(const char        *function,
+                                                       const char        *file,
+                                                       const int          line,
+                                                       unsigned int       upper_dim,
+                                                       const ITensorInfo *tensor_info_1,
+                                                       const ITensorInfo *tensor_info_2,
+                                                       Ts... tensor_infos)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info_1 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info_2 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensor_infos...));
 
-    const std::array < const ITensorInfo *, 2 + sizeof...(Ts) > tensors_info_array{ { tensor_info_1, tensor_info_2, tensor_infos... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(std::next(tensors_info_array.cbegin()), tensors_info_array.cend(), [&](const ITensorInfo * tensor_info)
-    {
-        return detail::have_different_dimensions((*tensors_info_array.cbegin())->tensor_shape(), tensor_info->tensor_shape(), upper_dim);
-    }),
-    function, file, line, "Tensors have different shapes");
+    const std::array<const ITensorInfo *, 2 + sizeof...(Ts)> tensors_info_array{
+        {tensor_info_1, tensor_info_2, tensor_infos...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(std::next(tensors_info_array.cbegin()), tensors_info_array.cend(),
+                                                    [&](const ITensorInfo *tensor_info)
+                                                    {
+                                                        return detail::have_different_dimensions(
+                                                            (*tensors_info_array.cbegin())->tensor_shape(),
+                                                            tensor_info->tensor_shape(), upper_dim);
+                                                    }),
+                                        function, file, line, "Tensors have different shapes");
     return arm_compute::Status{};
 }
 /** Return an error if the passed two tensors have different shapes from the given dimension
@@ -513,14 +553,20 @@ inline arm_compute::Status error_on_mismatching_shapes(const char *function, con
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                                       unsigned int upper_dim, const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_shapes(const char    *function,
+                                                       const char    *file,
+                                                       const int      line,
+                                                       unsigned int   upper_dim,
+                                                       const ITensor *tensor_1,
+                                                       const ITensor *tensor_2,
+                                                       Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_1 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_2 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensors...));
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_shapes(function, file, line, upper_dim, tensor_1->info(), tensor_2->info(),
-                                                                           detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ::arm_compute::error_on_mismatching_shapes(function, file, line, upper_dim, tensor_1->info(), tensor_2->info(),
+                                                   detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(...) \
@@ -539,19 +585,18 @@ inline arm_compute::Status error_on_mismatching_shapes(const char *function, con
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_data_layouts(const char *function, const char *file, const int line,
-                                                             const ITensorInfo *tensor_info, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_data_layouts(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, Ts... tensor_infos)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensor_infos...));
 
-    DataLayout &&tensor_data_layout = tensor_info->data_layout();
-    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{ { tensor_infos... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(), [&](const ITensorInfo * tensor_info_obj)
-    {
-        return tensor_info_obj->data_layout() != tensor_data_layout;
-    }),
-    function, file, line, "Tensors have different data layouts");
+    DataLayout                                         &&tensor_data_layout = tensor_info->data_layout();
+    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{{tensor_infos...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(),
+                                                    [&](const ITensorInfo *tensor_info_obj)
+                                                    { return tensor_info_obj->data_layout() != tensor_data_layout; }),
+                                        function, file, line, "Tensors have different data layouts");
     return arm_compute::Status{};
 }
 /** Return an error if the passed tensors have different data layouts
@@ -565,19 +610,21 @@ inline arm_compute::Status error_on_mismatching_data_layouts(const char *functio
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_data_layouts(const char *function, const char *file, const int line,
-                                                             const ITensor *tensor, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_data_layouts(
+    const char *function, const char *file, const int line, const ITensor *tensor, Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_layouts(function, file, line, tensor->info(),
-                                                                                 detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_layouts(
+        function, file, line, tensor->info(), detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_data_layouts(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                           \
+        ::arm_compute::error_on_mismatching_data_layouts(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_layouts(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                 \
+        ::arm_compute::error_on_mismatching_data_layouts(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Return an error if the passed two tensor infos have different data types
  *
@@ -590,19 +637,18 @@ inline arm_compute::Status error_on_mismatching_data_layouts(const char *functio
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_data_types(const char *function, const char *file, const int line,
-                                                           const ITensorInfo *tensor_info, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_data_types(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, Ts... tensor_infos)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensor_infos...));
 
-    DataType &&tensor_data_type = tensor_info->data_type();
-    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{ { tensor_infos... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(), [&](const ITensorInfo * tensor_info_obj)
-    {
-        return tensor_info_obj->data_type() != tensor_data_type;
-    }),
-    function, file, line, "Tensors have different data types");
+    DataType                                           &&tensor_data_type = tensor_info->data_type();
+    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{{tensor_infos...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(),
+                                                    [&](const ITensorInfo *tensor_info_obj)
+                                                    { return tensor_info_obj->data_type() != tensor_data_type; }),
+                                        function, file, line, "Tensors have different data types");
     return arm_compute::Status{};
 }
 /** Return an error if the passed two tensors have different data types
@@ -616,19 +662,21 @@ inline arm_compute::Status error_on_mismatching_data_types(const char *function,
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_data_types(const char *function, const char *file, const int line,
-                                                           const ITensor *tensor, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_data_types(
+    const char *function, const char *file, const int line, const ITensor *tensor, Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensors...));
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(function, file, line, tensor->info(),
-                                                                               detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(
+        function, file, line, tensor->info(), detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                          \
+        ::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                \
+        ::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Return an error if the passed tensor infos have different asymmetric quantized data types or different quantization info
  *
@@ -644,28 +692,32 @@ inline arm_compute::Status error_on_mismatching_data_types(const char *function,
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_quantization_info(const char *function, const char *file, const int line,
-                                                                  const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_quantization_info(const char        *function,
+                                                                  const char        *file,
+                                                                  const int          line,
+                                                                  const ITensorInfo *tensor_info_1,
+                                                                  const ITensorInfo *tensor_info_2,
+                                                                  Ts... tensor_infos)
 {
     DataType             &&first_data_type         = tensor_info_1->data_type();
     const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info();
 
-    if(!is_data_type_quantized(first_data_type))
+    if (!is_data_type_quantized(first_data_type))
     {
         return arm_compute::Status{};
     }
 
-    const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
-    {
-        return tensor_info->data_type() != first_data_type;
-    }),
-    function, file, line, "Tensors have different asymmetric quantized data types");
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
-    {
-        return tensor_info->quantization_info() != first_quantization_info;
-    }),
-    function, file, line, "Tensors have different quantization information");
+    const std::array<const ITensorInfo *, 1 + sizeof...(Ts)> tensor_infos_array{
+        {tensor_info_2, std::forward<Ts>(tensor_infos)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(),
+                                                    [&](const ITensorInfo *tensor_info)
+                                                    { return tensor_info->data_type() != first_data_type; }),
+                                        function, file, line, "Tensors have different asymmetric quantized data types");
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(),
+                    [&](const ITensorInfo *tensor_info)
+                    { return tensor_info->quantization_info() != first_quantization_info; }),
+        function, file, line, "Tensors have different quantization information");
 
     return arm_compute::Status{};
 }
@@ -683,17 +735,24 @@ inline arm_compute::Status error_on_mismatching_quantization_info(const char *fu
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_quantization_info(const char *function, const char *file, const int line,
-                                                                  const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_quantization_info(const char    *function,
+                                                                  const char    *file,
+                                                                  const int      line,
+                                                                  const ITensor *tensor_1,
+                                                                  const ITensor *tensor_2,
+                                                                  Ts... tensors)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_quantization_info(function, file, line, tensor_1->info(), tensor_2->info(),
-                                                                                      detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ::arm_compute::error_on_mismatching_quantization_info(function, file, line, tensor_1->info(), tensor_2->info(),
+                                                              detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                                 \
+        ::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                       \
+        ::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Throw an error if the format of the passed tensor/multi-image does not match any of the formats provided.
  *
@@ -705,8 +764,8 @@ inline arm_compute::Status error_on_mismatching_quantization_info(const char *fu
  * @param[in] formats  (Optional) Further allowed formats.
  */
 template <typename T, typename F, typename... Fs>
-void error_on_format_not_in(const char *function, const char *file, const int line,
-                            const T *object, F &&format, Fs &&... formats)
+void error_on_format_not_in(
+    const char *function, const char *file, const int line, const T *object, F &&format, Fs &&...formats)
 {
     ARM_COMPUTE_ERROR_ON_LOC(object == nullptr, function, file, line);
 
@@ -715,17 +774,17 @@ void error_on_format_not_in(const char *function, const char *file, const int li
 
     ARM_COMPUTE_ERROR_ON_LOC(object_format == Format::UNKNOWN, function, file, line);
 
-    const std::array<F, sizeof...(Fs)> formats_array{ { std::forward<Fs>(formats)... } };
+    const std::array<F, sizeof...(Fs)> formats_array{{std::forward<Fs>(formats)...}};
     ARM_COMPUTE_UNUSED(formats_array);
 
-    ARM_COMPUTE_ERROR_ON_LOC_MSG(object_format != format && std::none_of(formats_array.begin(), formats_array.end(), [&](const F & f)
-    {
-        return f == object_format;
-    }),
-    function, file, line, "Format %s not supported by this kernel", string_from_format(object_format).c_str());
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(
+        object_format != format &&
+            std::none_of(formats_array.begin(), formats_array.end(), [&](const F &f) { return f == object_format; }),
+        function, file, line, "Format %s not supported by this kernel", string_from_format(object_format).c_str());
     ARM_COMPUTE_UNUSED(function, format, file, line);
 }
-#define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t, ...) ::arm_compute::error_on_format_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__)
+#define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t, ...) \
+    ::arm_compute::error_on_format_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__)
 
 /** Return an error if the data type of the passed tensor info does not match any of the data types provided.
  *
@@ -739,20 +798,19 @@ void error_on_format_not_in(const char *function, const char *file, const int li
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_type_not_in(const char *function, const char *file, const int line,
-                                                     const ITensorInfo *tensor_info, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_not_in(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, T &&dt, Ts &&...dts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
 
     const DataType &tensor_dt = tensor_info->data_type(); //NOLINT
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_dt == DataType::UNKNOWN, function, file, line);
 
-    const std::array<T, sizeof...(Ts)> dts_array{ { std::forward<Ts>(dts)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor_dt != dt && std::none_of(dts_array.begin(), dts_array.end(), [&](const T & d)
-    {
-        return d == tensor_dt;
-    }),
-    function, file, line, "ITensor data type %s not supported by this kernel", string_from_data_type(tensor_dt).c_str());
+    const std::array<T, sizeof...(Ts)> dts_array{{std::forward<Ts>(dts)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(
+        tensor_dt != dt && std::none_of(dts_array.begin(), dts_array.end(), [&](const T &d) { return d == tensor_dt; }),
+        function, file, line, "ITensor data type %s not supported by this kernel",
+        string_from_data_type(tensor_dt).c_str());
     return arm_compute::Status{};
 }
 /** Return an error if the data type of the passed tensor does not match any of the data types provided.
@@ -767,11 +825,12 @@ inline arm_compute::Status error_on_data_type_not_in(const char *function, const
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_type_not_in(const char *function, const char *file, const int line,
-                                                     const ITensor *tensor, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_not_in(
+    const char *function, const char *file, const int line, const ITensor *tensor, T &&dt, Ts &&...dts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(function, file, line, tensor->info(), std::forward<T>(dt), std::forward<Ts>(dts)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(
+        function, file, line, tensor->info(), std::forward<T>(dt), std::forward<Ts>(dts)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(t, ...) \
@@ -791,20 +850,19 @@ inline arm_compute::Status error_on_data_type_not_in(const char *function, const
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_layout_not_in(const char *function, const char *file, const int line,
-                                                       const ITensorInfo *tensor_info, T &&dl, Ts &&... dls)
+inline arm_compute::Status error_on_data_layout_not_in(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, T &&dl, Ts &&...dls)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
 
     const DataLayout &tensor_dl = tensor_info->data_layout(); //NOLINT
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_dl == DataLayout::UNKNOWN, function, file, line);
 
-    const std::array<T, sizeof...(Ts)> dls_array{ { std::forward<Ts>(dls)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor_dl != dl && std::none_of(dls_array.begin(), dls_array.end(), [&](const T & l)
-    {
-        return l == tensor_dl;
-    }),
-    function, file, line, "ITensor data layout %s not supported by this kernel", string_from_data_layout(tensor_dl).c_str());
+    const std::array<T, sizeof...(Ts)> dls_array{{std::forward<Ts>(dls)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(
+        tensor_dl != dl && std::none_of(dls_array.begin(), dls_array.end(), [&](const T &l) { return l == tensor_dl; }),
+        function, file, line, "ITensor data layout %s not supported by this kernel",
+        string_from_data_layout(tensor_dl).c_str());
     return arm_compute::Status{};
 }
 /** Return an error if the data layout of the passed tensor does not match any of the data layout provided.
@@ -819,17 +877,19 @@ inline arm_compute::Status error_on_data_layout_not_in(const char *function, con
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_layout_not_in(const char *function, const char *file, const int line,
-                                                       const ITensor *tensor, T &&dl, Ts &&... dls)
+inline arm_compute::Status error_on_data_layout_not_in(
+    const char *function, const char *file, const int line, const ITensor *tensor, T &&dl, Ts &&...dls)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_layout_not_in(function, file, line, tensor->info(), std::forward<T>(dl), std::forward<Ts>(dls)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_layout_not_in(
+        function, file, line, tensor->info(), std::forward<T>(dl), std::forward<Ts>(dls)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_DATA_LAYOUT_NOT_IN(t, ...) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_data_layout_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(t, ...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_layout_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                               \
+        ::arm_compute::error_on_data_layout_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__))
 
 /** Return an error if the data type or the number of channels of the passed tensor info does not match any of the data types and number of channels provided.
  *
@@ -844,12 +904,20 @@ inline arm_compute::Status error_on_data_layout_not_in(const char *function, con
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_type_channel_not_in(const char *function, const char *file, const int line,
-                                                             const ITensorInfo *tensor_info, size_t num_channels, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_channel_not_in(const char        *function,
+                                                             const char        *file,
+                                                             const int          line,
+                                                             const ITensorInfo *tensor_info,
+                                                             size_t             num_channels,
+                                                             T                &&dt,
+                                                             Ts &&...dts)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(function, file, line, tensor_info, std::forward<T>(dt), std::forward<Ts>(dts)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(
+        function, file, line, tensor_info, std::forward<T>(dt), std::forward<Ts>(dts)...));
     const size_t tensor_nc = tensor_info->num_channels();
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor_nc != num_channels, function, file, line, "Number of channels %zu. Required number of channels %zu", tensor_nc, num_channels);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor_nc != num_channels, function, file, line,
+                                            "Number of channels %zu. Required number of channels %zu", tensor_nc,
+                                            num_channels);
     return arm_compute::Status{};
 }
 /** Return an error if the data type or the number of channels of the passed tensor does not match any of the data types and number of channels provided.
@@ -865,17 +933,25 @@ inline arm_compute::Status error_on_data_type_channel_not_in(const char *functio
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_type_channel_not_in(const char *function, const char *file, const int line,
-                                                             const ITensor *tensor, size_t num_channels, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_channel_not_in(const char    *function,
+                                                             const char    *file,
+                                                             const int      line,
+                                                             const ITensor *tensor,
+                                                             size_t         num_channels,
+                                                             T            &&dt,
+                                                             Ts &&...dts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(error_on_data_type_channel_not_in(function, file, line, tensor->info(), num_channels, std::forward<T>(dt), std::forward<Ts>(dts)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(error_on_data_type_channel_not_in(function, file, line, tensor->info(), num_channels,
+                                                                  std::forward<T>(dt), std::forward<Ts>(dts)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                                  \
+        ::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                        \
+        ::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
 
 /** Return an error if the data type of the passed tensor info is FP16 and FP16 extension is not supported by the device.
  *
@@ -887,12 +963,12 @@ inline arm_compute::Status error_on_data_type_channel_not_in(const char *functio
  *
  * @return Status
  */
-inline arm_compute::Status error_on_unsupported_fp16(const char *function, const char *file, const int line,
-                                                     const ITensorInfo *tensor_info, bool is_fp16_supported)
+inline arm_compute::Status error_on_unsupported_fp16(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, bool is_fp16_supported)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::F16 && !is_fp16_supported),
-                                        function, file, line, "FP16 not supported by the device");
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::F16 && !is_fp16_supported), function,
+                                        file, line, "FP16 not supported by the device");
     return arm_compute::Status{};
 }
 
@@ -906,11 +982,12 @@ inline arm_compute::Status error_on_unsupported_fp16(const char *function, const
  *
  * @return Status
  */
-inline arm_compute::Status error_on_unsupported_fp16(const char *function, const char *file, const int line,
-                                                     const ITensor *tensor, bool is_fp16_supported)
+inline arm_compute::Status error_on_unsupported_fp16(
+    const char *function, const char *file, const int line, const ITensor *tensor, bool is_fp16_supported)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(function, file, line, tensor->info(), is_fp16_supported));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ::arm_compute::error_on_unsupported_fp16(function, file, line, tensor->info(), is_fp16_supported));
     return arm_compute::Status{};
 }
 
@@ -923,8 +1000,8 @@ inline arm_compute::Status error_on_unsupported_fp16(const char *function, const
  *
  * @return Status
  */
-arm_compute::Status error_on_tensor_not_2d(const char *function, const char *file, const int line,
-                                           const ITensor *tensor);
+arm_compute::Status
+error_on_tensor_not_2d(const char *function, const char *file, const int line, const ITensor *tensor);
 
 /** Return an error if the tensor info is not 2D.
  *
@@ -935,8 +1012,8 @@ arm_compute::Status error_on_tensor_not_2d(const char *function, const char *fil
  *
  * @return Status
  */
-arm_compute::Status error_on_tensor_not_2d(const char *function, const char *file, const int line,
-                                           const ITensorInfo *tensor);
+arm_compute::Status
+error_on_tensor_not_2d(const char *function, const char *file, const int line, const ITensorInfo *tensor);
 
 #define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_tensor_not_2d(__func__, __FILE__, __LINE__, t))
@@ -955,17 +1032,15 @@ arm_compute::Status error_on_tensor_not_2d(const char *function, const char *fil
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_channel_not_in(const char *function, const char *file, const int line,
-                                                   T cn, T &&channel, Ts &&... channels)
+inline arm_compute::Status
+error_on_channel_not_in(const char *function, const char *file, const int line, T cn, T &&channel, Ts &&...channels)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(cn == Channel::UNKNOWN, function, file, line);
 
-    const std::array<T, sizeof...(Ts)> channels_array{ { std::forward<Ts>(channels)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(channel != cn && std::none_of(channels_array.begin(), channels_array.end(), [&](const T & f)
-    {
-        return f == cn;
-    }),
-    function, file, line);
+    const std::array<T, sizeof...(Ts)> channels_array{{std::forward<Ts>(channels)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(channel != cn && std::none_of(channels_array.begin(), channels_array.end(),
+                                                                  [&](const T &f) { return f == cn; }),
+                                    function, file, line);
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN(c, ...) \
@@ -983,8 +1058,8 @@ inline arm_compute::Status error_on_channel_not_in(const char *function, const c
  *
  * @return Status
  */
-arm_compute::Status error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
-                                                         Format fmt, Channel cn);
+arm_compute::Status
+error_on_channel_not_in_known_format(const char *function, const char *file, const int line, Format fmt, Channel cn);
 #define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(f, c) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_channel_not_in_known_format(__func__, __FILE__, __LINE__, f, c))
 #define ARM_COMPUTE_RETURN_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(f, c) \
@@ -999,8 +1074,8 @@ arm_compute::Status error_on_channel_not_in_known_format(const char *function, c
  *
  * @return Status
  */
-arm_compute::Status error_on_unconfigured_kernel(const char *function, const char *file, const int line,
-                                                 const IKernel *kernel);
+arm_compute::Status
+error_on_unconfigured_kernel(const char *function, const char *file, const int line, const IKernel *kernel);
 #define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unconfigured_kernel(__func__, __FILE__, __LINE__, k))
 #define ARM_COMPUTE_RETURN_ERROR_ON_UNCONFIGURED_KERNEL(k) \
@@ -1017,8 +1092,12 @@ arm_compute::Status error_on_unconfigured_kernel(const char *function, const cha
  *
  * @return Status
  */
-arm_compute::Status error_on_invalid_subtensor(const char *function, const char *file, const int line,
-                                               const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape);
+arm_compute::Status error_on_invalid_subtensor(const char        *function,
+                                               const char        *file,
+                                               const int          line,
+                                               const TensorShape &parent_shape,
+                                               const Coordinates &coords,
+                                               const TensorShape &shape);
 #define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(p, c, s) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, p, c, s))
 #define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBTENSOR(p, c, s) \
@@ -1034,11 +1113,16 @@ arm_compute::Status error_on_invalid_subtensor(const char *function, const char
  *
  * @return Status
  */
-arm_compute::Status error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
-                                                            const ValidRegion &parent_valid_region, const ValidRegion &valid_region);
+arm_compute::Status error_on_invalid_subtensor_valid_region(const char        *function,
+                                                            const char        *file,
+                                                            const int          line,
+                                                            const ValidRegion &parent_valid_region,
+                                                            const ValidRegion &valid_region);
 #define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
+    ARM_COMPUTE_ERROR_THROW_ON(                                     \
+        ::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
 #define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
-}
+    ARM_COMPUTE_RETURN_ON_ERROR(                                           \
+        ::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_VALIDATE_H*/
diff --git a/arm_compute/core/Version.h b/arm_compute/core/Version.h
index a4d307950aad10f72d9d97436c3b7ee693d6364b..44d400bad8e33c95db4250cd8ce37cf92a40d436 100644
--- a/arm_compute/core/Version.h
+++ b/arm_compute/core/Version.h
@@ -28,7 +28,7 @@
 
 /* Macro utilities */
 #define ARM_COMPUTE_STRINGIFY2(s) #s
-#define ARM_COMPUTE_STRINGIFY(s) ARM_COMPUTE_STRINGIFY2(s)
+#define ARM_COMPUTE_STRINGIFY(s)  ARM_COMPUTE_STRINGIFY2(s)
 
 #define ARM_COMPUTE_VERSION_STR                      \
     ARM_COMPUTE_STRINGIFY(ARM_COMPUTE_VERSION_MAJOR) \
diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h
index 8ae859f4b325902744fa67a40228035e6383e775..e93d2863c93dc96e8a8b6ffa8b1dfc4a52b82a79 100644
--- a/arm_compute/core/Window.h
+++ b/arm_compute/core/Window.h
@@ -21,18 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WINDOW_H
-#define ARM_COMPUTE_WINDOW_H
-
-#include <algorithm>
-#include <array>
-#include <cstddef>
+#ifndef ACL_ARM_COMPUTE_CORE_WINDOW_H
+#define ACL_ARM_COMPUTE_CORE_WINDOW_H
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/utils/math/Math.h"
 
+#include <algorithm>
+#include <array>
+#include <cstddef>
+
 namespace arm_compute
 {
 /** Describe a multidimensional execution window. */
@@ -86,8 +86,7 @@ public:
          * @param[in] step  Step between two elements of the dimension when iterating.
          *
          */
-        constexpr Dimension(int start = 0, int end = 1, int step = 1)
-            : _start(start), _end(end), _step(step)
+        constexpr Dimension(int start = 0, int end = 1, int step = 1) : _start(start), _end(end), _step(step)
         {
         }
         Dimension(const Dimension &d) = default;
@@ -214,15 +213,17 @@ public:
      */
     void shift(size_t dimension, int shift_value);
 
-    /** Shift down all the dimensions of a window
+    /** Shift down all the dimensions of a window starting from the specified dimension.
      *
-     * i.e new_dims[n] = old_dims[n+shift_value].
+     * new_dims[i] = old_dims[i]             for all i < start_dim.
+     * new_dims[i] = old_dims[i+shift_value] for all i >= start_dim.
      *
      * @param[in] shift_value Number of dimensions to shift the window by.
+     * @param[in] start_dim   The dimension from which the dimensions start to shift.
      *
      * @return The window with the shifted dimensions.
      */
-    Window shift_dimensions(unsigned int shift_value) const;
+    Window shift_dimensions(unsigned int shift_value, unsigned int start_dim = 0) const;
 
     /** Adjust the start or end of a given dimension by the given value
      *
@@ -373,7 +374,8 @@ public:
      *
      * @return Collapsed window.
      */
-    Window collapse_if_possible(const Window &full_window, size_t first, size_t last, bool *has_collapsed = nullptr) const;
+    Window
+    collapse_if_possible(const Window &full_window, size_t first, size_t last, bool *has_collapsed = nullptr) const;
 
     /** Collapse the dimensions higher than @p first if possible.
      *
@@ -441,7 +443,7 @@ private:
      * @return The first slice of the window.
      */
     template <unsigned int window_dimension>
-    Window                 first_slice_window() const;
+    Window first_slice_window() const;
 
     /** Slide the passed window slice.
      *
@@ -460,4 +462,4 @@ private:
 };
 } // namespace arm_compute
 #include "Window.inl"
-#endif /*ARM_COMPUTE_WINDOW_H */
+#endif // ACL_ARM_COMPUTE_CORE_WINDOW_H
diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl
index 5ee4b57145b04cb47e2133b5eeea5db10bcc7403..0f7c4fbdd72ad0b2b703326f3a13e2bd1904822b 100644
--- a/arm_compute/core/Window.inl
+++ b/arm_compute/core/Window.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, 2022 Arm Limited.
+ * Copyright (c) 2016-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
+#ifndef ACL_ARM_COMPUTE_CORE_WINDOW_INL
+#define ACL_ARM_COMPUTE_CORE_WINDOW_INL
+
 namespace arm_compute
 {
 inline Window::Window(const Window &src)
     : _dims(), _is_broadcasted(utility::generate_array<bool, Coordinates::num_max_dimensions, false>::value)
 {
-    for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         set(i, src[i]);
         _is_broadcasted[i] = src.is_broadcasted(i);
@@ -65,32 +69,34 @@ inline bool Window::is_broadcasted(size_t dimension) const
     return _is_broadcasted[dimension];
 }
 
-inline Window Window::collapse_if_possible(const Window &full_window, const size_t first,
-                                           const size_t last, bool *has_collapsed) const
+inline Window Window::collapse_if_possible(const Window &full_window,
+                                           const size_t  first,
+                                           const size_t  last,
+                                           bool         *has_collapsed) const
 {
     Window collapsed(*this);
 
     bool is_collapsable = true;
     int  collapsed_end  = _dims[first].end();
 
-    for(size_t d = first + 1; is_collapsable && (d < last); ++d)
+    for (size_t d = first + 1; is_collapsable && (d < last); ++d)
     {
         // The _dims's dimension must match the full _dims dimension to be collapsable:
-        is_collapsable = (_dims[d].start() == 0) && (full_window[d].start() == 0) && (_dims[d].step() <= 1)
-                         && (full_window[d].end() == _dims[d].end());
+        is_collapsable = (_dims[d].start() == 0) && (full_window[d].start() == 0) && (_dims[d].step() <= 1) &&
+                         (full_window[d].end() == _dims[d].end());
         collapsed_end *= _dims[d].end();
     }
 
-    if(is_collapsable)
+    if (is_collapsable)
     {
         collapsed._dims.at(first).set_end(collapsed_end);
-        for(size_t d = first + 1; is_collapsable && (d < last); ++d)
+        for (size_t d = first + 1; is_collapsable && (d < last); ++d)
         {
             collapsed.set(d, Dimension());
         }
     }
 
-    if(has_collapsed != nullptr)
+    if (has_collapsed != nullptr)
     {
         *has_collapsed = is_collapsable;
     }
@@ -98,13 +104,21 @@ inline Window Window::collapse_if_possible(const Window &full_window, const size
     return collapsed;
 }
 
-inline Window Window::shift_dimensions(unsigned int shift_value) const
+inline Window Window::shift_dimensions(unsigned int shift_value, unsigned int start_dim) const
 {
     Window shifted_window;
-    for(size_t n = 0; n < (Coordinates::num_max_dimensions - shift_value); n++)
+    size_t n = 0;
+
+    for (; n < start_dim; ++n)
+    {
+        shifted_window.set(n, _dims[n]);
+    }
+
+    for (; n < (Coordinates::num_max_dimensions - shift_value); n++)
     {
         shifted_window.set(n, _dims[n + shift_value]);
     }
+
     return shifted_window;
 }
 
@@ -120,9 +134,9 @@ inline Window Window::collapse(const Window &full_window, const size_t first, co
 inline Window Window::broadcast_if_dimension_le_one(const TensorShape &shape) const
 {
     Window broadcastWin(*this);
-    for(size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
+    for (size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
     {
-        if(shape[d] <= 1)
+        if (shape[d] <= 1)
         {
             broadcastWin.set_broadcasted(d);
         }
@@ -142,7 +156,7 @@ inline void Window::adjust(size_t dimension, int adjust_value, bool is_at_start)
     ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
     Window::Dimension &d = _dims[dimension];
 
-    if(is_at_start)
+    if (is_at_start)
     {
         d = Window::Dimension(d.start() + adjust_value, d.end(), d.step());
     }
@@ -172,7 +186,7 @@ inline void Window::set_dimension_step(size_t dimension, int step)
 
 inline void Window::validate() const
 {
-    for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_ERROR_ON(_dims[i].end() < _dims[i].start());
         ARM_COMPUTE_ERROR_ON((_dims[i].step() != 0) && (((_dims[i].end() - _dims[i].start()) % _dims[i].step()) != 0));
@@ -193,9 +207,9 @@ inline Window Window::split_window(size_t dimension, size_t id, size_t total) co
 
     Window out;
 
-    for(size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
+    for (size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
     {
-        if(d == dimension)
+        if (d == dimension)
         {
             int       start = _dims[d].start();
             int       end   = _dims[d].end();
@@ -207,7 +221,7 @@ inline Window Window::split_window(size_t dimension, size_t id, size_t total) co
 
             int it_start = work * id;
 
-            if(int(id) < rem)
+            if (int(id) < rem)
             {
                 ++work;
                 it_start += id;
@@ -234,18 +248,18 @@ inline Window Window::split_window(size_t dimension, size_t id, size_t total) co
 template <unsigned int window_dimension>
 inline bool Window::slide_window_slice(Window &slice) const
 {
-    for(unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
+    for (unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
     {
         // Did we reach the end of this dimension?
         const int v = slice._dims[n].start() + 1;
 
-        if(v < _dims[n].end())
+        if (v < _dims[n].end())
         {
             // No: increment
             slice._dims[n] = Dimension(v, v + 1, 1);
 
             // Reset lower dimensions:
-            for(unsigned int lower = window_dimension; lower < n; ++lower)
+            for (unsigned int lower = window_dimension; lower < n; ++lower)
             {
                 slice._dims[lower] = Dimension(_dims[lower].start(), _dims[lower].start() + 1, 1);
             }
@@ -258,14 +272,14 @@ inline bool Window::slide_window_slice(Window &slice) const
 }
 
 template <unsigned int window_dimension>
-inline Window          Window::first_slice_window() const
+inline Window Window::first_slice_window() const
 {
     Window slice;
 
     std::copy_n(_dims.begin(), window_dimension, slice._dims.begin());
 
     //Initialise higher dimensions to be the first slice.
-    for(unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
+    for (unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
     {
         slice._dims[n] = Dimension(_dims[n].start(), _dims[n].start() + 1, 1);
     }
@@ -275,7 +289,7 @@ inline Window          Window::first_slice_window() const
 
 inline void Window::use_tensor_dimensions(const TensorShape &shape, size_t first_dimension)
 {
-    for(unsigned int n = first_dimension; n < shape.num_dimensions(); ++n)
+    for (unsigned int n = first_dimension; n < shape.num_dimensions(); ++n)
     {
         set(n, Window::Dimension(0, std::max(shape[n], static_cast<size_t>(1))));
     }
@@ -284,7 +298,7 @@ inline void Window::use_tensor_dimensions(const TensorShape &shape, size_t first
 inline TensorShape Window::shape() const
 {
     TensorShape shape;
-    for(size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
+    for (size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
     {
         shape.set(d, (_dims[d].end() - _dims[d].start()) / _dims[d].step());
     }
@@ -294,7 +308,7 @@ inline TensorShape Window::shape() const
 inline size_t Window::num_iterations_total() const
 {
     size_t total = 1;
-    for(size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
+    for (size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
     {
         total *= num_iterations(d);
     }
@@ -311,3 +325,5 @@ inline bool operator==(const Window &lhs, const Window &rhs)
     return (lhs._dims == rhs._dims) && (lhs._is_broadcasted == rhs._is_broadcasted);
 }
 } // namespace arm_compute
+
+#endif // ACL_ARM_COMPUTE_CORE_WINDOW_INL
diff --git a/arm_compute/core/WindowIterator.h b/arm_compute/core/WindowIterator.h
index b1e399c87296d1ce755182da5effa03aded7a7ed..29302c410ae5819e01704272edfaee23b1da6397 100644
--- a/arm_compute/core/WindowIterator.h
+++ b/arm_compute/core/WindowIterator.h
@@ -28,7 +28,6 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Window.h"
 
-
 namespace arm_compute
 {
 /** Convert an offset in window steps into absolute coordinates.
@@ -41,7 +40,7 @@ namespace arm_compute
 inline Coordinates convert_window_coord_to_position(const Window &w, const Coordinates &offset)
 {
     Coordinates position;
-    for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         position.set(i, w[i].start() + offset[i] * w[i].step());
     }
@@ -165,7 +164,7 @@ public:
     template <typename M>
     void iterate_3D(M &&on_new_row_size)
     {
-        while(_end.z() != _position.z())
+        while (_end.z() != _position.z())
         {
             iterate_2D_internal(on_new_row_size, _w.x().end() - _w.x().step(), _w.y().end() - _w.y().step());
             _position[2] += _w.z().step();
@@ -212,7 +211,7 @@ private:
     void iterate_2D_internal(M &&on_new_row_size, int end_x, int end_y)
     {
         //Is there more than one row to process ?
-        if(end_y == _position.y())
+        if (end_y == _position.y())
         {
             // Both start and end belong to the same row:
             iterate_over_dim0(end_x + _w.x().step(), on_new_row_size);
@@ -220,7 +219,7 @@ private:
         else
         {
             // Do we start from the beginning of the row ?
-            if(_w.x().start() != _position.x())
+            if (_w.x().start() != _position.x())
             {
                 //Start in the middle of a row: process left-over X
                 iterate_over_dim0(_w.x().end(), on_new_row_size);
@@ -229,7 +228,7 @@ private:
 
             //Middle rows
             bool no_leftover = end_x + _w.x().step() == _w.x().end();
-            if(no_leftover)
+            if (no_leftover)
             {
                 //Switch to full row size:
                 on_new_row_size(_w[0].start(), _w.x().end());
@@ -241,7 +240,7 @@ private:
             else
             {
                 // Are there full rows to process ?
-                if(_position[1] != end_y)
+                if (_position[1] != end_y)
                 {
                     //Switch to full row size:
                     on_new_row_size(_w[0].start(), _w.x().end());
@@ -261,7 +260,7 @@ private:
      */
     void iterate_over_dim1(int end)
     {
-        for(; _position[1] != end; _position[1] += _w[1].step())
+        for (; _position[1] != end; _position[1] += _w[1].step())
         {
             _position[0] = _w[0].start();
             iterate_over_dim0(_w[0].end());
@@ -288,7 +287,7 @@ private:
     {
         // Both start and end belong to the same row:
         ARM_COMPUTE_ERROR_ON(_position[0] > end);
-        for(; _position.x() < end; _position[0] += _w[0].step())
+        for (; _position.x() < end; _position[0] += _w[0].step())
         {
             _lambda_function(_position);
         }
@@ -310,9 +309,10 @@ private:
  * @return A WindowIterator object.
  */
 template <typename L>
-WindowIterator<L> create_window_iterator(const Window &w, const Coordinates &start, const Coordinates &end, L &&lambda_function)
+WindowIterator<L>
+create_window_iterator(const Window &w, const Coordinates &start, const Coordinates &end, L &&lambda_function)
 {
     return WindowIterator<L>(w, start, end, std::move(lambda_function));
 }
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_WINDOW_ITERATOR_H*/
diff --git a/arm_compute/core/experimental/IPostOp.h b/arm_compute/core/experimental/IPostOp.h
deleted file mode 100644
index 567a4023c0cde1e2d065eaa9285204366c052bb5..0000000000000000000000000000000000000000
--- a/arm_compute/core/experimental/IPostOp.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_EXPERIMENTAL_IPOSTOP
-#define ARM_COMPUTE_EXPERIMENTAL_IPOSTOP
-
-#include <memory>
-#include <numeric>
-#include <vector>
-
-namespace arm_compute
-{
-namespace experimental
-{
-/** Type of Post Op */
-enum class PostOpType
-{
-    Activation,
-    Eltwise_Add,
-    Eltwise_PRelu
-};
-/** An ordered sequence of type of Post Ops */
-using PostOpTypeSequence = std::vector<PostOpType>;
-/** An elementwise n-ary operation that can be appended to and fused with (at kernel-level) other operators
- *  It contains:
- *      1. The attributes of the original operator.
- *      2. Any additional tensor argument.
- *      3. The position of the previous op's dst tensor in its argument list ( @ref prev_dst_pos )
- *
- *  For example, a series of chained ops:
- *
- *          div(src1, relu(conv(src0, weights, bias, conv_info), act_info), div_info)
- *
- *      translates to
- *
- *          dst = conv(src0, weights, bias, conv_info)  // main op
- *          dst = relu(dst, act_info)                   // previous dst is placed in the first (and only) argument
- *          dst = div(src1, dst, div_info)              // previous dst is placed in the second argument
- *
- *      which in turn translates to:
- *
- *          main op: conv(src0, weights, bias, conv_info)
- *          post op1: relu(act_info, prev_dst_pos = 0)
- *          post op2: div(div_info, src1, prev_dst_pos = 1)
- *
- *  @note: On Broadcasting
- *      For n-ary post ops, the tensor arguments must not "widen" the dst tensor of the main op
- *      For example, for a dst of shape [14, 1, 34]:
- *          * post_op_arg1 = [1, 1, 34] is allowed: broadcast in dim 0
- *          * post_op_arg1 = [14, 1, 34] is allowed: no broadcast
- *          * post_op_arg1 = [1, 1, 34] is allowed: broadcast in dims 0 and 1
- *          * post_op_arg1 = [14, 15, 34] is NOT allowed: broadcast widens the dst tensor
- *
- * @note: On Data layout
- *      All post ops are data layout agnostic. This means post ops do not have an inherent idea of "width", "height" and so on.
- *      Should we want to perform a post op with 2 tensors of different data layouts (where data layouts are significant to both),
- *      then we need to perform necessary permutation op beforehand to unify their data layout before they can be fused with a post op
- *
- *      Note although post ops themselves should be able to support any data layout, the main op they fuse to may impose
- *      additional restrictions in the presence of post ops. For example, the implementation of a gemm op may only allow
- *      NHWC data layout if post ops are provided. Such restrictions are main op implementation specific.
- *
- *  @note: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type
- *  @note: If TensorRelatedT points to a resource, IPostOp assumes that resource is valid throughout its lifetime
- *        and the lifetime of its copies. This is almost guaranteed as IPostOp is only meant to be used at configure time
- *        after the ITensor or ITensorInfo objects are already constructed
- */
-template <typename TensorRelatedT>
-struct IPostOp
-{
-    /** Get the arity of the post op
-     * @note: that this is one fewer than the arity of the original op, because we implicitly pass the previous op's dst
-     *       tensor as one of the arguments
-     */
-    size_t arity() const
-    {
-        return arguments().size();
-    }
-    /** The position of previous op's dst in current op's argument list */
-    virtual int prev_dst_pos() const = 0;
-    /** The IPostOp type */
-    virtual PostOpType type() const = 0;
-    /** The argument tensors
-     * The order of the argument tensor is strictly preserved
-     */
-    virtual std::vector<TensorRelatedT *>       arguments()       = 0;
-    virtual std::vector<const TensorRelatedT *> arguments() const = 0;
-    /** Clone method used in cases where PostOps are owned by unique_ptr
-     * @note: This performs a shallow copy of the TensorRelatedT if TensorRelatedT points to a resource
-     */
-    virtual std::unique_ptr<IPostOp<TensorRelatedT>> clone() const = 0;
-    virtual ~IPostOp()
-    {
-    }
-};
-
-/** A sequence of PostOps that can be appended to the end of other operators */
-template <typename TensorRelatedT>
-class PostOpList
-{
-public:
-    /** Constructor */
-    PostOpList() = default;
-    /** Destructor */
-    ~PostOpList() = default;
-    PostOpList(const PostOpList &other)
-    {
-        for(const auto &op : other._post_ops)
-        {
-            this->_post_ops.push_back(op->clone());
-        }
-    }
-    PostOpList &operator=(const PostOpList &other)
-    {
-        PostOpList tmp{ other };
-        std::swap(tmp, *this);
-        return *this;
-    }
-    PostOpList(PostOpList &&other) = default;
-    PostOpList &operator=(PostOpList &&other) = default;
-
-    /** Add a new post op at the end of the list */
-    template <typename OpT, typename... Args>
-    void push_back_op(Args &&... args)
-    {
-        _post_ops.push_back(std::make_unique<OpT>(std::forward<Args>(args)...));
-    }
-
-    /** Number of post ops */
-    size_t size() const
-    {
-        return _post_ops.size();
-    }
-
-    /** Total number of post ops */
-    size_t total_num_arguments() const
-    {
-        return std::accumulate(_post_ops.begin(), _post_ops.end(), 0, [](size_t op1_arity, const auto & op2)
-        {
-            return op1_arity + op2->arity();
-        });
-    }
-
-    /** Get the underlying post op list */
-    std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> &get_list()
-    {
-        return _post_ops;
-    }
-    const std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> &get_list() const
-    {
-        return _post_ops;
-    }
-
-private:
-    std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> _post_ops{};
-};
-
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_IPOSTOP
diff --git a/arm_compute/core/experimental/PostOps.h b/arm_compute/core/experimental/PostOps.h
deleted file mode 100644
index a5585bab5d6ea2e777d4e4703a6c30dc53b87adc..0000000000000000000000000000000000000000
--- a/arm_compute/core/experimental/PostOps.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2021, 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_EXPERIMENTAL_POSTOPS
-#define ARM_COMPUTE_EXPERIMENTAL_POSTOPS
-
-#include "arm_compute/core/experimental/IPostOp.h"
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/function_info/ActivationLayerInfo.h"
-
-#include <vector>
-
-namespace arm_compute
-{
-namespace experimental
-{
-/** (EXPERIMENTAL_POST_OPS)
- * Implementation of specific IPostOps
-*/
-
-template <typename TensorRelatedT>
-struct PostOpAct : public IPostOp<TensorRelatedT>
-{
-public:
-    PostOpAct(const ActivationLayerInfo &act_info)
-        : _act_info{ act_info }
-    {
-    }
-    // NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type, thus allow shallow copy
-    ~PostOpAct() override        = default;
-    PostOpAct(const PostOpAct &) = default;
-    PostOpAct &operator=(const PostOpAct &) = default;
-    PostOpAct(PostOpAct &&)                 = default;
-    PostOpAct &operator=(PostOpAct &&) = default;
-
-    int prev_dst_pos() const override
-    {
-        return 0;
-    }
-    PostOpType type() const override
-    {
-        return PostOpType::Activation;
-    }
-    std::vector<TensorRelatedT *> arguments() override
-    {
-        return {};
-    }
-    std::vector<const TensorRelatedT *> arguments() const override
-    {
-        return {};
-    }
-    std::unique_ptr<IPostOp<TensorRelatedT>> clone() const override
-    {
-        return std::make_unique<PostOpAct<TensorRelatedT>>(*this);
-    }
-    ActivationLayerInfo _act_info;
-};
-
-template <typename TensorRelatedT>
-struct PostOpEltwiseAdd : public IPostOp<TensorRelatedT>
-{
-public:
-    PostOpEltwiseAdd(TensorRelatedT addend, int prev_dst_pos, ConvertPolicy policy)
-        : _addend{ addend },
-          _prev_dst_pos{ prev_dst_pos },
-          _policy{ policy }
-    {
-    }
-    // NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type, thus allow shallow copy
-    ~PostOpEltwiseAdd() override               = default;
-    PostOpEltwiseAdd(const PostOpEltwiseAdd &) = default;
-    PostOpEltwiseAdd &operator=(const PostOpEltwiseAdd &) = default;
-    PostOpEltwiseAdd(PostOpEltwiseAdd &&)                 = default;
-    PostOpEltwiseAdd &operator=(PostOpEltwiseAdd &&) = default;
-    int               prev_dst_pos() const override
-    {
-        return _prev_dst_pos;
-    }
-    PostOpType type() const override
-    {
-        return PostOpType::Eltwise_Add;
-    }
-    std::vector<TensorRelatedT *> arguments() override
-    {
-        return { &_addend };
-    }
-    std::vector<const TensorRelatedT *> arguments() const override
-    {
-        return { &_addend };
-    }
-    std::unique_ptr<IPostOp<TensorRelatedT>> clone() const override
-    {
-        return std::make_unique<PostOpEltwiseAdd<TensorRelatedT>>(*this);
-    }
-    TensorRelatedT _addend;
-    int            _prev_dst_pos;
-    ConvertPolicy  _policy;
-};
-
-template <typename TensorRelatedT>
-struct PostOpEltwisePRelu : public IPostOp<TensorRelatedT>
-{
-public:
-    PostOpEltwisePRelu(TensorRelatedT alpha_param, int prev_dst_pos, ConvertPolicy policy)
-        : _alpha_param{ alpha_param },
-          _prev_dst_pos{ prev_dst_pos },
-          _policy{ policy }
-    {
-    }
-    // NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type, thus allow shallow copy
-    ~PostOpEltwisePRelu() override                 = default;
-    PostOpEltwisePRelu(const PostOpEltwisePRelu &) = default;
-    PostOpEltwisePRelu &operator=(const PostOpEltwisePRelu &) = default;
-    PostOpEltwisePRelu(PostOpEltwisePRelu &&)                 = default;
-    PostOpEltwisePRelu &operator=(PostOpEltwisePRelu &&) = default;
-    int                 prev_dst_pos() const override
-    {
-        return _prev_dst_pos;
-    }
-    PostOpType type() const override
-    {
-        return PostOpType::Eltwise_PRelu;
-    }
-    std::vector<TensorRelatedT *> arguments() override
-    {
-        return { &_alpha_param };
-    }
-    std::vector<const TensorRelatedT *> arguments() const override
-    {
-        return { &_alpha_param };
-    }
-    std::unique_ptr<IPostOp<TensorRelatedT>> clone() const override
-    {
-        return std::make_unique<PostOpEltwisePRelu<TensorRelatedT>>(*this);
-    }
-    TensorRelatedT _alpha_param;
-    int            _prev_dst_pos;
-    ConvertPolicy  _policy;
-};
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_POSTOPS
diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h
index 1995ab045e9b7b485d860aafac6b495148e9ae01..63a3a1a1ec968281466c3ccb99cb63fd1a9f3ec5 100644
--- a/arm_compute/core/experimental/Types.h
+++ b/arm_compute/core/experimental/Types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_EXPERIMENTAL_TYPES_H
-#define ARM_COMPUTE_EXPERIMENTAL_TYPES_H
+#ifndef ACL_ARM_COMPUTE_CORE_EXPERIMENTAL_TYPES_H
+#define ACL_ARM_COMPUTE_CORE_EXPERIMENTAL_TYPES_H
 
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorShape.h"
@@ -78,11 +78,6 @@ enum TensorType : int32_t
     ACL_VEC_COL_SUM = ACL_SRC_4,
     ACL_SHIFTS      = ACL_SRC_5,
     ACL_MULTIPLIERS = ACL_SRC_6,
-
-    // (EXPERIMENTAL_POST_OPS) Post ops arguments begin after everything else
-    EXPERIMENTAL_ACL_POST_OP_ARG       = 2048,
-    EXPERIMENTAL_ACL_POST_OP_ARG_FIRST = EXPERIMENTAL_ACL_POST_OP_ARG,
-    EXPERIMENTAL_ACL_POST_OP_ARG_LAST  = EXPERIMENTAL_ACL_POST_OP_ARG_FIRST + 1024, // Max number of post op arguments
 };
 
 namespace experimental
@@ -97,24 +92,18 @@ struct MemoryInfo
 {
     MemoryInfo() = default;
 
-    MemoryInfo(int slot, size_t size, size_t alignment = 0) noexcept
-        : slot(slot),
-          size(size),
-          alignment(alignment)
+    MemoryInfo(int slot, size_t size, size_t alignment = 0) noexcept : slot(slot), size(size), alignment(alignment)
     {
     }
 
     MemoryInfo(int slot, MemoryLifetime lifetime, size_t size, size_t alignment = 0) noexcept
-        : slot(slot),
-          lifetime(lifetime),
-          size(size),
-          alignment(alignment)
+        : slot(slot), lifetime(lifetime), size(size), alignment(alignment)
     {
     }
 
     bool merge(int slot, size_t new_size, size_t new_alignment = 0) noexcept
     {
-        if(slot != this->slot)
+        if (slot != this->slot)
         {
             return false;
         }
@@ -125,13 +114,13 @@ struct MemoryInfo
         return true;
     }
 
-    int            slot{ ACL_UNKNOWN };
-    MemoryLifetime lifetime{ MemoryLifetime::Temporary };
-    size_t         size{ 0 };
-    size_t         alignment{ 64 };
+    int            slot{ACL_UNKNOWN};
+    MemoryLifetime lifetime{MemoryLifetime::Temporary};
+    size_t         size{0};
+    size_t         alignment{64};
 };
 
 using MemoryRequirements = std::vector<MemoryInfo>;
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_EXPERIMENTAL_TYPES_H */
+#endif // ACL_ARM_COMPUTE_CORE_EXPERIMENTAL_TYPES_H
diff --git a/arm_compute/core/utils/ActivationFunctionUtils.h b/arm_compute/core/utils/ActivationFunctionUtils.h
index 1cb66da13de6c17d9413268ca848498141260a12..c988efa256c13bcbde30e9610463f40d34e46e94 100644
--- a/arm_compute/core/utils/ActivationFunctionUtils.h
+++ b/arm_compute/core/utils/ActivationFunctionUtils.h
@@ -37,5 +37,5 @@ namespace arm_compute
  * @return The string describing the activation function.
  */
 const std::string &string_from_activation_func(const ActivationFunction &act);
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CORE_UTILS_ACTIVATIONFUNCTIONUTILS_H */
diff --git a/arm_compute/core/utils/DataLayoutUtils.h b/arm_compute/core/utils/DataLayoutUtils.h
index 399f55c63f6ad423ef5cac7d26c4c6202ef4629b..61839c9f91ab6f0ebdff9d4fd198e8ea69b9d225 100644
--- a/arm_compute/core/utils/DataLayoutUtils.h
+++ b/arm_compute/core/utils/DataLayoutUtils.h
@@ -36,5 +36,5 @@ namespace arm_compute
  * @return The string describing the data layout.
  */
 const std::string &string_from_data_layout(DataLayout dl);
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CORE_UTILS_DATALAYOUTUTILS_H */
diff --git a/arm_compute/core/utils/DataTypeUtils.h b/arm_compute/core/utils/DataTypeUtils.h
index cbb409c8a10c7ca10caf28a935fbcba9ce85936c..7ea5eb767062b3eebfef45309a706422670f64f2 100644
--- a/arm_compute/core/utils/DataTypeUtils.h
+++ b/arm_compute/core/utils/DataTypeUtils.h
@@ -37,7 +37,7 @@ namespace arm_compute
  */
 inline size_t data_size_from_type(DataType data_type)
 {
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::U8:
         case DataType::S8:
@@ -77,7 +77,7 @@ inline size_t data_size_from_type(DataType data_type)
  */
 inline size_t element_size_from_data_type(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::S8:
         case DataType::U8:
@@ -114,7 +114,7 @@ inline size_t element_size_from_data_type(DataType dt)
  */
 inline DataType data_type_from_format(Format format)
 {
-    switch(format)
+    switch (format)
     {
         case Format::U8:
         case Format::UV88:
@@ -158,7 +158,7 @@ inline DataType data_type_from_format(Format format)
  */
 inline DataType get_promoted_data_type(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
             return DataType::U16;
@@ -196,7 +196,7 @@ inline std::tuple<PixelValue, PixelValue> get_min_max(DataType dt)
 {
     PixelValue min{};
     PixelValue max{};
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -303,7 +303,7 @@ inline ::std::istream &operator>>(::std::istream &stream, DataType &data_type)
  */
 inline bool is_data_type_float(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::F16:
         case DataType::F32:
@@ -323,7 +323,7 @@ inline bool is_data_type_float(DataType dt)
  */
 inline bool is_data_type_quantized(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::QSYMM8:
         case DataType::QASYMM8:
@@ -345,7 +345,7 @@ inline bool is_data_type_quantized(DataType dt)
  */
 inline bool is_data_type_quantized_asymmetric(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::QASYMM8:
         case DataType::QASYMM8_SIGNED:
@@ -364,7 +364,7 @@ inline bool is_data_type_quantized_asymmetric(DataType dt)
  */
 inline bool is_data_type_quantized_asymmetric_signed(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::QASYMM8_SIGNED:
             return true;
@@ -381,7 +381,7 @@ inline bool is_data_type_quantized_asymmetric_signed(DataType dt)
  */
 inline bool is_data_type_quantized_symmetric(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::QSYMM8:
         case DataType::QSYMM8_PER_CHANNEL:
@@ -400,7 +400,7 @@ inline bool is_data_type_quantized_symmetric(DataType dt)
  */
 inline bool is_data_type_quantized_per_channel(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::QSYMM8_PER_CHANNEL:
             return true;
@@ -420,12 +420,13 @@ inline bool is_data_type_quantized_per_channel(DataType dt)
 template <typename T>
 bool check_value_range(T val, DataType dt, QuantizationInfo qinfo = QuantizationInfo())
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         {
             const auto val_u8 = static_cast<uint8_t>(val);
-            return ((val_u8 == val) && val >= std::numeric_limits<uint8_t>::lowest() && val <= std::numeric_limits<uint8_t>::max());
+            return ((val_u8 == val) && val >= std::numeric_limits<uint8_t>::lowest() &&
+                    val <= std::numeric_limits<uint8_t>::max());
         }
         case DataType::QASYMM8:
         {
@@ -436,29 +437,34 @@ bool check_value_range(T val, DataType dt, QuantizationInfo qinfo = Quantization
         case DataType::S8:
         {
             const auto val_s8 = static_cast<int8_t>(val);
-            return ((val_s8 == val) && val >= std::numeric_limits<int8_t>::lowest() && val <= std::numeric_limits<int8_t>::max());
+            return ((val_s8 == val) && val >= std::numeric_limits<int8_t>::lowest() &&
+                    val <= std::numeric_limits<int8_t>::max());
         }
         case DataType::U16:
         {
             const auto val_u16 = static_cast<uint16_t>(val);
-            return ((val_u16 == val) && val >= std::numeric_limits<uint16_t>::lowest() && val <= std::numeric_limits<uint16_t>::max());
+            return ((val_u16 == val) && val >= std::numeric_limits<uint16_t>::lowest() &&
+                    val <= std::numeric_limits<uint16_t>::max());
         }
         case DataType::S16:
         {
             const auto val_s16 = static_cast<int16_t>(val);
-            return ((val_s16 == val) && val >= std::numeric_limits<int16_t>::lowest() && val <= std::numeric_limits<int16_t>::max());
+            return ((val_s16 == val) && val >= std::numeric_limits<int16_t>::lowest() &&
+                    val <= std::numeric_limits<int16_t>::max());
         }
         case DataType::U32:
         {
             const auto val_d64 = static_cast<double>(val);
             const auto val_u32 = static_cast<uint32_t>(val);
-            return ((val_u32 == val_d64) && val_d64 >= std::numeric_limits<uint32_t>::lowest() && val_d64 <= std::numeric_limits<uint32_t>::max());
+            return ((val_u32 == val_d64) && val_d64 >= std::numeric_limits<uint32_t>::lowest() &&
+                    val_d64 <= std::numeric_limits<uint32_t>::max());
         }
         case DataType::S32:
         {
             const auto val_d64 = static_cast<double>(val);
             const auto val_s32 = static_cast<int32_t>(val);
-            return ((val_s32 == val_d64) && val_d64 >= std::numeric_limits<int32_t>::lowest() && val_d64 <= std::numeric_limits<int32_t>::max());
+            return ((val_s32 == val_d64) && val_d64 >= std::numeric_limits<int32_t>::lowest() &&
+                    val_d64 <= std::numeric_limits<int32_t>::max());
         }
         case DataType::BFLOAT16:
             return (val >= bfloat16::lowest() && val <= bfloat16::max());
@@ -482,7 +488,7 @@ inline std::string cpu_impl_dt(const DataType &data_type)
 {
     std::string ret = "";
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::F32:
             ret = "fp32";
@@ -521,5 +527,5 @@ inline std::string cpu_impl_dt(const DataType &data_type)
     return ret;
 }
 
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CORE_UTILS_DATATYPEUTILS_H */
diff --git a/arm_compute/core/utils/FormatUtils.h b/arm_compute/core/utils/FormatUtils.h
index afb0f78255cf806e2ee57ecec3d473906b392529..a8e96bd361841285fe944ac2b5647b48d0128c78 100644
--- a/arm_compute/core/utils/FormatUtils.h
+++ b/arm_compute/core/utils/FormatUtils.h
@@ -37,7 +37,7 @@ namespace arm_compute
  */
 inline size_t pixel_size_from_format(Format format)
 {
-    switch(format)
+    switch (format)
     {
         case Format::U8:
             return 1;
@@ -77,7 +77,7 @@ inline size_t pixel_size_from_format(Format format)
  */
 inline int plane_idx_from_channel(Format format, Channel channel)
 {
-    switch(format)
+    switch (format)
     {
         // Single planar formats have a single plane
         case Format::U8:
@@ -99,7 +99,7 @@ inline int plane_idx_from_channel(Format format, Channel channel)
         case Format::NV21:
         {
             // Channel U and V share the same plane of format UV88
-            switch(channel)
+            switch (channel)
             {
                 case Channel::Y:
                     return 0;
@@ -114,7 +114,7 @@ inline int plane_idx_from_channel(Format format, Channel channel)
         case Format::IYUV:
         case Format::YUV444:
         {
-            switch(channel)
+            switch (channel)
             {
                 case Channel::Y:
                     return 0;
@@ -142,11 +142,11 @@ inline int plane_idx_from_channel(Format format, Channel channel)
  */
 inline int channel_idx_from_format(Format format, Channel channel)
 {
-    switch(format)
+    switch (format)
     {
         case Format::RGB888:
         {
-            switch(channel)
+            switch (channel)
             {
                 case Channel::R:
                     return 0;
@@ -161,7 +161,7 @@ inline int channel_idx_from_format(Format format, Channel channel)
         }
         case Format::RGBA8888:
         {
-            switch(channel)
+            switch (channel)
             {
                 case Channel::R:
                     return 0;
@@ -178,7 +178,7 @@ inline int channel_idx_from_format(Format format, Channel channel)
         }
         case Format::YUYV422:
         {
-            switch(channel)
+            switch (channel)
             {
                 case Channel::Y:
                     return 0;
@@ -193,7 +193,7 @@ inline int channel_idx_from_format(Format format, Channel channel)
         }
         case Format::UYVY422:
         {
-            switch(channel)
+            switch (channel)
             {
                 case Channel::Y:
                     return 1;
@@ -208,7 +208,7 @@ inline int channel_idx_from_format(Format format, Channel channel)
         }
         case Format::NV12:
         {
-            switch(channel)
+            switch (channel)
             {
                 case Channel::Y:
                     return 0;
@@ -223,7 +223,7 @@ inline int channel_idx_from_format(Format format, Channel channel)
         }
         case Format::NV21:
         {
-            switch(channel)
+            switch (channel)
             {
                 case Channel::Y:
                     return 0;
@@ -239,7 +239,7 @@ inline int channel_idx_from_format(Format format, Channel channel)
         case Format::YUV444:
         case Format::IYUV:
         {
-            switch(channel)
+            switch (channel)
             {
                 case Channel::Y:
                     return 0;
@@ -266,7 +266,7 @@ inline int channel_idx_from_format(Format format, Channel channel)
  */
 inline size_t num_planes_from_format(Format format)
 {
-    switch(format)
+    switch (format)
     {
         case Format::U8:
         case Format::S16:
@@ -301,7 +301,7 @@ inline size_t num_planes_from_format(Format format)
  */
 inline size_t num_channels_from_format(Format format)
 {
-    switch(format)
+    switch (format)
     {
         case Format::U8:
         case Format::U16:
@@ -340,5 +340,5 @@ inline size_t num_channels_from_format(Format format)
  * @return The string describing the format.
  */
 const std::string &string_from_format(Format format);
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CORE_UTILS_FORMATUTILS_H */
diff --git a/arm_compute/core/utils/InterpolationPolicyUtils.h b/arm_compute/core/utils/InterpolationPolicyUtils.h
index 79f6e3aa5f14ac174d09ff63d1405cd006b74fea..8d4ae4321cbdcf2cfa652dba4e6920f896cd480a 100644
--- a/arm_compute/core/utils/InterpolationPolicyUtils.h
+++ b/arm_compute/core/utils/InterpolationPolicyUtils.h
@@ -37,5 +37,5 @@ namespace arm_compute
  * @return The string describing the interpolation policy.
  */
 const std::string &string_from_interpolation_policy(InterpolationPolicy policy);
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CORE_UTILS_INTERPOLATIONPOLICYUTILS_H */
diff --git a/arm_compute/core/utils/StringUtils.h b/arm_compute/core/utils/StringUtils.h
index 41f29b0901686592ea8e817adb012d917ebda91b..c13cbaa334d906f5df68e4aef74528e8c23b303f 100644
--- a/arm_compute/core/utils/StringUtils.h
+++ b/arm_compute/core/utils/StringUtils.h
@@ -61,5 +61,5 @@ std::string float_to_string_with_full_precision(float val);
  * @return std::string
  */
 std::string join(const std::vector<std::string> strings, const std::string &sep);
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CORE_UTILS_STRINGUTILS_H */
diff --git a/arm_compute/core/utils/helpers/AdjustVecSize.h b/arm_compute/core/utils/helpers/AdjustVecSize.h
index bbb3048b84c14fdf99427338a30ba0da2a56988a..842e3b57d6394988a8c5c4924830b489304588dc 100644
--- a/arm_compute/core/utils/helpers/AdjustVecSize.h
+++ b/arm_compute/core/utils/helpers/AdjustVecSize.h
@@ -39,17 +39,17 @@ inline unsigned int adjust_vec_size(unsigned int vec_size, size_t dim0)
 {
     ARM_COMPUTE_ERROR_ON(vec_size > 16);
 
-    if((vec_size >= dim0) && (dim0 == 3))
+    if ((vec_size >= dim0) && (dim0 == 3))
     {
         return dim0;
     }
 
-    while(vec_size > dim0)
+    while (vec_size > dim0)
     {
         vec_size >>= 1;
     }
 
     return vec_size;
 }
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_UTILS_H */
diff --git a/arm_compute/core/utils/helpers/tensor_transform.h b/arm_compute/core/utils/helpers/tensor_transform.h
index faa5b4433c4de620a0fe0ee5790d8fd540b1d4d5..7a61fa192a0dc05553a9c650368c35bb6abef5b7 100644
--- a/arm_compute/core/utils/helpers/tensor_transform.h
+++ b/arm_compute/core/utils/helpers/tensor_transform.h
@@ -52,7 +52,8 @@ int calculate_stride_on_index(int index, Coordinates strides);
  *
  * @return Absolute start position of a given index
  */
-int calculate_start_on_index(TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask);
+int calculate_start_on_index(
+    TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask);
 
 /** Returns the absolute end position of a given index for a strided slice operation
  *
@@ -68,8 +69,13 @@ int calculate_start_on_index(TensorShape input_shape, int index, Coordinates sta
  *
  * @return Absolute end position of a given index
  */
-int calculate_end_on_index(TensorShape input_shape, int index, int start_on_index, Coordinates ends, Coordinates strides,
-                           int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+int calculate_end_on_index(TensorShape input_shape,
+                           int         index,
+                           int         start_on_index,
+                           Coordinates ends,
+                           Coordinates strides,
+                           int32_t     end_mask         = 0,
+                           int32_t     shrink_axis_mask = 0);
 
 /** Calculate start, end and stride coordinates for a strided slice
  *
@@ -87,8 +93,12 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde
  * @return A tuple with <Start,End,Strides>
  */
 std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords(TensorShape input_shape,
-                                                                                 Coordinates starts, Coordinates ends, Coordinates strides,
-                                                                                 int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+                                                                                 Coordinates starts,
+                                                                                 Coordinates ends,
+                                                                                 Coordinates strides,
+                                                                                 int32_t     begin_mask       = 0,
+                                                                                 int32_t     end_mask         = 0,
+                                                                                 int32_t     shrink_axis_mask = 0);
 
 /** Computes output shape of strided slice
  *
@@ -109,9 +119,14 @@ std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords
  *
  * @return The output tensor shape
  */
-TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends, Coordinates strides,
-                                               int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0,
-                                               bool return_unshrinked = false);
+TensorShape compute_strided_slice_output_shape(TensorShape input_shape,
+                                               Coordinates starts,
+                                               Coordinates ends,
+                                               Coordinates strides,
+                                               int32_t     begin_mask        = 0,
+                                               int32_t     end_mask          = 0,
+                                               int32_t     shrink_axis_mask  = 0,
+                                               bool        return_unshrinked = false);
 
 /** Constructs end mask in case we want to perform a slice operation using the strided slice interface
  *
@@ -122,7 +137,7 @@ TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordina
  * @return End mask
  */
 int32_t construct_slice_end_mask(Coordinates ends);
-} // namespace tensor_tranform
+} // namespace tensor_transform
 } // namespace helpers
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_UTILS_HELPERS_TENSOR_TRANSFORM_H */
diff --git a/arm_compute/core/utils/logging/FilePrinter.h b/arm_compute/core/utils/logging/FilePrinter.h
index 0e5b84f0845bbcbca914081ab084166c9a97b6f2..a865aadddb3087de3d26d97498962b6f7d72198b 100644
--- a/arm_compute/core/utils/logging/FilePrinter.h
+++ b/arm_compute/core/utils/logging/FilePrinter.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_LOGGING_FILE_PRINTER_H
 #define ARM_COMPUTE_LOGGING_FILE_PRINTER_H
 
-#include "arm_compute/core/utils/logging/IPrinter.h"
-
 #include "arm_compute/core/utils/io/FileHandler.h"
+#include "arm_compute/core/utils/logging/IPrinter.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/utils/logging/Helpers.h b/arm_compute/core/utils/logging/Helpers.h
index 5f8b94859288512d4ed73ffa6674eb0722f58f7a..c3c2f0f0b81586a0e6af3edb36a250d60f22f7a1 100644
--- a/arm_compute/core/utils/logging/Helpers.h
+++ b/arm_compute/core/utils/logging/Helpers.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_LOGGING_HELPERS_H
 
 #include "arm_compute/core/utils/logging/Types.h"
+
 #include "support/ToolchainSupport.h"
 
 #include <cstddef>
@@ -45,7 +46,7 @@ namespace logging
  * @return The formatted string
  */
 template <typename... Ts>
-inline std::string string_with_format(const std::string &fmt, Ts &&... args)
+inline std::string string_with_format(const std::string &fmt, Ts &&...args)
 {
     size_t size     = support::cpp11::snprintf(nullptr, 0, fmt.c_str(), args...) + 1;
     auto   char_str = std::make_unique<char[]>(size);
diff --git a/arm_compute/core/utils/logging/IPrinter.h b/arm_compute/core/utils/logging/IPrinter.h
index 42dca58ea1b727253930f04d3c15a8de4793566f..7fde4d93020ae2168807d1e2131948cd3270c110 100644
--- a/arm_compute/core/utils/logging/IPrinter.h
+++ b/arm_compute/core/utils/logging/IPrinter.h
@@ -35,8 +35,7 @@ class Printer
 {
 public:
     /** Default Constructor */
-    Printer() noexcept
-        : _mtx()
+    Printer() noexcept : _mtx()
     {
     }
     /** Prevent instances of this class from being copied */
diff --git a/arm_compute/core/utils/logging/LogMsgDecorators.h b/arm_compute/core/utils/logging/LogMsgDecorators.h
index 9c9e62740f58961eea596335e8166121d31bf084..66a8180e2192db76c4db3a2baa0faf92b44af4bb 100644
--- a/arm_compute/core/utils/logging/LogMsgDecorators.h
+++ b/arm_compute/core/utils/logging/LogMsgDecorators.h
@@ -63,8 +63,7 @@ public:
      *
      * @param str Sting to append
      */
-    StringDecorator(const std::string &str)
-        : _str(str)
+    StringDecorator(const std::string &str) : _str(str)
     {
         _str = angle_wrap_value(str);
     }
@@ -103,7 +102,7 @@ private:
         auto time = std::chrono::system_clock::to_time_t(now);
 
         // TODO: use put_time for gcc > 4.9
-        char buf[100] = { 0 };
+        char buf[100] = {0};
         std::strftime(buf, sizeof(buf), "%d-%m-%Y %I:%M:%S", std::localtime(&time));
         return buf;
     }
diff --git a/arm_compute/core/utils/logging/Logger.h b/arm_compute/core/utils/logging/Logger.h
index 4fc9bb7dbf0c5948614b6a86b1c3f3c21718aefb..608db3913807e6df1fb3150ffbec8820c1942b71 100644
--- a/arm_compute/core/utils/logging/Logger.h
+++ b/arm_compute/core/utils/logging/Logger.h
@@ -88,7 +88,7 @@ public:
      * @param[in] args      Message arguments
      */
     template <typename... Ts>
-    void log(LogLevel log_level, const std::string &fmt, Ts &&... args);
+    void log(LogLevel log_level, const std::string &fmt, Ts &&...args);
     /** Sets log level of the logger
      *
      * @warning Not thread-safe
@@ -159,11 +159,11 @@ private:
 };
 
 template <typename... Ts>
-inline void Logger::log(LogLevel log_level, const std::string &fmt, Ts &&... args)
+inline void Logger::log(LogLevel log_level, const std::string &fmt, Ts &&...args)
 {
     // Return if message shouldn't be logged
     // i.e. if log level does not match the logger's
-    if(!is_loggable(log_level))
+    if (!is_loggable(log_level))
     {
         return;
     }
diff --git a/arm_compute/core/utils/logging/LoggerRegistry.h b/arm_compute/core/utils/logging/LoggerRegistry.h
index 7c9931a26032b03845c4829a32e4afbe56e64dc4..4e52a10935cb722940cc25b2d015352969a73957 100644
--- a/arm_compute/core/utils/logging/LoggerRegistry.h
+++ b/arm_compute/core/utils/logging/LoggerRegistry.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/utils/logging/Logger.h"
 #include "arm_compute/core/utils/logging/Printers.h"
 #include "arm_compute/core/utils/logging/Types.h"
+
 #include "support/Mutex.h"
 
 #include <memory>
@@ -54,8 +55,9 @@ public:
      * @param[in] log_level Logger's log level. Defaults to INFO
      * @param[in] printers  Printers to attach to the system loggers. Defaults with a @ref StdPrinter.
      */
-    void create_logger(const std::string &name, LogLevel log_level = LogLevel::INFO,
-                       const std::vector<std::shared_ptr<Printer>> &printers = { std::make_shared<StdPrinter>() });
+    void create_logger(const std::string                           &name,
+                       LogLevel                                     log_level = LogLevel::INFO,
+                       const std::vector<std::shared_ptr<Printer>> &printers  = {std::make_shared<StdPrinter>()});
     /** Remove a logger
      *
      * @param name Logger's name
@@ -74,16 +76,17 @@ public:
      * @param[in] printers  (Optional) Printers to attach to the system loggers. Defaults with a @ref StdPrinter.
      */
     void create_reserved_loggers(LogLevel                                     log_level = LogLevel::INFO,
-                                 const std::vector<std::shared_ptr<Printer>> &printers  = { std::make_shared<StdPrinter>() });
+                                 const std::vector<std::shared_ptr<Printer>> &printers  = {
+                                      std::make_shared<StdPrinter>()});
 
 private:
     /** Default constructor */
     LoggerRegistry();
 
 private:
-    arm_compute::Mutex _mtx;
+    arm_compute::Mutex                                       _mtx;
     std::unordered_map<std::string, std::shared_ptr<Logger>> _loggers;
-    static std::set<std::string> _reserved_loggers;
+    static std::set<std::string>                             _reserved_loggers;
 };
 } // namespace logging
 } // namespace arm_compute
diff --git a/arm_compute/core/utils/logging/Macros.h b/arm_compute/core/utils/logging/Macros.h
index 0ab17c4464b644abd8a03561d3958643a0a50542..4d5aa5fe2c5f28f547e97c84942ae74b88336e7e 100644
--- a/arm_compute/core/utils/logging/Macros.h
+++ b/arm_compute/core/utils/logging/Macros.h
@@ -48,48 +48,48 @@ inline std::string signature_name(const std::string &pretty_func)
     do                                                                                   \
     {                                                                                    \
         auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \
-        if(__logger != nullptr)                                                          \
+        if (__logger != nullptr)                                                         \
         {                                                                                \
             __logger->log(log_level, msg);                                               \
         }                                                                                \
-    } while(false)
+    } while (false)
 
 #define ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME(logger_name, log_level, msg)                   \
     do                                                                                   \
     {                                                                                    \
         auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \
-        if(__logger != nullptr)                                                          \
+        if (__logger != nullptr)                                                         \
         {                                                                                \
             std::ostringstream s;                                                        \
             s << ARM_COMPUTE_SIGNATURE_NAME << " : " << msg;                             \
             __logger->log(log_level, s.str());                                           \
         }                                                                                \
-    } while(false)
+    } while (false)
 
 #define ARM_COMPUTE_LOG_MSG_WITH_FORMAT(logger_name, log_level, fmt, ...)                     \
     do                                                                                        \
     {                                                                                         \
         auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name);      \
-        if(__logger != nullptr)                                                               \
+        if (__logger != nullptr)                                                              \
         {                                                                                     \
             size_t size     = ::snprintf(nullptr, 0, fmt, __VA_ARGS__) + 1;                   \
             auto   char_str = std::make_unique<char[]>(size);                                 \
             ::snprintf(char_str.get(), size, fmt, __VA_ARGS__);                               \
             __logger->log(log_level, std::string(char_str.get(), char_str.get() + size - 1)); \
         }                                                                                     \
-    } while(false)
+    } while (false)
 
 #define ARM_COMPUTE_LOG_STREAM(logger_name, log_level, stream)                           \
     do                                                                                   \
     {                                                                                    \
         auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \
-        if(__logger != nullptr)                                                          \
+        if (__logger != nullptr)                                                         \
         {                                                                                \
             std::ostringstream s;                                                        \
             s << stream;                                                                 \
             __logger->log(log_level, s.str());                                           \
         }                                                                                \
-    } while(false)
+    } while (false)
 
 #else /* ARM_COMPUTE_LOGGING_ENABLED */
 
diff --git a/arm_compute/core/utils/logging/Types.h b/arm_compute/core/utils/logging/Types.h
index f0ddae6c849e854f391faff1ff41668943986233..64c567b9845786489e2c4b63226784d5f68e7bfb 100644
--- a/arm_compute/core/utils/logging/Types.h
+++ b/arm_compute/core/utils/logging/Types.h
@@ -44,8 +44,7 @@ enum class LogLevel
 struct LogMsg
 {
     /** Default constructor */
-    LogMsg()
-        : raw_(), log_level_(LogLevel::OFF)
+    LogMsg() : raw_(), log_level_(LogLevel::OFF)
     {
     }
     /** Construct a log message
@@ -53,8 +52,7 @@ struct LogMsg
      * @param[in] msg       Message to log.
      * @param[in] log_level Logging level. Default: OFF
      */
-    LogMsg(std::string msg, LogLevel log_level = LogLevel::OFF)
-        : raw_(msg), log_level_(log_level)
+    LogMsg(std::string msg, LogLevel log_level = LogLevel::OFF) : raw_(msg), log_level_(log_level)
     {
     }
 
diff --git a/arm_compute/core/utils/math/Math.h b/arm_compute/core/utils/math/Math.h
index c1dce7ff087a01a9abf6ad6aca2baa291956a030..e70337ba0f8ae63bdfdf6d95dba699d2527c66cb 100644
--- a/arm_compute/core/utils/math/Math.h
+++ b/arm_compute/core/utils/math/Math.h
@@ -67,5 +67,5 @@ inline auto floor_to_multiple(S value, T divisor) -> decltype((value / divisor)
     return (value / divisor) * divisor;
 }
 
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_UTILS_MATH_H */
diff --git a/arm_compute/core/utils/math/SafeOps.h b/arm_compute/core/utils/math/SafeOps.h
index dc928a0e5d4fda75494e48d5553d1d2ff6055aae..ef8bcf7e14ba41a0b3347a86d26cc43510d70766 100644
--- a/arm_compute/core/utils/math/SafeOps.h
+++ b/arm_compute/core/utils/math/SafeOps.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_UTILS_MATH_SAFE_OPS
 
 #include "arm_compute/core/Error.h"
+
 #include "support/AclRequires.h"
 
 #include <limits>
@@ -51,11 +52,11 @@ T safe_integer_add(T val_a, T val_b)
 {
     T result = 0;
 
-    if((val_b > 0) && (val_a > std::numeric_limits<T>::max() - val_b))
+    if ((val_b > 0) && (val_a > std::numeric_limits<T>::max() - val_b))
     {
         result = std::numeric_limits<T>::max();
     }
-    else if((val_b < 0) && (val_a < std::numeric_limits<T>::min() - val_b))
+    else if ((val_b < 0) && (val_a < std::numeric_limits<T>::min() - val_b))
     {
         result = std::numeric_limits<T>::min();
     }
@@ -83,11 +84,11 @@ T safe_integer_sub(T val_a, T val_b)
 {
     T result = 0;
 
-    if((val_b < 0) && (val_a > std::numeric_limits<T>::max() + val_b))
+    if ((val_b < 0) && (val_a > std::numeric_limits<T>::max() + val_b))
     {
         result = std::numeric_limits<T>::max();
     }
-    else if((val_b > 0) && (val_a < std::numeric_limits<T>::min() + val_b))
+    else if ((val_b > 0) && (val_a < std::numeric_limits<T>::min() + val_b))
     {
         result = std::numeric_limits<T>::min();
     }
@@ -115,13 +116,13 @@ T safe_integer_mul(T val_a, T val_b)
 {
     T result = 0;
 
-    if(val_a > 0)
+    if (val_a > 0)
     {
-        if((val_b > 0) && (val_a > (std::numeric_limits<T>::max() / val_b)))
+        if ((val_b > 0) && (val_a > (std::numeric_limits<T>::max() / val_b)))
         {
             result = std::numeric_limits<T>::max();
         }
-        else if(val_b < (std::numeric_limits<T>::min() / val_a))
+        else if (val_b < (std::numeric_limits<T>::min() / val_a))
         {
             result = std::numeric_limits<T>::min();
         }
@@ -132,11 +133,11 @@ T safe_integer_mul(T val_a, T val_b)
     }
     else
     {
-        if((val_b > 0) && (val_a < (std::numeric_limits<T>::min() / val_b)))
+        if ((val_b > 0) && (val_a < (std::numeric_limits<T>::min() / val_b)))
         {
             result = std::numeric_limits<T>::max();
         }
-        else if((val_a != 0) && (val_b < (std::numeric_limits<T>::max() / val_a)))
+        else if ((val_a != 0) && (val_b < (std::numeric_limits<T>::max() / val_a)))
         {
             result = std::numeric_limits<T>::min();
         }
@@ -165,7 +166,7 @@ T safe_integer_div(T val_a, T val_b)
 {
     T result = 0;
 
-    if((val_b == 0) || ((val_a == std::numeric_limits<T>::min()) && (val_b == -1)))
+    if ((val_b == 0) || ((val_a == std::numeric_limits<T>::min()) && (val_b == -1)))
     {
         result = std::numeric_limits<T>::min();
     }
@@ -176,7 +177,7 @@ T safe_integer_div(T val_a, T val_b)
 
     return result;
 }
-} // namespace cast
+} // namespace math
 } // namespace utils
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_UTILS_MATH_SAFE_OPS */
diff --git a/arm_compute/core/utils/misc/InfoHelpers.h b/arm_compute/core/utils/misc/InfoHelpers.h
index ced0d24b566e6beb5f117351d4ce9d5bf2de970a..1d1b4ea8d783ac7e0d5d104f478c3eb385a9b7d8 100644
--- a/arm_compute/core/utils/misc/InfoHelpers.h
+++ b/arm_compute/core/utils/misc/InfoHelpers.h
@@ -53,10 +53,12 @@ inline bool is_relu(ActivationLayerInfo activation_info)
  */
 inline bool is_relu6(ActivationLayerInfo activation_info)
 {
-    const bool is_lu_bounded_relu = activation_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                    && activation_info.a() == 6.f && activation_info.b() == 0.f;
-    const bool is_bounded_relu = activation_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
-                                 && activation_info.a() == 6.f;
+    const bool is_lu_bounded_relu =
+        activation_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU &&
+        activation_info.a() == 6.f && activation_info.b() == 0.f;
+    const bool is_bounded_relu =
+        activation_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+        activation_info.a() == 6.f;
     return activation_info.enabled() && (is_lu_bounded_relu || is_bounded_relu);
 }
 
@@ -68,34 +70,37 @@ inline bool is_relu6(ActivationLayerInfo activation_info)
  *
  */
 template <typename T>
-inline void build_lstm_params_tensor_info(const LSTMParams<T>     &lstm_params,
-                                          LSTMParams<ITensorInfo> *lstm_params_info)
+inline void build_lstm_params_tensor_info(const LSTMParams<T> &lstm_params, LSTMParams<ITensorInfo> *lstm_params_info)
 {
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        lstm_params_info->set_peephole_params(lstm_params.cell_to_forget_weights()->info(), lstm_params.cell_to_output_weights()->info());
+        lstm_params_info->set_peephole_params(lstm_params.cell_to_forget_weights()->info(),
+                                              lstm_params.cell_to_output_weights()->info());
     }
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.projection_weights());
-        lstm_params_info->set_projection_params(lstm_params.projection_weights()->info(),
-                                                lstm_params.projection_bias() != nullptr ? lstm_params.projection_bias()->info() : nullptr);
+        lstm_params_info->set_projection_params(
+            lstm_params.projection_weights()->info(),
+            lstm_params.projection_bias() != nullptr ? lstm_params.projection_bias()->info() : nullptr);
     }
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+        ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(),
+                                     lstm_params.input_gate_bias());
 
-        ITensorInfo *cell_to_input_weights_info = (lstm_params.has_peephole_opt()) ? lstm_params.cell_to_input_weights()->info() : nullptr;
-        lstm_params_info->set_cifg_params(lstm_params.input_to_input_weights()->info(), lstm_params.recurrent_to_input_weights()->info(),
-                                          cell_to_input_weights_info, lstm_params.input_gate_bias()->info());
+        ITensorInfo *cell_to_input_weights_info =
+            (lstm_params.has_peephole_opt()) ? lstm_params.cell_to_input_weights()->info() : nullptr;
+        lstm_params_info->set_cifg_params(lstm_params.input_to_input_weights()->info(),
+                                          lstm_params.recurrent_to_input_weights()->info(), cell_to_input_weights_info,
+                                          lstm_params.input_gate_bias()->info());
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
-                                     lstm_params.output_layer_norm_weights(),
+        ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.output_layer_norm_weights(),
                                      lstm_params.cell_layer_norm_weights());
-        if(!lstm_params.has_cifg_opt())
+        if (!lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.input_layer_norm_weights());
         }
@@ -103,15 +108,14 @@ inline void build_lstm_params_tensor_info(const LSTMParams<T>     &lstm_params,
         ITensorInfo *forget_info = lstm_params.forget_layer_norm_weights()->info();
         ITensorInfo *cell_info   = lstm_params.cell_layer_norm_weights()->info();
         ITensorInfo *output_info = lstm_params.output_layer_norm_weights()->info();
-        ITensorInfo *input_info  = lstm_params.has_cifg_opt() ? nullptr : lstm_params.input_layer_norm_weights()->info();
+        ITensorInfo *input_info = lstm_params.has_cifg_opt() ? nullptr : lstm_params.input_layer_norm_weights()->info();
 
         lstm_params_info->set_layer_normalization_params(input_info, forget_info, cell_info, output_info);
     }
 
-    lstm_params_info->set_matmul_scale_params(lstm_params.input_intermediate_scale(),
-                                              lstm_params.forget_intermediate_scale(),
-                                              lstm_params.cell_intermediate_scale(),
-                                              lstm_params.output_intermediate_scale());
+    lstm_params_info->set_matmul_scale_params(
+        lstm_params.input_intermediate_scale(), lstm_params.forget_intermediate_scale(),
+        lstm_params.cell_intermediate_scale(), lstm_params.output_intermediate_scale());
 
     lstm_params_info->set_hidden_state_params(lstm_params.hidden_state_zero(), lstm_params.hidden_state_scale());
 }
diff --git a/arm_compute/core/utils/misc/Macros.h b/arm_compute/core/utils/misc/Macros.h
index de66b6a52fde1b4c8074affa2cbbf55f31797dc0..fa861fa4423be5c227b5a54e305045694278815d 100644
--- a/arm_compute/core/utils/misc/Macros.h
+++ b/arm_compute/core/utils/misc/Macros.h
@@ -26,15 +26,16 @@
 
 #if defined(__cplusplus) && (__cplusplus >= 201402L)
 
-#define ARM_COMPUTE_DEPRECATED [[deprecated]]
-#define ARM_COMPUTE_DEPRECATED_REL(rel) [[deprecated("Deprecated in : " #rel)]]
+#define ARM_COMPUTE_DEPRECATED                           [[deprecated]]
+#define ARM_COMPUTE_DEPRECATED_REL(rel)                  [[deprecated("Deprecated in : " #rel)]]
 #define ARM_COMPUTE_DEPRECATED_REL_REPLACE(rel, replace) [[deprecated("Deprecated in : " #rel " - Use : " #replace)]]
 
 #elif defined(__GNUC__) || defined(__clang__)
 
-#define ARM_COMPUTE_DEPRECATED __attribute__((deprecated))
+#define ARM_COMPUTE_DEPRECATED          __attribute__((deprecated))
 #define ARM_COMPUTE_DEPRECATED_REL(rel) __attribute__((deprecated("Deprecated in : " #rel)))
-#define ARM_COMPUTE_DEPRECATED_REL_REPLACE(rel, replace) __attribute__((deprecated("Deprecated in : " #rel " - Use : " #replace)))
+#define ARM_COMPUTE_DEPRECATED_REL_REPLACE(rel, replace) \
+    __attribute__((deprecated("Deprecated in : " #rel " - Use : " #replace)))
 
 #else // defined(__cplusplus) && (__cplusplus >= 201402L)
 
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 4c2037ab8d2be7b83735bf4a49196ef9c2f67bf9..86dcfdc3d05a5cf47a2013b6c77a903381202028 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -21,18 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_ARM_COMPUTE_CORE_UTILS_MISC_SHAPECALCULATOR
-#define ACL_ARM_COMPUTE_CORE_UTILS_MISC_SHAPECALCULATOR
+#ifndef ACL_ARM_COMPUTE_CORE_UTILS_MISC_SHAPECALCULATOR_H
+#define ACL_ARM_COMPUTE_CORE_UTILS_MISC_SHAPECALCULATOR_H
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/function_info/ConvolutionInfo.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 
-#include "arm_compute/core/utils/helpers/tensor_transform.h"
-
 #include <cmath>
 
 namespace arm_compute
@@ -57,12 +56,12 @@ inline TensorShape calculate_reduce_mean_shape(ITensorInfo *input, const Coordin
     convert_negative_axis(axis_local, input_dims);
     TensorShape out_shape = input->tensor_shape();
     // Configure reshape layer if we want to drop the dimensions
-    if(!keep_dims)
+    if (!keep_dims)
     {
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
         std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-        for(int i = 0; i < reduction_ops; ++i)
+        for (int i = 0; i < reduction_ops; ++i)
         {
             out_shape.remove_dimension(axis_local[i] - i, false);
         }
@@ -70,7 +69,7 @@ inline TensorShape calculate_reduce_mean_shape(ITensorInfo *input, const Coordin
     }
     else
     {
-        for(int i = 0; i < reduction_ops; ++i)
+        for (int i = 0; i < reduction_ops; ++i)
         {
             out_shape.set(axis_local[i], 1);
         }
@@ -86,7 +85,10 @@ inline TensorShape calculate_reduce_mean_shape(ITensorInfo *input, const Coordin
  *
  * @return the calculated shape
  */
-inline TensorShape compute_vector_to_tensor_output_shape(const TensorShape &input, size_t conv_w, size_t conv_h, const DataLayout &data_layout)
+inline TensorShape compute_vector_to_tensor_output_shape(const TensorShape &input,
+                                                         size_t             conv_w,
+                                                         size_t             conv_h,
+                                                         const DataLayout  &data_layout)
 {
     const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -128,10 +130,12 @@ inline TensorShape compute_reorg_output_shape(const ITensorInfo &input, int32_t
     const size_t idx_channel = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL);
 
     ARM_COMPUTE_ERROR_ON(stride <= 0);
-    ARM_COMPUTE_ERROR_ON_MSG((input.tensor_shape()[idx_width] % stride != 0), "The width of the input tensor must be a multiple of stride");
-    ARM_COMPUTE_ERROR_ON_MSG((input.tensor_shape()[idx_height] % stride != 0), "The height of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_ERROR_ON_MSG((input.tensor_shape()[idx_width] % stride != 0),
+                             "The width of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_ERROR_ON_MSG((input.tensor_shape()[idx_height] % stride != 0),
+                             "The height of the input tensor must be a multiple of stride");
 
-    TensorShape output_shape{ input.tensor_shape() };
+    TensorShape output_shape{input.tensor_shape()};
 
     output_shape.set(idx_width, output_shape[idx_width] / stride);
     output_shape.set(idx_height, output_shape[idx_height] / stride);
@@ -148,7 +152,8 @@ inline TensorShape compute_reorg_output_shape(const ITensorInfo &input, int32_t
  *
  * @return the calculated shape of the reshaped weights
  */
-inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bool has_bias = false, unsigned int num_groups = 1)
+inline TensorShape
+compute_weights_reshaped_shape(const ITensorInfo &weights, bool has_bias = false, unsigned int num_groups = 1)
 {
     // Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
     ARM_COMPUTE_ERROR_ON(num_groups == 0);
@@ -156,14 +161,14 @@ inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bo
     ARM_COMPUTE_ERROR_ON((weights.dimension(3) % num_groups) != 0);
 
     // Calculate output shape
-    TensorShape weights_reshaped{ weights.tensor_shape() };
+    TensorShape weights_reshaped{weights.tensor_shape()};
     weights_reshaped.set(3, weights_reshaped[3] / num_groups);
 
     weights_reshaped.collapse(3);
     const size_t tmp_dim = weights_reshaped[0];
     weights_reshaped.set(0, weights_reshaped[1]);
     weights_reshaped.set(1, tmp_dim + (has_bias ? 1 : 0));
-    if(weights.num_dimensions() < 5)
+    if (weights.num_dimensions() < 5)
     {
         weights_reshaped.set(2, num_groups);
     }
@@ -179,7 +184,9 @@ inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bo
  *
  * @return the calculated shape
  */
-inline TensorShape compute_lhs_reshaped_shape(const ITensorInfo &a, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d = false)
+inline TensorShape compute_lhs_reshaped_shape(const ITensorInfo       &a,
+                                              const GEMMLHSMatrixInfo &lhs_info,
+                                              bool                     reinterpret_input_as_3d = false)
 {
     ARM_COMPUTE_ERROR_ON(lhs_info.m0 == 0);
     ARM_COMPUTE_ERROR_ON(lhs_info.k0 == 0);
@@ -200,11 +207,11 @@ inline TensorShape compute_lhs_reshaped_shape(const ITensorInfo &a, const GEMMLH
     const unsigned int output_width  = block_size * num_horiz_blocks * lhs_info.v0;
     const unsigned int output_height = std::ceil(num_vert_blocks / static_cast<float>(lhs_info.v0));
 
-    TensorShape lhs_shape{ a.tensor_shape() };
+    TensorShape lhs_shape{a.tensor_shape()};
     lhs_shape.set(0, output_width);
     lhs_shape.set(1, output_height);
 
-    if((reinterpret_input_as_3d) && (lhs_shape.num_dimensions() > 2))
+    if ((reinterpret_input_as_3d) && (lhs_shape.num_dimensions() > 2))
     {
         // When the data format is NHWC and the shapes are Nx1x1
         // the tensor shape num_dimensions is automatically set to 1 instead of 3.
@@ -244,7 +251,7 @@ inline TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRH
     const unsigned int output_width  = block_size * num_vert_blocks * rhs_info.h0;
     const unsigned int output_height = std::ceil(num_horiz_blocks / static_cast<float>(rhs_info.h0));
 
-    TensorShape rhs_shape{ a.tensor_shape() };
+    TensorShape rhs_shape{a.tensor_shape()};
     rhs_shape.set(0, output_width);
     rhs_shape.set(1, output_height);
 
@@ -259,14 +266,15 @@ inline TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRH
  *
  * @return the calculated shape
  */
-inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height = 1, bool reinterpret_input_as_3d = false)
+inline TensorShape
+compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height = 1, bool reinterpret_input_as_3d = false)
 {
     // The interleaved output matrix will have the following shape: [ a_height * W, ceil(a_width / W) ] where W = 4 * mult_interleave4x4_height
     ARM_COMPUTE_ERROR_ON(mult_interleave4x4_height < 1);
     const int   interleave_width = 4 * mult_interleave4x4_height;
-    TensorShape shape_interleaved_a{ a.tensor_shape() };
+    TensorShape shape_interleaved_a{a.tensor_shape()};
     shape_interleaved_a.set(0, a.dimension(0) * interleave_width);
-    if(reinterpret_input_as_3d)
+    if (reinterpret_input_as_3d)
     {
         const int M      = a.dimension(1) * a.dimension(2);
         const int height = std::ceil(M / static_cast<float>(interleave_width));
@@ -276,7 +284,7 @@ inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_inte
         // the tensor shape num_dimensions is automatically set to 1 instead of 3.
         // To avoid failures by removing a dimension that doesn't exist
         // check if the number of dimensions is greater than 2.
-        if(shape_interleaved_a.num_dimensions() > 2)
+        if (shape_interleaved_a.num_dimensions() > 2)
         {
             shape_interleaved_a.remove_dimension(2);
         }
@@ -298,7 +306,7 @@ inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_inte
 inline TensorShape compute_transpose1xW_shape(const ITensorInfo &b)
 {
     // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-    TensorShape shape_transposed1xW_b{ b.tensor_shape() };
+    TensorShape shape_transposed1xW_b{b.tensor_shape()};
     shape_transposed1xW_b.set(0, b.dimension(1) * 16);
     shape_transposed1xW_b.set(1, std::ceil(b.dimension(0) / 16.f));
 
@@ -318,7 +326,7 @@ inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInf
     //       The transpose1xW output matrix will have the following shape:
     //       [ b_height * W, ceil(b_width / W) ] where W = (16 / element size of the tensor) * mult_transpose1xW_width
     ARM_COMPUTE_ERROR_ON(mult_transpose1xW_width < 1);
-    TensorShape  shape_transposed1xW_b{ b.tensor_shape() };
+    TensorShape  shape_transposed1xW_b{b.tensor_shape()};
     const size_t transpose_width = (16 / b.element_size()) * mult_transpose1xW_width;
     shape_transposed1xW_b.set(0, b.dimension(1) * transpose_width);
     shape_transposed1xW_b.set(1, static_cast<size_t>(std::ceil(b.dimension(0) / static_cast<float>(transpose_width))));
@@ -334,8 +342,8 @@ inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInf
  */
 inline TensorShape compute_reductionA_shape(const ITensorInfo &b)
 {
-    TensorShape shape_vector_sum_col{ b.tensor_shape() };
-    if(shape_vector_sum_col.num_dimensions() > 1)
+    TensorShape shape_vector_sum_col{b.tensor_shape()};
+    if (shape_vector_sum_col.num_dimensions() > 1)
     {
         shape_vector_sum_col.remove_dimension(1);
     }
@@ -351,9 +359,9 @@ inline TensorShape compute_reductionA_shape(const ITensorInfo &b)
  */
 inline TensorShape compute_reductionB_shape(const ITensorInfo &a)
 {
-    TensorShape shape_vector_sum_row{ a.tensor_shape() };
+    TensorShape shape_vector_sum_row{a.tensor_shape()};
     shape_vector_sum_row.set(Window::DimX, a.dimension(1));
-    if(shape_vector_sum_row.num_dimensions() > 1)
+    if (shape_vector_sum_row.num_dimensions() > 1)
     {
         shape_vector_sum_row.remove_dimension(1);
     }
@@ -370,7 +378,10 @@ inline TensorShape compute_reductionB_shape(const ITensorInfo &a)
  *
  * @return the calculated shape
  */
-inline TensorShape compute_col2im_shape(const ITensorInfo &input, const Size2D &convolved_dims, bool batch_size_on_z, unsigned int num_groups = 1)
+inline TensorShape compute_col2im_shape(const ITensorInfo &input,
+                                        const Size2D      &convolved_dims,
+                                        bool               batch_size_on_z,
+                                        unsigned int       num_groups = 1)
 {
     ARM_COMPUTE_ERROR_ON(num_groups == 0);
     ARM_COMPUTE_ERROR_ON(input.tensor_shape()[1] != (convolved_dims.area()));
@@ -381,10 +392,10 @@ inline TensorShape compute_col2im_shape(const ITensorInfo &input, const Size2D &
     const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    TensorShape col2im_shape{ input.tensor_shape() };
+    TensorShape col2im_shape{input.tensor_shape()};
     // If batches start on 3rd dimension shift dimensions right by 1 to retain upper tensor shape,
     // as first three will be override by H,W,C data
-    if(batch_size_on_z && num_groups == 1)
+    if (batch_size_on_z && num_groups == 1)
     {
         col2im_shape.shift_right(1);
     }
@@ -403,7 +414,7 @@ inline TensorShape compute_col2im_shape(const ITensorInfo &input, const Size2D &
  */
 inline TensorShape compute_transposed_shape(const ITensorInfo &input)
 {
-    TensorShape shape_transposed{ input.tensor_shape() };
+    TensorShape shape_transposed{input.tensor_shape()};
 
     shape_transposed.set(0, input.dimension(1), false);
     shape_transposed.set(1, input.dimension(0), false);
@@ -419,10 +430,11 @@ inline TensorShape compute_transposed_shape(const ITensorInfo &input)
  *
  * @return the calculated shape
  */
-inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
+inline TensorShape
+compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
 {
-    const TensorShape input_shape{ input.tensor_shape() };
-    const TensorShape weights_shape{ weights.tensor_shape() };
+    const TensorShape input_shape{input.tensor_shape()};
+    const TensorShape weights_shape{weights.tensor_shape()};
 
     const DataLayout data_layout = input.data_layout();
     const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -430,16 +442,16 @@ inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input,
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
     const DataLayout weights_data_layout = weights.data_layout();
-    const int        weights_width_idx   = get_data_layout_dimension_index(weights_data_layout, DataLayoutDimension::WIDTH);
-    const int        weights_height_idx  = get_data_layout_dimension_index(weights_data_layout, DataLayoutDimension::HEIGHT);
+    const int weights_width_idx  = get_data_layout_dimension_index(weights_data_layout, DataLayoutDimension::WIDTH);
+    const int weights_height_idx = get_data_layout_dimension_index(weights_data_layout, DataLayoutDimension::HEIGHT);
 
     unsigned int output_width  = 0;
     unsigned int output_height = 0;
-    std::tie(output_width, output_height) = scaled_dimensions(input_shape[width_idx], input_shape[height_idx],
-                                                              weights_shape[weights_width_idx], weights_shape[weights_height_idx],
-                                                              info.pad_stride_info, info.dilation);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_shape[width_idx], input_shape[height_idx], weights_shape[weights_width_idx],
+                          weights_shape[weights_height_idx], info.pad_stride_info, info.dilation);
 
-    TensorShape output_shape{ input_shape };
+    TensorShape output_shape{input_shape};
     output_shape.set(width_idx, output_width);
     output_shape.set(height_idx, output_height);
     output_shape.set(channel_idx, input_shape[channel_idx] * info.depth_multiplier);
@@ -447,6 +459,37 @@ inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input,
     return output_shape;
 }
 
+/** Calculate padding required for deconvolution
+ *
+ * @param[in] input    Input tensor info
+ * @param[in] weights  Weights tensor shape
+ * @param[in] sx       Stride on x axis
+ * @param[in] sy       Stride on y axis
+ * @param[in] out_dims Output shape dimensions
+ *
+ * @return the padding required
+ */
+inline std::pair<int32_t, int32_t> compute_deconvolution_padding(const ITensorInfo            &input,
+                                                                 const ITensorInfo            &weights,
+                                                                 int32_t                       sx,
+                                                                 int32_t                       sy,
+                                                                 std::pair<uint32_t, uint32_t> out_dims)
+{
+    const DataLayout data_layout = input.data_layout();
+    const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t     idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    // Find the upsampled dimensions
+    int32_t out_x = (static_cast<int32_t>(input.dimension(idx_w)) - 1) * sx + 1;
+    int32_t out_y = (static_cast<int32_t>(input.dimension(idx_h)) - 1) * sy + 1;
+
+    // Find the padding needed for the convolution with stride 1 in order to match output shape
+    int32_t padx = out_dims.first - (out_x - static_cast<int32_t>(weights.dimension(idx_w)) + 1);
+    int32_t pady = out_dims.second - (out_y - static_cast<int32_t>(weights.dimension(idx_h)) + 1);
+
+    return std::make_pair(padx, pady);
+}
+
 /** Calculate the upsampled output shape used for deconvolution
  *
  * @param[in] input    Input tensor info
@@ -459,20 +502,28 @@ inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input,
  *
  * @return the calculated shape
  */
-inline TensorShape compute_deconvolution_upsampled_shape(const ITensorInfo &input, const ITensorInfo &weights, unsigned int sx, unsigned int sy,
-                                                         std::pair<unsigned int, unsigned int> &out_dims, uint32_t &padx, uint32_t &pady)
+inline TensorShape compute_deconvolution_upsampled_shape(const ITensorInfo                     &input,
+                                                         const ITensorInfo                     &weights,
+                                                         unsigned int                           sx,
+                                                         unsigned int                           sy,
+                                                         std::pair<unsigned int, unsigned int> &out_dims,
+                                                         uint32_t                              &padx,
+                                                         uint32_t                              &pady)
 {
+    // Find the padding needed for the convolution with stride 1 in order to match output shape
+    const auto padxy =
+        compute_deconvolution_padding(input, weights, static_cast<int32_t>(sx), static_cast<int32_t>(sy), out_dims);
+    padx = static_cast<uint32_t>(padxy.first);
+    pady = static_cast<uint32_t>(padxy.second);
+
     const DataLayout data_layout = input.data_layout();
     const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const size_t     idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Find the upsampled dimensions
-    unsigned int out_x = (input.dimension(idx_w) - 1) * sx + 1;
-    unsigned int out_y = (input.dimension(idx_h) - 1) * sy + 1;
+    uint32_t out_x = (input.dimension(idx_w) - 1) * sx + 1;
+    uint32_t out_y = (input.dimension(idx_h) - 1) * sy + 1;
 
-    // Find the padding needed for the convolution with stride 1 in order to match output shape
-    padx = out_dims.first - (out_x - weights.dimension(idx_w) + 1);
-    pady = out_dims.second - (out_y - weights.dimension(idx_h) + 1);
     out_x += padx;
     out_y += pady;
 
@@ -491,10 +542,12 @@ inline TensorShape compute_deconvolution_upsampled_shape(const ITensorInfo &inpu
  *
  * @return the calculated shape
  */
-inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, const ITensorInfo &input, const ITensorInfo &weights)
+inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims,
+                                                      const ITensorInfo                           &input,
+                                                      const ITensorInfo                           &weights)
 {
-    const TensorShape input_shape{ input.tensor_shape() };
-    const TensorShape weights_shape{ weights.tensor_shape() };
+    const TensorShape input_shape{input.tensor_shape()};
+    const TensorShape weights_shape{weights.tensor_shape()};
 
     const DataLayout data_layout = input.data_layout();
     const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -502,7 +555,7 @@ inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned i
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
     const int        batch_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
 
-    TensorShape out_shape{ input_shape };
+    TensorShape out_shape{input_shape};
     out_shape.set(width_idx, out_dims.first);
     out_shape.set(height_idx, out_dims.second);
     out_shape.set(channel_idx, weights_shape[batch_idx]);
@@ -522,8 +575,14 @@ inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned i
  *
  * @return the calculated shape
  */
-inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, bool batch_size_on_z,
-                                             unsigned int num_groups = 1, unsigned int input_pad_right = 0)
+inline TensorShape compute_im2col_conv_shape(const ITensorInfo   *input,
+                                             const Size2D        &kernel_dims,
+                                             const PadStrideInfo &conv_info,
+                                             bool                 has_bias,
+                                             const Size2D        &dilation,
+                                             bool                 batch_size_on_z,
+                                             unsigned int         num_groups      = 1,
+                                             unsigned int         input_pad_right = 0)
 {
     // The output shape will be the 3D shape [ out_channels * kernel_area, num_elems_per_out_channel, batches ]                           if batch_size_on_z == true
     //                       or the 4D shape [ out_channels * kernel_area / num_groups, num_elems_per_out_channel, num_groups, batches ]  if batch_size_on_z == false
@@ -532,17 +591,19 @@ inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Siz
     ARM_COMPUTE_ERROR_ON(num_groups > 1 && input->data_layout() != DataLayout::NCHW);
     ARM_COMPUTE_ERROR_ON(num_groups > 1 && batch_size_on_z);
 
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
 
     const DataLayout data_layout = input->data_layout();
     const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(output_shape[width_idx], output_shape[height_idx], kernel_dims.width, kernel_dims.height, conv_info, dilation);
-    output_shape.set(0, ((output_shape[channel_idx] + input_pad_right) / num_groups * kernel_dims.area() + (has_bias ? 1 : 0))); // NOLINT
+    std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(
+        output_shape[width_idx], output_shape[height_idx], kernel_dims.width, kernel_dims.height, conv_info, dilation);
+    output_shape.set(0, ((output_shape[channel_idx] + input_pad_right) / num_groups * kernel_dims.area() +
+                         (has_bias ? 1 : 0))); // NOLINT
     output_shape.set(1, (out_dims.first * out_dims.second));
-    if(batch_size_on_z && output_shape.num_dimensions() >= 3)
+    if (batch_size_on_z && output_shape.num_dimensions() >= 3)
     {
         output_shape.remove_dimension(2);
     }
@@ -564,7 +625,7 @@ inline TensorShape compute_flatten_shape(const ITensorInfo *input)
 {
     // The output shape will be the flatten version of the input (i.e. [ width * height * channels, num_batches, ... ] ). Used for FlattenLayer and FullyConnectedLayer.
 
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
 
     output_shape.collapse(3);
 
@@ -586,7 +647,7 @@ inline TensorShape compute_softmax_shape(const ITensorInfo *input, size_t axis =
     // - [x,y,z,w] and axis 3 will return [x*y*z, w]
     TensorShape shape2D = input->tensor_shape();
 
-    if(axis < input->num_dimensions())
+    if (axis < input->num_dimensions())
     {
         // Collapse from axis onward (this changes the shape)
         shape2D.collapse_from(axis);
@@ -600,7 +661,7 @@ inline TensorShape compute_softmax_shape(const ITensorInfo *input, size_t axis =
         shape2D.collapse(shape2D.num_dimensions());
     }
 
-    if(axis == 0)
+    if (axis == 0)
     {
         // If axis is zero the first dim should be one. Since
         // collapse is an inclusive operation we need to shift
@@ -619,15 +680,17 @@ inline TensorShape compute_softmax_shape(const ITensorInfo *input, size_t axis =
  */
 inline TensorShape compute_winograd_filter_transform_shape(const ITensorInfo &input, const WinogradInfo &winograd_info)
 {
-    TensorShape tensor_shape{ input.tensor_shape() };
+    TensorShape tensor_shape{input.tensor_shape()};
 
     const Size2D kernel_size      = winograd_info.kernel_size;
     const Size2D output_tile_size = winograd_info.output_tile_size;
-    const Size2D input_tile_size  = Size2D(output_tile_size.width + kernel_size.width - 1, output_tile_size.height + kernel_size.height - 1);
+    const Size2D input_tile_size =
+        Size2D(output_tile_size.width + kernel_size.width - 1, output_tile_size.height + kernel_size.height - 1);
 
     tensor_shape.remove_dimension(get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH));
     tensor_shape.set(Window::DimX, input.dimension(3));
-    tensor_shape.set(Window::DimY, input.dimension(get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL)));
+    tensor_shape.set(Window::DimY, input.dimension(get_data_layout_dimension_index(input.data_layout(),
+                                                                                   DataLayoutDimension::CHANNEL)));
     tensor_shape.set(Window::DimZ, input_tile_size.area());
 
     return tensor_shape;
@@ -645,23 +708,22 @@ inline TensorShape compute_winograd_input_transform_shape(const ITensorInfo &inp
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        kernel_size      = winograd_info.kernel_size;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const Size2D        input_tile_size  = Size2D(output_tile_size.width + kernel_size.width - 1, output_tile_size.height + kernel_size.height - 1);
+    const Size2D        input_tile_size =
+        Size2D(output_tile_size.width + kernel_size.width - 1, output_tile_size.height + kernel_size.height - 1);
 
     const size_t idx_w = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
     const size_t idx_c = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL);
 
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(input.tensor_shape()[idx_w], input.tensor_shape()[idx_h]),
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
+    const Size2D num_tiles = compute_winograd_convolution_tiles(
+        Size2D(input.tensor_shape()[idx_w], input.tensor_shape()[idx_h]), kernel_size, output_tile_size, conv_info);
 
     const unsigned int width  = input.tensor_shape()[idx_c];
     const unsigned int height = num_tiles.area();
     const unsigned int depth  = input_tile_size.area();
 
-    TensorShape output_shape{ input.tensor_shape() };
+    TensorShape output_shape{input.tensor_shape()};
     output_shape.set(0, width);
     output_shape.set(1, height);
     output_shape.set(2, depth);
@@ -684,12 +746,12 @@ inline TensorShape compute_winograd_output_transform_shape(const ITensorInfo &in
     const DataLayout    data_layout      = winograd_info.output_data_layout;
 
     // Compute output shape
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
+    unsigned int output_width             = 0;
+    unsigned int output_height            = 0;
     std::tie(output_width, output_height) = scaled_dimensions(input_dimensions.width, input_dimensions.height,
                                                               kernel_size.width, kernel_size.height, conv_info);
 
-    TensorShape tensor_shape{ input.tensor_shape() };
+    TensorShape tensor_shape{input.tensor_shape()};
 
     // Output dimension
     const unsigned int out_w = output_width;
@@ -712,7 +774,10 @@ inline TensorShape compute_winograd_output_transform_shape(const ITensorInfo &in
  *
  * @return the calculated shape
  */
-inline TensorShape compute_deep_convolution_shape(const TensorShape &input_shape, DataLayout input_data_layout, const TensorShape &weights_shape, const PadStrideInfo &conv_info)
+inline TensorShape compute_deep_convolution_shape(const TensorShape   &input_shape,
+                                                  DataLayout           input_data_layout,
+                                                  const TensorShape   &weights_shape,
+                                                  const PadStrideInfo &conv_info)
 {
     const size_t idx_width   = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::WIDTH);
     const size_t idx_height  = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::HEIGHT);
@@ -725,9 +790,10 @@ inline TensorShape compute_deep_convolution_shape(const TensorShape &input_shape
     const unsigned int weights_out_channel = weights_shape[3];
     unsigned int       output_width        = 0;
     unsigned int       output_height       = 0;
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, weights_width, weights_height, conv_info);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_width, input_height, weights_width, weights_height, conv_info);
 
-    TensorShape output_shape{ input_shape };
+    TensorShape output_shape{input_shape};
     output_shape.set(idx_width, output_width);
     output_shape.set(idx_height, output_height);
     output_shape.set(idx_channel, weights_out_channel);
@@ -743,7 +809,8 @@ inline TensorShape compute_deep_convolution_shape(const TensorShape &input_shape
  *
  * @return the calculated shape
  */
-inline TensorShape compute_deep_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info)
+inline TensorShape
+compute_deep_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info)
 {
     return compute_deep_convolution_shape(input.tensor_shape(), input.data_layout(), weights.tensor_shape(), conv_info);
 }
@@ -758,7 +825,10 @@ inline TensorShape compute_deep_convolution_shape(const ITensorInfo &input, cons
  *
  * @return the calculated shape
  */
-inline TensorShape compute_indirect_buffer_shape(const TensorShape &input_shape, DataLayout input_data_layout, const TensorShape &weights_shape, const PadStrideInfo &conv_info,
+inline TensorShape compute_indirect_buffer_shape(const TensorShape                 &input_shape,
+                                                 DataLayout                         input_data_layout,
+                                                 const TensorShape                 &weights_shape,
+                                                 const PadStrideInfo               &conv_info,
                                                  const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_ERROR_ON_MSG(input_data_layout != DataLayout::NHWC, "The data layout can only be NHWC");
@@ -768,7 +838,8 @@ inline TensorShape compute_indirect_buffer_shape(const TensorShape &input_shape,
     const unsigned int kw = weights_shape[1];
     const unsigned int kh = weights_shape[2];
 
-    TensorShape output_conv2d_shape = compute_deep_convolution_shape(input_shape, input_data_layout, weights_shape, conv_info);
+    TensorShape output_conv2d_shape =
+        compute_deep_convolution_shape(input_shape, input_data_layout, weights_shape, conv_info);
 
     const unsigned int output_w = m0 * kw * kh;
     const unsigned int output_h = DIV_CEIL(output_conv2d_shape[1] * output_conv2d_shape[2], m0);
@@ -785,7 +856,7 @@ inline TensorShape compute_indirect_buffer_shape(const TensorShape &input_shape,
  */
 inline TensorShape compute_min_max_shape(const ITensorInfo *input)
 {
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
     output_shape.set(Window::DimX, 2);
     output_shape.remove_dimension(1);
     output_shape.remove_dimension(1);
@@ -805,7 +876,7 @@ inline TensorShape compute_pool_shape(const ITensorInfo &input, PoolingLayerInfo
     int pooled_w = 0;
     int pooled_h = 0;
 
-    TensorShape output_shape{ input.tensor_shape() };
+    TensorShape output_shape{input.tensor_shape()};
 
     const bool is_global_pooling = pool_info.is_global_pooling;
     const int  idx_width         = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
@@ -815,9 +886,8 @@ inline TensorShape compute_pool_shape(const ITensorInfo &input, PoolingLayerInfo
     const int  pool_size_x       = is_global_pooling ? output_shape[idx_width] : pool_info.pool_size.width;
     const int  pool_size_y       = is_global_pooling ? output_shape[idx_height] : pool_info.pool_size.height;
 
-    std::tie(pooled_w, pooled_h) = scaled_dimensions_signed(input_width, input_height,
-                                                            pool_size_x, pool_size_y,
-                                                            pool_info.pad_stride_info);
+    std::tie(pooled_w, pooled_h) =
+        scaled_dimensions_signed(input_width, input_height, pool_size_x, pool_size_y, pool_info.pad_stride_info);
 
     ARM_COMPUTE_ERROR_ON_MSG((pooled_w < 1 || pooled_h < 1), "Calculated output dimension size is invalid");
 
@@ -850,8 +920,10 @@ inline TensorShape compute_unpool_shape(const ITensorInfo &input, PoolingLayerIn
     const int pad_bottom = pad_stride_info.pad_bottom();
 
     TensorShape        output_shape = input_shape;
-    const unsigned int out_width    = (input_shape[idx_width] - 1) * stride_x - pad_left - pad_right + pool_info.pool_size.width;
-    const unsigned int out_height   = (input_shape[idx_height] - 1) * stride_y - pad_top - pad_bottom + pool_info.pool_size.height;
+    const unsigned int out_width =
+        (input_shape[idx_width] - 1) * stride_x - pad_left - pad_right + pool_info.pool_size.width;
+    const unsigned int out_height =
+        (input_shape[idx_height] - 1) * stride_y - pad_top - pad_bottom + pool_info.pool_size.height;
 
     output_shape.set(idx_width, out_width);
     output_shape.set(idx_height, out_height);
@@ -866,9 +938,10 @@ inline TensorShape compute_unpool_shape(const ITensorInfo &input, PoolingLayerIn
  *
  * @return the calculated shape
  */
-inline TensorShape compute_roi_align_shape(const ITensorInfo &input, const ITensorInfo &rois, ROIPoolingLayerInfo pool_info)
+inline TensorShape
+compute_roi_align_shape(const ITensorInfo &input, const ITensorInfo &rois, ROIPoolingLayerInfo pool_info)
 {
-    TensorShape output_shape{ input.tensor_shape() };
+    TensorShape output_shape{input.tensor_shape()};
 
     const unsigned int idx_width  = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
     const unsigned int idx_height = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
@@ -889,7 +962,7 @@ inline TensorShape compute_roi_align_shape(const ITensorInfo &input, const ITens
  */
 inline TensorShape compute_rnn_shape(const ITensorInfo *input, const unsigned int batch_size)
 {
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
     output_shape.set(1, batch_size);
 
     return output_shape;
@@ -904,15 +977,21 @@ inline TensorShape compute_rnn_shape(const ITensorInfo *input, const unsigned in
  *
  * @return the calculated shape
  */
-inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
+inline TensorShape compute_mm_shape(const ITensorInfo     &input0,
+                                    const ITensorInfo     &input1,
+                                    bool                   is_interleaved_transposed,
+                                    const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_ERROR_ON_MSG(input0.num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
-    ARM_COMPUTE_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The first input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
+    ARM_COMPUTE_ERROR_ON_MSG(
+        is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(),
+        "The first input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
 
     const bool reinterpret_input_as_3d  = reshape_info.reinterpret_input_as_3d();
     const bool reinterpret_output_as_3d = reshape_info.depth_output_gemm3d() != 0;
     const int  depth_output_gemm3d      = reinterpret_output_as_3d ? reshape_info.depth_output_gemm3d() : 1;
-    const int  m                        = reshape_info.reinterpret_input_as_3d() ? input0.dimension(1) * input0.dimension(2) : input0.dimension(1);
+    const int  m =
+        reshape_info.reinterpret_input_as_3d() ? input0.dimension(1) * input0.dimension(2) : input0.dimension(1);
 
     // If the output of GEMM has to be reinterpreted as 3D, the number of input0 rows (M) is obtained collapsing the second and third
     // dimension of the output tensor
@@ -921,7 +1000,7 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
     const int dim2 = reinterpret_input_as_3d ? input0.tensor_shape()[3] : input0.tensor_shape()[2];
     const int dim3 = reinterpret_input_as_3d ? 1 : input0.tensor_shape()[3];
 
-    TensorShape output_shape{ input0.tensor_shape() };
+    TensorShape output_shape{input0.tensor_shape()};
 
     output_shape.set(0, dim0);
     output_shape.set(1, dim1);
@@ -940,7 +1019,8 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
  *
  * @return the calculated shape
  */
-inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, const GEMMReshapeInfo &gemm_info)
+inline TensorShape
+compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, const GEMMReshapeInfo &gemm_info)
 {
     ARM_COMPUTE_UNUSED(input1);
     ARM_COMPUTE_ERROR_ON_MSG(input0.num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
@@ -949,9 +1029,9 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
     const bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d() != 0;
     const int  depth_output_gemm3d      = reinterpret_output_as_3d ? gemm_info.depth_output_gemm3d() : 1;
 
-    TensorShape output_shape{ input0.tensor_shape() };
+    TensorShape output_shape{input0.tensor_shape()};
 
-    if(!reinterpret_input_as_3d && !reinterpret_output_as_3d)
+    if (!reinterpret_input_as_3d && !reinterpret_output_as_3d)
     {
         output_shape.set(0, gemm_info.n());
         output_shape.set(1, gemm_info.m());
@@ -978,7 +1058,8 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
  *
  * @return the calculated shape
  */
-inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, const GEMMKernelInfo &gemm_info)
+inline TensorShape
+compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, const GEMMKernelInfo &gemm_info)
 {
     ARM_COMPUTE_UNUSED(input1);
     ARM_COMPUTE_ERROR_ON_MSG(input0.num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
@@ -987,9 +1068,9 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
     const bool         reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
     const unsigned int depth_output_gemm3d      = reinterpret_output_as_3d ? gemm_info.depth_output_gemm3d : 1;
 
-    TensorShape output_shape{ input0.tensor_shape() };
+    TensorShape output_shape{input0.tensor_shape()};
 
-    if(!reinterpret_input_as_3d && !reinterpret_output_as_3d)
+    if (!reinterpret_input_as_3d && !reinterpret_output_as_3d)
     {
         output_shape.set(0, gemm_info.n);
         output_shape.set(1, gemm_info.m);
@@ -1016,16 +1097,17 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
  *
  * @return the calculated shape
  */
-inline TensorShape compute_matmul_shape(const TensorShape &input0, const TensorShape &input1, const MatMulKernelInfo &matmul_info)
+inline TensorShape
+compute_matmul_shape(const TensorShape &input0, const TensorShape &input1, const MatMulKernelInfo &matmul_info)
 {
-    TensorShape output_shape{ input0 };
+    TensorShape output_shape{input0};
 
-    if(matmul_info.adj_lhs)
+    if (matmul_info.adj_lhs)
     {
         output_shape.set(1, input0[0]); // The vertical (M) dimension
     }
 
-    if(matmul_info.adj_rhs)
+    if (matmul_info.adj_rhs)
     {
         output_shape.set(0, input1[1]); // The horizontal (N) dimension
     }
@@ -1044,14 +1126,15 @@ inline TensorShape compute_matmul_shape(const TensorShape &input0, const TensorS
  *
  * @return the calculated shape
  */
-inline TensorShape compute_output_stage_shape(const ITensorInfo &input, unsigned int gemm_3d_depth = 1, bool batch_size_on_z = false)
+inline TensorShape
+compute_output_stage_shape(const ITensorInfo &input, unsigned int gemm_3d_depth = 1, bool batch_size_on_z = false)
 {
     ARM_COMPUTE_ERROR_ON(input.data_layout() != DataLayout::NHWC && gemm_3d_depth > 1);
 
     TensorShape output_shape = input.tensor_shape();
-    if(gemm_3d_depth > 1)
+    if (gemm_3d_depth > 1)
     {
-        if(batch_size_on_z)
+        if (batch_size_on_z)
         {
             output_shape.shift_right(1);
         }
@@ -1076,11 +1159,16 @@ inline TensorShape compute_output_stage_shape(const ITensorInfo &input, unsigned
  * @return the calculated shape
  */
 inline TensorShape compute_strided_slice_shape(const ITensorInfo &input,
-                                               const Coordinates &starts, const Coordinates &ends, const Coordinates &strides,
-                                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+                                               const Coordinates &starts,
+                                               const Coordinates &ends,
+                                               const Coordinates &strides,
+                                               int32_t            begin_mask,
+                                               int32_t            end_mask,
+                                               int32_t            shrink_axis_mask)
 {
     using namespace arm_compute::helpers::tensor_transform;
-    return compute_strided_slice_output_shape(input.tensor_shape(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    return compute_strided_slice_output_shape(input.tensor_shape(), starts, ends, strides, begin_mask, end_mask,
+                                              shrink_axis_mask);
 }
 
 /** Calculate the slice output shape of a tensor
@@ -1091,13 +1179,13 @@ inline TensorShape compute_strided_slice_shape(const ITensorInfo &input,
  *
  * @return the calculated shape
  */
-inline TensorShape compute_slice_shape(const TensorShape &input_shape, const Coordinates &starts, const Coordinates &ends)
+inline TensorShape
+compute_slice_shape(const TensorShape &input_shape, const Coordinates &starts, const Coordinates &ends)
 {
     using namespace arm_compute::helpers::tensor_transform;
 
-    return compute_strided_slice_output_shape(input_shape,
-                                              starts, ends, BiStrides(),
-                                              0, construct_slice_end_mask(ends), 0);
+    return compute_strided_slice_output_shape(input_shape, starts, ends, BiStrides(), 0, construct_slice_end_mask(ends),
+                                              0);
 }
 
 /** Calculate the batch to space output shape of a tensor
@@ -1110,7 +1198,8 @@ inline TensorShape compute_slice_shape(const TensorShape &input_shape, const Coo
  *
  * @return the calculated shape
  */
-inline TensorShape compute_batch_to_space_shape(DataLayout data_layout, const TensorShape &input, int block_x, int block_y, const CropInfo &crop_info = CropInfo{})
+inline TensorShape compute_batch_to_space_shape(
+    DataLayout data_layout, const TensorShape &input, int block_x, int block_y, const CropInfo &crop_info = CropInfo{})
 {
     ARM_COMPUTE_ERROR_ON(block_x < 1 || block_y < 1);
 
@@ -1118,7 +1207,7 @@ inline TensorShape compute_batch_to_space_shape(DataLayout data_layout, const Te
     const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int idx_batch  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
 
-    TensorShape output_shape{ input };
+    TensorShape output_shape{input};
 
     unsigned int       new_width   = input[idx_width] * static_cast<unsigned int>(block_x);
     unsigned int       new_height  = input[idx_height] * static_cast<unsigned int>(block_y);
@@ -1152,7 +1241,7 @@ inline TensorShape compute_depth_to_space_shape(const TensorShape &input_shape,
     const int idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    TensorShape output_shape{ input_shape };
+    TensorShape output_shape{input_shape};
     output_shape.set(idx_width, input_shape[idx_width] * block);
     output_shape.set(idx_height, input_shape[idx_height] * block);
     output_shape.set(idx_channel, input_shape[idx_channel] / (block * block));
@@ -1173,10 +1262,10 @@ inline TensorShape compute_split_shape(const ITensorInfo *input, unsigned int ax
     TensorShape empty_shape;
     empty_shape.set(0, 0);
 
-    TensorShape out_shape{ input->tensor_shape() };
+    TensorShape out_shape{input->tensor_shape()};
 
     // Return empty shape if axis is invalid
-    if(axis > input->tensor_shape().num_dimensions())
+    if (axis > input->tensor_shape().num_dimensions())
     {
         return empty_shape;
     }
@@ -1184,7 +1273,7 @@ inline TensorShape compute_split_shape(const ITensorInfo *input, unsigned int ax
     size_t axis_size = out_shape[axis];
 
     // Return empty shape if num_split is not valid
-    if(axis_size % num_splits)
+    if (axis_size % num_splits)
     {
         return empty_shape;
     }
@@ -1203,9 +1292,10 @@ inline TensorShape compute_split_shape(const ITensorInfo *input, unsigned int ax
  *
  * @return the calculated shape
  */
-inline TensorShape compute_space_to_batch_shape(const ITensorInfo *input, int block_x, int block_y, const Size2D &padding_left, const Size2D &padding_right)
+inline TensorShape compute_space_to_batch_shape(
+    const ITensorInfo *input, int block_x, int block_y, const Size2D &padding_left, const Size2D &padding_right)
 {
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
 
     const DataLayout data_layout = input->data_layout();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -1231,7 +1321,7 @@ inline TensorShape compute_space_to_batch_shape(const ITensorInfo *input, int bl
  */
 inline TensorShape compute_space_to_depth_shape(const ITensorInfo *input, int32_t block_shape)
 {
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
 
     const DataLayout data_layout = input->data_layout();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -1276,7 +1366,7 @@ inline TensorShape compute_prior_box_shape(const ITensorInfo &input, const Prior
 inline TensorShape compute_padded_shape(const TensorShape &input_shape, const PaddingList &padding)
 {
     TensorShape padded_shape = input_shape;
-    for(size_t dim = 0; dim < padding.size(); ++dim)
+    for (size_t dim = 0; dim < padding.size(); ++dim)
     {
         const auto    &padding_pair   = padding[dim];
         const uint32_t shape_on_index = (padded_shape.num_dimensions() <= dim) ? 1 : input_shape[dim];
@@ -1295,7 +1385,7 @@ inline TensorShape compute_padded_shape(const TensorShape &input_shape, const Pa
 inline TensorShape compute_tiled_shape(const TensorShape &input_shape, const Multiples &multiples)
 {
     TensorShape tiled_shape = input_shape;
-    for(size_t dim = 0; dim < multiples.size(); ++dim)
+    for (size_t dim = 0; dim < multiples.size(); ++dim)
     {
         tiled_shape.set(dim, input_shape[dim] * multiples[dim]);
     }
@@ -1312,9 +1402,9 @@ inline TensorShape compute_tiled_shape(const TensorShape &input_shape, const Mul
  */
 inline TensorShape compute_reduced_shape(const TensorShape &input, unsigned int axis, bool keep_dims = true)
 {
-    TensorShape output_shape{ input };
+    TensorShape output_shape{input};
 
-    if(!keep_dims)
+    if (!keep_dims)
     {
         output_shape.remove_dimension(axis);
     }
@@ -1407,14 +1497,14 @@ inline TensorShape calculate_concatenate_shape(const std::vector<T *> &input, si
 
 #if defined(ARM_COMPUTE_ASSERTS_ENABLED)
     // All dimensions must match except the axis one
-    for(unsigned int i = 0; i < MAX_DIMS; ++i)
+    for (unsigned int i = 0; i < MAX_DIMS; ++i)
     {
-        if(i == axis)
+        if (i == axis)
         {
             continue;
         }
 
-        for(const auto &tensor : input)
+        for (const auto &tensor : input)
         {
             ARM_COMPUTE_ERROR_ON(tensor == nullptr);
             const TensorShape shape = extract_shape(tensor);
@@ -1425,7 +1515,7 @@ inline TensorShape calculate_concatenate_shape(const std::vector<T *> &input, si
 
     // Calculate output shape
     size_t new_size = 0;
-    for(const auto &tensor : input)
+    for (const auto &tensor : input)
     {
         const TensorShape shape = extract_shape(tensor);
         new_size += shape[axis];
@@ -1448,14 +1538,14 @@ inline TensorShape compute_stack_shape(const ITensorInfo &a, unsigned int axis,
     ARM_COMPUTE_ERROR_ON(axis > a.num_dimensions());
     ARM_COMPUTE_ERROR_ON(a.num_dimensions() > 4);
 
-    TensorShape shape_out{ a.tensor_shape() };
+    TensorShape shape_out{a.tensor_shape()};
     shape_out.set(axis, num_tensors);
 
     unsigned int i_shift = 0;
 
-    for(unsigned int i = 0; i < a.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < a.num_dimensions(); ++i)
     {
-        if(i == axis)
+        if (i == axis)
         {
             i_shift++;
         }
@@ -1473,7 +1563,8 @@ inline TensorShape compute_stack_shape(const ITensorInfo &a, unsigned int axis,
  *
  * @return the calculated shape
  */
-inline TensorShape compute_conv3d_shape(const TensorShape &src, const TensorShape &weights, const Conv3dInfo &conv3d_info)
+inline TensorShape
+compute_conv3d_shape(const TensorShape &src, const TensorShape &weights, const Conv3dInfo &conv3d_info)
 {
     // Weight tensor shape indices (D H W Cin Cout)
     constexpr unsigned int weights_depth_dim  = 4u;
@@ -1488,7 +1579,7 @@ inline TensorShape compute_conv3d_shape(const TensorShape &src, const TensorShap
     constexpr unsigned int width_dim   = 1u;
     constexpr unsigned int channel_dim = 0u;
 
-    TensorShape  output_shape{ src };
+    TensorShape  output_shape{src};
     const size_t pad_left   = conv3d_info.padding.left;
     const size_t pad_right  = conv3d_info.padding.right;
     const size_t pad_top    = conv3d_info.padding.top;
@@ -1506,17 +1597,41 @@ inline TensorShape compute_conv3d_shape(const TensorShape &src, const TensorShap
     int output_height_size = 0;
     int output_depth_size  = 0;
 
-    switch(conv3d_info.round_type)
+    switch (conv3d_info.round_type)
     {
         case DimensionRoundingType::FLOOR:
-            output_width_size  = static_cast<int>(std::floor((static_cast<float>(src[width_dim] + pad_left + pad_right - (dilation_x * (weights[weights_width_dim] - 1) + 1)) / stride_x) + 1));
-            output_height_size = static_cast<int>(std::floor((static_cast<float>(src[height_dim] + pad_top + pad_bottom - (dilation_y * (weights[weights_height_dim] - 1) + 1)) / stride_y) + 1));
-            output_depth_size  = static_cast<int>(std::floor((static_cast<float>(src[depth_dim] + pad_front + pad_back - (dilation_z * (weights[weights_depth_dim] - 1) + 1)) / stride_z) + 1));
+            output_width_size =
+                static_cast<int>(std::floor((static_cast<float>(src[width_dim] + pad_left + pad_right -
+                                                                (dilation_x * (weights[weights_width_dim] - 1) + 1)) /
+                                             stride_x) +
+                                            1));
+            output_height_size =
+                static_cast<int>(std::floor((static_cast<float>(src[height_dim] + pad_top + pad_bottom -
+                                                                (dilation_y * (weights[weights_height_dim] - 1) + 1)) /
+                                             stride_y) +
+                                            1));
+            output_depth_size =
+                static_cast<int>(std::floor((static_cast<float>(src[depth_dim] + pad_front + pad_back -
+                                                                (dilation_z * (weights[weights_depth_dim] - 1) + 1)) /
+                                             stride_z) +
+                                            1));
             break;
         case DimensionRoundingType::CEIL:
-            output_width_size  = static_cast<int>(std::ceil((static_cast<float>(src[width_dim] + pad_left + pad_right - (dilation_x * (weights[weights_width_dim] - 1) + 1)) / stride_x) + 1));
-            output_height_size = static_cast<int>(std::ceil((static_cast<float>(src[height_dim] + pad_top + pad_bottom - (dilation_y * (weights[weights_height_dim] - 1) + 1)) / stride_y) + 1));
-            output_depth_size  = static_cast<int>(std::ceil((static_cast<float>(src[depth_dim] + pad_front + pad_back - (dilation_z * (weights[weights_depth_dim] - 1) + 1)) / stride_z) + 1));
+            output_width_size =
+                static_cast<int>(std::ceil((static_cast<float>(src[width_dim] + pad_left + pad_right -
+                                                               (dilation_x * (weights[weights_width_dim] - 1) + 1)) /
+                                            stride_x) +
+                                           1));
+            output_height_size =
+                static_cast<int>(std::ceil((static_cast<float>(src[height_dim] + pad_top + pad_bottom -
+                                                               (dilation_y * (weights[weights_height_dim] - 1) + 1)) /
+                                            stride_y) +
+                                           1));
+            output_depth_size =
+                static_cast<int>(std::ceil((static_cast<float>(src[depth_dim] + pad_front + pad_back -
+                                                               (dilation_z * (weights[weights_depth_dim] - 1) + 1)) /
+                                            stride_z) +
+                                           1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -1539,7 +1654,7 @@ inline TensorShape compute_conv3d_shape(const TensorShape &src, const TensorShap
  */
 inline TensorShape compute_pool3d_shape(const TensorShape &src, Pooling3dLayerInfo pool3d_info)
 {
-    TensorShape output_shape{ src };
+    TensorShape output_shape{src};
 
     const auto data_layout      = DataLayout::NDHWC;
     const int  idx_width        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -1552,10 +1667,12 @@ inline TensorShape compute_pool3d_shape(const TensorShape &src, Pooling3dLayerIn
     int        output_height    = 0;
     int        output_depth     = 0;
 
-    std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src[idx_width], src[idx_height], src[idx_depth], pool_size_width, pool_size_height,
-                                                                                      pool_size_depth, pool3d_info);
+    std::tie(output_width, output_height, output_depth) =
+        scaled_3d_dimensions_signed(src[idx_width], src[idx_height], src[idx_depth], pool_size_width, pool_size_height,
+                                    pool_size_depth, pool3d_info);
 
-    ARM_COMPUTE_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid");
+    ARM_COMPUTE_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1),
+                             "Calculated output dimension size is invalid");
 
     output_shape.set(idx_width, static_cast<size_t>(output_width));
     output_shape.set(idx_height, static_cast<size_t>(output_height));
@@ -1576,7 +1693,8 @@ inline TensorShape compute_pool3d_shape(const TensorShape &src, Pooling3dLayerIn
  *
  * @return the calculated shape
  */
-inline TensorShape compute_gather_shape(const TensorShape &input_shape, const TensorShape &indices_shape, uint32_t actual_axis)
+inline TensorShape
+compute_gather_shape(const TensorShape &input_shape, const TensorShape &indices_shape, uint32_t actual_axis)
 {
     const auto input_num_dims   = input_shape.num_dimensions();
     const auto indices_num_dims = indices_shape.num_dimensions();
@@ -1587,26 +1705,27 @@ inline TensorShape compute_gather_shape(const TensorShape &input_shape, const Te
     TensorShape output_shape;
     size_t      dim_no = 0;
 
-    for(; dim_no < actual_axis; ++dim_no)
+    for (; dim_no < actual_axis; ++dim_no)
     {
         output_shape.set(dim_no, input_shape[dim_no]);
     }
 
-    for(; dim_no < actual_axis + indices_num_dims; ++dim_no)
+    for (; dim_no < actual_axis + indices_num_dims; ++dim_no)
     {
         output_shape.set(dim_no, indices_shape[dim_no - actual_axis]);
     }
 
-    for(; dim_no < input_num_dims + indices_num_dims - 1; ++dim_no)
+    for (; dim_no < input_num_dims + indices_num_dims - 1; ++dim_no)
     {
         output_shape.set(dim_no, input_shape[dim_no + 1 - indices_num_dims]);
     }
 
-    ARM_COMPUTE_ERROR_ON(input_shape.total_size() * indices_shape.total_size() != output_shape.total_size() * input_shape[actual_axis]);
+    ARM_COMPUTE_ERROR_ON(input_shape.total_size() * indices_shape.total_size() !=
+                         output_shape.total_size() * input_shape[actual_axis]);
 
     return output_shape;
 }
 } // namespace shape_calculator
 } // namespace misc
 } // namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_CORE_UTILS_MISC_SHAPECALCULATOR */
+#endif // ACL_ARM_COMPUTE_CORE_UTILS_MISC_SHAPECALCULATOR_H
diff --git a/arm_compute/core/utils/misc/Traits.h b/arm_compute/core/utils/misc/Traits.h
index 933922f63cc6358763fa4631557f7676c9be62f7..944fcb95f9ab02b1abd9d83f14db83c522859ec8 100644
--- a/arm_compute/core/utils/misc/Traits.h
+++ b/arm_compute/core/utils/misc/Traits.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_UTILS_TRAITS_TRAITS_H
 
 #include "arm_compute/core/Types.h"
+
 #include <type_traits>
 
 namespace arm_compute
diff --git a/arm_compute/core/utils/misc/Utility.h b/arm_compute/core/utils/misc/Utility.h
index e3e20d719fa2259f5066bb9a88e9c1105f9f056f..22f10d74ccdd5a184a6bc7a90a63117f2c96d05b 100644
--- a/arm_compute/core/utils/misc/Utility.h
+++ b/arm_compute/core/utils/misc/Utility.h
@@ -44,7 +44,7 @@ struct index_sequence
 };
 
 template <std::size_t N, std::size_t... S>
-struct index_sequence_generator : index_sequence_generator < N - 1, N - 1, S... >
+struct index_sequence_generator : index_sequence_generator<N - 1, N - 1, S...>
 {
 };
 
@@ -58,17 +58,17 @@ template <std::size_t N>
 using index_sequence_t = typename index_sequence_generator<N>::type;
 
 template <typename T, std::size_t N, T val, T... vals>
-struct generate_array : generate_array < T, N - 1, val, val, vals... >
+struct generate_array : generate_array<T, N - 1, val, val, vals...>
 {
 };
 
 template <typename T, T val, T... vals>
 struct generate_array<T, 0, val, vals...>
 {
-    static constexpr std::array<T, sizeof...(vals)> value{ vals... };
+    static constexpr std::array<T, sizeof...(vals)> value{vals...};
 };
 
-template <typename T, T                  val, T... vals>
+template <typename T, T val, T... vals>
 constexpr std::array<T, sizeof...(vals)> generate_array<T, 0, val, vals...>::value;
 /** @endcond */
 
@@ -79,7 +79,7 @@ template <std::size_t... S,
           typename T = std::array<typename std::iterator_traits<Iterator>::value_type, sizeof...(S)>>
 T make_array(Iterator first, index_sequence<S...>)
 {
-    return T{ { first[S]... } };
+    return T{{first[S]...}};
 }
 } // namespace detail
 
@@ -87,7 +87,7 @@ template <std::size_t N, typename Iterator>
 std::array<typename std::iterator_traits<Iterator>::value_type, N> make_array(Iterator first, Iterator last)
 {
     ARM_COMPUTE_UNUSED(last);
-    return detail::make_array(first, index_sequence_t<N> {});
+    return detail::make_array(first, index_sequence_t<N>{});
 }
 
 /** Performs clamping among a lower and upper value.
@@ -119,7 +119,7 @@ inline void for_each(F &&)
  * @param[in] args Remaining arguments
  */
 template <typename F, typename T, typename... Ts>
-inline void for_each(F &&func, T &&arg, Ts &&... args)
+inline void for_each(F &&func, T &&arg, Ts &&...args)
 {
     func(std::forward<T>(arg));
     for_each(std::forward<F>(func), std::forward<Ts>(args)...);
@@ -143,9 +143,11 @@ inline T &&foldl(F &&, T &&value)
  * @param[in] values  Remaining arguments
  */
 template <typename F, typename T, typename U, typename... Us>
-inline auto foldl(F &&func, T &&initial, U &&value, Us &&... values) -> decltype(func(std::forward<T>(initial), std::forward<U>(value)))
+inline auto foldl(F &&func, T &&initial, U &&value, Us &&...values)
+    -> decltype(func(std::forward<T>(initial), std::forward<U>(value)))
 {
-    return foldl(std::forward<F>(func), func(std::forward<T>(initial), std::forward<U>(value)), std::forward<Us>(values)...);
+    return foldl(std::forward<F>(func), func(std::forward<T>(initial), std::forward<U>(value)),
+                 std::forward<Us>(values)...);
 }
 
 /** Perform an index sort of a given vector.
@@ -160,11 +162,7 @@ std::vector<size_t> sort_indices(const std::vector<T> &v)
     std::vector<size_t> idx(v.size());
     std::iota(idx.begin(), idx.end(), 0);
 
-    std::sort(idx.begin(), idx.end(),
-              [&v](size_t i1, size_t i2)
-    {
-        return v[i1] < v[i2];
-    });
+    std::sort(idx.begin(), idx.end(), [&v](size_t i1, size_t i2) { return v[i1] < v[i2]; });
 
     return idx;
 }
@@ -178,7 +176,7 @@ std::vector<size_t> sort_indices(const std::vector<T> &v)
  */
 inline bool endswith(const std::string &str, const std::string &suffix)
 {
-    if(str.size() < suffix.size())
+    if (str.size() < suffix.size())
     {
         return false;
     }
@@ -205,10 +203,7 @@ inline bool check_aligned(void *ptr, const size_t alignment)
  */
 inline std::string tolower(std::string string)
 {
-    std::transform(string.begin(), string.end(), string.begin(), [](unsigned char c)
-    {
-        return std::tolower(c);
-    });
+    std::transform(string.begin(), string.end(), string.begin(), [](unsigned char c) { return std::tolower(c); });
     return string;
 }
 
@@ -227,7 +222,7 @@ inline std::string getenv(const std::string &env_name)
     return std::string{};
 #else  // BARE_METAL
     const auto env_chr = std::getenv(env_name.c_str());
-    return env_chr == nullptr ? std::string{} : std::string{ env_chr };
+    return env_chr == nullptr ? std::string{} : std::string{env_chr};
 #endif // BARE_METAL
 }
 } // namespace utility
diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h
index a15f3e5cde3f7dbe8ca5dcc66990450c377bf733..2324fe1838dee48946832426fcca6876bab0fb43 100644
--- a/arm_compute/core/utils/quantization/AsymmHelpers.h
+++ b/arm_compute/core/utils/quantization/AsymmHelpers.h
@@ -41,7 +41,10 @@ namespace quantization
  *
  * @return a status
  */
-Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon = false);
+Status calculate_quantized_multiplier(float    multiplier,
+                                      int32_t *quant_multiplier,
+                                      int32_t *shift,
+                                      bool     ignore_epsilon = false);
 /** Calculate quantized representation of multiplier with value less than one.
  *
  * @param[in]  multiplier       Real multiplier.
@@ -51,7 +54,10 @@ Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplie
  *
  * @return a status
  */
-Status calculate_quantized_multiplier_less_than_one(float multiplier, int32_t *quant_multiplier, int32_t *right_shift, bool ignore_epsilon = false);
+Status calculate_quantized_multiplier_less_than_one(float    multiplier,
+                                                    int32_t *quant_multiplier,
+                                                    int32_t *right_shift,
+                                                    bool     ignore_epsilon = false);
 /** Calculate quantized representation of multiplier having value greater than one.
  *
  * @param[in]  multiplier           Real multiplier.
@@ -60,7 +66,8 @@ Status calculate_quantized_multiplier_less_than_one(float multiplier, int32_t *q
  *
  * @return a status
  */
-Status calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift);
+Status
+calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift);
 
 /** Calculate quantized representation of per-channel multipliers
  *
@@ -71,9 +78,9 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t
  *
  * @return a status
  */
-Status calculate_quantized_multipliers(const QuantizationInfo &iq_info,
-                                       const QuantizationInfo &wq_info,
-                                       const QuantizationInfo &oq_info,
+Status calculate_quantized_multipliers(const QuantizationInfo  &iq_info,
+                                       const QuantizationInfo  &wq_info,
+                                       const QuantizationInfo  &oq_info,
                                        GEMMLowpOutputStageInfo &stage_info);
 
 /** Get minimum and maximum values for the input quantized data type
@@ -147,7 +154,10 @@ int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v);
  * @param[out] output_shift    Shift for inverse square root
  *
  */
-void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, int32_t &output_inv_sqrt, int32_t &output_shift);
+void get_invsqrt_quantized_multiplier_exp(int32_t  input,
+                                          int32_t  reverse_shift,
+                                          int32_t &output_inv_sqrt,
+                                          int32_t &output_shift);
 
 } // namespace quantization
 } // namespace arm_compute
diff --git a/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h b/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h
index b5af589cd23bc36c39c9cdcebcbd4fab86675dc2..3deaff74fc06924badf7d55c32cc3e38c4087786 100644
--- a/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h
+++ b/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
 #include <map>
 #include <memory>
 
diff --git a/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h b/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h
index 3daedd4efbef4085d0227d3cb58f17c4c83ca26b..e2a34e4424206a0c0b98963ffbe3a7e35e07e11a 100644
--- a/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h
+++ b/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h
@@ -53,10 +53,12 @@ namespace dynamic_fusion
 enum class MemoryType
 {
     /** Both User and Auxiliary types are of Alloc type. Since they require memory allocation */
-    User      = 0, /**< Memory coming directly from users, e.g. for argument tensors */
-    Auxiliary = 1, /**< Additional memory required by the workload tensor, e.g. for tensors holding temporary results between kernels */
+    User = 0, /**< Memory coming directly from users, e.g. for argument tensors */
+    Auxiliary =
+        1, /**< Additional memory required by the workload tensor, e.g. for tensors holding temporary results between kernels */
     /** Virtual type is of No-Alloc type. Since it doesn't require memory allocation */
-    Virtual = 2, /**< Temporary tile which is not allocated as a whole tensor in the memory. It is mainly used at sketch time to link operators; there should be no Virtual tensors at runtime */
+    Virtual =
+        2, /**< Temporary tile which is not allocated as a whole tensor in the memory. It is mainly used at sketch time to link operators; there should be no Virtual tensors at runtime */
 };
 
 /** Memory information for tensors with @ref MemoryType::Auxiliary.
@@ -66,9 +68,7 @@ struct AuxMemoryInfo
 {
     AuxMemoryInfo() = default;
 
-    AuxMemoryInfo(size_t size, size_t alignment = 0) noexcept
-        : size(size),
-          alignment(alignment)
+    AuxMemoryInfo(size_t size, size_t alignment = 0) noexcept : size(size), alignment(alignment)
     {
     }
 
@@ -76,8 +76,8 @@ struct AuxMemoryInfo
     {
         return info0.size == info1.size && info0.alignment == info1.alignment;
     }
-    size_t size{ 0 };      /**< Total memory size in bytes */
-    size_t alignment{ 0 }; /**< Memory alignment in bytes */
+    size_t size{0};      /**< Total memory size in bytes */
+    size_t alignment{0}; /**< Memory alignment in bytes */
 };
 
 /** Descriptor of a workload tensor memory */
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h
index 59efc8bd5d4c0925aad3e68bcaf082a6670ce211..ba2f658c7cdebcd7a3359883bd4b7d501a674625 100644
--- a/arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h
@@ -49,8 +49,8 @@ public:
     ConvertPolicy convert_policy() const;
 
 private:
-    DataType      _data_type{};                               /**< Data Type to be casted to */
-    ConvertPolicy _convert_policy{ ConvertPolicy::SATURATE }; /**< Convert Policy */
+    DataType      _data_type{};                             /**< Data Type to be casted to */
+    ConvertPolicy _convert_policy{ConvertPolicy::SATURATE}; /**< Convert Policy */
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h
index 58102d8d88c9d168f46839667c777df805692e49..a98ef0363ee19d6d4a52ccf2c5b292df325184a3 100644
--- a/arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Types.h"
+
 #include <cstdint>
 
 namespace arm_compute
@@ -55,9 +56,9 @@ public:
     Size2D dilation() const;
 
 private:
-    Padding2D _pad{};              /**< Padding */
-    Size2D    _stride{ 1U, 1U };   /**< Stride */
-    Size2D    _dilation{ 1U, 1U }; /**< Dilation */
+    Padding2D _pad{};            /**< Padding */
+    Size2D    _stride{1U, 1U};   /**< Stride */
+    Size2D    _dilation{1U, 1U}; /**< Dilation */
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h
index 6d05e9e4d6ed0ac9a8d44948da6e0ee18a4a6012..c46b25cb5dab44e331827c7a260b874d61148712 100644
--- a/arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Types.h"
+
 #include <cstdint>
 
 namespace arm_compute
@@ -63,11 +64,11 @@ public:
     DimensionRoundingType dimension_rounding_type() const;
 
 private:
-    Padding2D             _pad{};                                                   /**< Padding */
-    Size2D                _stride{ 1U, 1U };                                        /**< Stride */
-    Size2D                _dilation{ 1U, 1U };                                      /**< Dilation */
-    uint32_t              _depth_multiplier{ 1U };                                  /**< Depth multiplier */
-    DimensionRoundingType _dimension_rounding_type{ DimensionRoundingType::FLOOR }; /**< Dimension rounding type */
+    Padding2D             _pad{};                                                 /**< Padding */
+    Size2D                _stride{1U, 1U};                                        /**< Stride */
+    Size2D                _dilation{1U, 1U};                                      /**< Dilation */
+    uint32_t              _depth_multiplier{1U};                                  /**< Depth multiplier */
+    DimensionRoundingType _dimension_rounding_type{DimensionRoundingType::FLOOR}; /**< Dimension rounding type */
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc512e9ff9148b3a5eb8f257b62acc93b503ee64
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_MATMULATTRIBUTES_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_MATMULATTRIBUTES_H
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator.
+ */
+
+/** MatMul attributes */
+class MatMulAttributes
+{
+public:
+    /* Get adjoint LHS flag value (transpose LHS before multiplication) */
+    bool adj_lhs() const;
+
+    /* Get adjoint RHS flag value (transpose RHS before multiplication) */
+    bool adj_rhs() const;
+
+    /* Set adjoint LHS flag value (transpose LHS before multiplication) */
+    MatMulAttributes adj_lhs(bool adj_lhs);
+
+    /* Set adjoint RHS flag value (transpose RHS before multiplication) */
+    MatMulAttributes adj_rhs(bool adj_rhs);
+
+private:
+    bool _adj_lhs{false};
+    bool _adj_rhs{false};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_MATMULATTRIBUTES_H
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h
index be30781d86c25845e400a0f1d69e43184e35a388..19d8b96dcf85d5ca974fc971db1c1a2c92a4c30e 100644
--- a/arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h
@@ -72,8 +72,8 @@ private:
     PoolingType _pool_type{};
     Padding2D   _pad{};
     Size2D      _pool_size{};
-    Size2D      _stride{ 1U, 1U };
-    bool        _exclude_padding{ true };
+    Size2D      _stride{1U, 1U};
+    bool        _exclude_padding{true};
 };
 
 } // namespace dynamic_fusion
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h
index 8992693cd1fcc3e932ebafb868522d5b895eab95..7410cc7e704e0c913659f7c508e59bb3ce483c77 100644
--- a/arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h
@@ -27,6 +27,7 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+
 #include <cstdint>
 
 namespace arm_compute
@@ -75,9 +76,9 @@ public:
 private:
     int32_t             _output_width{};
     int32_t             _output_height{};
-    InterpolationPolicy _interpolation_policy{ InterpolationPolicy::BILINEAR };
-    SamplingPolicy      _sampling_policy{ SamplingPolicy::CENTER };
-    bool                _align_corners{ false };
+    InterpolationPolicy _interpolation_policy{InterpolationPolicy::BILINEAR};
+    SamplingPolicy      _sampling_policy{SamplingPolicy::CENTER};
+    bool                _align_corners{false};
 };
 
 } // namespace dynamic_fusion
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h
index fc50aa0f683ccfb0f7a492ca35fad1ef3b28bd1f..61654bfa275f6bc94083e423f22b2cfe348f4df3 100644
--- a/arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h
@@ -51,7 +51,7 @@ public:
     int axis() const;
 
 private:
-    float _beta{ 1.f };      /**< Scaling factor for the exponent */
+    float _beta{1.f};        /**< Scaling factor for the exponent */
     bool  _is_log_softmax{}; /**< True if operation is log-softmax */
     int   _axis{};           /**< Axis/Dimension to perform the operation */
 };
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h
index 0b6089973448002da8b0985ddc8867e345af43e5..38b350c7eb6bc6c7d257386fdeee7f21228d6c02 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h
@@ -85,7 +85,7 @@ public:
      * @return TensorInfo Newly created tensor info
      */
     template <typename... TArgs>
-    TensorInfo create_tensor_info(TArgs &&... args)
+    TensorInfo create_tensor_info(TArgs &&...args)
     {
         auto tensor_info = TensorInfo(std::forward<TArgs>(args)...);
         register_user_tensor(tensor_info);
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h
index 33eded4dff1a486cf13e9229ec6efd8c6bcff1a0..5b6c1b90ab6173ec7861d90d34c71a7b74436c37 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h
@@ -65,9 +65,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *lhs,
-                                  ITensorInfo       *rhs);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs);
     /** Check if the operator configuration is supported, irrespective of fusion
      *
      * @param[in] context Workload context within which the operator is running
@@ -76,18 +74,14 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *lhs,
-                                  const ITensorInfo        *rhs);
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs);
     /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
      *
      * Parameters are similar to @ref GpuAdd::create_op()
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *rhs,
-                              const ITensorInfo       *lhs);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *rhs, const ITensorInfo *lhs);
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h
index 83b004b8b89b663da9c0966ef4d07baabd9e5483..1593cec804ca7a0a1a1450054d71207696f4e9f5 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h
@@ -68,9 +68,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src,
-                                  const Attributes &attributes);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes);
     /** Check if the operator configuration is supported, irrespective of fusion
      *
      * @param[in] context    Workload context within which the operator is running
@@ -79,18 +77,15 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src,
-                                  const Attributes         &attributes);
+    static Status
+    is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes);
     /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
      *
      *  Parameters are similar to @ref GpuCast::create_op()
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src,
-                              const Attributes        &attributes);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes);
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuClamp.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuClamp.h
index e96251196aedd044a6715da17f68c468ba97d5c1..5dd77bdc8e39ee13cdef938889baf2c2c19a52b3 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuClamp.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuClamp.h
@@ -62,9 +62,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src,
-                                  const Attributes &attributes);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes);
 
     /** Check if the operator configuration is supported, irrespective of fusion
      *
@@ -74,9 +72,8 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src,
-                                  const Attributes         &attributes);
+    static Status
+    is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes);
 
     /** Validate the operator and check if it can be fused into the workload sketch.
      *
@@ -84,9 +81,7 @@ public:
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src,
-                              const Attributes        &attributes);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes);
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h
index 612cc83a1f6ba492f5a175859dfd6c07d58ad156..da7e860757d9bc6e893941645ceb89cf06176814 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h
@@ -64,11 +64,8 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src,
-                                  ITensorInfo       *wei,
-                                  ITensorInfo       *bia,
-                                  const Attributes &attributes);
+    static ITensorInfo *create_op(
+        GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *wei, ITensorInfo *bia, const Attributes &attributes);
     /** Check if the operator configuration is supported, irrespective of fusion
      *
      * @param[in] context    Workload context within which the operator is running
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h
index a0cb29273013abc13d3928bd9010a452b1184d44..958569efd788d967b9be75eebcc2c9d8a43b627d 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h
@@ -63,11 +63,8 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src,
-                                  ITensorInfo       *wei,
-                                  ITensorInfo       *bia,
-                                  const Attributes &attributes);
+    static ITensorInfo *create_op(
+        GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *wei, ITensorInfo *bia, const Attributes &attributes);
 
     /** Check if the operator configuration is supported, irrespective of fusion
      *
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ba5b1db93c875f5e61406278ba1219c9956758b
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUMATMUL_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUMATMUL_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator backend specific settings
+*/
+class GpuMatMulSettings
+{
+public:
+    /** Set N0: number of columns processed by each work-item */
+    GpuMatMulSettings &n0(int n0);
+    /** Get N0: number of columns processed by each work-item */
+    int n0() const;
+
+    /** Set M0: number of rows processed by each work-item */
+    GpuMatMulSettings &m0(int m0);
+    /** Get M0: number of rows processed by each work-item */
+    int m0() const;
+
+    /** Set K0: number of inner accumulations */
+    GpuMatMulSettings &k0(int k0);
+    /** Get K0: number of inner accumulations */
+    int k0() const;
+
+private:
+    int _n0{0}; /**< Number of columns processed by each work-item */
+    int _m0{0}; /**< Number of rows processed by each work-item */
+    int _k0{0}; /**< Number of inner accumulations */
+};
+
+/** Operator interface. */
+class GpuMatMul final
+{
+public:
+    /* Attributes are a set of backend-agnostic parameters that define what an operator does */
+    using Attributes = MatMulAttributes;
+    /* Settings are a set of backend-specific parameters that influence the implementation of a operator */
+    using Settings = GpuMatMulSettings;
+
+    /* Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |LHS            |RHS            |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
+     *
+     * @param[in,out] sketch     Workload sketch into which the operator will be fused
+     * @param[in]     lhs        Input tensor info for the LHS matrix. Data type supported: F32/F16. Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+     * @param[in]     rhs        Input tensor info for the RHS matrix. Data type supported: same as @p lhs. Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+     * @param[in]     attributes Operator attributes
+     * @param[in]     settings   Operator settings
+     *
+     * @return Pointer for the destination tensor info
+     */
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
+                                  ITensorInfo       *lhs,
+                                  ITensorInfo       *rhs,
+                                  const Attributes  &attributes,
+                                  const Settings    &settings);
+
+    /* Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context    Workload context within which the operator is running
+     * @param[in] lhs        Input tensor info for the LHS matrix.
+     * @param[in] rhs        Input tensor info for the RHS matrix.
+     * @param[in] attributes Operator attributes
+     * @param[in] settings   Operator settings
+     *
+     * @return Status
+     */
+    static Status is_supported_op(const GpuWorkloadContext &context,
+                                  const ITensorInfo        *lhs,
+                                  const ITensorInfo        *rhs,
+                                  const Attributes         &attributes,
+                                  const Settings           &settings);
+
+    /* Check if the operator configuration is supported and if it can be fused into the workload sketch.
+     *
+     * Parameters are similar to @ref GpuMatMul::create_op()
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *lhs,
+                              const ITensorInfo       *rhs,
+                              const Attributes        &attributes,
+                              const Settings          &settings);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUMATMUL_H
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h
index 3e0ebdd96c477c2dce5a0e0d530f3ee8871be3e6..d13e4a3cad8c0265d90cd9e9c1453e2f8eda0c80 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h
@@ -62,9 +62,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *lhs,
-                                  ITensorInfo       *rhs);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs);
 
     /** Check if the operator configuration is supported, irrespective of fusion
      *
@@ -74,9 +72,7 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *lhs,
-                                  const ITensorInfo        *rhs);
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs);
 
     /** Validate the operator and check if the configuration is supported and if it can be fused into the workload sketch.
      *
@@ -84,9 +80,7 @@ public:
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *rhs,
-                              const ITensorInfo       *lhs);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *rhs, const ITensorInfo *lhs);
 };
 
 } // namespace dynamic_fusion
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h
index 06317511cdd197a767715b9193814639003314c4..deb5559b9d98513b2192dd5ec6f7fc0dbcd198bd 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h
@@ -56,9 +56,7 @@ public:
      * @param[in, out] dst    Destination tensor info.
      *                        If an uninitialized ITensorInfo is passed in, it will be auto-initialized.
      */
-    static void create_op(GpuWorkloadSketch &sketch,
-                          ITensorInfo       *src,
-                          ITensorInfo       *dst);
+    static void create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst);
 
     /** Check if the operator configuration is supported, irrespective of fusion.
      *
@@ -68,9 +66,7 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src,
-                                  const ITensorInfo        *dst);
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst);
 
     /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
      *
@@ -78,9 +74,7 @@ public:
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src,
-                              const ITensorInfo       *dst);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ITensorInfo *dst);
 };
 
 } // namespace dynamic_fusion
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h
index 6e1bcdbbfd26e116ab442608f599016ae4a5f2be..4d2db0e89c09bda606dd43e33172ccc78f972fd5 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUPOOL2D
-#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUPOOL2D
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUPOOL2D_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUPOOL2D_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
@@ -55,8 +55,8 @@ public:
     GpuPool2dSettings use_inf_as_limit(bool use_inf_as_limit);
 
 private:
-    bool _mixed_precision{ false };
-    bool _use_inf_as_limit{ true };
+    bool _mixed_precision{false};
+    bool _use_inf_as_limit{true};
 };
 
 /** Operator interface. */
@@ -83,27 +83,20 @@ public:
      *
      * @param[in,out] sketch     Workload sketch into which the operator will be fused
      * @param[in]     src        Source tensor
-     * @param[out]    dst        Destination tensor
      * @param[in]     attributes Operator attributes
      * @param[in]     settings   Operator settings
      */
-    static void create_op(GpuWorkloadSketch &sketch,
-                          ITensorInfo       *src,
-                          ITensorInfo       *dst,
-                          const Attributes &attributes,
-                          const Settings    &settings);
+    static ITensorInfo *
+    create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes, const Settings &settings);
     /** Check if the operator configuration is supported, irrespective of fusion
      *
-     * @param[in]  context    Workload context within which the operator is running
-     * @param[in]  src        Left hand side tensor info. Data types supported: F16/F32.
-     * @param[out] dst        Destination tensor info. Data types supported: F16/F32.
-     *                        If an uninitialized ITensorInfo is passed in, it will be auto-initialized
-     * @param[in]  attributes Operator attributes
-     * @param[in]  settings   Operator settings
+     * @param[in] context    Workload context within which the operator is running
+     * @param[in] src        Left hand side tensor info. Data types supported: F16/F32.
+     * @param[in] attributes Operator attributes
+     * @param[in] settings   Operator settings
      */
     static Status is_supported_op(const GpuWorkloadContext &context,
                                   const ITensorInfo        *src,
-                                  const ITensorInfo        *dst,
                                   const Attributes         &attributes,
                                   const Settings           &settings);
     /** Validate the operator and check if it can be fused into the workload sketch.
@@ -111,11 +104,10 @@ public:
      */
     static Status validate_op(const GpuWorkloadSketch &sketch,
                               const ITensorInfo       *src,
-                              const ITensorInfo       *dst,
                               const Attributes        &attributes,
                               const Settings          &settings);
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUPOOL2D */
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUPOOL2D_H
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h
index 0f50127199ed888e89cc2624b8724023e3aa6a1d..dc194fcadbcab4ce326c4ea31e6380ef018a1cd4 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h
@@ -62,9 +62,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src,
-                                  const Attributes &attributes);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes);
     /** Check if the operator configuration is supported, irrespective of fusion
      *
      * @param[in] context    Workload context within which the operator is running
@@ -73,18 +71,15 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src,
-                                  const Attributes         &attributes);
+    static Status
+    is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes);
     /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
      *
      *  Parameters are similar to @ref GpuReshape::create_op()
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src,
-                              const Attributes        &attributes);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes);
 };
 
 } // namespace dynamic_fusion
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h
index 2579d10f5bec8e0e8793feb0b78cc6bd329b60fe..e2ece80a1d97c7451f436109c3e39bc496b8f255 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h
@@ -67,9 +67,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src,
-                                  const Attributes &attributes);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes);
     /** Check if the operator configuration is supported, irrespective of fusion
      *
      * @param[in] context    Workload context within which the operator is running
@@ -78,18 +76,15 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src,
-                                  const Attributes         &attributes);
+    static Status
+    is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes);
     /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
      *
      * Parameters are similar to @ref GpuResize::create_op()
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src,
-                              const Attributes        &attributes);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes);
 };
 
 } // namespace dynamic_fusion
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h
index 616a61e614b93ef4ee4cb27261b3b64bafeb11f4..798b84b9066e74e075bfc7c2eac105b211d0eebd 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h
@@ -59,8 +59,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src);
 
     /** Check if the operator configuration is supported, irrespective of fusion
      *
@@ -69,8 +68,7 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src);
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src);
 
     /** Validate the operator and check if it can be fused into the workload sketch.
      *
@@ -78,8 +76,7 @@ public:
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src);
 };
 
 } // namespace dynamic_fusion
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h
index e86ef91e6a96feee866cb2558c8a7756c8afda4e..66c2d773105a4ca29ea700505670835f0d7ed9b0 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h
@@ -62,10 +62,7 @@ public:
      * @param[in]     dst        Destination tensor info.
      * @param[in]     attributes Operator attributes
      */
-    static void create_op(GpuWorkloadSketch &sketch,
-                          ITensorInfo       *src,
-                          ITensorInfo       *dst,
-                          const Attributes &attributes);
+    static void create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst, const Attributes &attributes);
     /** Check if the operator configuration is supported, irrespective of fusion
      *
      * @param[in] context    Workload context within which the operator is running
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h
index 6f8c2d0b769029b59b85168b382d3c6274619ed7..2d9255fff24f18901a2295b5c1df8713b47fe519 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h
@@ -65,9 +65,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *lhs,
-                                  ITensorInfo       *rhs);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs);
 
     /** Check if the operator configuration is supported, irrespective of fusion
      *
@@ -77,9 +75,7 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *lhs,
-                                  const ITensorInfo        *rhs);
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs);
 
     /** Validate the operator and check if its configuration is supported and if it can be fused into the workload sketch.
      *
@@ -87,9 +83,7 @@ public:
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *rhs,
-                              const ITensorInfo       *lhs);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *rhs, const ITensorInfo *lhs);
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h
index 08b5032e93b461eec83a03f470963dc7ede3c3ba..9c0ce6de028071316ebc30fd4aae9bf3b41362f1 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h
@@ -59,8 +59,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src);
 
     /** Check if the operator configuration is supported, irrespective of fusion
      *
@@ -69,8 +68,7 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src);
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src);
 
     /** Validate the operator and check if it can be fused into the workload sketch.
      *
@@ -78,8 +76,7 @@ public:
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src);
 };
 
 } // namespace dynamic_fusion
diff --git a/arm_compute/function_info/ActivationLayerInfo.h b/arm_compute/function_info/ActivationLayerInfo.h
index 84e962cb3a229f76a33e829b9848ab51a30a22d5..195b67cf990f09db2bd758840fa578c502cc061d 100644
--- a/arm_compute/function_info/ActivationLayerInfo.h
+++ b/arm_compute/function_info/ActivationLayerInfo.h
@@ -39,17 +39,17 @@ enum class ActivationFunction
     RELU,            /**< Rectifier ( \f$ f(x) = max(0,x) \f$ ) */
     BOUNDED_RELU,    /**< Upper Bounded Rectifier ( \f$ f(x) = min(a, max(0,x)) \f$ ) */
     LU_BOUNDED_RELU, /**< Lower and Upper Bounded Rectifier ( \f$ f(x) = min(a, max(b,x)) \f$ ) */
-    LEAKY_RELU,      /**< Leaky Rectifier ( \f$ f(x) = \begin{cases}  \alpha x & \quad \text{if } x \text{ < 0}\\  x & \quad \text{if } x \geq \text{ 0 } \end{cases} \f$ ) */
-    SOFT_RELU,       /**< Soft Rectifier ( \f$ f(x)= log(1+e^x) \f$ ) */
-    ELU,             /**< Exponential Linear Unit ( \f$ f(x) = \begin{cases}  \alpha (exp(x) - 1) & \quad \text{if } x \text{ < 0}\\  x & \quad \text{if } x \geq \text{ 0 } \end{cases} \f$ ) */
-    ABS,             /**< Absolute ( \f$ f(x)= |x| \f$ ) */
-    SQUARE,          /**< Square ( \f$ f(x)= x^2 \f$ )*/
-    SQRT,            /**< Square root ( \f$ f(x) = \sqrt{x} \f$ )*/
-    LINEAR,          /**< Linear ( \f$ f(x)= ax + b \f$ ) */
-    IDENTITY,        /**< Identity ( \f$ f(x)= x \f$ ) */
-    HARD_SWISH,      /**< Hard-swish ( \f$ f(x) = (x \text{ReLU6}(x+3))/6 = x \min(\max(0,x+3),6)/6 \f$ ) */
-    SWISH,           /**< Swish ( \f$ f(x) = \frac{x}{1 + e^{-ax}} = x \text{logistic}(ax) \f$ ) */
-    GELU             /**< GELU ( \f$ f(x) = x * 1/2 * 1 + erf(x / \sqrt{2}) \f$ ) */
+    LEAKY_RELU, /**< Leaky Rectifier ( \f$ f(x) = \begin{cases}  \alpha x & \quad \text{if } x \text{ < 0}\\  x & \quad \text{if } x \geq \text{ 0 } \end{cases} \f$ ) */
+    SOFT_RELU,  /**< Soft Rectifier ( \f$ f(x)= log(1+e^x) \f$ ) */
+    ELU, /**< Exponential Linear Unit ( \f$ f(x) = \begin{cases}  \alpha (exp(x) - 1) & \quad \text{if } x \text{ < 0}\\  x & \quad \text{if } x \geq \text{ 0 } \end{cases} \f$ ) */
+    ABS, /**< Absolute ( \f$ f(x)= |x| \f$ ) */
+    SQUARE,     /**< Square ( \f$ f(x)= x^2 \f$ )*/
+    SQRT,       /**< Square root ( \f$ f(x) = \sqrt{x} \f$ )*/
+    LINEAR,     /**< Linear ( \f$ f(x)= ax + b \f$ ) */
+    IDENTITY,   /**< Identity ( \f$ f(x)= x \f$ ) */
+    HARD_SWISH, /**< Hard-swish ( \f$ f(x) = (x \text{ReLU6}(x+3))/6 = x \min(\max(0,x+3),6)/6 \f$ ) */
+    SWISH,      /**< Swish ( \f$ f(x) = \frac{x}{1 + e^{-ax}} = x \text{logistic}(ax) \f$ ) */
+    GELU        /**< GELU ( \f$ f(x) = x * 1/2 * 1 + erf(x / \sqrt{2}) \f$ ) */
 };
 /** Activation Layer Information class */
 class ActivationLayerInfo
@@ -68,8 +68,7 @@ public:
      *              (@ref ActivationFunction::BOUNDED_RELU, @ref ActivationFunction::LU_BOUNDED_RELU, @ref ActivationFunction::LINEAR, @ref ActivationFunction::TANH).
      * @param[in] b (Optional) The beta parameter used by some activation functions (@ref ActivationFunction::LINEAR, @ref ActivationFunction::LU_BOUNDED_RELU, @ref ActivationFunction::TANH).
      */
-    ActivationLayerInfo(ActivationFunction f, float a = 0.0f, float b = 0.0f)
-        : _act(f), _a(a), _b(b), _enabled(true)
+    ActivationLayerInfo(ActivationFunction f, float a = 0.0f, float b = 0.0f) : _act(f), _a(a), _b(b), _enabled(true)
     {
     }
     /** Get the type of activation function */
@@ -104,10 +103,10 @@ public:
     }
 #endif // __aarch64__
 private:
-    ActivationFunction _act     = { ActivationLayerInfo::ActivationFunction::IDENTITY };
+    ActivationFunction _act     = {ActivationLayerInfo::ActivationFunction::IDENTITY};
     float              _a       = {};
     float              _b       = {};
-    bool               _enabled = { false };
+    bool               _enabled = {false};
 
 #ifdef __aarch64__
     LookupTable256 _lut = {};
diff --git a/arm_compute/function_info/ConvolutionInfo.h b/arm_compute/function_info/ConvolutionInfo.h
index c27dc523c868ddf4abb83eb471ab12a9431e6115..4830cae1374529ea7352013b540321ff719f408a 100644
--- a/arm_compute/function_info/ConvolutionInfo.h
+++ b/arm_compute/function_info/ConvolutionInfo.h
@@ -33,14 +33,18 @@ namespace arm_compute
 struct ConvolutionInfo
 {
     ConvolutionInfo() = default;
-    ConvolutionInfo(const PadStrideInfo &pad_stride_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+    ConvolutionInfo(const PadStrideInfo       &pad_stride_info,
+                    unsigned int               depth_multiplier,
+                    const ActivationLayerInfo &act_info,
+                    const Size2D              &dilation)
         : pad_stride_info(pad_stride_info), depth_multiplier(depth_multiplier), act_info(act_info), dilation(dilation)
     {
     }
-    PadStrideInfo       pad_stride_info{};        /**< Convolution info (Pads, strides,...) */
-    unsigned int        depth_multiplier{ 1 };    /**< Multiplier to apply to input's depth to retrieve the output depth. Defaults to 1 */
-    ActivationLayerInfo act_info{};               /**< Fused activation to apply after convolution. */
-    Size2D              dilation{ Size2D(1, 1) }; /**< Dilation, in elements, across x and y. Defaults to (1, 1). */
+    PadStrideInfo pad_stride_info{}; /**< Convolution info (Pads, strides,...) */
+    unsigned int  depth_multiplier{
+        1}; /**< Multiplier to apply to input's depth to retrieve the output depth. Defaults to 1 */
+    ActivationLayerInfo act_info{};             /**< Fused activation to apply after convolution. */
+    Size2D              dilation{Size2D(1, 1)}; /**< Dilation, in elements, across x and y. Defaults to (1, 1). */
 };
 } // namespace arm_compute
 #endif /* ACL_ARM_COMPUTE_FUNCTION_INFO_CONVOLUTIONINFO */
diff --git a/arm_compute/function_info/FullyConnectedLayerInfo.h b/arm_compute/function_info/FullyConnectedLayerInfo.h
index 5f5578eaddb01e65e06fceaff2ade332db51dcda..e65daeb2d407492a0cb4c97267f0496e79395062 100644
--- a/arm_compute/function_info/FullyConnectedLayerInfo.h
+++ b/arm_compute/function_info/FullyConnectedLayerInfo.h
@@ -35,13 +35,13 @@ struct FullyConnectedLayerInfo
     /* Fused-activation parameters */
     ActivationLayerInfo activation_info{}; /**<  Fused activation to apply after the matrix multiplication. */
     /* Information about weights */
-    DataLayout weights_trained_layout{ DataLayout::NCHW }; /**<  Layout that the weights have been trained with. */
-    bool       transpose_weights{ true };                  /**<  Transpose weights if true. */
-    bool       are_weights_reshaped{ false };              /**<  @deprecated Reshape the weights tensor if false. */
-    bool       retain_internal_weights{ false };           /**<  Retain internal reshaped weights. */
-    bool       enable_fast_math{ false };                  /**<  Enable fast math computation. */
+    DataLayout weights_trained_layout{DataLayout::NCHW}; /**<  Layout that the weights have been trained with. */
+    bool       transpose_weights{true};                  /**<  Transpose weights if true. */
+    bool       are_weights_reshaped{false};              /**<  @deprecated Reshape the weights tensor if false. */
+    bool       retain_internal_weights{false};           /**<  Retain internal reshaped weights. */
+    bool       enable_fast_math{false};                  /**<  Enable fast math computation. */
     /* Other parameters */
-    bool fp_mixed_precision{ false }; /**<  Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy. */
+    bool fp_mixed_precision{false}; /**<  Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy. */
 
     /** Sets the weights trained data layout
      *
diff --git a/arm_compute/function_info/GEMMInfo.h b/arm_compute/function_info/GEMMInfo.h
index daaf86243afe190b2a50f5b69580e7ab7be82a60..a827c79fda13ad6ebc8fc9cb9e3d11eaa0cf7e9f 100644
--- a/arm_compute/function_info/GEMMInfo.h
+++ b/arm_compute/function_info/GEMMInfo.h
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO
-#define ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO
+#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO_H
+#define ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO_H
 
 #include "arm_compute/core/CoreTypes.h"
-#include "arm_compute/core/experimental/IPostOp.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include <vector>
 
 namespace arm_compute
@@ -44,17 +44,22 @@ enum class GEMMLowpOutputStageType
 /** GEMMLowp output stage info */
 struct GEMMLowpOutputStageInfo
 {
-    GEMMLowpOutputStageType type{ GEMMLowpOutputStageType::NONE };                        /**< GEMMLowp output stage type */
-    int32_t                 gemmlowp_offset{ 0 };                                         /**< GEMMLowp output stage offset used for quantizing to QASYMM8 */
-    int32_t                 gemmlowp_multiplier{ 0 };                                     /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
-    int32_t                 gemmlowp_shift{ 0 };                                          /**< GEMMLowp output stage shift used for quantizing to uint8 */
-    int32_t                 gemmlowp_min_bound{ std::numeric_limits<int32_t>::lowest() }; /**< GEMMLowp min value used to saturate down the output result before converting back to QASYMM8 */
-    int32_t                 gemmlowp_max_bound{ std::numeric_limits<int32_t>::max() };    /**< GEMMLowp max value used to saturate down the output result before converting back to QASYMM8 */
-    std::vector<int32_t>    gemmlowp_multipliers{};                                       /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
-    std::vector<int32_t>    gemmlowp_shifts{};                                            /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
-    float                   gemmlowp_real_multiplier{ 0 };                                /**< GEMMLowp output stage real multiplier used for quantizing to QASYMM8 */
-    bool                    is_quantized_per_channel{ false };                            /**< GEMMLowp quantized per-channel flag */
-    DataType                output_data_type{ DataType::UNKNOWN };                        /**< Output tensor data type to use if the output is not initialized */
+    GEMMLowpOutputStageType type{GEMMLowpOutputStageType::NONE}; /**< GEMMLowp output stage type */
+    int32_t                 gemmlowp_offset{0}; /**< GEMMLowp output stage offset used for quantizing to QASYMM8 */
+    int32_t gemmlowp_multiplier{0};             /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
+    int32_t gemmlowp_shift{0};                  /**< GEMMLowp output stage shift used for quantizing to uint8 */
+    int32_t gemmlowp_min_bound{
+        std::numeric_limits<int32_t>::
+            lowest()}; /**< GEMMLowp min value used to saturate down the output result before converting back to QASYMM8 */
+    int32_t gemmlowp_max_bound{
+        std::numeric_limits<int32_t>::
+            max()}; /**< GEMMLowp max value used to saturate down the output result before converting back to QASYMM8 */
+    std::vector<int32_t> gemmlowp_multipliers{}; /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
+    std::vector<int32_t> gemmlowp_shifts{};      /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
+    float    gemmlowp_real_multiplier{0}; /**< GEMMLowp output stage real multiplier used for quantizing to QASYMM8 */
+    bool     is_quantized_per_channel{false}; /**< GEMMLowp quantized per-channel flag */
+    DataType output_data_type{
+        DataType::UNKNOWN}; /**< Output tensor data type to use if the output is not initialized */
 };
 /** GEMM information class. This class stores the necessary information to compute GEMM functions
  *
@@ -79,7 +84,6 @@ public:
           _pretranspose_A(false),
           _pretranspose_B(false),
           _activation_info(),
-          _post_ops(),
           _fixed_format(false),
           _weight_format(arm_compute::WeightFormat::UNSPECIFIED)
     {
@@ -99,14 +103,24 @@ public:
      * @param[in] fast_math                   (Optional) Use a data type of shorter width to improve performance
      * @param[in] broadcast_bias              (Optional) Broadcast the shape of the bias tensor from a vector to a matrix.
      * @param[in] activation_info             (Optional) Activation to apply after the matrix multiplication
-     * @param[in] post_ops                    (Optional) A sequence of post operations that are performed after the main operation.
      * @param[in] fixed_format                (Optional) Specify the selection of fixed format kernels for variable weights support in GEMM. These kernels expect the weights tensor to be in amemory format that is fixed by the kernel itself. For more information, see arm_compute::WeightFormat.
      * @param[in] weight_format               (Optional) arm_gemm:WeightFormat enumeration requested by the user. Default is arm_compute::WeightFormat::UNSPECIFIED.
+     * @param[in] pretranspose_B              (Optional) Pretranspose matrix B (transposition of its lowest 2 dimensions), in addition to and before, any further transformations of B
      */
-    GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false, bool retain_internal_weights = false,
-             GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo(), bool fp_mixed_precision = false, bool fast_math = false, bool broadcast_bias = false,
-             const ActivationLayerInfo &activation_info = ActivationLayerInfo(), const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *>(),
-             bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED) noexcept
+    GEMMInfo(bool                       is_a_reshaped,
+             bool                       is_b_reshaped,
+             bool                       reshape_b_only_on_first_run,
+             int                        depth_output_gemm3d     = 0,
+             bool                       reinterpret_input_as_3d = false,
+             bool                       retain_internal_weights = false,
+             GEMMLowpOutputStageInfo    gemmlowp_output_stage   = GEMMLowpOutputStageInfo(),
+             bool                       fp_mixed_precision      = false,
+             bool                       fast_math               = false,
+             bool                       broadcast_bias          = false,
+             const ActivationLayerInfo &activation_info         = ActivationLayerInfo(),
+             bool                       fixed_format            = false,
+             arm_compute::WeightFormat  weight_format           = arm_compute::WeightFormat::UNSPECIFIED,
+             bool                       pretranspose_B          = false) noexcept
         : _is_a_reshaped(is_a_reshaped),
           _is_b_reshaped(is_b_reshaped),
           _reshape_b_only_on_first_run(reshape_b_only_on_first_run),
@@ -118,9 +132,8 @@ public:
           _fp_mixed_precision(fp_mixed_precision),
           _broadcast_bias(broadcast_bias),
           _pretranspose_A(false),
-          _pretranspose_B(false),
+          _pretranspose_B(pretranspose_B),
           _activation_info(activation_info),
-          _post_ops(post_ops),
           _fixed_format(fixed_format),
           _weight_format(weight_format)
     {
@@ -240,6 +253,8 @@ public:
         _pretranspose_A = flag;
     }
     /** Flag which specifies whether b should be pre-transposed if supported.
+     * More concretely, the "pre-transpose" is the transposition of the b tensor's lowest 2 dimensions
+     * If specified true, this pre-transpose will occur in addition to and before, any further transformations of the b matrix
      *
      * @return True if b should be pre-transposed else false.
      */
@@ -271,22 +286,6 @@ public:
     {
         _activation_info = activation_info;
     }
-    /** Post operations to apply after the matrix multiplication
-     *
-     * @return experimental::PostOpList object
-     */
-    const experimental::PostOpList<ITensorInfo *> &post_ops() const
-    {
-        return _post_ops;
-    }
-    /** Set post ops
-     *
-     * @param[in] post_ops experimental::PostOpList object to set
-     */
-    void set_post_ops(const experimental::PostOpList<ITensorInfo *> &post_ops)
-    {
-        _post_ops = post_ops;
-    }
     /** Flag which specifies if the GEMM operation is running fixed-format kernels.
      *
      * @return True if the GEMM operation is running fixed-format kernel else false.
@@ -320,22 +319,21 @@ public:
     }
 
 private:
-    bool                                    _is_a_reshaped;
-    bool                                    _is_b_reshaped;
-    bool                                    _reshape_b_only_on_first_run;
-    int                                     _depth_output_gemm3d;
-    bool                                    _reinterpret_input_as_3d;
-    bool                                    _retain_internal_weights;
-    GEMMLowpOutputStageInfo                 _gemmlowp_output_stage;
-    bool                                    _fast_math;
-    bool                                    _fp_mixed_precision;
-    bool                                    _broadcast_bias;
-    bool                                    _pretranspose_A;
-    bool                                    _pretranspose_B;
-    ActivationLayerInfo                     _activation_info;
-    experimental::PostOpList<ITensorInfo *> _post_ops;
-    bool                                    _fixed_format;
-    arm_compute::WeightFormat               _weight_format;
+    bool                      _is_a_reshaped;
+    bool                      _is_b_reshaped;
+    bool                      _reshape_b_only_on_first_run;
+    int                       _depth_output_gemm3d;
+    bool                      _reinterpret_input_as_3d;
+    bool                      _retain_internal_weights;
+    GEMMLowpOutputStageInfo   _gemmlowp_output_stage;
+    bool                      _fast_math;
+    bool                      _fp_mixed_precision;
+    bool                      _broadcast_bias;
+    bool                      _pretranspose_A;
+    bool                      _pretranspose_B;
+    ActivationLayerInfo       _activation_info;
+    bool                      _fixed_format;
+    arm_compute::WeightFormat _weight_format;
 };
 } //namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO */
+#endif // ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO_H
diff --git a/arm_compute/function_info/MatMulInfo.h b/arm_compute/function_info/MatMulInfo.h
index cd9ef1f4d957766c9c9a87450a27403210b0ef56..fc73efb44a76dcd0a398a9eef7a7fb61fde3ef65 100644
--- a/arm_compute/function_info/MatMulInfo.h
+++ b/arm_compute/function_info/MatMulInfo.h
@@ -55,8 +55,8 @@ public:
     }
 
 private:
-    bool _adj_lhs{ false };
-    bool _adj_rhs{ false };
+    bool _adj_lhs{false};
+    bool _adj_rhs{false};
 };
 } // namespace arm_compute
 #endif /* ACL_ARM_COMPUTE_FUNCTION_INFO_MATMULINFO */
diff --git a/arm_compute/graph/DataLayerVisitor.h b/arm_compute/graph/DataLayerVisitor.h
index ac7f1c84ee64396a07a22d4dfb1092ddbb339865..11d9f1ddc97c43d6a49e4bf8b71fcd31c7a0a1f1 100644
--- a/arm_compute/graph/DataLayerVisitor.h
+++ b/arm_compute/graph/DataLayerVisitor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_DATALAYERPRINTER_H
-#define ARM_COMPUTE_GRAPH_DATALAYERPRINTER_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_DATALAYERVISITOR_H
+#define ACL_ARM_COMPUTE_GRAPH_DATALAYERVISITOR_H
 
 #include "arm_compute/graph/IGraphPrinter.h"
 #include "arm_compute/graph/INodeVisitor.h"
@@ -48,7 +48,6 @@ public:
     void visit(ConvolutionLayerNode &n) override;
     void visit(DepthwiseConvolutionLayerNode &n) override;
     void visit(FusedConvolutionBatchNormalizationNode &n) override;
-    void visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n) override;
     void visit(FusedDepthwiseConvolutionBatchNormalizationNode &n) override;
     void visit(OutputNode &n) override;
 
@@ -59,4 +58,4 @@ private:
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_DATALAYERPRINTER_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_DATALAYERVISITOR_H
diff --git a/arm_compute/graph/Edge.h b/arm_compute/graph/Edge.h
index 5e81b9c52f85ad8b1731fdfae5845a85822f8ff6..7f5075d88526f56a012d88331c11126a341adde3 100644
--- a/arm_compute/graph/Edge.h
+++ b/arm_compute/graph/Edge.h
@@ -48,8 +48,18 @@ public:
      * @param[in] consumer_idx Consumer node input index
      * @param[in] tensor       Tensor associated with the edge
      */
-    Edge(EdgeID id, INode *producer, unsigned int producer_idx, INode *consumer, unsigned int consumer_idx, Tensor *tensor)
-        : _id(id), _producer(producer), _consumer(consumer), _producer_idx(producer_idx), _consumer_idx(consumer_idx), _tensor(tensor)
+    Edge(EdgeID       id,
+         INode       *producer,
+         unsigned int producer_idx,
+         INode       *consumer,
+         unsigned int consumer_idx,
+         Tensor      *tensor)
+        : _id(id),
+          _producer(producer),
+          _consumer(consumer),
+          _producer_idx(producer_idx),
+          _consumer_idx(consumer_idx),
+          _tensor(tensor)
 
     {
     }
diff --git a/arm_compute/graph/Graph.h b/arm_compute/graph/Graph.h
index 806d84c3fdb9daa4a50abc272a3928546ea4ee64..e6e173f5fa3371781911df95b1419d59c6e40086 100644
--- a/arm_compute/graph/Graph.h
+++ b/arm_compute/graph/Graph.h
@@ -79,7 +79,7 @@ public:
      * @return ID of the node
      */
     template <typename NT, typename... Ts>
-    NodeID add_node(Ts &&... args);
+    NodeID add_node(Ts &&...args);
     /** Remove the node with the given ID
      *
      * @param[in] nid ID of the node to remove
@@ -221,17 +221,17 @@ private:
     TensorID create_tensor(const TensorDescriptor &desc = TensorDescriptor());
 
 private:
-    GraphID                              _id      = GraphID(0); /**< Graph id */
-    std::string                          _name    = {};         /**< Graph name */
-    std::vector<std::unique_ptr<INode>>  _nodes   = {};         /**< Graph nodes */
-    std::vector<std::unique_ptr<Edge>>   _edges   = {};         /**< Graph edges */
-    std::vector<std::unique_ptr<Tensor>> _tensors = {};         /**< Graph tensors */
+    GraphID                                 _id           = GraphID(0); /**< Graph id */
+    std::string                             _name         = {};         /**< Graph name */
+    std::vector<std::unique_ptr<INode>>     _nodes        = {};         /**< Graph nodes */
+    std::vector<std::unique_ptr<Edge>>      _edges        = {};         /**< Graph edges */
+    std::vector<std::unique_ptr<Tensor>>    _tensors      = {};         /**< Graph tensors */
     std::map<NodeType, std::vector<NodeID>> _tagged_nodes = {}; /**< Graph nodes map with the node type as key */
-    arm_compute::Mutex _mtx = {};                               /**< Mutex used for graph construction */
+    arm_compute::Mutex                      _mtx          = {}; /**< Mutex used for graph construction */
 };
 
 template <typename NT, typename... Ts>
-inline NodeID Graph::add_node(Ts &&... args)
+inline NodeID Graph::add_node(Ts &&...args)
 {
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
 
@@ -245,7 +245,7 @@ inline NodeID Graph::add_node(Ts &&... args)
     _tagged_nodes[node->type()].push_back(nid);
 
     // Associate a new tensor with each output
-    for(auto &output : node->_outputs)
+    for (auto &output : node->_outputs)
     {
         output = create_tensor();
     }
diff --git a/arm_compute/graph/GraphBuilder.h b/arm_compute/graph/GraphBuilder.h
index cb88c0e7aa5ce6bc23ed5d2d370c21a1417d5de6..118d06bddada680f534659d16146f0ab6178ef77 100644
--- a/arm_compute/graph/GraphBuilder.h
+++ b/arm_compute/graph/GraphBuilder.h
@@ -51,7 +51,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_const_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor = nullptr);
+    static NodeID
+    add_const_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor = nullptr);
     /** Adds an input layer node to the graph
      *
      * @param[in] g        Graph to add the node to
@@ -61,7 +62,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_input_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor = nullptr);
+    static NodeID
+    add_input_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor = nullptr);
     /** Adds an output layer node to the graph
      *
      * @param[in] g        Graph to add the node to
@@ -71,7 +73,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_output_node(Graph &g, NodeParams params, NodeIdxPair input, ITensorAccessorUPtr accessor = nullptr);
+    static NodeID
+    add_output_node(Graph &g, NodeParams params, NodeIdxPair input, ITensorAccessorUPtr accessor = nullptr);
     /** Adds an activation layer node to the graph
      *
      * @param[in] g              Graph to add the node to
@@ -82,7 +85,10 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_activation_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info,
+    static NodeID add_activation_node(Graph                  &g,
+                                      NodeParams              params,
+                                      NodeIdxPair             input,
+                                      ActivationLayerInfo     act_info,
                                       const QuantizationInfo &out_quant_info = QuantizationInfo());
     /** Adds an activation layer node to the graph
      *
@@ -96,7 +102,11 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_arg_min_max_node(Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, unsigned int axis,
+    static NodeID add_arg_min_max_node(Graph                  &g,
+                                       NodeParams              params,
+                                       NodeIdxPair             input,
+                                       ReductionOperation      op,
+                                       unsigned int            axis,
                                        DataType                out_data_type  = DataType::UNKNOWN,
                                        const QuantizationInfo &out_quant_info = QuantizationInfo());
     /** Adds a batch normalization layer node to the graph
@@ -112,9 +122,14 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_batch_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, float epsilon,
-                                               ITensorAccessorUPtr mean_accessor = nullptr, ITensorAccessorUPtr var_accessor = nullptr,
-                                               ITensorAccessorUPtr beta_accessor = nullptr, ITensorAccessorUPtr gamma_accessor = nullptr);
+    static NodeID add_batch_normalization_node(Graph              &g,
+                                               NodeParams          params,
+                                               NodeIdxPair         input,
+                                               float               epsilon,
+                                               ITensorAccessorUPtr mean_accessor  = nullptr,
+                                               ITensorAccessorUPtr var_accessor   = nullptr,
+                                               ITensorAccessorUPtr beta_accessor  = nullptr,
+                                               ITensorAccessorUPtr gamma_accessor = nullptr);
     /** Adds a bounding box transform layer node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -125,7 +140,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_bounding_box_transform_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info);
+    static NodeID add_bounding_box_transform_node(
+        Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info);
     /** Adds an channel shuffle layer node to the graph
      *
      * @param[in] g          Graph to add the node to
@@ -154,10 +170,17 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_convolution_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                       Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo conv_info, unsigned int num_groups = 1,
-                                       ConvolutionMethod method = ConvolutionMethod::Default, FastMathHint fast_math_hint = FastMathHint::Disabled,
-                                       ITensorAccessorUPtr weights_accessor = nullptr, ITensorAccessorUPtr bias_accessor = nullptr,
+    static NodeID add_convolution_node(Graph                  &g,
+                                       NodeParams              params,
+                                       NodeIdxPair             input,
+                                       Size2D                  kernel_spatial_extend,
+                                       unsigned int            depth,
+                                       PadStrideInfo           conv_info,
+                                       unsigned int            num_groups         = 1,
+                                       ConvolutionMethod       method             = ConvolutionMethod::Default,
+                                       FastMathHint            fast_math_hint     = FastMathHint::Disabled,
+                                       ITensorAccessorUPtr     weights_accessor   = nullptr,
+                                       ITensorAccessorUPtr     bias_accessor      = nullptr,
                                        const QuantizationInfo &weights_quant_info = QuantizationInfo(),
                                        const QuantizationInfo &out_quant_info     = QuantizationInfo());
     /** Adds a deconvolution layer node to the graph
@@ -173,9 +196,14 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_deconvolution_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                         Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo deconv_info,
-                                         ITensorAccessorUPtr weights_accessor = nullptr, ITensorAccessorUPtr bias_accessor = nullptr);
+    static NodeID add_deconvolution_node(Graph              &g,
+                                         NodeParams          params,
+                                         NodeIdxPair         input,
+                                         Size2D              kernel_spatial_extend,
+                                         unsigned int        depth,
+                                         PadStrideInfo       deconv_info,
+                                         ITensorAccessorUPtr weights_accessor = nullptr,
+                                         ITensorAccessorUPtr bias_accessor    = nullptr);
     /** Adds a depth concatenate node to the graph
      *
      * @param[in] g                 Graph to add the node to
@@ -185,7 +213,10 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_concatenate_node(Graph &g, NodeParams params, const std::vector<NodeIdxPair> &inputs, const descriptors::ConcatLayerDescriptor &concat_descriptor);
+    static NodeID add_concatenate_node(Graph                                    &g,
+                                       NodeParams                                params,
+                                       const std::vector<NodeIdxPair>           &inputs,
+                                       const descriptors::ConcatLayerDescriptor &concat_descriptor);
     /** Adds an depth to space layer node to the graph
      *
      * @param[in] g           Graph to add the node to
@@ -212,11 +243,18 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_depthwise_convolution_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                                 Size2D kernel_spatial_extend, PadStrideInfo conv_info, int depth_multiplier = 1,
-                                                 DepthwiseConvolutionMethod method    = DepthwiseConvolutionMethod::Default,
-                                                 ITensorAccessorUPtr weights_accessor = nullptr, ITensorAccessorUPtr bias_accessor = nullptr, const QuantizationInfo &quant_info = QuantizationInfo(),
-                                                 const QuantizationInfo &out_quant_info = QuantizationInfo());
+    static NodeID
+    add_depthwise_convolution_node(Graph                     &g,
+                                   NodeParams                 params,
+                                   NodeIdxPair                input,
+                                   Size2D                     kernel_spatial_extend,
+                                   PadStrideInfo              conv_info,
+                                   int                        depth_multiplier = 1,
+                                   DepthwiseConvolutionMethod method           = DepthwiseConvolutionMethod::Default,
+                                   ITensorAccessorUPtr        weights_accessor = nullptr,
+                                   ITensorAccessorUPtr        bias_accessor    = nullptr,
+                                   const QuantizationInfo    &quant_info       = QuantizationInfo(),
+                                   const QuantizationInfo    &out_quant_info   = QuantizationInfo());
     /** Adds an element-wise layer node to the graph
      *
      * @param[in] g         Graph to add the node to
@@ -227,7 +265,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_elementwise_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation);
+    static NodeID add_elementwise_node(
+        Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation);
     /** Adds a dequantization node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -248,7 +287,12 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_detection_output_node(Graph &g, NodeParams params, NodeIdxPair input_loc, NodeIdxPair input_conf, NodeIdxPair input_priorbox, const DetectionOutputLayerInfo &detect_info);
+    static NodeID add_detection_output_node(Graph                          &g,
+                                            NodeParams                      params,
+                                            NodeIdxPair                     input_loc,
+                                            NodeIdxPair                     input_conf,
+                                            NodeIdxPair                     input_priorbox,
+                                            const DetectionOutputLayerInfo &detect_info);
     /** Adds a detection post process layer node to the graph
      *
      * @param[in] g                      Graph to add the node to
@@ -261,8 +305,12 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_detection_post_process_node(Graph &g, NodeParams params, NodeIdxPair input_box_encoding, NodeIdxPair input_class_prediction,
-                                                  const DetectionPostProcessLayerInfo &detect_info, ITensorAccessorUPtr anchors_accessor = nullptr,
+    static NodeID add_detection_post_process_node(Graph                               &g,
+                                                  NodeParams                           params,
+                                                  NodeIdxPair                          input_box_encoding,
+                                                  NodeIdxPair                          input_class_prediction,
+                                                  const DetectionPostProcessLayerInfo &detect_info,
+                                                  ITensorAccessorUPtr                  anchors_accessor = nullptr,
                                                   const QuantizationInfo &anchor_quant_info = QuantizationInfo());
     /** Adds a Dummy node to the graph
      *
@@ -299,8 +347,12 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
-                                            NodeID weights_nid, NodeID bias_nid = EmptyNodeID,
+    static NodeID add_fully_connected_layer(Graph                        &g,
+                                            NodeParams                    params,
+                                            NodeIdxPair                   input,
+                                            unsigned int                  num_outputs,
+                                            NodeID                        weights_nid,
+                                            NodeID                        bias_nid       = EmptyNodeID,
                                             const FullyConnectedLayerInfo fc_info        = FullyConnectedLayerInfo(),
                                             const QuantizationInfo       &out_quant_info = QuantizationInfo(),
                                             FastMathHint                  fast_math_hint = FastMathHint::Disabled);
@@ -319,9 +371,13 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
-                                            ITensorAccessorUPtr weights_accessor = nullptr, ITensorAccessorUPtr bias_accessor = nullptr,
-                                            const FullyConnectedLayerInfo fc_info            = FullyConnectedLayerInfo(),
+    static NodeID add_fully_connected_layer(Graph                        &g,
+                                            NodeParams                    params,
+                                            NodeIdxPair                   input,
+                                            unsigned int                  num_outputs,
+                                            ITensorAccessorUPtr           weights_accessor = nullptr,
+                                            ITensorAccessorUPtr           bias_accessor    = nullptr,
+                                            const FullyConnectedLayerInfo fc_info          = FullyConnectedLayerInfo(),
                                             const QuantizationInfo       &weights_quant_info = QuantizationInfo(),
                                             const QuantizationInfo       &out_quant_info     = QuantizationInfo(),
                                             FastMathHint                  fast_math_hint     = FastMathHint::Disabled);
@@ -336,8 +392,12 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_generate_proposals_node(Graph &g, NodeParams params, NodeIdxPair scores, NodeIdxPair deltas,
-                                              NodeIdxPair anchors, GenerateProposalsInfo info);
+    static NodeID add_generate_proposals_node(Graph                &g,
+                                              NodeParams            params,
+                                              NodeIdxPair           scores,
+                                              NodeIdxPair           deltas,
+                                              NodeIdxPair           anchors,
+                                              GenerateProposalsInfo info);
     /** Adds a L2 Normalize layer node to the graph
      *
      * @param[in] g       Graph to add the node to
@@ -358,7 +418,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info);
+    static NodeID
+    add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info);
     /** Adds a normalize planar YUV layer node to the graph
      *
      * @param[in] g             Graph to add the node to
@@ -369,8 +430,11 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_normalize_planar_yuv_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                                ITensorAccessorUPtr mean_accessor = nullptr, ITensorAccessorUPtr std_accessor = nullptr);
+    static NodeID add_normalize_planar_yuv_node(Graph              &g,
+                                                NodeParams          params,
+                                                NodeIdxPair         input,
+                                                ITensorAccessorUPtr mean_accessor = nullptr,
+                                                ITensorAccessorUPtr std_accessor  = nullptr);
     /** Adds a pad layer node to the graph
      *
      * @param[in] g         Graph to add the node to
@@ -382,7 +446,11 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_pad_node(Graph &g, NodeParams params, NodeIdxPair input, const PaddingList &paddings, PixelValue pad_value = PixelValue());
+    static NodeID add_pad_node(Graph             &g,
+                               NodeParams         params,
+                               NodeIdxPair        input,
+                               const PaddingList &paddings,
+                               PixelValue         pad_value = PixelValue());
     /** Adds a permute layer node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -394,7 +462,11 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_permute_node(Graph &g, NodeParams params, NodeIdxPair input, PermutationVector perm, DataLayout layout = DataLayout::UNKNOWN);
+    static NodeID add_permute_node(Graph            &g,
+                                   NodeParams        params,
+                                   NodeIdxPair       input,
+                                   PermutationVector perm,
+                                   DataLayout        layout = DataLayout::UNKNOWN);
     /** Adds a pooling layer node to the graph
      *
      * @param[in] g         Graph to add the node to
@@ -426,8 +498,12 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_print_node(Graph &g, NodeParams params, NodeIdxPair input, std::ostream &stream, const IOFormatInfo &format_info = IOFormatInfo(),
-                                 const std::function<ITensor *(ITensor *)> transform = nullptr);
+    static NodeID add_print_node(Graph                                    &g,
+                                 NodeParams                                params,
+                                 NodeIdxPair                               input,
+                                 std::ostream                             &stream,
+                                 const IOFormatInfo                       &format_info = IOFormatInfo(),
+                                 const std::function<ITensor *(ITensor *)> transform   = nullptr);
     /** Adds a priorbox layer node to the graph
      *
      * @param[in] g          Graph to add the node to
@@ -438,7 +514,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_priorbox_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, const PriorBoxLayerInfo &prior_info);
+    static NodeID add_priorbox_node(
+        Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, const PriorBoxLayerInfo &prior_info);
     /** Adds a quantization layer node to the graph
      *
      * @param[in] g              Graph to add the node to
@@ -448,7 +525,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_quantization_node(Graph &g, NodeParams params, NodeIdxPair input, const QuantizationInfo &out_quant_info);
+    static NodeID
+    add_quantization_node(Graph &g, NodeParams params, NodeIdxPair input, const QuantizationInfo &out_quant_info);
     /** Adds a reduction sum layer node to the graph
      *
      * @param[in] g         Graph to add the node to
@@ -460,7 +538,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_reduction_operation_node(Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, int axis, bool keep_dims = true);
+    static NodeID add_reduction_operation_node(
+        Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, int axis, bool keep_dims = true);
     /** Adds a reorg layer node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -492,7 +571,12 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_resize_node(Graph &g, NodeParams params, NodeIdxPair input, InterpolationPolicy policy, float width_scale, float height_scale);
+    static NodeID add_resize_node(Graph              &g,
+                                  NodeParams          params,
+                                  NodeIdxPair         input,
+                                  InterpolationPolicy policy,
+                                  float               width_scale,
+                                  float               height_scale);
     /** Adds a ROI align layer node to the graph
      *
      * @param[in] g         Graph to add the node to
@@ -503,7 +587,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info);
+    static NodeID
+    add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info);
     /** Adds a scale layer node to the graph
      * This layer computes a product of the input with a scale (read from mul_accessor) and it applies an offset (read from add_accessor).
      * output = input * mul_w + add_w
@@ -516,8 +601,11 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_scale_layer(Graph &g, const NodeParams &params, NodeIdxPair input,
-                                  ITensorAccessorUPtr mul_accessor = nullptr, ITensorAccessorUPtr add_accessor = nullptr);
+    static NodeID add_scale_layer(Graph              &g,
+                                  const NodeParams   &params,
+                                  NodeIdxPair         input,
+                                  ITensorAccessorUPtr mul_accessor = nullptr,
+                                  ITensorAccessorUPtr add_accessor = nullptr);
     /** Adds a softmax node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -538,7 +626,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends);
+    static NodeID
+    add_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends);
     /** Adds a split node to the graph
      *
      * @param[in] g          Graph to add the node to
@@ -549,7 +638,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis = 0);
+    static NodeID
+    add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis = 0);
     /** Adds a stack layer node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -572,7 +662,13 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_strided_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends, BiStrides &strides, StridedSliceLayerInfo info);
+    static NodeID add_strided_slice_node(Graph                &g,
+                                         NodeParams            params,
+                                         NodeIdxPair           input,
+                                         Coordinates          &starts,
+                                         Coordinates          &ends,
+                                         BiStrides            &strides,
+                                         StridedSliceLayerInfo info);
     /** Adds a yolo layer to the graph
      *
      * @param[in] g        Graph to add the node to
diff --git a/arm_compute/graph/GraphContext.h b/arm_compute/graph/GraphContext.h
index 7beb598646c1fce3e9dc9fb3774fcb84c46a117f..68fbaf547892b6581805f5acdee56ab540a81322 100644
--- a/arm_compute/graph/GraphContext.h
+++ b/arm_compute/graph/GraphContext.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_GRAPH_CONTEXT_H
 
 #include "arm_compute/graph/Types.h"
-
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 
@@ -39,18 +38,18 @@ namespace graph
 /** Contains structs required for memory management */
 struct MemoryManagerContext
 {
-    Target                                       target      = { Target::UNSPECIFIED }; /**< Target */
-    std::shared_ptr<arm_compute::IMemoryManager> intra_mm    = { nullptr };             /**< Intra-function memory manager */
-    std::shared_ptr<arm_compute::IMemoryManager> cross_mm    = { nullptr };             /**< Cross-function memory manager */
-    std::shared_ptr<arm_compute::IMemoryGroup>   cross_group = { nullptr };             /**< Cross-function memory group */
-    IAllocator                                  *allocator   = { nullptr };             /**< Backend allocator to use */
+    Target                                       target      = {Target::UNSPECIFIED}; /**< Target */
+    std::shared_ptr<arm_compute::IMemoryManager> intra_mm    = {nullptr}; /**< Intra-function memory manager */
+    std::shared_ptr<arm_compute::IMemoryManager> cross_mm    = {nullptr}; /**< Cross-function memory manager */
+    std::shared_ptr<arm_compute::IMemoryGroup>   cross_group = {nullptr}; /**< Cross-function memory group */
+    IAllocator                                  *allocator   = {nullptr}; /**< Backend allocator to use */
 };
 
 /** Contains structs required for weights management */
 struct WeightsManagerContext
 {
-    Target                                        target = { Target::UNSPECIFIED }; /**< Target */
-    std::shared_ptr<arm_compute::IWeightsManager> wm     = { nullptr };             /**< Weights manager */
+    Target                                        target = {Target::UNSPECIFIED}; /**< Target */
+    std::shared_ptr<arm_compute::IWeightsManager> wm     = {nullptr};             /**< Weights manager */
 };
 
 /** Graph context **/
@@ -125,7 +124,7 @@ public:
     void finalize();
 
 private:
-    GraphConfig _config;                                       /**< Graph configuration */
+    GraphConfig                             _config;           /**< Graph configuration */
     std::map<Target, MemoryManagerContext>  _memory_managers;  /**< Memory managers for each target */
     std::map<Target, WeightsManagerContext> _weights_managers; /**< Weights managers for each target */
 };
diff --git a/arm_compute/graph/IDeviceBackend.h b/arm_compute/graph/IDeviceBackend.h
index f84aac0ae03a39da1f42e451d7299e1636cf1e38..8ae92e3177f7113aca4cbe92c5e3b753820ccddb 100644
--- a/arm_compute/graph/IDeviceBackend.h
+++ b/arm_compute/graph/IDeviceBackend.h
@@ -88,7 +88,8 @@ public:
      *
      * @return Backend sub-tensor handle
      */
-    virtual std::unique_ptr<ITensorHandle> create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) = 0;
+    virtual std::unique_ptr<ITensorHandle>
+    create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) = 0;
     /** Configure a backend Node
      *
      * @note This creates an appropriate configured backend function for the given node
diff --git a/arm_compute/graph/INode.h b/arm_compute/graph/INode.h
index becd672d9006a6df4991109562d77940d243605a..5646ea81d2bb78e1d5e6d644d473911aa20d022f 100644
--- a/arm_compute/graph/INode.h
+++ b/arm_compute/graph/INode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019,2021 Arm Limited.
+ * Copyright (c) 2018-2019,2021,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_INODE_H
-#define ARM_COMPUTE_GRAPH_INODE_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_INODE_H
+#define ACL_ARM_COMPUTE_GRAPH_INODE_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/graph/LayerDescriptors.h"
@@ -241,30 +241,19 @@ public:
      * @return Assigned target of this node
      */
     Target assigned_target() const;
-    /** Post operator info list
-     *
-     * @return Post operator info list
-     */
-    const std::list<std::unique_ptr<ConvPostOpInfo>> &post_op_info_list() const;
-    /** Post operator info list
-     *
-     * @return Post operator info list
-     */
-    std::list<std::unique_ptr<ConvPostOpInfo>> &post_op_info_list();
 
 protected:
     friend class Graph;
 
 protected:
-    Graph                                     *_graph;             /**< Backward reference to graph owning the node */
-    NodeID                                     _id;                /**< Node ID */
-    NodeParams                                 _common_params;     /**< Node common params */
-    std::vector<TensorID>                      _outputs;           /**< Output of the node */
-    std::vector<EdgeID>                        _input_edges;       /**< Inputs edge set */
-    std::set<EdgeID>                           _output_edges;      /**< Output edge set */
-    Target                                     _assigned_target;   /**< Assigned target by the Graph executor */
-    std::list<std::unique_ptr<ConvPostOpInfo>> _post_op_info_list; /**< Post operator info list */
+    Graph                *_graph;           /**< Backward reference to graph owning the node */
+    NodeID                _id;              /**< Node ID */
+    NodeParams            _common_params;   /**< Node common params */
+    std::vector<TensorID> _outputs;         /**< Output of the node */
+    std::vector<EdgeID>   _input_edges;     /**< Inputs edge set */
+    std::set<EdgeID>      _output_edges;    /**< Output edge set */
+    Target                _assigned_target; /**< Assigned target by the Graph executor */
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_INODE_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_INODE_H
diff --git a/arm_compute/graph/INodeVisitor.h b/arm_compute/graph/INodeVisitor.h
index 97e95336ef50c72c7d28616dc7bd98e20019e727..efe191adfc16265d0ab7f67e452b84f5fae8b38f 100644
--- a/arm_compute/graph/INodeVisitor.h
+++ b/arm_compute/graph/INodeVisitor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_INODEVISITOR_H
-#define ARM_COMPUTE_GRAPH_INODEVISITOR_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_INODEVISITOR_H
+#define ACL_ARM_COMPUTE_GRAPH_INODEVISITOR_H
 
 #include "arm_compute/graph/nodes/NodesFwd.h"
 
@@ -106,16 +106,6 @@ public:
      * @param[in] n Node to visit.
      */
     virtual void visit(FusedConvolutionBatchNormalizationNode &n) = 0;
-    /** Visit FusedConvolutionBatchNormalizationWithPostOpsNode.
-     *
-     * @param[in] n Node to visit.
-     */
-    virtual void visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n) = 0;
-    /** Visit FusedConvolutionWithPostOpNode.
-     *
-     * @param[in] n Node to visit.
-     */
-    virtual void visit(FusedConvolutionWithPostOpNode &n) = 0;
     /** Visit FusedDepthwiseConvolutionBatchNormalizationNode.
      *
      * @param[in] n Node to visit.
@@ -215,8 +205,6 @@ public:
     virtual void visit(FlattenLayerNode &n) override;
     virtual void visit(FullyConnectedLayerNode &n) override;
     virtual void visit(FusedConvolutionBatchNormalizationNode &n) override;
-    virtual void visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n) override;
-    virtual void visit(FusedConvolutionWithPostOpNode &n) override;
     virtual void visit(FusedDepthwiseConvolutionBatchNormalizationNode &n) override;
     virtual void visit(InputNode &n) override;
     virtual void visit(NormalizationLayerNode &n) override;
@@ -240,4 +228,4 @@ public:
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_INODEVISITOR_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_INODEVISITOR_H
diff --git a/arm_compute/graph/LayerDescriptors.h b/arm_compute/graph/LayerDescriptors.h
index c11174f2ce7959f6b693887336ef34aeb90fcc4b..d632ed9e783a1e3a25617d1ed25622dd1224331d 100644
--- a/arm_compute/graph/LayerDescriptors.h
+++ b/arm_compute/graph/LayerDescriptors.h
@@ -37,8 +37,7 @@ namespace descriptors
 struct ConcatLayerDescriptor
 {
     /** Default constructor */
-    ConcatLayerDescriptor()
-        : axis(DataLayoutDimension::CHANNEL), output_qinfo()
+    ConcatLayerDescriptor() : axis(DataLayoutDimension::CHANNEL), output_qinfo()
     {
     }
 
@@ -46,8 +45,7 @@ struct ConcatLayerDescriptor
      *
      * @param[in] axis Axis.
      */
-    ConcatLayerDescriptor(DataLayoutDimension axis)
-        : axis(axis), output_qinfo()
+    ConcatLayerDescriptor(DataLayoutDimension axis) : axis(axis), output_qinfo()
     {
     }
 
@@ -76,9 +74,16 @@ struct EltwiseLayerDescriptor
      * @param[in] r_policy         (Optional) Rounding policy used for the operation. Defaults to @ref RoundingPolicy::TO_ZERO
      * @param[in] fused_activation (Optional) Fused activation information. Defaults to empty (identity) @ref ActivationLayerInfo
      */
-    EltwiseLayerDescriptor(EltwiseOperation op, QuantizationInfo out_quant_info = QuantizationInfo(), ConvertPolicy c_policy = ConvertPolicy::SATURATE, RoundingPolicy r_policy = RoundingPolicy::TO_ZERO,
+    EltwiseLayerDescriptor(EltwiseOperation    op,
+                           QuantizationInfo    out_quant_info   = QuantizationInfo(),
+                           ConvertPolicy       c_policy         = ConvertPolicy::SATURATE,
+                           RoundingPolicy      r_policy         = RoundingPolicy::TO_ZERO,
                            ActivationLayerInfo fused_activation = ActivationLayerInfo())
-        : op(op), out_quant_info(out_quant_info), c_policy(c_policy), r_policy(r_policy), fused_activation(fused_activation)
+        : op(op),
+          out_quant_info(out_quant_info),
+          c_policy(c_policy),
+          r_policy(r_policy),
+          fused_activation(fused_activation)
     {
     }
 
@@ -100,10 +105,16 @@ struct UnaryEltwiseLayerDescriptor
      * @param[in] r_policy         (Optional) Rounding policy used for the operation. Defaults to @ref RoundingPolicy::TO_ZERO
      * @param[in] fused_activation (Optional) Fused activation information. Defaults to empty (identity) @ref ActivationLayerInfo
      */
-    UnaryEltwiseLayerDescriptor(UnaryEltwiseOperation op, QuantizationInfo out_quant_info = QuantizationInfo(), ConvertPolicy c_policy = ConvertPolicy::SATURATE,
-                                RoundingPolicy      r_policy         = RoundingPolicy::TO_ZERO,
-                                ActivationLayerInfo fused_activation = ActivationLayerInfo())
-        : op(op), out_quant_info(out_quant_info), c_policy(c_policy), r_policy(r_policy), fused_activation(fused_activation)
+    UnaryEltwiseLayerDescriptor(UnaryEltwiseOperation op,
+                                QuantizationInfo      out_quant_info   = QuantizationInfo(),
+                                ConvertPolicy         c_policy         = ConvertPolicy::SATURATE,
+                                RoundingPolicy        r_policy         = RoundingPolicy::TO_ZERO,
+                                ActivationLayerInfo   fused_activation = ActivationLayerInfo())
+        : op(op),
+          out_quant_info(out_quant_info),
+          c_policy(c_policy),
+          r_policy(r_policy),
+          fused_activation(fused_activation)
     {
     }
 
@@ -130,7 +141,7 @@ struct DeconvolutionLayerDescriptor
     PadStrideInfo    info;           /**< Padding and stride information */
     QuantizationInfo out_quant_info; /**< Output quantization information */
 };
-} // namespace descriptor
+} // namespace descriptors
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_LAYER_DESCRIPTORS_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_LAYER_DESCRIPTORS_H */
diff --git a/arm_compute/graph/Logger.h b/arm_compute/graph/Logger.h
index 872c650a1ae508c2bf283f631ba78cbbc3685a26..e83d5f4ddc8288180dad496486c992851d3b7df0 100644
--- a/arm_compute/graph/Logger.h
+++ b/arm_compute/graph/Logger.h
@@ -31,14 +31,14 @@
  *
  * @note It will eventually create all default loggers in don't exist
  */
-#define ARM_COMPUTE_CREATE_DEFAULT_GRAPH_LOGGER()                                  \
-    do                                                                             \
-    {                                                                              \
-        if(arm_compute::logging::LoggerRegistry::get().logger("GRAPH") == nullptr) \
-        {                                                                          \
-            arm_compute::logging::LoggerRegistry::get().create_reserved_loggers(); \
-        }                                                                          \
-    } while(false)
+#define ARM_COMPUTE_CREATE_DEFAULT_GRAPH_LOGGER()                                   \
+    do                                                                              \
+    {                                                                               \
+        if (arm_compute::logging::LoggerRegistry::get().logger("GRAPH") == nullptr) \
+        {                                                                           \
+            arm_compute::logging::LoggerRegistry::get().create_reserved_loggers();  \
+        }                                                                           \
+    } while (false)
 #else /* ARM_COMPUTE_LOGGING_ENABLED */
 #define ARM_COMPUTE_CREATE_DEFAULT_GRAPH_LOGGER()
 #endif /* ARM_COMPUTE_LOGGING_ENABLED */
diff --git a/arm_compute/graph/Tensor.h b/arm_compute/graph/Tensor.h
index de96c998bd1376bb0511daffc67d45f132809b16..0ffae28ecc67936992e3d19deecd056ff6984363 100644
--- a/arm_compute/graph/Tensor.h
+++ b/arm_compute/graph/Tensor.h
@@ -24,11 +24,10 @@
 #ifndef ARM_COMPUTE_GRAPH_TENSOR_H
 #define ARM_COMPUTE_GRAPH_TENSOR_H
 
-#include "arm_compute/graph/Types.h"
-
 #include "arm_compute/graph/ITensorAccessor.h"
 #include "arm_compute/graph/ITensorHandle.h"
 #include "arm_compute/graph/TensorDescriptor.h"
+#include "arm_compute/graph/Types.h"
 
 #include <memory>
 #include <set>
diff --git a/arm_compute/graph/TensorDescriptor.h b/arm_compute/graph/TensorDescriptor.h
index 5fa155efc80a496922f1171642539932deeee8d5..46a6ab2c27263790eeceef9550070639a652bd4d 100644
--- a/arm_compute/graph/TensorDescriptor.h
+++ b/arm_compute/graph/TensorDescriptor.h
@@ -52,7 +52,11 @@ struct TensorDescriptor final : public misc::ICloneable<TensorDescriptor>
                      QuantizationInfo tensor_quant_info  = QuantizationInfo(),
                      DataLayout       tensor_data_layout = DataLayout::NCHW,
                      Target           tensor_target      = Target::UNSPECIFIED)
-        : shape(tensor_shape), data_type(tensor_data_type), layout(tensor_data_layout), quant_info(tensor_quant_info), target(tensor_target)
+        : shape(tensor_shape),
+          data_type(tensor_data_type),
+          layout(tensor_data_layout),
+          quant_info(tensor_quant_info),
+          target(tensor_target)
     {
     }
     /** Sets tensor descriptor shape
@@ -106,11 +110,11 @@ struct TensorDescriptor final : public misc::ICloneable<TensorDescriptor>
         return std::make_unique<TensorDescriptor>(*this);
     }
 
-    TensorShape      shape{};                        /**< Tensor shape */
-    DataType         data_type{ DataType::UNKNOWN }; /**< Data type */
-    DataLayout       layout{ DataLayout::NCHW };     /**< Data layout */
-    QuantizationInfo quant_info{};                   /**< Quantization info */
-    Target           target{ Target::UNSPECIFIED };  /**< Target */
+    TensorShape      shape{};                      /**< Tensor shape */
+    DataType         data_type{DataType::UNKNOWN}; /**< Data type */
+    DataLayout       layout{DataLayout::NCHW};     /**< Data layout */
+    QuantizationInfo quant_info{};                 /**< Quantization info */
+    Target           target{Target::UNSPECIFIED};  /**< Target */
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/TypePrinter.h b/arm_compute/graph/TypePrinter.h
index 8f97bbf845fb69903d0834c9369721c49f01a6d1..5e83820ab3bf66d20672858123c6d91f23673386 100644
--- a/arm_compute/graph/TypePrinter.h
+++ b/arm_compute/graph/TypePrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_TYPE_PRINTER_H
-#define ARM_COMPUTE_GRAPH_TYPE_PRINTER_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_TYPEPRINTER_H
+#define ACL_ARM_COMPUTE_GRAPH_TYPEPRINTER_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
@@ -37,7 +37,7 @@ namespace graph
 /** Formatted output of the Target. */
 inline ::std::ostream &operator<<(::std::ostream &os, const Target &target)
 {
-    switch(target)
+    switch (target)
     {
         case Target::UNSPECIFIED:
             os << "UNSPECIFIED";
@@ -60,7 +60,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const Target &target)
 
 inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
 {
-    switch(node_type)
+    switch (node_type)
     {
         case NodeType::ActivationLayer:
             os << "ActivationLayer";
@@ -116,12 +116,6 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
         case NodeType::FusedConvolutionBatchNormalizationLayer:
             os << "FusedConvolutionBatchNormalizationLayer";
             break;
-        case NodeType::FusedConvolutionBatchNormalizationLayerWithPostOpsLayer:
-            os << "FusedConvolutionBatchNormalizationLayerWithPostOpsLayer";
-            break;
-        case NodeType::FusedConvolutionWithPostOp:
-            os << "FusedConvolutionWithPostOp";
-            break;
         case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer:
             os << "FusedDepthwiseConvolutionBatchNormalizationLayer";
             break;
@@ -213,7 +207,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
 /** Formatted output of the EltwiseOperation type. */
 inline ::std::ostream &operator<<(::std::ostream &os, const EltwiseOperation &eltwise_op)
 {
-    switch(eltwise_op)
+    switch (eltwise_op)
     {
         case EltwiseOperation::Add:
             os << "Add";
@@ -237,7 +231,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const EltwiseOperation &el
 /** Formatted output of the ConvolutionMethod type. */
 inline ::std::ostream &operator<<(::std::ostream &os, const ConvolutionMethod &method)
 {
-    switch(method)
+    switch (method)
     {
         case ConvolutionMethod::Default:
             os << "Default";
@@ -261,7 +255,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const ConvolutionMethod &m
 /** Formatted output of the FastMathHint type. */
 inline ::std::ostream &operator<<(::std::ostream &os, const FastMathHint &hint)
 {
-    switch(hint)
+    switch (hint)
     {
         case FastMathHint::Enabled:
             os << "Enabled";
@@ -279,7 +273,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const FastMathHint &hint)
 /** Formatted output of the DepthwiseConvolutionMethod type. */
 inline ::std::ostream &operator<<(::std::ostream &os, const DepthwiseConvolutionMethod &method)
 {
-    switch(method)
+    switch (method)
     {
         case DepthwiseConvolutionMethod::Default:
             os << "DEFAULT";
@@ -295,4 +289,4 @@ inline ::std::ostream &operator<<(::std::ostream &os, const DepthwiseConvolution
 }
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_TYPE_PRINTER_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_TYPEPRINTER_H
diff --git a/arm_compute/graph/Types.h b/arm_compute/graph/Types.h
index 167f7388d4e132572ca8579965b38c192c9abe66..5541e3cbccfd67de7e5c69cdf78e66abf6734221 100644
--- a/arm_compute/graph/Types.h
+++ b/arm_compute/graph/Types.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_TYPES_H
-#define ARM_COMPUTE_GRAPH_TYPES_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_TYPES_H
+#define ACL_ARM_COMPUTE_GRAPH_TYPES_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
@@ -41,32 +41,31 @@ namespace arm_compute
 {
 namespace graph
 {
-using arm_compute::CLTunerMode;
 using arm_compute::CLBackendType;
+using arm_compute::CLTunerMode;
 using arm_compute::Status;
 
 using arm_compute::Coordinates;
-using arm_compute::DataType;
 using arm_compute::DataLayout;
 using arm_compute::DataLayoutDimension;
-using arm_compute::TensorShape;
-using arm_compute::Size2D;
+using arm_compute::DataType;
 using arm_compute::PermutationVector;
 using arm_compute::PixelValue;
+using arm_compute::Size2D;
+using arm_compute::TensorShape;
 
 using arm_compute::ActivationLayerInfo;
 using arm_compute::DetectionOutputLayerInfo;
 using arm_compute::DetectionPostProcessLayerInfo;
-using arm_compute::NormType;
-using arm_compute::NormalizationLayerInfo;
+using arm_compute::DimensionRoundingType;
 using arm_compute::FullyConnectedLayerInfo;
+using arm_compute::InterpolationPolicy;
+using arm_compute::NormalizationLayerInfo;
+using arm_compute::NormType;
 using arm_compute::PadStrideInfo;
 using arm_compute::PoolingLayerInfo;
 using arm_compute::PoolingType;
 using arm_compute::PriorBoxLayerInfo;
-using arm_compute::DimensionRoundingType;
-using arm_compute::InterpolationPolicy;
-using arm_compute::experimental::PostOpType;
 
 using GraphID    = unsigned int;
 using TensorID   = unsigned int;
@@ -87,17 +86,18 @@ struct TensorDescriptor;
 /** Graph configuration structure */
 struct GraphConfig
 {
-    bool          use_function_memory_manager{ true };   /**< Use a memory manager to manage per-function auxilary memory */
-    bool          use_function_weights_manager{ true };  /**< Use a weights manager to manage transformed weights */
-    bool          use_transition_memory_manager{ true }; /**< Use a memory manager to manager transition buffer memory */
-    bool          use_tuner{ false };                    /**< Use a tuner in tunable backends */
-    bool          use_synthetic_type{ false };           /**< Convert graph to a synthetic graph for a data type */
-    DataType      synthetic_type{ DataType::QASYMM8 };   /**< The data type of the synthetic graph  */
-    CLTunerMode   tuner_mode{ CLTunerMode::EXHAUSTIVE }; /**< Tuner mode to be used by the CL tuner */
-    int           num_threads{ -1 };                     /**< Number of threads to use (thread capable backends), if 0 the backend will auto-initialize, if -1 the backend will stay as it is. */
-    std::string   tuner_file{ "acl_tuner.csv" };         /**< File to load/store tuning values from */
-    std::string   mlgo_file{ "heuristics.mlgo" };        /**< Filename to load MLGO heuristics from */
-    CLBackendType backend_type{ CLBackendType::Native }; /**< CL backend type to use */
+    bool        use_function_memory_manager{true};   /**< Use a memory manager to manage per-function auxilary memory */
+    bool        use_function_weights_manager{true};  /**< Use a weights manager to manage transformed weights */
+    bool        use_transition_memory_manager{true}; /**< Use a memory manager to manager transition buffer memory */
+    bool        use_tuner{false};                    /**< Use a tuner in tunable backends */
+    bool        use_synthetic_type{false};           /**< Convert graph to a synthetic graph for a data type */
+    DataType    synthetic_type{DataType::QASYMM8};   /**< The data type of the synthetic graph  */
+    CLTunerMode tuner_mode{CLTunerMode::EXHAUSTIVE}; /**< Tuner mode to be used by the CL tuner */
+    int         num_threads{
+        -1}; /**< Number of threads to use (thread capable backends), if 0 the backend will auto-initialize, if -1 the backend will stay as it is. */
+    std::string   tuner_file{"acl_tuner.csv"};         /**< File to load/store tuning values from */
+    std::string   mlgo_file{"heuristics.mlgo"};        /**< Filename to load MLGO heuristics from */
+    CLBackendType backend_type{CLBackendType::Native}; /**< CL backend type to use */
 };
 
 /**< Device target types */
@@ -150,55 +150,6 @@ enum class FastMathHint
     Disabled, /**< Fast math disabled for Convolution layer */
 };
 
-/** Convolution post operator info */
-class ConvPostOpInfo
-{
-public:
-    /** Returns post op type
-     *
-     * @return Post op type
-     */
-    virtual PostOpType type() const = 0;
-    virtual ~ConvPostOpInfo()
-    {
-    }
-};
-
-class ConvPostOpInfoActivation : public ConvPostOpInfo
-{
-public:
-    ConvPostOpInfoActivation(const ActivationLayerInfo &act)
-        : _act(act)
-    {
-    }
-    ~ConvPostOpInfoActivation() override
-    {
-    }
-    PostOpType type() const override
-    {
-        return PostOpType::Activation;
-    }
-    ActivationLayerInfo _act;
-};
-
-class ConvPostOpInfoEltwiseAdd : public ConvPostOpInfo
-{
-public:
-    ConvPostOpInfoEltwiseAdd(int arg_pos, const ConvertPolicy &policy)
-        : _prev_op_dst_pos(arg_pos), _policy(policy)
-    {
-    }
-    PostOpType type() const override
-    {
-        return PostOpType::Eltwise_Add;
-    }
-    ~ConvPostOpInfoEltwiseAdd() override
-    {
-    }
-    int           _prev_op_dst_pos;
-    ConvertPolicy _policy;
-};
-
 /** Supported nodes */
 enum class NodeType
 {
@@ -219,8 +170,6 @@ enum class NodeType
     FlattenLayer,
     FullyConnectedLayer,
     FusedConvolutionBatchNormalizationLayer,
-    FusedConvolutionWithPostOp,
-    FusedConvolutionBatchNormalizationLayerWithPostOpsLayer,
     FusedDepthwiseConvolutionBatchNormalizationLayer,
     GenerateProposalsLayer,
     L2NormalizeLayer,
@@ -278,4 +227,4 @@ struct NodeParams
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_TYPES_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_TYPES_H
diff --git a/arm_compute/graph/Utils.h b/arm_compute/graph/Utils.h
index a3d9012ee905131adf4effeff03880bcbd43cbff..9813ff05c7720c098fba9839b98ff10b9b926678 100644
--- a/arm_compute/graph/Utils.h
+++ b/arm_compute/graph/Utils.h
@@ -36,7 +36,7 @@ class GraphContext;
 
 inline bool is_utility_node(INode *node)
 {
-    std::set<NodeType> utility_node_types = { NodeType::PrintLayer };
+    std::set<NodeType> utility_node_types = {NodeType::PrintLayer};
     return utility_node_types.find(node->type()) != utility_node_types.end();
 }
 
diff --git a/arm_compute/graph/Workload.h b/arm_compute/graph/Workload.h
index 5b4533cb6f673cd5b02b3921e74e1108b868e5f6..8ff0a548aef203222d22c67504efaad37c14e926 100644
--- a/arm_compute/graph/Workload.h
+++ b/arm_compute/graph/Workload.h
@@ -69,8 +69,7 @@ public:
  */
 struct ExecutionTask
 {
-    ExecutionTask(std::unique_ptr<arm_compute::IFunction> &&f, INode *n)
-        : task(std::move(f)), node(n)
+    ExecutionTask(std::unique_ptr<arm_compute::IFunction> &&f, INode *n) : task(std::move(f)), node(n)
     {
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -97,11 +96,11 @@ struct ExecutionTask
 /** Execution workload */
 struct ExecutionWorkload
 {
-    std::vector<Tensor *>      inputs  = {};          /**< Input handles */
-    std::vector<Tensor *>      outputs = {};          /**< Output handles */
-    std::vector<ExecutionTask> tasks   = {};          /**< Execution workload */
-    Graph                     *graph   = { nullptr }; /**< Graph bound to the workload */
-    GraphContext              *ctx     = { nullptr }; /**< Graph execution context */
+    std::vector<Tensor *>      inputs  = {};        /**< Input handles */
+    std::vector<Tensor *>      outputs = {};        /**< Output handles */
+    std::vector<ExecutionTask> tasks   = {};        /**< Execution workload */
+    Graph                     *graph   = {nullptr}; /**< Graph bound to the workload */
+    GraphContext              *ctx     = {nullptr}; /**< Graph execution context */
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/backends/BackendRegistrar.h b/arm_compute/graph/backends/BackendRegistrar.h
index 902c12b0a6613141ee49c68a58a1054a2cbe3aa0..2879361fefb6b2a974218722a57586bf9910c6fa 100644
--- a/arm_compute/graph/backends/BackendRegistrar.h
+++ b/arm_compute/graph/backends/BackendRegistrar.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_GRAPH_BACKEND_REGISTRAR_H
 #define ARM_COMPUTE_GRAPH_BACKEND_REGISTRAR_H
 
-#include "arm_compute/graph/Types.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/Types.h"
 
 #include <utility>
 
@@ -58,4 +58,4 @@ inline BackendRegistrar<T>::BackendRegistrar(Target target)
 } // namespace backends
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_BACKEND_REGISTRAR_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_GRAPH_BACKEND_REGISTRAR_H */
diff --git a/arm_compute/graph/backends/CL/CLDeviceBackend.h b/arm_compute/graph/backends/CL/CLDeviceBackend.h
index 63674ad794f23970cc7ed125ceed1eb18fc041ec..09e19d7688826fb706c8ae35ceb4089d24e3722c 100644
--- a/arm_compute/graph/backends/CL/CLDeviceBackend.h
+++ b/arm_compute/graph/backends/CL/CLDeviceBackend.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_CLDEVICEBACKEND_H
 
 #include "arm_compute/graph/IDeviceBackend.h"
-
 #include "arm_compute/runtime/CL/CLBufferAllocator.h"
 #include "arm_compute/runtime/CL/CLGEMMHeuristicsHandle.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
@@ -59,22 +58,23 @@ public:
     void set_kernel_tuning_mode(CLTunerMode tuning_mode);
 
     // Inherited overridden methods
-    void initialize_backend() override;
-    void setup_backend_context(GraphContext &ctx) override;
-    void release_backend_context(GraphContext &ctx) override;
+    void                           initialize_backend() override;
+    void                           setup_backend_context(GraphContext &ctx) override;
+    void                           release_backend_context(GraphContext &ctx) override;
     bool                           is_backend_supported() override;
     IAllocator                    *backend_allocator() override;
     std::unique_ptr<ITensorHandle> create_tensor(const Tensor &tensor) override;
-    std::unique_ptr<ITensorHandle> create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) override;
-    std::unique_ptr<arm_compute::IFunction> configure_node(INode &node, GraphContext &ctx) override;
-    Status validate_node(INode &node) override;
-    std::shared_ptr<arm_compute::IMemoryManager> create_memory_manager(MemoryManagerAffinity affinity) override;
+    std::unique_ptr<ITensorHandle>
+    create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) override;
+    std::unique_ptr<arm_compute::IFunction>       configure_node(INode &node, GraphContext &ctx) override;
+    Status                                        validate_node(INode &node) override;
+    std::shared_ptr<arm_compute::IMemoryManager>  create_memory_manager(MemoryManagerAffinity affinity) override;
     std::shared_ptr<arm_compute::IWeightsManager> create_weights_manager() override;
     void                                          sync() override;
 
 private:
-    int                                _context_count;   /**< Counts how many contexts are currently using the backend */
-    CLTuner                            _tuner;           /**< CL kernel tuner */
+    int                                _context_count; /**< Counts how many contexts are currently using the backend */
+    CLTuner                            _tuner;         /**< CL kernel tuner */
     CLGEMMHeuristicsHandle             _gemm_heuristics; /**< GEMM heuristics */
     std::unique_ptr<CLBufferAllocator> _allocator;       /**< CL buffer affinity allocator */
     std::string                        _tuner_file;      /**< Filename to load/store the tuner's values from */
diff --git a/arm_compute/graph/backends/CL/CLSubTensorHandle.h b/arm_compute/graph/backends/CL/CLSubTensorHandle.h
index 3750fc85eede82ca0ba1a6c803193057fa45b030..85eebec63997346f74a3ac3cb8160f2450bd37b0 100644
--- a/arm_compute/graph/backends/CL/CLSubTensorHandle.h
+++ b/arm_compute/graph/backends/CL/CLSubTensorHandle.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_CLSUBTENSORHANDLE_H
 
 #include "arm_compute/graph/ITensorHandle.h"
-
 #include "arm_compute/runtime/CL/CLSubTensor.h"
 
 namespace arm_compute
@@ -45,7 +44,10 @@ public:
      * @param[in] coords        Starting coordinates
      * @param[in] extend_parent Extends parent shape if true
      */
-    CLSubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent = false);
+    CLSubTensorHandle(ITensorHandle     *parent_handle,
+                      const TensorShape &shape,
+                      const Coordinates &coords,
+                      bool               extend_parent = false);
     /** Destructor: free the tensor's memory */
     ~CLSubTensorHandle() = default;
     /** Allow instances of this class to be move constructed */
@@ -58,10 +60,10 @@ public:
     CLSubTensorHandle &operator=(const CLSubTensorHandle &) = delete;
 
     // Inherited overridden methods
-    void allocate() override;
-    void free() override;
-    void manage(IMemoryGroup *mg) override;
-    void map(bool blocking) override;
+    void                        allocate() override;
+    void                        free() override;
+    void                        manage(IMemoryGroup *mg) override;
+    void                        map(bool blocking) override;
     void                        unmap() override;
     void                        release_if_unused() override;
     arm_compute::ITensor       &tensor() override;
diff --git a/arm_compute/graph/backends/CL/CLTensorHandle.h b/arm_compute/graph/backends/CL/CLTensorHandle.h
index 16e30efc435369318cc1537e3e353e031fd098a7..57e9794ec34c2812016bfb3fd78927eb6b22f300 100644
--- a/arm_compute/graph/backends/CL/CLTensorHandle.h
+++ b/arm_compute/graph/backends/CL/CLTensorHandle.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_CLTENSORHANDLE_H
 
 #include "arm_compute/graph/ITensorHandle.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
 
 namespace arm_compute
@@ -51,10 +50,10 @@ public:
     CLTensorHandle &operator=(CLTensorHandle &&) = default;
 
     // Inherited overridden methods
-    void allocate() override;
-    void free() override;
-    void manage(IMemoryGroup *mg) override;
-    void map(bool blocking) override;
+    void                        allocate() override;
+    void                        free() override;
+    void                        manage(IMemoryGroup *mg) override;
+    void                        map(bool blocking) override;
     void                        unmap() override;
     void                        release_if_unused() override;
     arm_compute::ITensor       &tensor() override;
diff --git a/arm_compute/graph/backends/FunctionHelpers.h b/arm_compute/graph/backends/FunctionHelpers.h
index 803283e20dfedcd842e6224b008c53624e11233a..fd8b6b5a693a9ac24942013bafa402560b4d58d7 100644
--- a/arm_compute/graph/backends/FunctionHelpers.h
+++ b/arm_compute/graph/backends/FunctionHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,25 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_FUNCTION_HELPERS_H
-#define ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_FUNCTION_HELPERS_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUNCTIONHELPERS_H
+#define ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUNCTIONHELPERS_H
 
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "arm_compute/core/experimental/PostOps.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h"
+#include "arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h"
+#include "arm_compute/graph/backends/Utils.h"
 #include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/nodes/Nodes.h"
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/TypePrinter.h"
 #include "arm_compute/graph/Types.h"
 #include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h"
-#include "arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h"
-#include "arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h"
-#include "arm_compute/graph/backends/Utils.h"
-#include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorInfo.h"
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -62,13 +59,16 @@ template <typename TargetInfo>
 typename TargetInfo::TensorType *get_backing_tensor(arm_compute::graph::Tensor *tensor)
 {
     typename TargetInfo::TensorType *backing_tensor = nullptr;
-    if(tensor != nullptr)
+    if (tensor != nullptr)
     {
         ARM_COMPUTE_ERROR_ON(tensor->desc().target != TargetInfo::TargetType);
         // Get backing tensor handle
         ITensorHandle *tensor_handle = tensor->handle();
         // Get backing tensor
-        backing_tensor = (tensor_handle != nullptr) ? arm_compute::utils::cast::polymorphic_cast<typename TargetInfo::TensorType *>(&tensor_handle->tensor()) : nullptr;
+        backing_tensor = (tensor_handle != nullptr)
+                             ? arm_compute::utils::cast::polymorphic_cast<typename TargetInfo::TensorType *>(
+                                   &tensor_handle->tensor())
+                             : nullptr;
     }
 
     return backing_tensor;
@@ -77,11 +77,8 @@ typename TargetInfo::TensorType *get_backing_tensor(arm_compute::graph::Tensor *
 template <typename TargetInfo>
 void validate_node(const INode &node, size_t num_expected_inputs, size_t num_expected_outputs)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating " << node.type()
-                                  << " Target: " << TargetInfo::TargetType
-                                  << " ID: " << node.id()
-                                  << node.name()
-                                  << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating " << node.type() << " Target: " << TargetInfo::TargetType
+                                              << " ID: " << node.id() << node.name() << std::endl);
 
     ARM_COMPUTE_ERROR_ON(TargetInfo::TargetType != node.assigned_target());
     ARM_COMPUTE_ERROR_ON(node.num_inputs() != num_expected_inputs);
@@ -112,19 +109,13 @@ std::unique_ptr<IFunction> create_activation_layer(ActivationLayerNode &node)
     auto func = std::make_unique<ActivationLayerFunction>();
     func->configure(input, output, act_info);
 
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << " Activation function: " << act_info.activation()
-                               << " a: " << act_info.a()
-                               << " b: " << act_info.b()
-                               << " InPlace : " << is_in_place_operation(input, output)
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO(
+        "Instantiated " << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                        << " Data Type: " << input->info()->data_type() << " Shape: " << input->info()->tensor_shape()
+                        << " Activation function: " << act_info.activation() << " a: " << act_info.a() << " b: "
+                        << act_info.b() << " InPlace : " << is_in_place_operation(input, output) << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Creates a backend argminmax layer function
@@ -151,17 +142,12 @@ std::unique_ptr<IFunction> create_arg_min_max_layer(ArgMinMaxLayerNode &node)
     auto func = std::make_unique<ArgMinMaxLayerFunction>();
     func->configure(input, axis, output, op);
 
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << " Reduction Operation: " << op
-                               << " axis: " << axis
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Shape: " << input->info()->tensor_shape()
+                                               << " Reduction Operation: " << op << " axis: " << axis << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend batch normalization layer function
@@ -194,18 +180,13 @@ std::unique_ptr<IFunction> create_batch_normalization_layer(BatchNormalizationLa
     func->configure(input, output, mean, var, beta, gamma, epsilon, fused_act);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << " Epsilon: " << epsilon << " "
-                               << (fused_act.enabled() ? to_string(fused_act.activation()) : "")
-                               << " InPlace: " << is_in_place_operation(input, output)
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Shape: " << input->info()->tensor_shape() << " Epsilon: " << epsilon
+                                               << " " << (fused_act.enabled() ? to_string(fused_act.activation()) : "")
+                                               << " InPlace: " << is_in_place_operation(input, output) << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend batch normalization layer function
@@ -219,7 +200,8 @@ std::unique_ptr<IFunction> create_batch_normalization_layer(BatchNormalizationLa
  * @return Backend batch normalization layer function
  */
 template <typename FusedLayerTypes, typename TargetInfo>
-std::unique_ptr<IFunction> create_fused_convolution_batch_normalization_layer(FusedConvolutionBatchNormalizationNode &node, GraphContext &ctx)
+std::unique_ptr<IFunction>
+create_fused_convolution_batch_normalization_layer(FusedConvolutionBatchNormalizationNode &node, GraphContext &ctx)
 {
     validate_node<TargetInfo>(node, 7 /* expected inputs */, 1 /* expected outputs */);
 
@@ -249,20 +231,17 @@ std::unique_ptr<IFunction> create_fused_convolution_batch_normalization_layer(Fu
 
     // Create and configure function
     std::tie(func, func_name) = create_named_memory_managed_function<FType>(
-                                    std::string("FusedConvolutionBatchNormalizationLayer"), mm, input, weights, biases, output, mean, var, beta, gamma, epsilon, conv_info, num_groups, fast_math, fused_act);
+        std::string("FusedConvolutionBatchNormalizationLayer"), mm, input, weights, biases, output, mean, var, beta,
+        gamma, epsilon, conv_info, num_groups, fast_math, fused_act);
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type() << " Input shape: "
+                               << input->info()->tensor_shape() << " Weights shape: " << weights->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
-                               << std::endl);
-    return std::move(func);
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "") << std::endl);
+    return func;
 }
 
 /** Create a backend fused depthwise convolution batch normalization layer function
@@ -276,7 +255,9 @@ std::unique_ptr<IFunction> create_fused_convolution_batch_normalization_layer(Fu
  * @return Backend fused depthwise convolution batch normalization layer function
  */
 template <typename FusedLayerTypes, typename TargetInfo>
-std::unique_ptr<IFunction> create_fused_depthwise_convolution_batch_normalization_layer(FusedDepthwiseConvolutionBatchNormalizationNode &node, GraphContext &ctx)
+std::unique_ptr<IFunction>
+create_fused_depthwise_convolution_batch_normalization_layer(FusedDepthwiseConvolutionBatchNormalizationNode &node,
+                                                             GraphContext                                    &ctx)
 {
     validate_node<TargetInfo>(node, 7 /* expected inputs */, 1 /* expected outputs */);
 
@@ -305,20 +286,17 @@ std::unique_ptr<IFunction> create_fused_depthwise_convolution_batch_normalizatio
 
     // Create and configure function
     std::tie(func, func_name) = create_named_memory_managed_function<FType>(
-                                    std::string("FusedDepthwiseConvolutionBatchNormalizationLayer"), mm, input, weights, biases, output, mean, var, beta, gamma, epsilon, conv_info, depth_multiplier, fused_act);
+        std::string("FusedDepthwiseConvolutionBatchNormalizationLayer"), mm, input, weights, biases, output, mean, var,
+        beta, gamma, epsilon, conv_info, depth_multiplier, fused_act);
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type() << " Input shape: "
+                               << input->info()->tensor_shape() << " Weights shape: " << weights->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
-                               << std::endl);
-    return std::move(func);
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "") << std::endl);
+    return func;
 }
 
 /** Create a backend bounding box transform layer function
@@ -346,15 +324,11 @@ std::unique_ptr<IFunction> create_bounding_box_transform_layer(BoundingBoxTransf
     func->configure(input, output, deltas, bbox_info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << " BoundingBox Info img W: " << bbox_info.img_width() << " "
-                               << " BoundingBox Info img H: " << bbox_info.img_height() << " "
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO(
+        "Instantiated " << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                        << " Data Type: " << input->info()->data_type() << " Shape: " << input->info()->tensor_shape()
+                        << " BoundingBox Info img W: " << bbox_info.img_width() << " "
+                        << " BoundingBox Info img H: " << bbox_info.img_height() << " " << std::endl);
 
     return std::move(func);
 }
@@ -382,16 +356,12 @@ std::unique_ptr<IFunction> create_channel_shuffle_layer(ChannelShuffleLayerNode
     auto func = std::make_unique<ChannelShuffleLayerFunction>();
     func->configure(input, output, num_groups);
 
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << " Num groups: " << num_groups
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Shape: " << input->info()->tensor_shape()
+                                               << " Num groups: " << num_groups << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend layer concatenate function
@@ -406,24 +376,25 @@ std::unique_ptr<IFunction> create_channel_shuffle_layer(ChannelShuffleLayerNode
 template <typename ConcatenateLayerFunction, typename TargetInfo>
 std::unique_ptr<arm_compute::IFunction> create_concatenate_layer(ConcatenateLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating Concatenate node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating Concatenate node with ID : " << node.id() << " and Name: " << node.name()
+                                                                         << std::endl);
     ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
 
     // Return nullptr if depth concatenate is switched off
-    if(!node.is_enabled())
+    if (!node.is_enabled())
     {
         return nullptr;
     }
 
     // Extract IO and info
     std::vector<typename TargetInfo::SrcTensorType *> inputs;
-    for(unsigned int i = 0; i < node.num_inputs(); ++i)
+    for (unsigned int i = 0; i < node.num_inputs(); ++i)
     {
         inputs.push_back(get_backing_tensor<TargetInfo>(node.input(i)));
     }
-    typename TargetInfo::TensorType *output      = get_backing_tensor<TargetInfo>(node.output(0));
-    const DataLayout                 data_layout = node.output(0) != nullptr ? node.output(0)->desc().layout : DataLayout::UNKNOWN;
-    const size_t                     concat_axis = get_dimension_idx(data_layout, node.concatenation_axis());
+    typename TargetInfo::TensorType *output = get_backing_tensor<TargetInfo>(node.output(0));
+    const DataLayout data_layout = node.output(0) != nullptr ? node.output(0)->desc().layout : DataLayout::UNKNOWN;
+    const size_t     concat_axis = get_dimension_idx(data_layout, node.concatenation_axis());
 
     // Create and configure function
     auto func = std::make_unique<ConcatenateLayerFunction>();
@@ -432,22 +403,16 @@ std::unique_ptr<arm_compute::IFunction> create_concatenate_layer(ConcatenateLaye
     // Log info
     const bool         is_quantized = is_data_type_quantized_asymmetric(output->info()->data_type());
     std::ostringstream qss;
-    if(is_quantized)
+    if (is_quantized)
     {
         qss << " Output QuantInfo: " << output->info()->quantization_info();
     }
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << output->info()->data_type()
-                               << " Shape: " << output->info()->tensor_shape()
-                               << " Num Inputs: " << inputs.size()
-                               << " Axis: " << concat_axis
-                               << qss.str()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO(
+        "Instantiated " << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                        << " Data Type: " << output->info()->data_type() << " Shape: " << output->info()->tensor_shape()
+                        << " Num Inputs: " << inputs.size() << " Axis: " << concat_axis << qss.str() << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend convolution layer function
@@ -473,7 +438,7 @@ std::unique_ptr<IFunction> create_convolution_layer(ConvolutionLayerNode &node,
 
     const bool is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
 
-    if(is_quantized)
+    if (is_quantized)
     {
         biases->info()->set_data_type(DataType::S32);
     }
@@ -489,233 +454,51 @@ std::unique_ptr<IFunction> create_convolution_layer(ConvolutionLayerNode &node,
     std::unique_ptr<IFunction>      func;
     std::string                     func_name;
 
-    if(conv_algorithm == ConvolutionMethod::Winograd)
+    if (conv_algorithm == ConvolutionMethod::Winograd)
     {
         ARM_COMPUTE_ERROR_ON_MSG(num_groups != 1, "WinogradConvolutionLayer does not support grouping!");
-        std::tie(func, func_name) = create_named_memory_managed_function<typename ConvolutionLayerFunctions::WinogradConvolutionLayer>(
-                                        std::string("WinogradConvolutionLayer"), mm,
-                                        input, weights, biases, output, conv_info, fused_act, fast_math);
+        std::tie(func, func_name) =
+            create_named_memory_managed_function<typename ConvolutionLayerFunctions::WinogradConvolutionLayer>(
+                std::string("WinogradConvolutionLayer"), mm, input, weights, biases, output, conv_info, fused_act,
+                fast_math);
     }
-    else if(conv_algorithm == ConvolutionMethod::Direct)
+    else if (conv_algorithm == ConvolutionMethod::Direct)
     {
         ARM_COMPUTE_ERROR_ON_MSG(num_groups != 1, "DirectConvolutionLayer does not support grouping!");
         std::tie(func, func_name) = create_named_function<typename ConvolutionLayerFunctions::DirectConvolutionLayer>(
-                                        std::string("DirectConvolutionLayer"),
-                                        input, weights, biases, output, conv_info, fused_act);
+            std::string("DirectConvolutionLayer"), input, weights, biases, output, conv_info, fused_act);
     }
-    else if(conv_algorithm == ConvolutionMethod::GEMM)
+    else if (conv_algorithm == ConvolutionMethod::GEMM)
     {
-        std::tie(func, func_name) = create_named_memory_managed_function<typename ConvolutionLayerFunctions::GEMMConvolutionLayer>(
-                                        std::string("GEMMConvolutionLayer"), mm,
-                                        input, weights, biases, output, conv_info,
-                                        WeightsInfo(), Size2D(1U, 1U), fused_act, num_groups);
+        std::tie(func, func_name) =
+            create_named_memory_managed_function<typename ConvolutionLayerFunctions::GEMMConvolutionLayer>(
+                std::string("GEMMConvolutionLayer"), mm, input, weights, biases, output, conv_info, WeightsInfo(),
+                Size2D(1U, 1U), fused_act, num_groups);
     }
     else
     {
-        std::tie(func, func_name) = create_named_memory_managed_function<typename ConvolutionLayerFunctions::GenericConvolutionLayer>(
-                                        std::string("GenericConvolutionLayer"), mm,
-                                        input, weights, biases, output, conv_info,
-                                        WeightsInfo(), Size2D(1U, 1U), fused_act, fast_math, num_groups);
+        std::tie(func, func_name) =
+            create_named_memory_managed_function<typename ConvolutionLayerFunctions::GenericConvolutionLayer>(
+                std::string("GenericConvolutionLayer"), mm, input, weights, biases, output, conv_info, WeightsInfo(),
+                Size2D(1U, 1U), fused_act, fast_math, num_groups);
     }
 
     // Log info
     std::ostringstream qss;
-    if(is_quantized)
+    if (is_quantized)
     {
         qss << " Input QuantInfo: " << input->info()->quantization_info()
             << " Weights QuantInfo: " << weights->info()->quantization_info()
             << " Output QuantInfo: " << output->info()->quantization_info();
     }
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << func_name
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Groups: " << num_groups
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << qss.str()
-                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
-                               << std::endl);
-    return std::move(func);
-}
-
-/** Create a backend convolution layer function with post operator
- *
- * @tparam ConvolutionLayerFunctions Backend convolution functions
- * @tparam TargetInfo                Target-specific information
- *
- * @param[in] node Node to create the backend function for
- * @param[in] ctx  Graph context
- *
- * @return Backend convolution layer function
- */
-template <typename ConvolutionLayerFunctions, typename TargetInfo>
-std::unique_ptr<IFunction> create_fused_convolution_with_post_op(FusedConvolutionWithPostOpNode &node, GraphContext &ctx)
-{
-    validate_node<TargetInfo>(node, 4 /* expected inputs */, 1 /* expected outputs */);
-
-    // Extract IO and info
-    typename TargetInfo::TensorType *input   = get_backing_tensor<TargetInfo>(node.input(0));
-    typename TargetInfo::TensorType *weights = get_backing_tensor<TargetInfo>(node.input(1));
-    typename TargetInfo::TensorType *biases  = get_backing_tensor<TargetInfo>(node.input(2));
-    typename TargetInfo::TensorType *output  = get_backing_tensor<TargetInfo>(node.output(0));
-
-    const bool is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
-
-    if(is_quantized)
-    {
-        biases->info()->set_data_type(DataType::S32);
-    }
-
-    const PadStrideInfo       conv_info  = node.convolution_info();
-    const unsigned int        num_groups = node.num_groups();
-    const ActivationLayerInfo fused_act  = node.fused_activation();
-
-    experimental::PostOpList<typename TargetInfo::TensorType *> post_ops;
-
-    auto &post_op_info_list = node.post_op_info_list();
-    for(const auto &post_op_info : post_op_info_list)
-    {
-        switch(post_op_info->type())
-        {
-            case PostOpType::Activation:
-            {
-                const auto act_info = utils::cast::polymorphic_downcast<const ConvPostOpInfoActivation *>(post_op_info.get());
-                post_ops.template push_back_op<experimental::PostOpAct<typename TargetInfo::TensorType *>>(act_info->_act);
-                break;
-            }
-            case PostOpType::Eltwise_Add:
-            {
-                typename TargetInfo::TensorType *add_input    = get_backing_tensor<TargetInfo>(node.input(3));
-                const auto                       eltwise_info = utils::cast::polymorphic_downcast<const ConvPostOpInfoEltwiseAdd *>(post_op_info.get());
-                post_ops.template push_back_op<experimental::PostOpEltwiseAdd<typename TargetInfo::TensorType *>>(add_input, eltwise_info->_prev_op_dst_pos, eltwise_info->_policy);
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported PostOpType");
-            }
-        }
-    }
-
-    // Create and configure function (we assume that functions have been validated before creation)
-    std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, TargetInfo::TargetType);
-    std::unique_ptr<IFunction>      func;
-    std::string                     func_name;
-
-    // Fuse convolution with post ops is only supported for conv1x1, which is only implemented as gemmconv2d
-    std::tie(func, func_name) = create_named_memory_managed_function<typename ConvolutionLayerFunctions::GEMMConvolutionLayer>(
-                                    std::string("GEMMConvolutionLayer"), mm,
-                                    input, weights, biases, output, conv_info,
-                                    WeightsInfo(), Size2D(1U, 1U), fused_act, num_groups, post_ops);
-
-    // Log info
-    std::ostringstream qss;
-    if(is_quantized)
-    {
-        qss << " Input QuantInfo: " << input->info()->quantization_info()
-            << " Weights QuantInfo: " << weights->info()->quantization_info()
-            << " Output QuantInfo: " << output->info()->quantization_info();
-    }
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << func_name
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Groups: " << num_groups
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << qss.str()
-                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
-                               << " Post ops" << post_ops
-                               << std::endl);
-    return std::move(func);
-}
-
-/** Create a backend convolution batch normalization layer function with post operator
- *
- * @tparam FusedLayerTypes           Backend convolution functions
- * @tparam TargetInfo                Target-specific information
- *
- * @param[in] node Node to create the backend function for
- * @param[in] ctx  Graph context
- *
- * @return Backend fused convolution with batch normalization layer function
- */
-template <typename FusedLayerTypes, typename TargetInfo>
-std::unique_ptr<IFunction> create_fused_convolution_batch_normalization_with_post_op(FusedConvolutionBatchNormalizationWithPostOpsNode &node, GraphContext &ctx)
-{
-    validate_node<TargetInfo>(node, 8 /* expected inputs */, 1 /* expected outputs */);
-
-    // Extract IO and info
-    typename TargetInfo::TensorType *input   = get_backing_tensor<TargetInfo>(node.input(0));
-    typename TargetInfo::TensorType *weights = get_backing_tensor<TargetInfo>(node.input(1));
-    typename TargetInfo::TensorType *biases  = get_backing_tensor<TargetInfo>(node.input(2));
-    typename TargetInfo::TensorType *mean    = get_backing_tensor<TargetInfo>(node.input(3));
-    typename TargetInfo::TensorType *var     = get_backing_tensor<TargetInfo>(node.input(4));
-    typename TargetInfo::TensorType *beta    = get_backing_tensor<TargetInfo>(node.input(5));
-    typename TargetInfo::TensorType *gamma   = get_backing_tensor<TargetInfo>(node.input(6));
-
-    typename TargetInfo::TensorType *output = get_backing_tensor<TargetInfo>(node.output(0));
-
-    const PadStrideInfo conv_info  = node.convolution_info();
-    const unsigned int  num_groups = node.num_groups();
-    const bool          fast_math  = node.fast_math_hint() == FastMathHint::Enabled;
-    const float         epsilon    = node.epsilon();
-
-    experimental::PostOpList<typename TargetInfo::TensorType *> post_ops;
-
-    auto &post_op_info_list = node.post_op_info_list();
-    for(const auto &post_op_info : post_op_info_list)
-    {
-        switch(post_op_info->type())
-        {
-            case PostOpType::Activation:
-            {
-                const auto act_info = utils::cast::polymorphic_downcast<const ConvPostOpInfoActivation *>(post_op_info.get());
-                post_ops.template push_back_op<experimental::PostOpAct<typename TargetInfo::TensorType *>>(act_info->_act);
-                break;
-            }
-            case PostOpType::Eltwise_Add:
-            {
-                typename TargetInfo::TensorType *add_input    = get_backing_tensor<TargetInfo>(node.input(3));
-                const auto                       eltwise_info = utils::cast::polymorphic_downcast<const ConvPostOpInfoEltwiseAdd *>(post_op_info.get());
-                post_ops.template push_back_op<experimental::PostOpEltwiseAdd<typename TargetInfo::TensorType *>>(add_input, eltwise_info->_prev_op_dst_pos, eltwise_info->_policy);
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported PostOpType");
-            }
-        }
-    }
-
-    // Create and configure function (we assume that functions have been validated before creation)
-    std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, TargetInfo::TargetType);
-    std::unique_ptr<IFunction>      func;
-    std::string                     func_name;
-
-    using FType = FusedConvolutionBatchNormalizationWithPostOpsFunction<TargetInfo, FusedLayerTypes>;
-
-    // Create and configure function
-    std::tie(func, func_name) = create_named_memory_managed_function<FType>(
-                                    std::string("FusedConvolutionBatchNormalizationLayerWithPostOpsLayer"), mm, input, weights, biases, output, mean, var, beta, gamma, epsilon, conv_info, num_groups, fast_math, post_ops);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
+                               << node.name() << " Type: " << func_name << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type() << " Groups: " << num_groups
                                << " Input shape: " << input->info()->tensor_shape()
                                << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Post Ops:" << post_ops
-                               << std::endl);
-    return std::move(func);
+                               << " Output shape: " << output->info()->tensor_shape() << qss.str()
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "") << std::endl);
+    return func;
 }
 
 /** Create a backend deconvolution layer function
@@ -746,19 +529,14 @@ std::unique_ptr<IFunction> create_deconvolution_layer(DeconvolutionLayerNode &no
     std::unique_ptr<IFunction>      func;
 
     std::tie(func, std::ignore) = create_named_memory_managed_function<DeconvolutionLayerFunction>(
-                                      std::string(), mm,
-                                      input, weights, biases, output, deconv_info);
+        std::string(), mm, input, weights, biases, output, deconv_info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Weights shape: " << weights->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
     return func;
 }
 
@@ -784,7 +562,7 @@ std::unique_ptr<IFunction> create_depthwise_convolution_layer(DepthwiseConvoluti
 
     const bool is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
 
-    if(is_quantized)
+    if (is_quantized)
     {
         biases->info()->set_data_type(DataType::S32);
     }
@@ -797,31 +575,26 @@ std::unique_ptr<IFunction> create_depthwise_convolution_layer(DepthwiseConvoluti
     std::unique_ptr<IFunction> func;
     std::string                func_name;
 
-    std::tie(func, func_name) = create_named_function<DepthwiseConvolutionLayer>(
-                                    std::string("DepthwiseConvolutionLayer"),
-                                    input, weights, biases, output, conv_info, depth_multiplier, fused_act);
+    std::tie(func, func_name) =
+        create_named_function<DepthwiseConvolutionLayer>(std::string("DepthwiseConvolutionLayer"), input, weights,
+                                                         biases, output, conv_info, depth_multiplier, fused_act);
 
     // Log info
     std::ostringstream qss;
-    if(is_quantized)
+    if (is_quantized)
     {
         qss << " Input QuantInfo: " << input->info()->quantization_info()
             << " Weights QuantInfo: " << weights->info()->quantization_info()
             << " Output QuantInfo: " << output->info()->quantization_info();
     }
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << func_name
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << node.name() << " Type: " << func_name << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type() << " Input shape: "
+                               << input->info()->tensor_shape() << " Weights shape: " << weights->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << " Depth multiplier: " << depth_multiplier
-                               << qss.str()
-                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
-                               << std::endl);
-    return std::move(func);
+                               << " Depth multiplier: " << depth_multiplier << qss.str()
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "") << std::endl);
+    return func;
 }
 
 /** Create a backend depth to space layer function
@@ -850,17 +623,13 @@ std::unique_ptr<IFunction> create_depth_to_space_layer(DepthToSpaceLayerNode &no
     func->configure(input, output, node.block_shape());
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Block Size: " << node.block_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Block Size: " << node.block_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend dequantize layer function
@@ -889,17 +658,13 @@ std::unique_ptr<IFunction> create_dequantization_layer(DequantizationLayerNode &
     func->configure(input, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Input quantization info: " << output->info()->quantization_info()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Input quantization info: " << output->info()->quantization_info()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return std::move(func);
+    return func;
 }
 /** Create a backend detection output layer function
  *
@@ -933,18 +698,14 @@ std::unique_ptr<IFunction> create_detection_output_layer(DetectionOutputLayerNod
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input0->info()->data_type()
-                               << " Input0 shape: " << input0->info()->tensor_shape()
-                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type() << " Input0 shape: "
+                               << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape()
                                << " Input2 shape: " << input2->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << " DetectionOutputLayer info: " << detect_info
-                               << std::endl);
+                               << " DetectionOutputLayer info: " << detect_info << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend detection post process layer function
@@ -985,21 +746,17 @@ std::unique_ptr<IFunction> create_detection_post_process_layer(DetectionPostProc
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input0->info()->data_type()
-                               << " Input0 shape: " << input0->info()->tensor_shape()
-                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type() << " Input0 shape: "
+                               << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape()
                                << " Input2 shape: " << input2->info()->tensor_shape()
                                << " Output0 shape: " << output0->info()->tensor_shape()
                                << " Output1 shape: " << output1->info()->tensor_shape()
                                << " Output2 shape: " << output2->info()->tensor_shape()
                                << " Output3 shape: " << output3->info()->tensor_shape()
-                               << " DetectionPostProcessLayer info: " << detect_info
-                               << std::endl);
+                               << " DetectionPostProcessLayer info: " << detect_info << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend element-wise operation layer function
@@ -1029,35 +786,31 @@ std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
 
     std::unique_ptr<IFunction> func = nullptr;
     std::string                func_name;
-    if(eltwise_op == EltwiseOperation::Add)
+    if (eltwise_op == EltwiseOperation::Add)
     {
         std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Addition>(
-                                        std::string("ArithmeticAddition"),
-                                        input1, input2, output, convert_policy, act_info);
+            std::string("ArithmeticAddition"), input1, input2, output, convert_policy, act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Sub)
+    else if (eltwise_op == EltwiseOperation::Sub)
     {
         std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Subtraction>(
-                                        std::string("ArithmeticSubtraction"),
-                                        input1, input2, output, convert_policy, act_info);
+            std::string("ArithmeticSubtraction"), input1, input2, output, convert_policy, act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Mul)
+    else if (eltwise_op == EltwiseOperation::Mul)
     {
         std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Multiplication>(
-                                        std::string("PixelWiseMultiplication"),
-                                        input1, input2, output, 1.f, convert_policy, node.rounding_policy(), act_info);
+            std::string("PixelWiseMultiplication"), input1, input2, output, 1.f, convert_policy, node.rounding_policy(),
+            act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Max)
+    else if (eltwise_op == EltwiseOperation::Max)
     {
         std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Maximum>(
-                                        std::string("ElementwiseMaximum"),
-                                        input1, input2, output, act_info);
+            std::string("ElementwiseMaximum"), input1, input2, output, act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Div)
+    else if (eltwise_op == EltwiseOperation::Div)
     {
         std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Division>(
-                                        std::string("ArithmeticDivision"),
-                                        input1, input2, output, act_info);
+            std::string("ArithmeticDivision"), input1, input2, output, act_info);
     }
     else
     {
@@ -1065,16 +818,12 @@ std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
     }
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Operation: " << func_name
-                               << " Data Type: " << input1->info()->data_type()
-                               << " Shape: " << input1->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type()
+                                               << " Target: " << TargetInfo::TargetType << " Operation: " << func_name
+                                               << " Data Type: " << input1->info()->data_type()
+                                               << " Shape: " << input1->info()->tensor_shape() << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend unary element-wise operation layer function
@@ -1101,11 +850,10 @@ std::unique_ptr<IFunction> create_unary_eltwise_layer(UnaryEltwiseLayerNode &nod
 
     std::unique_ptr<IFunction> func = nullptr;
     std::string                func_name;
-    if(eltwise_op == UnaryEltwiseOperation::Exp)
+    if (eltwise_op == UnaryEltwiseOperation::Exp)
     {
-        std::tie(func, func_name) = create_named_function<typename UnaryEltwiseFunctions::Exp>(
-                                        std::string("Exp"),
-                                        input, output);
+        std::tie(func, func_name) =
+            create_named_function<typename UnaryEltwiseFunctions::Exp>(std::string("Exp"), input, output);
     }
     else
     {
@@ -1113,16 +861,12 @@ std::unique_ptr<IFunction> create_unary_eltwise_layer(UnaryEltwiseLayerNode &nod
     }
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Operation: " << func_name
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type()
+                                               << " Target: " << TargetInfo::TargetType << " Operation: " << func_name
+                                               << " Data Type: " << input->info()->data_type()
+                                               << " Shape: " << input->info()->tensor_shape() << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend flatten layer function
@@ -1151,16 +895,12 @@ std::unique_ptr<IFunction> create_flatten_layer(FlattenLayerNode &node)
     func->configure(input, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend fully connected layer function
@@ -1200,24 +940,19 @@ std::unique_ptr<IFunction> create_fully_connected_layer(FullyConnectedLayerNode
 
     // Log info
     std::ostringstream qss;
-    if(is_quantized)
+    if (is_quantized)
     {
         qss << " Input QuantInfo: " << input->info()->quantization_info()
             << " Weights QuantInfo: " << weights->info()->quantization_info()
             << " Output QuantInfo: " << output->info()->quantization_info();
     }
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << qss.str()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << qss.str() << " Input shape: " << input->info()->tensor_shape()
+                                               << " Weights shape: " << weights->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend generate proposals layer function
@@ -1255,16 +990,14 @@ std::unique_ptr<IFunction> create_generate_proposals_layer(GenerateProposalsLaye
     func->configure(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.type()
-                               << " Target " << TargetInfo::TargetType
-                               << " Data Type: " << scores->info()->data_type()
-                               << " Scores shape: " << scores->info()->tensor_shape()
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.type() << " Target " << TargetInfo::TargetType << " Data Type: "
+                               << scores->info()->data_type() << " Scores shape: " << scores->info()->tensor_shape()
                                << " Deltas shape: " << deltas->info()->tensor_shape()
                                << " Anchors shape: " << anchors->info()->tensor_shape()
                                << " Proposals shape: " << proposals->info()->tensor_shape()
                                << " Num valid proposals shape: " << num_valid_proposals->info()->tensor_shape()
-                               << " Scores Out shape: " << scores_out->info()->tensor_shape()
-                               << std::endl);
+                               << " Scores Out shape: " << scores_out->info()->tensor_shape() << std::endl);
 
     return std::move(func);
 }
@@ -1299,18 +1032,13 @@ std::unique_ptr<IFunction> create_l2_normalize_layer(L2NormalizeLayerNode &node,
     func->configure(input, output, axis, epsilon);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Axis: " << axis
-                               << " Epsilon: " << epsilon
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Axis: " << axis << " Epsilon: " << epsilon << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend normalization layer function
@@ -1342,15 +1070,11 @@ std::unique_ptr<IFunction> create_normalization_layer(NormalizationLayerNode &no
     func->configure(input, output, norm_info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Normalization info: " << norm_info.type()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Normalization info: " << norm_info.type() << std::endl);
 
     return std::move(func);
 }
@@ -1384,13 +1108,9 @@ std::unique_ptr<IFunction> create_normalize_planar_yuv_layer(NormalizePlanarYUVL
     func->configure(input, output, mean, std);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Shape: " << input->info()->tensor_shape() << std::endl);
 
     return std::move(func);
 }
@@ -1422,16 +1142,12 @@ std::unique_ptr<IFunction> create_pad_layer(PadLayerNode &node)
     func->configure(input, output, padding, pad_value);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend permute layer function
@@ -1460,17 +1176,13 @@ std::unique_ptr<IFunction> create_permute_layer(PermuteLayerNode &node)
     func->configure(input, output, perm);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Permutation vector: " << perm
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Permutation vector: " << perm << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend pooling layer function
@@ -1499,17 +1211,13 @@ std::unique_ptr<IFunction> create_pooling_layer(PoolingLayerNode &node)
     func->configure(input, output, pool_info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Pooling info: " << pool_info.pool_type
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Pooling info: " << pool_info.pool_type << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend PRelu layer function
@@ -1538,16 +1246,12 @@ std::unique_ptr<IFunction> create_prelu_layer(PReluLayerNode &node)
     func->configure(input, alpha, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend print layer function
@@ -1568,13 +1272,9 @@ std::unique_ptr<IFunction> create_print_layer(PrintLayerNode &node)
     ARM_COMPUTE_UNUSED(input);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape() << std::endl);
 
     return nullptr;
 }
@@ -1608,17 +1308,13 @@ std::unique_ptr<IFunction> create_priorbox_layer(PriorBoxLayerNode &node)
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input0->info()->data_type()
-                               << " Input0 shape: " << input0->info()->tensor_shape()
-                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type() << " Input0 shape: "
+                               << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << " PriorBoxLayer info: " << prior_info
-                               << std::endl);
+                               << " PriorBoxLayer info: " << prior_info << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend quantization layer function
@@ -1646,16 +1342,12 @@ std::unique_ptr<IFunction> create_quantization_layer(QuantizationLayerNode &node
     func->configure(input, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend reduction operation layer function
@@ -1688,18 +1380,13 @@ std::unique_ptr<IFunction> create_reduction_operation_layer(ReductionLayerNode &
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
                                << " Data Type: " << input->info()->data_type()
                                << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Operation: " << op
-                               << " Axis: " << axis
-                               << " Keep dimensions:" << keep_dims
-                               << std::endl);
+                               << " Output shape: " << output->info()->tensor_shape() << " Operation: " << op
+                               << " Axis: " << axis << " Keep dimensions:" << keep_dims << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend reorg layer function
@@ -1727,16 +1414,12 @@ std::unique_ptr<IFunction> create_reorg_layer(ReorgLayerNode &node)
     func->configure(input, output, node.stride());
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend reshape layer function
@@ -1764,16 +1447,12 @@ std::unique_ptr<IFunction> create_reshape_layer(ReshapeLayerNode &node)
     func->configure(input, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend resize layer function
@@ -1799,20 +1478,17 @@ std::unique_ptr<IFunction> create_resize_layer(ResizeLayerNode &node)
 
     // Create and configure function
     auto func = std::make_unique<ResizeLayerFunction>();
-    func->configure(input, output, ScaleKernelInfo{ policy, BorderMode::CONSTANT, PixelValue(), SamplingPolicy::CENTER, false, false });
+    func->configure(input, output,
+                    ScaleKernelInfo{policy, BorderMode::CONSTANT, PixelValue(), SamplingPolicy::CENTER, false, false});
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Interpolation: " << policy
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Interpolation: " << policy << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend ROI align layer function
@@ -1845,17 +1521,13 @@ std::unique_ptr<IFunction> create_roi_align_layer(ROIAlignLayerNode &node)
     func->configure(input, rois, output, pool_info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " ROIs shape: " << rois->info()->tensor_shape()
-                               << " ROIPooling width: " << pool_info.pooled_width()
-                               << " ROIPooling height: " << pool_info.pooled_height()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " ROIs shape: " << rois->info()->tensor_shape()
+                                               << " ROIPooling width: " << pool_info.pooled_width()
+                                               << " ROIPooling height: " << pool_info.pooled_height() << std::endl);
 
     return std::move(func);
 }
@@ -1885,16 +1557,12 @@ std::unique_ptr<IFunction> create_slice_layer(SliceLayerNode &node)
     func->configure(input, output, node.starts(), node.ends());
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend softmax layer function
@@ -1924,16 +1592,12 @@ std::unique_ptr<IFunction> create_softmax_layer(SoftmaxLayerNode &node, GraphCon
     func->configure(input, output, beta);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend layer stack function
@@ -1948,12 +1612,13 @@ std::unique_ptr<IFunction> create_softmax_layer(SoftmaxLayerNode &node, GraphCon
 template <typename StackLayerFunction, typename TargetInfo>
 std::unique_ptr<arm_compute::IFunction> create_stack_layer(StackLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating Stack node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating Stack node with ID : " << node.id() << " and Name: " << node.name()
+                                                                   << std::endl);
     ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
 
     // Extract IO and info
     std::vector<typename TargetInfo::TensorType *> inputs;
-    for(unsigned int i = 0; i < node.num_inputs(); ++i)
+    for (unsigned int i = 0; i < node.num_inputs(); ++i)
     {
         inputs.push_back(get_backing_tensor<TargetInfo>(node.input(i)));
     }
@@ -1965,18 +1630,14 @@ std::unique_ptr<arm_compute::IFunction> create_stack_layer(StackLayerNode &node)
     func->configure(inputs, axis, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << output->info()->data_type()
-                               << " Inputs shape: " << inputs[0]->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Num Inputs: " << inputs.size()
-                               << " Axis: " << axis
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type()
+                                               << " Target: " << TargetInfo::TargetType
+                                               << " Data Type: " << output->info()->data_type()
+                                               << " Inputs shape: " << inputs[0]->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Num Inputs: " << inputs.size() << " Axis: " << axis << std::endl);
 
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend slice layer function
@@ -2009,20 +1670,16 @@ std::unique_ptr<IFunction> create_strided_slice_layer(StridedSliceLayerNode &nod
     func->configure(input, output, starts, ends, strides, info.begin_mask(), info.end_mask(), info.shrink_axis_mask());
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return std::move(func);
+    return func;
 }
 } // namespace detail
 } // namespace backends
 } // namespace graph
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_FUNCTION_HELPERS_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUNCTIONHELPERS_H
diff --git a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h b/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h
index ec03bcc952248b276d16e9fdbf559d6052406276..27e21cbc7eb447bc7117f4470a33c98b5de18081 100644
--- a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h
+++ b/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,11 +22,12 @@
  * SOFTWARE.
  */
 
-#ifndef ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_FUNCTION_H
-#define ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_FUNCTION_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUSEDCONVOLUTIONBATCHNORMALIZATIONFUNCTION_H
+#define ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUSEDCONVOLUTIONBATCHNORMALIZATIONFUNCTION_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 
 namespace arm_compute
 {
@@ -69,15 +70,19 @@ public:
      * @param[in]  fused_act  Activation layer information in case of a fused activation.
      *
      */
-    void configure(TensorType       *input,
-                   TensorType       *weights,
-                   TensorType       *bias,
-                   TensorType       *output,
-                   const TensorType *mean,
-                   const TensorType *var,
-                   const TensorType *beta,
-                   const TensorType *gamma,
-                   float epsilon, const PadStrideInfo &conv_info, unsigned int num_groups, bool fast_math, ActivationLayerInfo const &fused_act)
+    void configure(TensorType                *input,
+                   TensorType                *weights,
+                   TensorType                *bias,
+                   TensorType                *output,
+                   const TensorType          *mean,
+                   const TensorType          *var,
+                   const TensorType          *beta,
+                   const TensorType          *gamma,
+                   float                      epsilon,
+                   const PadStrideInfo       &conv_info,
+                   unsigned int               num_groups,
+                   bool                       fast_math,
+                   ActivationLayerInfo const &fused_act)
     {
         // We don't run any validate, as we assume that the layers have been already validated
         const bool        has_bias = (bias != nullptr);
@@ -85,7 +90,7 @@ public:
 
         // We check if the layer has a bias. If yes, use it in-place. If not, we need to create one
         // as batch normalization might end up with a bias != 0
-        if(has_bias)
+        if (has_bias)
         {
             _fused_batch_norm_layer.configure(weights, mean, var, nullptr, nullptr, bias, beta, gamma, epsilon);
             bias_to_use = bias;
@@ -96,9 +101,10 @@ public:
             bias_to_use = &_fused_bias;
         }
 
-        _conv_layer.configure(input, weights, bias_to_use, output, conv_info, WeightsInfo(), Size2D(1U, 1U), fused_act, fast_math, num_groups);
+        _conv_layer.configure(input, weights, bias_to_use, output, conv_info, WeightsInfo(), Size2D(1U, 1U), fused_act,
+                              fast_math, num_groups);
 
-        if(!has_bias)
+        if (!has_bias)
         {
             _fused_bias.allocator()->allocate();
         }
@@ -113,7 +119,7 @@ public:
 
     void prepare()
     {
-        if(!_is_prepared)
+        if (!_is_prepared)
         {
             _fused_batch_norm_layer.run();
             _is_prepared = true;
@@ -130,4 +136,4 @@ private:
 } // namespace graph
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_FUNCTION_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUSEDCONVOLUTIONBATCHNORMALIZATIONFUNCTION_H
diff --git a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h b/arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h
deleted file mode 100644
index 10f2e5c25e4fdd1f51e96aeb140313785036c6a0..0000000000000000000000000000000000000000
--- a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_WITH_POST_OPS_FUNCTION_H
-#define ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_WITH_POST_OPS_FUNCTION_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "arm_compute/runtime/IFunction.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-namespace backends
-{
-/** Wrapper function to first apply {NE, CL}BatchNormalizationLayer on the weights and then run {NE, CL}ConvolutionLayer with the modified weights */
-template <typename TargetInfo, typename FusedLayerTypes>
-class FusedConvolutionBatchNormalizationWithPostOpsFunction : public IFunction
-{
-public:
-    using TensorType         = typename TargetInfo::TensorType;
-    using TensorConcreteType = typename TargetInfo::TensorConcreteType;
-
-    FusedConvolutionBatchNormalizationWithPostOpsFunction(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
-        : _conv_layer(memory_manager), _fused_batch_norm_layer(), _fused_bias(), _is_prepared(false)
-    {
-    }
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input      Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                        while every optional dimension from 4 and above represent a batch of inputs.
-     *                        Data types supported: QASYMM8/F16/F32.
-     * @param[in]  weights    Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
-     * @param[in]  bias       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                        Data type supported: Should match @p input data type.
-     * @param[out] output     Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                        Data types supported: Same as @p input.
-     * @param[in]  mean       Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]  var        Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]  beta       Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
-     * @param[in]  gamma      Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
-     * @param[in]  epsilon    Small value to avoid division with zero. Default value is 0.001f.
-     * @param[in]  conv_info  Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  num_groups Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     * @param[in]  fast_math  Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                        available which may introduce a drop of accuracy as well. Default is false
-     * @param[in]  post_ops   A sequence of post operations that are performed after the main operation.
-     *
-     */
-    void configure(TensorType       *input,
-                   TensorType       *weights,
-                   TensorType       *bias,
-                   TensorType       *output,
-                   const TensorType *mean,
-                   const TensorType *var,
-                   const TensorType *beta,
-                   const TensorType *gamma,
-                   float epsilon, const PadStrideInfo &conv_info, unsigned int num_groups, bool fast_math,
-                   const arm_compute::experimental::PostOpList<TensorType *> &post_ops = experimental::PostOpList<TensorType *> {})
-    {
-        // We don't run any validate, as we assume that the layers have been already validated
-        const bool        has_bias = (bias != nullptr);
-        const TensorType *bias_to_use;
-
-        // We check if the layer has a bias. If yes, use it in-place. If not, we need to create one
-        // as batch normalization might end up with a bias != 0
-        if(has_bias)
-        {
-            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, nullptr, bias, beta, gamma, epsilon);
-            bias_to_use = bias;
-        }
-        else
-        {
-            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, &_fused_bias, nullptr, beta, gamma, epsilon);
-            bias_to_use = &_fused_bias;
-        }
-
-        ActivationLayerInfo fused_act = ActivationLayerInfo(); // Passing an empty ActivationLayerInfo.
-        _conv_layer.configure(input, weights, bias_to_use, output, conv_info, WeightsInfo(), Size2D(1U, 1U), fused_act, fast_math, num_groups, post_ops);
-
-        if(!has_bias)
-        {
-            _fused_bias.allocator()->allocate();
-        }
-    }
-
-    // Inherited methods overridden:
-    void run()
-    {
-        prepare();
-        _conv_layer.run();
-    }
-
-    void prepare()
-    {
-        if(!_is_prepared)
-        {
-            _fused_batch_norm_layer.run();
-            _is_prepared = true;
-        }
-    }
-
-private:
-    typename FusedLayerTypes::ConvolutionLayer       _conv_layer;
-    typename FusedLayerTypes::FuseBatchNormalization _fused_batch_norm_layer;
-    TensorConcreteType                               _fused_bias;
-    bool                                             _is_prepared;
-};
-} // namespace backends
-} // namespace graph
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_WITH_POST_OPS_FUNCTION_H */
diff --git a/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h b/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h
index 4f8a8da1fbea3a985537f1fea1f5f21f4a1e1cef..07a2cdd8b8d422f7863da244b03d84dc2dac8039 100644
--- a/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h
+++ b/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h
@@ -67,15 +67,18 @@ public:
      * @param[in]  fused_act        Activation layer information in case of a fused activation.
      *
      */
-    void configure(TensorType       *input,
-                   TensorType       *weights,
-                   TensorType       *bias,
-                   TensorType       *output,
-                   const TensorType *mean,
-                   const TensorType *var,
-                   const TensorType *beta,
-                   const TensorType *gamma,
-                   float epsilon, const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo const &fused_act)
+    void configure(TensorType                *input,
+                   TensorType                *weights,
+                   TensorType                *bias,
+                   TensorType                *output,
+                   const TensorType          *mean,
+                   const TensorType          *var,
+                   const TensorType          *beta,
+                   const TensorType          *gamma,
+                   float                      epsilon,
+                   const PadStrideInfo       &conv_info,
+                   unsigned int               depth_multiplier,
+                   ActivationLayerInfo const &fused_act)
     {
         // We don't run any validate, as we assume that the layers have been already validated
         const bool        has_bias = (bias != nullptr);
@@ -83,20 +86,23 @@ public:
 
         // We check if the layer has a bias. If yes, use it in-place. If not, we need to create one
         // as batch normalization might end up with a bias != 0
-        if(has_bias)
+        if (has_bias)
         {
-            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, nullptr, bias, beta, gamma, epsilon, FuseBatchNormalizationType::DEPTHWISECONVOLUTION);
+            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, nullptr, bias, beta, gamma, epsilon,
+                                              FuseBatchNormalizationType::DEPTHWISECONVOLUTION);
             bias_to_use = bias;
         }
         else
         {
-            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, &_fused_bias, nullptr, beta, gamma, epsilon, FuseBatchNormalizationType::DEPTHWISECONVOLUTION);
+            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, &_fused_bias, nullptr, beta, gamma, epsilon,
+                                              FuseBatchNormalizationType::DEPTHWISECONVOLUTION);
             bias_to_use = &_fused_bias;
         }
 
-        _depth_conv_layer.configure(input, weights, bias_to_use, output, conv_info, depth_multiplier, fused_act.enabled() ? fused_act : ActivationLayerInfo());
+        _depth_conv_layer.configure(input, weights, bias_to_use, output, conv_info, depth_multiplier,
+                                    fused_act.enabled() ? fused_act : ActivationLayerInfo());
 
-        if(!has_bias)
+        if (!has_bias)
         {
             _fused_bias.allocator()->allocate();
         }
@@ -111,7 +117,7 @@ public:
 
     void prepare()
     {
-        if(!_is_prepared)
+        if (!_is_prepared)
         {
             _fused_batch_norm_layer.run();
             _is_prepared = true;
diff --git a/arm_compute/graph/backends/NEON/NEDeviceBackend.h b/arm_compute/graph/backends/NEON/NEDeviceBackend.h
index 9cb37d45533a0a7953b9d5950a9faf0f82abb346..cd817a20d80dd49c8975639f29fabcf17077b17c 100644
--- a/arm_compute/graph/backends/NEON/NEDeviceBackend.h
+++ b/arm_compute/graph/backends/NEON/NEDeviceBackend.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_NEDEVICEBACKEND_H
 
 #include "arm_compute/graph/IDeviceBackend.h"
-
 #include "arm_compute/runtime/Allocator.h"
 
 namespace arm_compute
@@ -41,16 +40,17 @@ public:
     NEDeviceBackend();
 
     // Inherited overridden methods
-    void initialize_backend() override;
-    void setup_backend_context(GraphContext &ctx) override;
-    void release_backend_context(GraphContext &ctx) override;
+    void                           initialize_backend() override;
+    void                           setup_backend_context(GraphContext &ctx) override;
+    void                           release_backend_context(GraphContext &ctx) override;
     bool                           is_backend_supported() override;
     IAllocator                    *backend_allocator() override;
     std::unique_ptr<ITensorHandle> create_tensor(const Tensor &tensor) override;
-    std::unique_ptr<ITensorHandle> create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) override;
-    std::unique_ptr<arm_compute::IFunction> configure_node(INode &node, GraphContext &ctx) override;
-    Status validate_node(INode &node) override;
-    std::shared_ptr<arm_compute::IMemoryManager> create_memory_manager(MemoryManagerAffinity affinity) override;
+    std::unique_ptr<ITensorHandle>
+    create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) override;
+    std::unique_ptr<arm_compute::IFunction>       configure_node(INode &node, GraphContext &ctx) override;
+    Status                                        validate_node(INode &node) override;
+    std::shared_ptr<arm_compute::IMemoryManager>  create_memory_manager(MemoryManagerAffinity affinity) override;
     std::shared_ptr<arm_compute::IWeightsManager> create_weights_manager() override;
     void                                          sync() override;
 
diff --git a/arm_compute/graph/backends/NEON/NESubTensorHandle.h b/arm_compute/graph/backends/NEON/NESubTensorHandle.h
index a438b657355c2d4e1ebaf0f30c064dca60969d38..3619f4ed1b9cda15c38ee47ccdf1e0a46adc4ef8 100644
--- a/arm_compute/graph/backends/NEON/NESubTensorHandle.h
+++ b/arm_compute/graph/backends/NEON/NESubTensorHandle.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_NESUBTENSORHANDLE_H
 
 #include "arm_compute/graph/ITensorHandle.h"
-
 #include "arm_compute/runtime/SubTensor.h"
 
 namespace arm_compute
@@ -45,7 +44,10 @@ public:
      * @param[in] coords        Starting coordinates
      * @param[in] extend_parent Extends parent shape if true
      */
-    NESubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent = false);
+    NESubTensorHandle(ITensorHandle     *parent_handle,
+                      const TensorShape &shape,
+                      const Coordinates &coords,
+                      bool               extend_parent = false);
     /** Destructor: free the tensor's memory */
     ~NESubTensorHandle() = default;
     /** Allow instances of this class to be move constructed */
@@ -58,10 +60,10 @@ public:
     NESubTensorHandle &operator=(const NESubTensorHandle &) = delete;
 
     // Inherited overridden methods
-    void allocate() override;
-    void free() override;
-    void manage(IMemoryGroup *mg) override;
-    void map(bool blocking) override;
+    void                        allocate() override;
+    void                        free() override;
+    void                        manage(IMemoryGroup *mg) override;
+    void                        map(bool blocking) override;
     void                        unmap() override;
     void                        release_if_unused() override;
     arm_compute::ITensor       &tensor() override;
diff --git a/arm_compute/graph/backends/NEON/NETensorHandle.h b/arm_compute/graph/backends/NEON/NETensorHandle.h
index 99101a8fe90e75c1b2102e87de4488ad5d61191c..1df90822ba4f45dbc58f4f113f5ca92b5efd4e12 100644
--- a/arm_compute/graph/backends/NEON/NETensorHandle.h
+++ b/arm_compute/graph/backends/NEON/NETensorHandle.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_NETENSORHANDLE_H
 
 #include "arm_compute/graph/ITensorHandle.h"
-
 #include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
@@ -51,10 +50,10 @@ public:
     NETensorHandle &operator=(NETensorHandle &&) = default;
 
     // Inherited overridden methods
-    void allocate() override;
-    void free() override;
-    void manage(IMemoryGroup *mg) override;
-    void map(bool blocking) override;
+    void                        allocate() override;
+    void                        free() override;
+    void                        manage(IMemoryGroup *mg) override;
+    void                        map(bool blocking) override;
     void                        unmap() override;
     void                        release_if_unused() override;
     arm_compute::ITensor       &tensor() override;
diff --git a/arm_compute/graph/backends/Utils.h b/arm_compute/graph/backends/Utils.h
index 774ce515b522175ac6f0cf7c191c3501602ac988..5f4e66c207c5e85037f8c2cdd74e6092bd8fa4a6 100644
--- a/arm_compute/graph/backends/Utils.h
+++ b/arm_compute/graph/backends/Utils.h
@@ -42,7 +42,8 @@ namespace backends
  * @return  A configured backend function
  */
 template <typename FunctionType, typename FunctionNameType, typename... ParameterType>
-std::tuple<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_named_function(FunctionNameType name, ParameterType... args)
+std::tuple<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_named_function(FunctionNameType name,
+                                                                                            ParameterType... args)
 {
     auto f = std::make_unique<FunctionType>();
     f->configure(std::forward<ParameterType>(args)...);
@@ -58,9 +59,8 @@ std::tuple<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_nam
  * @return  A configured backend function
  */
 template <typename FunctionType, typename FunctionNameType, typename MemoryManagerType, typename... ParameterType>
-std::tuple<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_named_memory_managed_function(FunctionNameType name,
-                                                                                                           MemoryManagerType mm,
-                                                                                                           ParameterType... args)
+std::tuple<std::unique_ptr<arm_compute::IFunction>, FunctionNameType>
+create_named_memory_managed_function(FunctionNameType name, MemoryManagerType mm, ParameterType... args)
 {
     auto f = std::make_unique<FunctionType>(mm);
     f->configure(std::forward<ParameterType>(args)...);
diff --git a/arm_compute/graph/backends/ValidateHelpers.h b/arm_compute/graph/backends/ValidateHelpers.h
index 89dccd88b759698dce9af9b19cd9d83bd74c3d35..0e102942a7e348f9fb2ade5a6731ca433faff826 100644
--- a/arm_compute/graph/backends/ValidateHelpers.h
+++ b/arm_compute/graph/backends/ValidateHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,17 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_VALIDATE_HELPERS_H
-#define ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_VALIDATE_HELPERS_H
-
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Tensor.h"
-#include "arm_compute/graph/Types.h"
-#include "arm_compute/graph/nodes/Nodes.h"
+#ifndef ACL_ARM_COMPUTE_GRAPH_BACKENDS_VALIDATEHELPERS_H
+#define ACL_ARM_COMPUTE_GRAPH_BACKENDS_VALIDATEHELPERS_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/Types.h"
 
 namespace arm_compute
 {
@@ -63,7 +62,8 @@ inline arm_compute::ITensorInfo *get_backing_tensor_info(arm_compute::graph::Ten
 template <typename ArgMinMaxLayer>
 Status validate_arg_min_max_layer(ArgMinMaxLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ArgMinMaxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating ArgMinMaxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -86,7 +86,8 @@ Status validate_arg_min_max_layer(ArgMinMaxLayerNode &node)
 template <typename BoundingBoxTransformLayer>
 Status validate_bounding_box_transform_layer(BoundingBoxTransformLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating BoundingBoxTransformLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating BoundingBoxTransformLayer node with ID : " << node.id() << " and Name: "
+                                                                                         << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -110,7 +111,8 @@ Status validate_bounding_box_transform_layer(BoundingBoxTransformLayerNode &node
 template <typename ChannelShuffleLayer>
 Status validate_channel_shuffle_layer(ChannelShuffleLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ChannelShuffle node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating ChannelShuffle node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -133,10 +135,14 @@ Status validate_channel_shuffle_layer(ChannelShuffleLayerNode &node)
  *
  * @return Status
  */
-template <typename ConvolutionLayer, typename DirectConvolutionLayer, typename GEMMConvolutionLayer, typename WinogradConvolutionLayer>
+template <typename ConvolutionLayer,
+          typename DirectConvolutionLayer,
+          typename GEMMConvolutionLayer,
+          typename WinogradConvolutionLayer>
 Status validate_convolution_layer(ConvolutionLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -146,7 +152,7 @@ Status validate_convolution_layer(ConvolutionLayerNode &node)
     arm_compute::ITensorInfo *biases  = get_backing_tensor_info(node.input(2));
     arm_compute::ITensorInfo *output  = get_backing_tensor_info(node.output(0));
 
-    if(is_data_type_quantized_asymmetric(input->data_type()))
+    if (is_data_type_quantized_asymmetric(input->data_type()))
     {
         biases->set_data_type(DataType::S32);
     }
@@ -158,23 +164,24 @@ Status validate_convolution_layer(ConvolutionLayerNode &node)
 
     // Validate function
     Status status{};
-    switch(conv_algorithm)
+    switch (conv_algorithm)
     {
         case ConvolutionMethod::Direct:
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "DirectConvolutionLayer does not support grouping!");
             status = DirectConvolutionLayer::validate(input, weights, biases, output, conv_info);
             break;
         case ConvolutionMethod::GEMM:
-            status = GEMMConvolutionLayer::validate(input, weights, biases, output, conv_info,
-                                                    WeightsInfo(), Size2D(1, 1), ActivationLayerInfo(), num_groups);
+            status = GEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, WeightsInfo(),
+                                                    Size2D(1, 1), ActivationLayerInfo(), num_groups);
             break;
         case ConvolutionMethod::Winograd:
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "WinogradConvolutionLayer does not support grouping!");
-            status = WinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, ActivationLayerInfo(), fast_math);
+            status = WinogradConvolutionLayer::validate(input, weights, biases, output, conv_info,
+                                                        ActivationLayerInfo(), fast_math);
             break;
         case ConvolutionMethod::Default:
-            status = ConvolutionLayer::validate(input, weights, biases, output, conv_info,
-                                                WeightsInfo(), Size2D(1, 1), ActivationLayerInfo(), fast_math, num_groups);
+            status = ConvolutionLayer::validate(input, weights, biases, output, conv_info, WeightsInfo(), Size2D(1, 1),
+                                                ActivationLayerInfo(), fast_math, num_groups);
             break;
         default:
             ARM_COMPUTE_RETURN_ERROR_MSG("Unsupported convolution method");
@@ -183,42 +190,6 @@ Status validate_convolution_layer(ConvolutionLayerNode &node)
     return status;
 }
 
-/** Validates a Convolution layer node
- *
- * @tparam GEMMConvolutionLayer      GEMM Convolution layer function type
- *
- * @param[in] node Node to validate
- *
- * @return Status
- */
-template <typename GEMMConvolutionLayer>
-Status validate_fused_convolution_with_post_op(FusedConvolutionWithPostOpNode &node)
-{
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating fused ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
-    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 4);
-    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
-
-    // Extract IO and info
-    arm_compute::ITensorInfo *input   = get_backing_tensor_info(node.input(0));
-    arm_compute::ITensorInfo *weights = get_backing_tensor_info(node.input(1));
-    arm_compute::ITensorInfo *biases  = get_backing_tensor_info(node.input(2));
-    arm_compute::ITensorInfo *output  = get_backing_tensor_info(node.output(0));
-
-    if(is_data_type_quantized_asymmetric(input->data_type()))
-    {
-        biases->set_data_type(DataType::S32);
-    }
-
-    const PadStrideInfo conv_info = node.convolution_info();
-    //const ConvolutionMethod conv_algorithm = node.convolution_method();
-    //const bool              fast_math      = node.fast_math_hint() == FastMathHint::Enabled;
-    const unsigned int num_groups = node.num_groups();
-
-    // Validate function
-    return GEMMConvolutionLayer::validate(input, weights, biases, output, conv_info,
-                                          WeightsInfo(), Size2D(1, 1), ActivationLayerInfo(), num_groups);
-}
-
 /** Validates a Depthwise Convolution layer node
  *
  * @tparam DepthwiseConvolutionLayer    Default Depthwise Convolution layer type
@@ -230,7 +201,8 @@ Status validate_fused_convolution_with_post_op(FusedConvolutionWithPostOpNode &n
 template <typename DepthwiseConvolutionLayer>
 Status validate_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DepthwiseConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DepthwiseConvolutionLayer node with ID : " << node.id() << " and Name: "
+                                                                                         << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -246,7 +218,7 @@ Status validate_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
 
     // Validate function
     Status status{};
-    switch(dwc_algorithm)
+    switch (dwc_algorithm)
     {
         case DepthwiseConvolutionMethod::Default:
         case DepthwiseConvolutionMethod::Optimized3x3:
@@ -269,7 +241,8 @@ Status validate_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
 template <typename DepthToSpaceLayer>
 Status validate_depth_to_space_layer(DepthToSpaceLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -290,7 +263,8 @@ Status validate_depth_to_space_layer(DepthToSpaceLayerNode &node)
 template <typename DequantizationLayer>
 Status validate_dequantization_layer(DequantizationLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -311,7 +285,8 @@ Status validate_dequantization_layer(DequantizationLayerNode &node)
 template <typename DetectionOutputLayer>
 Status validate_detection_output_layer(DetectionOutputLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -335,7 +310,8 @@ Status validate_detection_output_layer(DetectionOutputLayerNode &node)
 template <typename DetectionPostProcessLayer>
 Status validate_detection_post_process_layer(DetectionPostProcessLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DetectionPostProcessLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DetectionPostProcessLayer node with ID : " << node.id() << " and Name: "
+                                                                                         << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 4);
 
@@ -363,7 +339,8 @@ Status validate_detection_post_process_layer(DetectionPostProcessLayerNode &node
 template <typename GenerateProposalsLayer>
 Status validate_generate_proposals_layer(GenerateProposalsLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating GenerateProposalsLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating GenerateProposalsLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 3);
 
@@ -390,7 +367,8 @@ Status validate_generate_proposals_layer(GenerateProposalsLayerNode &node)
 template <typename L2NormalizeLayer>
 Status validate_l2_normalize_layer(L2NormalizeLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating L2NormalizeLayerNode node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating L2NormalizeLayerNode node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -415,7 +393,8 @@ Status validate_l2_normalize_layer(L2NormalizeLayerNode &node)
 template <typename NormalizePlanarYUVLayer>
 Status validate_normalize_planar_yuv_layer(NormalizePlanarYUVLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating NormalizePlanarYUVLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating NormalizePlanarYUVLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -440,7 +419,8 @@ Status validate_normalize_planar_yuv_layer(NormalizePlanarYUVLayerNode &node)
 template <typename PadLayer>
 Status validate_pad_layer(PadLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PadLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PadLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                        << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -463,14 +443,15 @@ Status validate_pad_layer(PadLayerNode &node)
 template <typename PermuteLayer>
 Status validate_permute_layer(PermuteLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PermuteLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PermuteLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                            << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
     // Extract IO and info
     arm_compute::ITensorInfo *input  = get_backing_tensor_info(node.input(0));
     arm_compute::ITensorInfo *output = get_backing_tensor_info(node.output(0));
-    const PermutationVector &perm   = node.permutation_vector();
+    const PermutationVector  &perm   = node.permutation_vector();
 
     return PermuteLayer::validate(input, output, perm);
 }
@@ -486,7 +467,8 @@ Status validate_permute_layer(PermuteLayerNode &node)
 template <typename PReluLayer>
 Status validate_prelu_layer(PReluLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PRelu node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PRelu node with ID : " << node.id() << " and Name: " << node.name()
+                                                                     << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -509,7 +491,8 @@ Status validate_prelu_layer(PReluLayerNode &node)
 template <typename PriorBoxLayer>
 Status validate_priorbox_layer(PriorBoxLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PriorBoxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating PriorBoxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -533,7 +516,8 @@ Status validate_priorbox_layer(PriorBoxLayerNode &node)
 template <typename QuantizationLayer>
 Status validate_quantization_layer(QuantizationLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating QuantizationLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating QuantizationLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -556,7 +540,8 @@ Status validate_quantization_layer(QuantizationLayerNode &node)
 template <typename ReductionLayer>
 Status validate_reduction_operation_layer(ReductionLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ReductionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating ReductionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
 
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
@@ -580,7 +565,8 @@ Status validate_reduction_operation_layer(ReductionLayerNode &node)
 template <typename ReorgLayer>
 Status validate_reorg_layer(ReorgLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ReorgLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ReorgLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                          << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -603,7 +589,8 @@ Status validate_reorg_layer(ReorgLayerNode &node)
 template <typename ReshapeLayer>
 Status validate_reshape_layer(ReshapeLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ReshapeLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ReshapeLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                            << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -626,14 +613,15 @@ Status validate_reshape_layer(ReshapeLayerNode &node)
 template <typename ROIAlignLayer>
 Status validate_roi_align_layer(ROIAlignLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ROIAlignLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating ROIAlignLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
     // Extract input and output
-    arm_compute::ITensorInfo *input     = detail::get_backing_tensor_info(node.input(0));
-    arm_compute::ITensorInfo *rois      = detail::get_backing_tensor_info(node.input(1));
-    arm_compute::ITensorInfo *output    = detail::get_backing_tensor_info(node.output(0));
+    arm_compute::ITensorInfo  *input     = detail::get_backing_tensor_info(node.input(0));
+    arm_compute::ITensorInfo  *rois      = detail::get_backing_tensor_info(node.input(1));
+    arm_compute::ITensorInfo  *output    = detail::get_backing_tensor_info(node.output(0));
     const ROIPoolingLayerInfo &pool_info = node.pooling_info();
 
     // Validate function
@@ -651,7 +639,8 @@ Status validate_roi_align_layer(ROIAlignLayerNode &node)
 template <typename SliceLayer>
 Status validate_slice_layer(SliceLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating Slice node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating Slice node with ID : " << node.id() << " and Name: " << node.name()
+                                                                     << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -675,7 +664,8 @@ Status validate_slice_layer(SliceLayerNode &node)
 template <typename StridedSliceLayer>
 Status validate_strided_slice_layer(StridedSliceLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating StridedSlice node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating StridedSlice node with ID : " << node.id() << " and Name: " << node.name()
+                                                                            << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -687,7 +677,8 @@ Status validate_strided_slice_layer(StridedSliceLayerNode &node)
     const BiStrides             strides = node.strides();
     const StridedSliceLayerInfo info    = node.strided_slice_info();
 
-    return StridedSliceLayer::validate(input, output, starts, ends, strides, info.begin_mask(), info.end_mask(), info.shrink_axis_mask());
+    return StridedSliceLayer::validate(input, output, starts, ends, strides, info.begin_mask(), info.end_mask(),
+                                       info.shrink_axis_mask());
 }
 
 /** Validates a element-wise layer node
@@ -699,7 +690,8 @@ Status validate_strided_slice_layer(StridedSliceLayerNode &node)
 template <typename EltwiseLayerFunctions>
 Status validate_eltwise_Layer(EltwiseLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                            << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -714,23 +706,24 @@ Status validate_eltwise_Layer(EltwiseLayerNode &node)
     const QuantizationInfo          quant_info     = node.output_quant_info();
 
     // Validate function
-    if(eltwise_op == EltwiseOperation::Add)
+    if (eltwise_op == EltwiseOperation::Add)
     {
         return EltwiseLayerFunctions::ArithmeticAddition::validate(input1, input2, output, convert_policy, act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Sub)
+    else if (eltwise_op == EltwiseOperation::Sub)
     {
         return EltwiseLayerFunctions::ArithmeticSubtraction::validate(input1, input2, output, convert_policy, act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Mul)
+    else if (eltwise_op == EltwiseOperation::Mul)
     {
-        return EltwiseLayerFunctions::PixelWiseMultiplication::validate(input1, input2, output, 1.0f, convert_policy, round_policy, act_info);
+        return EltwiseLayerFunctions::PixelWiseMultiplication::validate(input1, input2, output, 1.0f, convert_policy,
+                                                                        round_policy, act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Max)
+    else if (eltwise_op == EltwiseOperation::Max)
     {
         return EltwiseLayerFunctions::ElementwiseMax::validate(input1, input2, output, act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Div)
+    else if (eltwise_op == EltwiseOperation::Div)
     {
         return EltwiseLayerFunctions::ArithmeticDivision::validate(input1, input2, output, act_info);
     }
@@ -749,7 +742,8 @@ Status validate_eltwise_Layer(EltwiseLayerNode &node)
 template <typename UnaryEltwiseLayerFunctions>
 Status validate_unary_eltwise_layer(UnaryEltwiseLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                            << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -759,7 +753,7 @@ Status validate_unary_eltwise_layer(UnaryEltwiseLayerNode &node)
     const UnaryEltwiseOperation eltwise_op = node.eltwise_descriptor().op;
 
     // Validate function
-    if(eltwise_op == UnaryEltwiseOperation::Exp)
+    if (eltwise_op == UnaryEltwiseOperation::Exp)
     {
         return UnaryEltwiseLayerFunctions::ExpLayer::validate(input, output);
     }
@@ -775,4 +769,4 @@ Status validate_unary_eltwise_layer(UnaryEltwiseLayerNode &node)
 } // namespace graph
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_VALIDATE_HELPERS_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_BACKENDS_VALIDATEHELPERS_H
diff --git a/arm_compute/graph/frontend/IStream.h b/arm_compute/graph/frontend/IStream.h
index f69d5437c19bbd5722ed898ce7fa009de56cced1..1831ac0be3adba97e9cde035c0baf3c3825eb3a8 100644
--- a/arm_compute/graph/frontend/IStream.h
+++ b/arm_compute/graph/frontend/IStream.h
@@ -84,8 +84,8 @@ public:
     }
 
 protected:
-    StreamHints _hints     = {};              /**< Execution and algorithmic hints */
-    NodeID      _tail_node = { EmptyNodeID }; /**< NodeID pointing to the last(tail) node of the graph */
+    StreamHints _hints     = {};            /**< Execution and algorithmic hints */
+    NodeID      _tail_node = {EmptyNodeID}; /**< NodeID pointing to the last(tail) node of the graph */
 };
 } // namespace frontend
 } // namespace graph
diff --git a/arm_compute/graph/frontend/Layers.h b/arm_compute/graph/frontend/Layers.h
index fe0539bac5a503454340b06764194e4808ae3f8a..bd321e6f1abece59d6f6a59da7d9498a11a2a3fb 100644
--- a/arm_compute/graph/frontend/Layers.h
+++ b/arm_compute/graph/frontend/Layers.h
@@ -24,13 +24,12 @@
 #ifndef ARM_COMPUTE_GRAPH_LAYERS_H
 #define ARM_COMPUTE_GRAPH_LAYERS_H
 
-#include "arm_compute/graph/GraphBuilder.h"
-#include "arm_compute/graph/Types.h"
+#include "arm_compute/core/utils/misc/Utility.h"
 #include "arm_compute/graph/frontend/ILayer.h"
 #include "arm_compute/graph/frontend/IStream.h"
 #include "arm_compute/graph/frontend/SubStream.h"
-
-#include "arm_compute/core/utils/misc/Utility.h"
+#include "arm_compute/graph/GraphBuilder.h"
+#include "arm_compute/graph/Types.h"
 
 #include <memory>
 #include <string>
@@ -50,14 +49,13 @@ public:
      * @param[in] desc     Description of input tensor.
      * @param[in] accessor Accessor to get input tensor data from.
      */
-    InputLayer(TensorDescriptor desc, ITensorAccessorUPtr accessor)
-        : _desc(desc), _accessor(std::move(accessor))
+    InputLayer(TensorDescriptor desc, ITensorAccessorUPtr accessor) : _desc(desc), _accessor(std::move(accessor))
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams common_params = { name(), s.hints().target_hint };
+        NodeParams common_params = {name(), s.hints().target_hint};
         return GraphBuilder::add_input_node(s.graph(), common_params, _desc, std::move(_accessor));
     }
 
@@ -75,14 +73,13 @@ public:
      * @param[in] desc     Description of input tensor.
      * @param[in] accessor Accessor to get input tensor data from.
      */
-    ConstantLayer(TensorDescriptor desc, ITensorAccessorUPtr accessor)
-        : _desc(desc), _accessor(std::move(accessor))
+    ConstantLayer(TensorDescriptor desc, ITensorAccessorUPtr accessor) : _desc(desc), _accessor(std::move(accessor))
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams common_params = { name(), s.hints().target_hint };
+        NodeParams common_params = {name(), s.hints().target_hint};
         return GraphBuilder::add_const_node(s.graph(), common_params, _desc, std::move(_accessor));
     }
 
@@ -107,8 +104,8 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), _connection_idx };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), _connection_idx};
         return GraphBuilder::add_output_node(s.graph(), common_params, input, std::move(_accessor));
     }
 
@@ -126,18 +123,17 @@ public:
      * @param[in] act_info       Activation information
      * @param[in] out_quant_info (Optional) Output quantization info
      */
-    ActivationLayer(ActivationLayerInfo    act_info,
-                    const QuantizationInfo out_quant_info = QuantizationInfo())
-        : _act_info(act_info),
-          _out_quant_info(std::move(out_quant_info))
+    ActivationLayer(ActivationLayerInfo act_info, const QuantizationInfo out_quant_info = QuantizationInfo())
+        : _act_info(act_info), _out_quant_info(std::move(out_quant_info))
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        return GraphBuilder::add_activation_node(s.graph(), common_params, input, _act_info, std::move(_out_quant_info));
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_activation_node(s.graph(), common_params, input, _act_info,
+                                                 std::move(_out_quant_info));
     }
 
 private:
@@ -160,10 +156,7 @@ public:
                    unsigned int           axis,
                    DataType               out_data_type  = DataType::UNKNOWN,
                    const QuantizationInfo out_quant_info = QuantizationInfo())
-        : _op(op),
-          _axis(axis),
-          _out_data_type(out_data_type),
-          _out_quant_info(std::move(out_quant_info))
+        : _op(op), _axis(axis), _out_data_type(out_data_type), _out_quant_info(std::move(out_quant_info))
     {
     }
 
@@ -175,9 +168,10 @@ public:
      */
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        return GraphBuilder::add_arg_min_max_node(s.graph(), common_params, input, _op, _axis, _out_data_type, std::move(_out_quant_info));
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_arg_min_max_node(s.graph(), common_params, input, _op, _axis, _out_data_type,
+                                                  std::move(_out_quant_info));
     }
 
 private:
@@ -204,7 +198,11 @@ public:
                             ITensorAccessorUPtr gamma   = nullptr,
                             ITensorAccessorUPtr beta    = nullptr,
                             float               epsilon = 0.001f)
-        : _mean(std::move(mean)), _var(std::move(var)), _gamma(std::move(gamma)), _beta(std::move(beta)), _epsilon(epsilon)
+        : _mean(std::move(mean)),
+          _var(std::move(var)),
+          _gamma(std::move(gamma)),
+          _beta(std::move(beta)),
+          _epsilon(epsilon)
     {
     }
 
@@ -213,10 +211,10 @@ public:
         ARM_COMPUTE_ERROR_ON(_mean == nullptr);
         ARM_COMPUTE_ERROR_ON(_var == nullptr);
 
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        return GraphBuilder::add_batch_normalization_node(s.graph(), common_params, input, _epsilon,
-                                                          std::move(_mean), std::move(_var), std::move(_beta), std::move(_gamma));
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_batch_normalization_node(s.graph(), common_params, input, _epsilon, std::move(_mean),
+                                                          std::move(_var), std::move(_beta), std::move(_gamma));
     }
 
 private:
@@ -237,7 +235,9 @@ public:
      * @param[in] sub_stream_deltas Graph sub-stream for the deltas
      * @param[in] info              Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
      */
-    BoundingBoxTransformLayer(SubStream &&sub_stream_input, SubStream &&sub_stream_deltas, BoundingBoxTransformInfo info)
+    BoundingBoxTransformLayer(SubStream              &&sub_stream_input,
+                              SubStream              &&sub_stream_deltas,
+                              BoundingBoxTransformInfo info)
         : _ss_input(sub_stream_input), _ss_deltas(sub_stream_deltas), _bbox_info(info)
     {
     }
@@ -250,9 +250,9 @@ public:
      */
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { _ss_input.tail_node(), 0 };
-        NodeIdxPair deltas        = { _ss_deltas.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {_ss_input.tail_node(), 0};
+        NodeIdxPair deltas        = {_ss_deltas.tail_node(), 0};
         return GraphBuilder::add_bounding_box_transform_node(s.graph(), common_params, input, deltas, _bbox_info);
     }
 
@@ -270,15 +270,14 @@ public:
      *
      * @param[in] num_groups Number of groups
      */
-    ChannelShuffleLayer(unsigned int num_groups)
-        : _num_groups(num_groups)
+    ChannelShuffleLayer(unsigned int num_groups) : _num_groups(num_groups)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_channel_shuffle_node(s.graph(), common_params, input, _num_groups);
     }
 
@@ -297,17 +296,15 @@ public:
      * @param[in] rest_sub_streams Rest sub-graph branches
      */
     template <typename... Ts>
-    ConcatLayer(SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&... rest_sub_streams)
+    ConcatLayer(SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&...rest_sub_streams)
         : _sub_streams(), _concat_descriptor(DataLayoutDimension::CHANNEL)
     {
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream1)));
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream2)));
 
-        utility::for_each([&](SubStream && sub_stream)
-        {
-            _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream)));
-        },
-        std::move(rest_sub_streams)...);
+        utility::for_each([&](SubStream &&sub_stream)
+                          { _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream))); },
+                          std::move(rest_sub_streams)...);
     }
     /** Construct a concatenation layer
      *
@@ -317,33 +314,33 @@ public:
      * @param[in] rest_sub_streams  Rest sub-graph branches
      */
     template <typename... Ts>
-    ConcatLayer(descriptors::ConcatLayerDescriptor concat_descriptor, SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&... rest_sub_streams)
+    ConcatLayer(descriptors::ConcatLayerDescriptor concat_descriptor,
+                SubStream                        &&sub_stream1,
+                SubStream                        &&sub_stream2,
+                Ts &&...rest_sub_streams)
         : _sub_streams(), _concat_descriptor(concat_descriptor)
     {
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream1)));
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream2)));
 
-        utility::for_each([&](SubStream && sub_stream)
-        {
-            _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream)));
-        },
-        std::move(rest_sub_streams)...);
+        utility::for_each([&](SubStream &&sub_stream)
+                          { _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream))); },
+                          std::move(rest_sub_streams)...);
     }
     /** Construct a concat layer
      *
      * @param[in] sub_stream Sub-stream
      */
     template <typename... Ts>
-    ConcatLayer(SubStream &&sub_stream)
-        : _sub_streams(), _concat_descriptor(DataLayoutDimension::CHANNEL)
+    ConcatLayer(SubStream &&sub_stream) : _sub_streams(), _concat_descriptor(DataLayoutDimension::CHANNEL)
     {
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream)));
     }
     NodeID create_layer(IStream &s) override
     {
         NodeID     nid           = EmptyNodeID;
-        NodeParams common_params = { name(), s.hints().target_hint };
-        if(_sub_streams.size() == 1 && _sub_streams.at(0) != nullptr)
+        NodeParams common_params = {name(), s.hints().target_hint};
+        if (_sub_streams.size() == 1 && _sub_streams.at(0) != nullptr)
         {
             nid = _sub_streams[0]->tail_node();
         }
@@ -351,14 +348,14 @@ public:
         {
             // Collect tail nodes and concatenate
             std::vector<NodeIdxPair> nodes;
-            for(auto &ss : _sub_streams)
+            for (auto &ss : _sub_streams)
             {
-                if(ss && (ss->tail_node() != EmptyNodeID))
+                if (ss && (ss->tail_node() != EmptyNodeID))
                 {
                     const auto tail_node = s.graph().node(ss->tail_node());
-                    if(tail_node != nullptr && tail_node->type() != NodeType::Output)
+                    if (tail_node != nullptr && tail_node->type() != NodeType::Output)
                     {
-                        nodes.push_back({ ss->tail_node(), 0 });
+                        nodes.push_back({ss->tail_node(), 0});
                     }
                 }
             }
@@ -411,12 +408,12 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        return GraphBuilder::add_convolution_node(s.graph(), common_params, input,
-                                                  Size2D(_conv_width, _conv_height), _ofm, _conv_info, _num_groups,
-                                                  s.hints().convolution_method_hint, s.hints().fast_math_hint,
-                                                  std::move(_weights), std::move(_bias), std::move(_weights_quant_info), std::move(_out_quant_info));
+        NodeIdxPair input         = {s.tail_node(), 0};
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        return GraphBuilder::add_convolution_node(s.graph(), common_params, input, Size2D(_conv_width, _conv_height),
+                                                  _ofm, _conv_info, _num_groups, s.hints().convolution_method_hint,
+                                                  s.hints().fast_math_hint, std::move(_weights), std::move(_bias),
+                                                  std::move(_weights_quant_info), std::move(_out_quant_info));
     }
 
 private:
@@ -461,11 +458,10 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        return GraphBuilder::add_deconvolution_node(s.graph(), common_params, input,
-                                                    Size2D(_conv_width, _conv_height), _ofm, _deconv_info,
-                                                    std::move(_weights), std::move(_bias));
+        NodeIdxPair input         = {s.tail_node(), 0};
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        return GraphBuilder::add_deconvolution_node(s.graph(), common_params, input, Size2D(_conv_width, _conv_height),
+                                                    _ofm, _deconv_info, std::move(_weights), std::move(_bias));
     }
 
 private:
@@ -513,12 +509,12 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        return GraphBuilder::add_depthwise_convolution_node(s.graph(), common_params,
-                                                            input, Size2D(_conv_width, _conv_height), _conv_info, _depth_multiplier,
-                                                            s.hints().depthwise_convolution_method_hint,
-                                                            std::move(_weights), std::move(_bias), std::move(_weights_quant_info), std::move(_out_quant_info));
+        NodeIdxPair input         = {s.tail_node(), 0};
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        return GraphBuilder::add_depthwise_convolution_node(
+            s.graph(), common_params, input, Size2D(_conv_width, _conv_height), _conv_info, _depth_multiplier,
+            s.hints().depthwise_convolution_method_hint, std::move(_weights), std::move(_bias),
+            std::move(_weights_quant_info), std::move(_out_quant_info));
     }
 
 private:
@@ -540,15 +536,14 @@ public:
      *
      * @param[in] block_shape Block size to rearranged
      */
-    DepthToSpaceLayer(int32_t block_shape)
-        : _block_shape(block_shape)
+    DepthToSpaceLayer(int32_t block_shape) : _block_shape(block_shape)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_depth_to_space_node(s.graph(), common_params, input, _block_shape);
     }
 
@@ -569,8 +564,8 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_dequantization_node(s.graph(), common_params, input);
     }
 };
@@ -585,18 +580,21 @@ public:
      * @param[in] sub_stream_prior PriorBox graph sub-stream.
      * @param[in] detect_info      DetectionOutput parameters.
      */
-    DetectionOutputLayer(SubStream &&sub_stream_conf, SubStream &&sub_stream_prior, const DetectionOutputLayerInfo &detect_info)
+    DetectionOutputLayer(SubStream                     &&sub_stream_conf,
+                         SubStream                     &&sub_stream_prior,
+                         const DetectionOutputLayerInfo &detect_info)
         : _ss_conf(std::move(sub_stream_conf)), _ss_prior(std::move(sub_stream_prior)), _detect_info(detect_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params  = { name(), s.hints().target_hint };
-        NodeIdxPair input_loc      = { s.tail_node(), 0 };
-        NodeIdxPair input_conf     = { _ss_conf.tail_node(), 0 };
-        NodeIdxPair input_priorbox = { _ss_prior.tail_node(), 0 };
-        return GraphBuilder::add_detection_output_node(s.graph(), common_params, input_loc, input_conf, input_priorbox, _detect_info);
+        NodeParams  common_params  = {name(), s.hints().target_hint};
+        NodeIdxPair input_loc      = {s.tail_node(), 0};
+        NodeIdxPair input_conf     = {_ss_conf.tail_node(), 0};
+        NodeIdxPair input_priorbox = {_ss_prior.tail_node(), 0};
+        return GraphBuilder::add_detection_output_node(s.graph(), common_params, input_loc, input_conf, input_priorbox,
+                                                       _detect_info);
     }
 
 private:
@@ -615,9 +613,14 @@ public:
      * @param[in] anchors                     Accessor to get anchors tensor data from.
      * @param[in] out_quant_info              (Optional) Output quantization info
      */
-    DetectionPostProcessLayer(SubStream &&sub_stream_class_prediction, DetectionPostProcessLayerInfo detect_info, ITensorAccessorUPtr anchors,
-                              const QuantizationInfo out_quant_info = QuantizationInfo())
-        : _sub_stream_class_prediction(std::move(sub_stream_class_prediction)), _detect_info(detect_info), _anchors(std::move(anchors)), _out_quant_info(std::move(out_quant_info))
+    DetectionPostProcessLayer(SubStream                   &&sub_stream_class_prediction,
+                              DetectionPostProcessLayerInfo detect_info,
+                              ITensorAccessorUPtr           anchors,
+                              const QuantizationInfo        out_quant_info = QuantizationInfo())
+        : _sub_stream_class_prediction(std::move(sub_stream_class_prediction)),
+          _detect_info(detect_info),
+          _anchors(std::move(anchors)),
+          _out_quant_info(std::move(out_quant_info))
     {
     }
 
@@ -625,10 +628,12 @@ public:
     {
         ARM_COMPUTE_ERROR_ON(_anchors == nullptr);
 
-        NodeParams  common_params          = { name(), s.hints().target_hint };
-        NodeIdxPair input_box_encoding     = { s.tail_node(), 0 };
-        NodeIdxPair input_class_prediction = { _sub_stream_class_prediction.tail_node(), 0 };
-        return GraphBuilder::add_detection_post_process_node(s.graph(), common_params, input_box_encoding, input_class_prediction, _detect_info, std::move(_anchors), std::move(_out_quant_info));
+        NodeParams  common_params          = {name(), s.hints().target_hint};
+        NodeIdxPair input_box_encoding     = {s.tail_node(), 0};
+        NodeIdxPair input_class_prediction = {_sub_stream_class_prediction.tail_node(), 0};
+        return GraphBuilder::add_detection_post_process_node(s.graph(), common_params, input_box_encoding,
+                                                             input_class_prediction, _detect_info, std::move(_anchors),
+                                                             std::move(_out_quant_info));
     }
 
 private:
@@ -645,15 +650,14 @@ public:
      *
      * @param[in] shape Output shape
      */
-    DummyLayer(TensorShape shape)
-        : _shape(shape)
+    DummyLayer(TensorShape shape) : _shape(shape)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_dummy_node(s.graph(), common_params, input, _shape);
     }
 
@@ -677,9 +681,9 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input0        = { _ss0.tail_node(), 0 };
-        NodeIdxPair input1        = { _ss1.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input0        = {_ss0.tail_node(), 0};
+        NodeIdxPair input1        = {_ss1.tail_node(), 0};
 
         return GraphBuilder::add_elementwise_node(s.graph(), common_params, input0, input1, _op);
     }
@@ -700,8 +704,8 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_flatten_node(s.graph(), common_params, input);
     }
 };
@@ -770,13 +774,13 @@ public:
      */
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        if(_weights != nullptr)
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        if (_weights != nullptr)
         {
-            return GraphBuilder::add_fully_connected_layer(s.graph(), common_params, input, _num_outputs,
-                                                           std::move(_weights), std::move(_bias), _fc_info,
-                                                           std::move(_weights_quant_info), std::move(_out_quant_info), s.hints().fast_math_hint);
+            return GraphBuilder::add_fully_connected_layer(
+                s.graph(), common_params, input, _num_outputs, std::move(_weights), std::move(_bias), _fc_info,
+                std::move(_weights_quant_info), std::move(_out_quant_info), s.hints().fast_math_hint);
         }
         else
         {
@@ -811,8 +815,14 @@ public:
      * @param[in] ss_anchors Graph sub-stream for the anchors.
      * @param[in] info       Generate Proposals operation information.
      */
-    GenerateProposalsLayer(SubStream &&ss_scores, SubStream &&ss_deltas, SubStream &&ss_anchors, GenerateProposalsInfo info)
-        : _ss_scores(std::move(ss_scores)), _ss_deltas(std::move(ss_deltas)), _ss_anchors(std::move(ss_anchors)), _info(info)
+    GenerateProposalsLayer(SubStream           &&ss_scores,
+                           SubStream           &&ss_deltas,
+                           SubStream           &&ss_anchors,
+                           GenerateProposalsInfo info)
+        : _ss_scores(std::move(ss_scores)),
+          _ss_deltas(std::move(ss_deltas)),
+          _ss_anchors(std::move(ss_anchors)),
+          _info(info)
     {
     }
 
@@ -824,10 +834,10 @@ public:
      */
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair scores        = { _ss_scores.tail_node(), 0 };
-        NodeIdxPair deltas        = { _ss_deltas.tail_node(), 0 };
-        NodeIdxPair anchors       = { _ss_anchors.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair scores        = {_ss_scores.tail_node(), 0};
+        NodeIdxPair deltas        = {_ss_deltas.tail_node(), 0};
+        NodeIdxPair anchors       = {_ss_anchors.tail_node(), 0};
         return GraphBuilder::add_generate_proposals_node(s.graph(), common_params, scores, deltas, anchors, _info);
     }
 
@@ -847,15 +857,14 @@ public:
      * @param[in] axis    Axis to perform normalization on
      * @param[in] epsilon Lower bound value for the normalization
      */
-    L2NormalizeLayer(int axis, float epsilon)
-        : _axis(axis), _epsilon(epsilon)
+    L2NormalizeLayer(int axis, float epsilon) : _axis(axis), _epsilon(epsilon)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_l2_normalize_node(s.graph(), common_params, input, _axis, _epsilon);
     }
 
@@ -872,15 +881,14 @@ public:
      *
      * @param[in] norm_info Normalization information.
      */
-    NormalizationLayer(NormalizationLayerInfo norm_info)
-        : _norm_info(norm_info)
+    NormalizationLayer(NormalizationLayerInfo norm_info) : _norm_info(norm_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_normalization_node(s.graph(), common_params, input, _norm_info);
     }
 
@@ -897,8 +905,7 @@ public:
      * @param[in] mean Accessor to get mean tensor data from.
      * @param[in] std  Accessor to get std tensor data from.
      */
-    NormalizePlanarYUVLayer(ITensorAccessorUPtr mean,
-                            ITensorAccessorUPtr std)
+    NormalizePlanarYUVLayer(ITensorAccessorUPtr mean, ITensorAccessorUPtr std)
         : _mean(std::move(mean)), _std(std::move(std))
     {
     }
@@ -908,10 +915,10 @@ public:
         ARM_COMPUTE_ERROR_ON(_mean == nullptr);
         ARM_COMPUTE_ERROR_ON(_std == nullptr);
 
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        return GraphBuilder::add_normalize_planar_yuv_node(s.graph(), common_params, input,
-                                                           std::move(_mean), std::move(_std));
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_normalize_planar_yuv_node(s.graph(), common_params, input, std::move(_mean),
+                                                           std::move(_std));
     }
 
 private:
@@ -929,15 +936,14 @@ public:
      *                      specifies the front and the end padding in the i-th dimension.
      * @param[in] pad_value Padding value to use. Defaults to 0.
      */
-    PadLayer(PaddingList padding, PixelValue pad_value = PixelValue())
-        : _padding(padding), _pad_value(pad_value)
+    PadLayer(PaddingList padding, PixelValue pad_value = PixelValue()) : _padding(padding), _pad_value(pad_value)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_pad_node(s.graph(), common_params, input, _padding, _pad_value);
     }
 
@@ -956,15 +962,14 @@ public:
      * @param[in] layout (Optional) Data layout to assign to permuted tensor.
      *                   If UNKNOWN then the input's layout will be used.
      */
-    PermuteLayer(PermutationVector perm, DataLayout layout = DataLayout::UNKNOWN)
-        : _perm(perm), _layout(layout)
+    PermuteLayer(PermutationVector perm, DataLayout layout = DataLayout::UNKNOWN) : _perm(perm), _layout(layout)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_permute_node(s.graph(), common_params, input, _perm, _layout);
     }
 
@@ -981,15 +986,14 @@ public:
      *
      * @param[in] pool_info Pooling information.
      */
-    PoolingLayer(PoolingLayerInfo pool_info)
-        : _pool_info(pool_info)
+    PoolingLayer(PoolingLayerInfo pool_info) : _pool_info(pool_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_pooling_node(s.graph(), common_params, input, _pool_info);
     }
 
@@ -1013,9 +1017,9 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { _ss0.tail_node(), 0 };
-        NodeIdxPair alpha         = { _ss1.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {_ss0.tail_node(), 0};
+        NodeIdxPair alpha         = {_ss1.tail_node(), 0};
 
         return GraphBuilder::add_prelu_node(s.graph(), common_params, input, alpha);
     }
@@ -1064,15 +1068,17 @@ public:
      * @param[in] format_info (Optional) Format info.
      * @param[in] transform   (Optional) Input transform function.
      */
-    PrintLayer(std::ostream &stream, const IOFormatInfo &format_info = IOFormatInfo(), const std::function<ITensor *(ITensor *)> transform = nullptr)
+    PrintLayer(std::ostream                             &stream,
+               const IOFormatInfo                       &format_info = IOFormatInfo(),
+               const std::function<ITensor *(ITensor *)> transform   = nullptr)
         : _stream(stream), _format_info(format_info), _transform(transform)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_print_node(s.graph(), common_params, input, _stream, _format_info, _transform);
     }
 
@@ -1098,9 +1104,9 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input0        = { s.tail_node(), 0 };
-        NodeIdxPair input1        = { _ss.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input0        = {s.tail_node(), 0};
+        NodeIdxPair input1        = {_ss.tail_node(), 0};
         return GraphBuilder::add_priorbox_node(s.graph(), common_params, input0, input1, _prior_info);
     }
 
@@ -1117,15 +1123,14 @@ public:
      *
      * @param[in] out_quant_info Output tensor quantization info
      */
-    QuantizationLayer(QuantizationInfo out_quant_info)
-        : _out_quant_info(out_quant_info)
+    QuantizationLayer(QuantizationInfo out_quant_info) : _out_quant_info(out_quant_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_quantization_node(s.graph(), common_params, input, _out_quant_info);
     }
 
@@ -1150,8 +1155,8 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_reduction_operation_node(s.graph(), common_params, input, _op, _axis, _keep_dims);
     }
 
@@ -1170,15 +1175,14 @@ public:
      * @param[in] stride Stride value to use for reorganizing the values in the output tensor.
      *                   It defines the spatial distance between 2 consecutive pixels in the x and y direction
      */
-    ReorgLayer(int stride)
-        : _stride(stride)
+    ReorgLayer(int stride) : _stride(stride)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_reorg_node(s.graph(), common_params, input, _stride);
     }
 
@@ -1194,15 +1198,14 @@ public:
      *
      * @param[in] shape Target shape.
      */
-    ReshapeLayer(TensorShape shape)
-        : _shape(shape)
+    ReshapeLayer(TensorShape shape) : _shape(shape)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_reshape_node(s.graph(), common_params, input, _shape);
     }
 
@@ -1221,8 +1224,8 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_resize_node(s.graph(), common_params, input, _policy, _width_scale, _height_scale);
     }
 
@@ -1254,9 +1257,9 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { _ss_input.tail_node(), 0 };
-        NodeIdxPair rois          = { _ss_rois.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {_ss_input.tail_node(), 0};
+        NodeIdxPair rois          = {_ss_rois.tail_node(), 0};
         return GraphBuilder::add_roi_align_node(s.graph(), common_params, input, rois, _pool_info);
     }
 
@@ -1275,16 +1278,15 @@ public:
      * @param[in] mul_w Accessor to get mul weight from.
      * @param[in] add_w Accessor to get add weight from.
      */
-    ScaleLayer(ITensorAccessorUPtr mul_w,
-               ITensorAccessorUPtr add_w)
+    ScaleLayer(ITensorAccessorUPtr mul_w, ITensorAccessorUPtr add_w)
         : _mul_w(std::move(mul_w)), _add_w(std::move(add_w))
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_scale_layer(s.graph(), common_params, input, std::move(_mul_w), std::move(_add_w));
     }
 
@@ -1302,15 +1304,14 @@ public:
      * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in] ends   The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      */
-    SliceLayer(Coordinates &starts, Coordinates &ends)
-        : _starts(starts), _ends(ends)
+    SliceLayer(Coordinates &starts, Coordinates &ends) : _starts(starts), _ends(ends)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_slice_node(s.graph(), common_params, input, _starts, _ends);
     }
 
@@ -1327,15 +1328,14 @@ public:
      *
      * @param[in] beta (Optional) Beta value. Default 1.0.
      */
-    SoftmaxLayer(float beta = 1.0f)
-        : _beta(beta)
+    SoftmaxLayer(float beta = 1.0f) : _beta(beta)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_softmax_node(s.graph(), common_params, input, _beta);
     }
 
@@ -1354,17 +1354,14 @@ public:
      * @param[in] rest_sub_streams Rest sub-graph branches
      */
     template <typename... Ts>
-    StackLayer(SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&... rest_sub_streams)
-        : _sub_streams(), _axis(0)
+    StackLayer(SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&...rest_sub_streams) : _sub_streams(), _axis(0)
     {
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream1)));
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream2)));
 
-        utility::for_each([&](SubStream && sub_stream)
-        {
-            _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream)));
-        },
-        std::move(rest_sub_streams)...);
+        utility::for_each([&](SubStream &&sub_stream)
+                          { _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream))); },
+                          std::move(rest_sub_streams)...);
     }
     /** Construct a concatenation layer
      *
@@ -1374,33 +1371,30 @@ public:
      * @param[in] rest_sub_streams Rest sub-graph branches
      */
     template <typename... Ts>
-    StackLayer(int axis, SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&... rest_sub_streams)
+    StackLayer(int axis, SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&...rest_sub_streams)
         : _sub_streams(), _axis(axis)
     {
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream1)));
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream2)));
 
-        utility::for_each([&](SubStream && sub_stream)
-        {
-            _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream)));
-        },
-        std::move(rest_sub_streams)...);
+        utility::for_each([&](SubStream &&sub_stream)
+                          { _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream))); },
+                          std::move(rest_sub_streams)...);
     }
     /** Construct a concat layer
      *
      * @param[in] sub_stream Sub-stream
      */
     template <typename... Ts>
-    StackLayer(SubStream &&sub_stream)
-        : _sub_streams(), _axis(0)
+    StackLayer(SubStream &&sub_stream) : _sub_streams(), _axis(0)
     {
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream)));
     }
     NodeID create_layer(IStream &s) override
     {
         NodeID     nid           = EmptyNodeID;
-        NodeParams common_params = { name(), s.hints().target_hint };
-        if(_sub_streams.size() == 1 && _sub_streams.at(0) != nullptr)
+        NodeParams common_params = {name(), s.hints().target_hint};
+        if (_sub_streams.size() == 1 && _sub_streams.at(0) != nullptr)
         {
             nid = _sub_streams[0]->tail_node();
         }
@@ -1408,14 +1402,14 @@ public:
         {
             // Collect tail nodes and stack
             std::vector<NodeIdxPair> nodes;
-            for(auto &ss : _sub_streams)
+            for (auto &ss : _sub_streams)
             {
-                if(ss && (ss->tail_node() != EmptyNodeID))
+                if (ss && (ss->tail_node() != EmptyNodeID))
                 {
                     const auto tail_node = s.graph().node(ss->tail_node());
-                    if(tail_node != nullptr && tail_node->type() != NodeType::Output)
+                    if (tail_node != nullptr && tail_node->type() != NodeType::Output)
                     {
-                        nodes.push_back({ ss->tail_node(), 0 });
+                        nodes.push_back({ss->tail_node(), 0});
                     }
                 }
             }
@@ -1440,15 +1434,18 @@ public:
      * @param[in] strides            The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in] strided_slice_info Contains masks for the starts, ends and strides
      */
-    StridedSliceLayer(Coordinates &starts, Coordinates &ends, BiStrides &strides, StridedSliceLayerInfo strided_slice_info)
+    StridedSliceLayer(Coordinates          &starts,
+                      Coordinates          &ends,
+                      BiStrides            &strides,
+                      StridedSliceLayerInfo strided_slice_info)
         : _starts(starts), _ends(ends), _strides(strides), _info(strided_slice_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_strided_slice_node(s.graph(), common_params, input, _starts, _ends, _strides, _info);
     }
 
@@ -1467,15 +1464,14 @@ public:
      *
      * @param[in] act_info Activation info
      */
-    YOLOLayer(ActivationLayerInfo act_info)
-        : _act_info(act_info)
+    YOLOLayer(ActivationLayerInfo act_info) : _act_info(act_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_yolo_node(s.graph(), common_params, input, _act_info);
     }
 
diff --git a/arm_compute/graph/frontend/Stream.h b/arm_compute/graph/frontend/Stream.h
index db22f6d91b4b78379efc033a7a9ead0086606f8c..7e760b63734edc2aef1a17136dc7453e89bf35e4 100644
--- a/arm_compute/graph/frontend/Stream.h
+++ b/arm_compute/graph/frontend/Stream.h
@@ -27,7 +27,6 @@
 #include "arm_compute/graph/frontend/IStream.h"
 #include "arm_compute/graph/frontend/IStreamOperators.h"
 #include "arm_compute/graph/frontend/Types.h"
-
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/GraphManager.h"
@@ -65,7 +64,7 @@ public:
     void run();
 
     // Inherited overridden methods
-    void add_layer(ILayer &layer) override;
+    void         add_layer(ILayer &layer) override;
     Graph       &graph() override;
     const Graph &graph() const override;
 
diff --git a/arm_compute/graph/frontend/SubStream.h b/arm_compute/graph/frontend/SubStream.h
index 2283cfeebebf5acce6b7b83cb908e48416c4fad3..c54317c52b84b4e6d0b1ab1026c002fe571726f6 100644
--- a/arm_compute/graph/frontend/SubStream.h
+++ b/arm_compute/graph/frontend/SubStream.h
@@ -54,7 +54,7 @@ public:
     SubStream(IStream &s);
 
     // Inherited overridden methods
-    void add_layer(ILayer &layer) override;
+    void         add_layer(ILayer &layer) override;
     Graph       &graph() override;
     const Graph &graph() const override;
 
diff --git a/arm_compute/graph/frontend/Types.h b/arm_compute/graph/frontend/Types.h
index bc4fe7ae0d5fa78bba712d1742098f73216880b8..42b28b3cd2ba0e9d23462efc70c0901a18591462 100644
--- a/arm_compute/graph/frontend/Types.h
+++ b/arm_compute/graph/frontend/Types.h
@@ -33,39 +33,40 @@ namespace graph
 namespace frontend
 {
 // Import types for graph
-using graph::DataType;
 using graph::DataLayout;
 using graph::DataLayoutDimension;
-using graph::TensorShape;
+using graph::DataType;
 using graph::PermutationVector;
+using graph::TensorShape;
 
 using graph::ActivationLayerInfo;
+using graph::ConvolutionMethod;
+using graph::DepthwiseConvolutionMethod;
+using graph::DimensionRoundingType;
 using graph::EltwiseOperation;
+using graph::FastMathHint;
 using graph::FullyConnectedLayerInfo;
+using graph::GraphConfig;
+using graph::InterpolationPolicy;
 using graph::NormalizationLayerInfo;
 using graph::NormType;
 using graph::PadStrideInfo;
 using graph::PoolingLayerInfo;
 using graph::PoolingType;
+using graph::Size2D;
 using graph::Target;
-using graph::ConvolutionMethod;
-using graph::FastMathHint;
-using graph::DepthwiseConvolutionMethod;
 using graph::TensorDescriptor;
-using graph::DimensionRoundingType;
-using graph::GraphConfig;
-using graph::InterpolationPolicy;
-using graph::Size2D;
 
 /** Hints that can be passed to the stream to expose parameterization */
 struct StreamHints
 {
-    Target                     target_hint                       = { Target::UNSPECIFIED };                 /**< Target execution hint */
-    ConvolutionMethod          convolution_method_hint           = { ConvolutionMethod::Default };          /**< Convolution method hint */
-    DepthwiseConvolutionMethod depthwise_convolution_method_hint = { DepthwiseConvolutionMethod::Default }; /**< Depthwise Convolution method hint */
-    FastMathHint               fast_math_hint                    = { FastMathHint::Disabled };              /**< Fast math hint */
+    Target                     target_hint             = {Target::UNSPECIFIED};        /**< Target execution hint */
+    ConvolutionMethod          convolution_method_hint = {ConvolutionMethod::Default}; /**< Convolution method hint */
+    DepthwiseConvolutionMethod depthwise_convolution_method_hint = {
+        DepthwiseConvolutionMethod::Default};               /**< Depthwise Convolution method hint */
+    FastMathHint fast_math_hint = {FastMathHint::Disabled}; /**< Fast math hint */
 };
 } // namespace frontend
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_STREAM_TYPES_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_GRAPH_STREAM_TYPES_H */
diff --git a/arm_compute/graph/mutators/DepthConcatSubTensorMutator.h b/arm_compute/graph/mutators/DepthConcatSubTensorMutator.h
index cb1f079a2e90632ffcf0f441eb46848e33ae1a4e..61d8854a6137e43dd7c8de83a33773d2230eb4c3 100644
--- a/arm_compute/graph/mutators/DepthConcatSubTensorMutator.h
+++ b/arm_compute/graph/mutators/DepthConcatSubTensorMutator.h
@@ -40,7 +40,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/GroupedConvolutionMutator.h b/arm_compute/graph/mutators/GroupedConvolutionMutator.h
index e68c7030d0d82867d910e566f4c885bfd44d6b9e..3ed8d786fc06d9d08f914a9caaf620c20342e389 100644
--- a/arm_compute/graph/mutators/GroupedConvolutionMutator.h
+++ b/arm_compute/graph/mutators/GroupedConvolutionMutator.h
@@ -40,7 +40,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/InPlaceOperationMutator.h b/arm_compute/graph/mutators/InPlaceOperationMutator.h
index 6248d86a0aab7d82eb0214e420153ef575197f11..86f62f1994bea0778bedf76433fe4a2aec9cb92d 100644
--- a/arm_compute/graph/mutators/InPlaceOperationMutator.h
+++ b/arm_compute/graph/mutators/InPlaceOperationMutator.h
@@ -37,7 +37,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/NodeExecutionMethodMutator.h b/arm_compute/graph/mutators/NodeExecutionMethodMutator.h
index 07c8ffad97bb48d1fb8b9a1876f94817399f26ef..505d4ab3007f43969e476cc146c47c1a1b5f5e5d 100644
--- a/arm_compute/graph/mutators/NodeExecutionMethodMutator.h
+++ b/arm_compute/graph/mutators/NodeExecutionMethodMutator.h
@@ -42,7 +42,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/NodeFusionMutator.h b/arm_compute/graph/mutators/NodeFusionMutator.h
index f3e3eaa1909c702e72662eb4a78c5ee903184562..9d2d44f4360a02b7d4c2fd1e1959d4aa7e162875 100644
--- a/arm_compute/graph/mutators/NodeFusionMutator.h
+++ b/arm_compute/graph/mutators/NodeFusionMutator.h
@@ -38,7 +38,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/SplitLayerSubTensorMutator.h b/arm_compute/graph/mutators/SplitLayerSubTensorMutator.h
index b14ef5953292060e2091e35859b96374651fd875..ab9746a29bdd6f3f4b7e30d098e68d1b5744c79d 100644
--- a/arm_compute/graph/mutators/SplitLayerSubTensorMutator.h
+++ b/arm_compute/graph/mutators/SplitLayerSubTensorMutator.h
@@ -40,7 +40,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/SyntheticDataTypeMutator.h b/arm_compute/graph/mutators/SyntheticDataTypeMutator.h
index 2292e520861c2456b3f1334b5946780ef071ecfb..ce8af0a1d73c813ad33a50f5f2eb832a9aab31dc 100644
--- a/arm_compute/graph/mutators/SyntheticDataTypeMutator.h
+++ b/arm_compute/graph/mutators/SyntheticDataTypeMutator.h
@@ -40,7 +40,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 
 private:
     DataType _mutate_type;
diff --git a/arm_compute/graph/nodes/ActivationLayerNode.h b/arm_compute/graph/nodes/ActivationLayerNode.h
index 4a98ee248f1d975ec6fc31a230fc422b395ef028..fe5f273db5e863596c6becc5ab90c67c743310a1 100644
--- a/arm_compute/graph/nodes/ActivationLayerNode.h
+++ b/arm_compute/graph/nodes/ActivationLayerNode.h
@@ -39,8 +39,7 @@ public:
      * @param[in] info           Activation Layer information
      * @param[in] out_quant_info (Optional) Output quantization info
      */
-    ActivationLayerNode(ActivationLayerInfo info,
-                        QuantizationInfo    out_quant_info = QuantizationInfo());
+    ActivationLayerNode(ActivationLayerInfo info, QuantizationInfo out_quant_info = QuantizationInfo());
     /** Activation metadata accessor
      *
      * @return The activation info of the layer
@@ -51,7 +50,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::ActivationLayer;
diff --git a/arm_compute/graph/nodes/ArgMinMaxLayerNode.h b/arm_compute/graph/nodes/ArgMinMaxLayerNode.h
index 69191add995894babf0102aee7b31e50e486efe2..65fbc36db637513c58872a631b8a4227a28510e1 100644
--- a/arm_compute/graph/nodes/ArgMinMaxLayerNode.h
+++ b/arm_compute/graph/nodes/ArgMinMaxLayerNode.h
@@ -65,7 +65,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::ArgMinMaxLayer;
diff --git a/arm_compute/graph/nodes/BatchNormalizationLayerNode.h b/arm_compute/graph/nodes/BatchNormalizationLayerNode.h
index e7f4049df8b46a5056871c74a25404cb6296b065..8583ed87ebb0ded27320bc3d40901a0a575120ca 100644
--- a/arm_compute/graph/nodes/BatchNormalizationLayerNode.h
+++ b/arm_compute/graph/nodes/BatchNormalizationLayerNode.h
@@ -60,7 +60,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::BatchNormalizationLayer;
diff --git a/arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h b/arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h
index 57175eba2e0b591ed8ed2c7232ceeab4728b6114..96c2544065fe1a45a18a42f3ec9d594ec57f2d0a 100644
--- a/arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h
+++ b/arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h
@@ -50,7 +50,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     BoundingBoxTransformInfo _bbox_info;
diff --git a/arm_compute/graph/nodes/ChannelShuffleLayerNode.h b/arm_compute/graph/nodes/ChannelShuffleLayerNode.h
index 0696fe56fc39f33e29c39bb2f268303213cd4124..d296a2dcc3ca85df9914bbf237659eccff1c3b32 100644
--- a/arm_compute/graph/nodes/ChannelShuffleLayerNode.h
+++ b/arm_compute/graph/nodes/ChannelShuffleLayerNode.h
@@ -49,7 +49,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     unsigned int _num_groups;
diff --git a/arm_compute/graph/nodes/ConcatenateLayerNode.h b/arm_compute/graph/nodes/ConcatenateLayerNode.h
index 8582403355483865ad3ed0c23f9266896728987d..13398b1a6151875f375cc3630ccda61629f70243 100644
--- a/arm_compute/graph/nodes/ConcatenateLayerNode.h
+++ b/arm_compute/graph/nodes/ConcatenateLayerNode.h
@@ -47,7 +47,8 @@ public:
      *
      * @return Expected output descriptor
      */
-    static TensorDescriptor compute_output_descriptor(const std::vector<TensorDescriptor> &input_descriptors, DataLayoutDimension axis);
+    static TensorDescriptor compute_output_descriptor(const std::vector<TensorDescriptor> &input_descriptors,
+                                                      DataLayoutDimension                  axis);
     /** Disables or not the depth concatenate node
      *
      * @warning This is used when concatenate is performed using sub-tensors, where this node is used as a placeholder.
@@ -78,7 +79,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     unsigned int                       _total_nodes;
diff --git a/arm_compute/graph/nodes/ConstNode.h b/arm_compute/graph/nodes/ConstNode.h
index b377c6020897bd28a8e03725752eeb0f1637dc8b..400b9b4d9fde7e760d2538196bf057930c3a9205 100644
--- a/arm_compute/graph/nodes/ConstNode.h
+++ b/arm_compute/graph/nodes/ConstNode.h
@@ -44,7 +44,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     TensorDescriptor _desc;
diff --git a/arm_compute/graph/nodes/ConvolutionLayerNode.h b/arm_compute/graph/nodes/ConvolutionLayerNode.h
index 99effa07dc3f9f51773385bfa7978a2a833eb61d..8a77b89f27954258cacc967aa3a68f65e32615dd 100644
--- a/arm_compute/graph/nodes/ConvolutionLayerNode.h
+++ b/arm_compute/graph/nodes/ConvolutionLayerNode.h
@@ -111,7 +111,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::ConvolutionLayer;
diff --git a/arm_compute/graph/nodes/DeconvolutionLayerNode.h b/arm_compute/graph/nodes/DeconvolutionLayerNode.h
index e74adb17aa7c3e6f874229f13f5f6662b0868ab4..553d05985c5e60ece1d819321732c3d6950ad6af 100644
--- a/arm_compute/graph/nodes/DeconvolutionLayerNode.h
+++ b/arm_compute/graph/nodes/DeconvolutionLayerNode.h
@@ -61,7 +61,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     descriptors::DeconvolutionLayerDescriptor descriptor;
diff --git a/arm_compute/graph/nodes/DepthToSpaceLayerNode.h b/arm_compute/graph/nodes/DepthToSpaceLayerNode.h
index 25e30e2c67596e1add850c8d834fd0d7519d74a5..5fbcc670ff612980e7a5b2ee06601f3e2b8206ea 100644
--- a/arm_compute/graph/nodes/DepthToSpaceLayerNode.h
+++ b/arm_compute/graph/nodes/DepthToSpaceLayerNode.h
@@ -56,7 +56,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     int _block_shape;
diff --git a/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h b/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h
index 5df86983f07bc0118f8773e80afe7ee3b6079ed6..441d68d2b8ffffb3dc18ec435b3da7f4ac6f41d8 100644
--- a/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h
+++ b/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h
@@ -101,7 +101,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::DepthwiseConvolutionLayer;
diff --git a/arm_compute/graph/nodes/DequantizationLayerNode.h b/arm_compute/graph/nodes/DequantizationLayerNode.h
index 4910938d471219602bf4fd94d6bdf5ea2cdb92c9..1cce71373f3bc2f19f29451dc652756319fa3d6b 100644
--- a/arm_compute/graph/nodes/DequantizationLayerNode.h
+++ b/arm_compute/graph/nodes/DequantizationLayerNode.h
@@ -46,8 +46,8 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_DEQUANTIZATION_NODE_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_GRAPH_DEQUANTIZATION_NODE_H */
diff --git a/arm_compute/graph/nodes/DetectionOutputLayerNode.h b/arm_compute/graph/nodes/DetectionOutputLayerNode.h
index b4b910c40e98c4d701817afae71847af6c2c42c6..c3e067e430561d30284cf9c553e26678fe60a1da 100644
--- a/arm_compute/graph/nodes/DetectionOutputLayerNode.h
+++ b/arm_compute/graph/nodes/DetectionOutputLayerNode.h
@@ -51,13 +51,14 @@ public:
      *
      * @return Output descriptor
      */
-    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor, const DetectionOutputLayerInfo &info);
+    static TensorDescriptor compute_output_descriptor(const TensorDescriptor         &input_descriptor,
+                                                      const DetectionOutputLayerInfo &info);
 
     // Inherited overridden methods:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     DetectionOutputLayerInfo _info;
diff --git a/arm_compute/graph/nodes/DetectionPostProcessLayerNode.h b/arm_compute/graph/nodes/DetectionPostProcessLayerNode.h
index 6ff78aee07bbf9bb70b63437b4faa6b837cb8ae2..a53aaf2b9cfb24eec2a1c0294be71bef78af0fc4 100644
--- a/arm_compute/graph/nodes/DetectionPostProcessLayerNode.h
+++ b/arm_compute/graph/nodes/DetectionPostProcessLayerNode.h
@@ -49,7 +49,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     DetectionPostProcessLayerInfo _info;
@@ -59,4 +59,4 @@ private:
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_DETECTION_POST_PROCESS_LAYER_NODE_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_GRAPH_DETECTION_POST_PROCESS_LAYER_NODE_H */
diff --git a/arm_compute/graph/nodes/DummyNode.h b/arm_compute/graph/nodes/DummyNode.h
index 645f1b325d0933313f79645868bc86b31ee6af27..2263525a72b142803ca27dc41cb04a69fae7966a 100644
--- a/arm_compute/graph/nodes/DummyNode.h
+++ b/arm_compute/graph/nodes/DummyNode.h
@@ -51,11 +51,11 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     TensorShape _shape;
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_DUMMY_NODE_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_GRAPH_DUMMY_NODE_H */
diff --git a/arm_compute/graph/nodes/EltwiseLayerNode.h b/arm_compute/graph/nodes/EltwiseLayerNode.h
index 7a6d8e8303f9f96d3ba1dda883601ddc99c1fe71..258298259f208c22666b60174e2b67b073cedae4 100644
--- a/arm_compute/graph/nodes/EltwiseLayerNode.h
+++ b/arm_compute/graph/nodes/EltwiseLayerNode.h
@@ -79,7 +79,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
     static constexpr NodeType node_type = NodeType::EltwiseLayer;
 
@@ -112,7 +112,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
     static constexpr NodeType node_type = NodeType::UnaryEltwiseLayer;
 
diff --git a/arm_compute/graph/nodes/FlattenLayerNode.h b/arm_compute/graph/nodes/FlattenLayerNode.h
index 046114c2914fe54bd641921355d552739d5a5400..af104707a1bf4729741e89d4bc9ed1f070ec8285 100644
--- a/arm_compute/graph/nodes/FlattenLayerNode.h
+++ b/arm_compute/graph/nodes/FlattenLayerNode.h
@@ -41,7 +41,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/FullyConnectedLayerNode.h b/arm_compute/graph/nodes/FullyConnectedLayerNode.h
index 9ade62bf4a4de27e71c0b310846db37fa7dfec7c..3bcf386d645fc921e1562cecce30def69fc32840 100644
--- a/arm_compute/graph/nodes/FullyConnectedLayerNode.h
+++ b/arm_compute/graph/nodes/FullyConnectedLayerNode.h
@@ -73,7 +73,7 @@ public:
      */
     static TensorDescriptor compute_weights_descriptor(const TensorDescriptor &input_descriptor,
                                                        unsigned int            num_outputs,
-                                                       FullyConnectedLayerInfo fc_info            = FullyConnectedLayerInfo(),
+                                                       FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(),
                                                        const QuantizationInfo &weights_quant_info = QuantizationInfo());
     /** Computes fully connected layer output descriptor
      *
@@ -98,7 +98,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
     static constexpr NodeType node_type = NodeType::FullyConnectedLayer;
 
diff --git a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h b/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h
index b0051b138505fd63fdaba4168761e207efa068b3..d891ea49eb437f3d9eb67aa8fc072cee92f41798 100644
--- a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h
+++ b/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h
@@ -43,7 +43,8 @@ public:
      * @param[in] fast_math_hint   (Optional) Fast math hint
      * @param[in] fused_activation (Optional) Fused activation layer. Disabled if not specified
      */
-    FusedConvolutionBatchNormalizationNode(float epsilon, PadStrideInfo info,
+    FusedConvolutionBatchNormalizationNode(float               epsilon,
+                                           PadStrideInfo       info,
                                            unsigned int        num_groups       = 1,
                                            ConvolutionMethod   method           = ConvolutionMethod::Default,
                                            FastMathHint        fast_math_hint   = FastMathHint::Disabled,
@@ -122,7 +123,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::FusedConvolutionBatchNormalizationLayer;
diff --git a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h b/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h
deleted file mode 100644
index a42e06d889721f5c06956456d2c19748f14b28cc..0000000000000000000000000000000000000000
--- a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_BATCH_NORMALIZATION_WITH_POST_OPS_NODE_H
-#define ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_BATCH_NORMALIZATION_WITH_POST_OPS_NODE_H
-
-#include "arm_compute/graph/INode.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-/** Batch Normalization node */
-class FusedConvolutionBatchNormalizationWithPostOpsNode final : public INode
-{
-public:
-    /** Constructor
-     *
-     * @param[in] epsilon        Epsilon parameter.
-     * @param[in] info           Convolution layer attributes.
-     * @param[in] num_groups     (Optional) Number of groups (Defaults to 1)
-     * @param[in] method         (Optional) Convolution method to use
-     * @param[in] fast_math_hint (Optional) Fast math hint
-     */
-    FusedConvolutionBatchNormalizationWithPostOpsNode(float epsilon, PadStrideInfo info,
-                                                      unsigned int      num_groups     = 1,
-                                                      ConvolutionMethod method         = ConvolutionMethod::Default,
-                                                      FastMathHint      fast_math_hint = FastMathHint::Disabled);
-
-    /** Epsilon parameter accessor
-     *
-     * @return Epsilon parameter
-     */
-    float epsilon() const;
-
-    /** Computes convolution output descriptor
-     *
-     * @param[in] input_descriptor   Input descriptor
-     * @param[in] weights_descriptor Weights descriptor
-     * @param[in] info               Convolution operation attributes
-     *
-     * @return Output descriptor
-     */
-    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                      const TensorDescriptor &weights_descriptor,
-                                                      const PadStrideInfo    &info);
-
-    /** Sets the convolution layer method to use
-     *
-     * @param[in] method Method to use for convolution
-     */
-    void set_convolution_method(ConvolutionMethod method);
-
-    /** Number of groups in convolution accessor
-     *
-     * @return Number of groups in convolution
-     */
-    unsigned int num_groups() const;
-
-    /** Convolution layer method accessor
-     *
-     * @note This is an indication on which convolution layer implementation to use,
-     *       if it fails to be created the library's heuristic approach will be used
-     *
-     * @return Convolution layer method to be used by the node
-     */
-    ConvolutionMethod convolution_method() const;
-
-    /** Sets the fast math hint
-     *
-     * @param[in] hint Hint to use for convolution
-     */
-    void set_fast_math_hint(FastMathHint hint);
-
-    /** Fast math hint accessor
-     *
-     * @return Fast math hint to be used by the node
-     */
-    FastMathHint fast_math_hint() const;
-
-    /** Convolution metadata accessor
-     *
-     * @return Convolution information
-     */
-    PadStrideInfo convolution_info() const;
-
-    // Inherited overridden methods:
-    NodeType         type() const override;
-    bool             forward_descriptors() override;
-    TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
-
-public:
-    static constexpr NodeType node_type = NodeType::FusedConvolutionBatchNormalizationLayerWithPostOpsLayer;
-
-private:
-    float _epsilon;
-
-    PadStrideInfo     _info;
-    unsigned int      _num_groups;
-    ConvolutionMethod _method;
-    FastMathHint      _fast_math_hint;
-};
-
-} // namespace graph
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_BATCH_NORMALIZATION_LAYER_NODE_H */
diff --git a/arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h b/arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h
deleted file mode 100644
index 6048994b02538f11a488b0217a09c841cb23890c..0000000000000000000000000000000000000000
--- a/arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_WITH_POST_OP_NODE_H
-#define ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_WITH_POST_OP_NODE_H
-
-#include "arm_compute/graph/INode.h"
-
-#include <list>
-
-namespace arm_compute
-{
-namespace graph
-{
-/** Convolution node */
-class FusedConvolutionWithPostOpNode final : public INode
-{
-public:
-    /** Constructor
-     *
-     * @param[in] info           Convolution layer attributes
-     * @param[in] num_groups     (Optional) Number of groups (Defaults to 1)
-     * @param[in] method         (Optional) Convolution method to use
-     * @param[in] fast_math_hint (Optional) Fast math hint
-     * @param[in] out_quant_info (Optional) Output quantization info
-     */
-    FusedConvolutionWithPostOpNode(PadStrideInfo     info,
-                                   unsigned int      num_groups     = 1,
-                                   ConvolutionMethod method         = ConvolutionMethod::Default,
-                                   FastMathHint      fast_math_hint = FastMathHint::Disabled,
-                                   QuantizationInfo  out_quant_info = QuantizationInfo());
-    /** Sets the convolution layer method to use
-     *
-     * @param[in] method Method to use for convolution
-     */
-    void set_convolution_method(ConvolutionMethod method);
-    /** Convolution layer method accessor
-     *
-     * @note This is an indication on which convolution layer implementation to use,
-     *       if it fails to be created the library's heuristic approach will be used
-     *
-     * @return Convolution layer method to be used by the node
-     */
-    ConvolutionMethod convolution_method() const;
-    /** Sets the fast math fast hint
-     *
-     * @param[in] hint Hint to use for convolution
-     */
-    void set_fast_math_hint(FastMathHint hint);
-    /** Fast math hint accessor
-     *
-     * @return Fast math hint to be used by the node
-     */
-    FastMathHint fast_math_hint() const;
-    /** Convolution metadata accessor
-     *
-     * @return Convolution information
-     */
-    PadStrideInfo convolution_info() const;
-    /** Number of groups in convolution accessor
-     *
-     * @return Number of groups in convolution
-     */
-    unsigned int num_groups() const;
-    /** Returns fused activation
-     *
-     * @return Fused activation
-     */
-    ActivationLayerInfo fused_activation() const;
-    /** Sets fused activation
-     *
-     * @param[in] fused_activation Fused activation to set
-     */
-    void set_fused_activation(ActivationLayerInfo fused_activation);
-    /** Sets convolution info
-     *
-     * @param[in] info Convolution info to set
-     */
-    void set_convolution_info(PadStrideInfo info);
-    /** Computes convolution output descriptor
-     *
-     * @param[in] input_descriptor   Input descriptor
-     * @param[in] weights_descriptor Weights descriptor
-     * @param[in] info               Convolution operation attributes
-     *
-     * @return Output descriptor
-     */
-    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                      const TensorDescriptor &weights_descriptor,
-                                                      const PadStrideInfo    &info);
-
-    // Inherited overridden methods:
-    NodeType         type() const override;
-    bool             forward_descriptors() override;
-    TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
-
-public:
-    static constexpr NodeType node_type = NodeType::FusedConvolutionWithPostOp;
-
-private:
-    PadStrideInfo       _info;
-    unsigned int        _num_groups;
-    ConvolutionMethod   _method;
-    FastMathHint        _fast_math_hint;
-    QuantizationInfo    _out_quant_info;
-    ActivationLayerInfo _fused_activation;
-};
-} // namespace graph
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_WITH_POST_OP_NODE_H */
diff --git a/arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h b/arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h
index a01cb9dc426941bcd15bfb9c10a3668b096182b4..a61b155151f1e1b095a3a08a004539228a95eb58 100644
--- a/arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h
+++ b/arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h
@@ -46,7 +46,7 @@ public:
                                                     PadStrideInfo              info,
                                                     unsigned int               depth_multiplier,
                                                     DepthwiseConvolutionMethod method,
-                                                    ActivationLayerInfo        fused_activation = ActivationLayerInfo());
+                                                    ActivationLayerInfo fused_activation = ActivationLayerInfo());
 
     /** Sets the depthwise convolution layer method to use
      *
@@ -117,7 +117,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer;
diff --git a/arm_compute/graph/nodes/GenerateProposalsLayerNode.h b/arm_compute/graph/nodes/GenerateProposalsLayerNode.h
index 6f8edc87582964e9ce8f6904a16ec083743bf492..b5e4b9781cecb7c86a14393391b5a8954b7d2e7c 100644
--- a/arm_compute/graph/nodes/GenerateProposalsLayerNode.h
+++ b/arm_compute/graph/nodes/GenerateProposalsLayerNode.h
@@ -50,7 +50,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     GenerateProposalsInfo _info;
diff --git a/arm_compute/graph/nodes/InputNode.h b/arm_compute/graph/nodes/InputNode.h
index 07091af64fbfb397ea66c93b78422334baf578c0..0983d25a599eb303405304de4f35733d2d99cb55 100644
--- a/arm_compute/graph/nodes/InputNode.h
+++ b/arm_compute/graph/nodes/InputNode.h
@@ -44,7 +44,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     TensorDescriptor _desc;
diff --git a/arm_compute/graph/nodes/L2NormalizeLayerNode.h b/arm_compute/graph/nodes/L2NormalizeLayerNode.h
index 8edc5b0bf3254a8fa1f95d4a2cf1b0100a9ebea5..ed11412b70842900349341eb7b5a782ad456ce28 100644
--- a/arm_compute/graph/nodes/L2NormalizeLayerNode.h
+++ b/arm_compute/graph/nodes/L2NormalizeLayerNode.h
@@ -68,7 +68,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     int   _axis;
diff --git a/arm_compute/graph/nodes/Nodes.h b/arm_compute/graph/nodes/Nodes.h
index 3887eaeac6e1b3281d100ca925097f2c1f4557c6..d4ad32b6f0ab5084e21e59032ef954936746f611 100644
--- a/arm_compute/graph/nodes/Nodes.h
+++ b/arm_compute/graph/nodes/Nodes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_NODES_H
-#define ARM_COMPUTE_GRAPH_NODES_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_NODES_NODES_H
+#define ACL_ARM_COMPUTE_GRAPH_NODES_NODES_H
 
 #include "arm_compute/graph/nodes/ActivationLayerNode.h"
 #include "arm_compute/graph/nodes/ArgMinMaxLayerNode.h"
@@ -43,8 +43,6 @@
 #include "arm_compute/graph/nodes/FlattenLayerNode.h"
 #include "arm_compute/graph/nodes/FullyConnectedLayerNode.h"
 #include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
-#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h"
-#include "arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h"
 #include "arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h"
 #include "arm_compute/graph/nodes/GenerateProposalsLayerNode.h"
 #include "arm_compute/graph/nodes/InputNode.h"
@@ -52,22 +50,22 @@
 #include "arm_compute/graph/nodes/NormalizationLayerNode.h"
 #include "arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h"
 #include "arm_compute/graph/nodes/OutputNode.h"
-#include "arm_compute/graph/nodes/PReluLayerNode.h"
 #include "arm_compute/graph/nodes/PadLayerNode.h"
 #include "arm_compute/graph/nodes/PermuteLayerNode.h"
 #include "arm_compute/graph/nodes/PoolingLayerNode.h"
+#include "arm_compute/graph/nodes/PReluLayerNode.h"
 #include "arm_compute/graph/nodes/PrintLayerNode.h"
 #include "arm_compute/graph/nodes/PriorBoxLayerNode.h"
 #include "arm_compute/graph/nodes/QuantizationLayerNode.h"
-#include "arm_compute/graph/nodes/ROIAlignLayerNode.h"
 #include "arm_compute/graph/nodes/ReductionLayerNode.h"
 #include "arm_compute/graph/nodes/ReorgLayerNode.h"
 #include "arm_compute/graph/nodes/ReshapeLayerNode.h"
 #include "arm_compute/graph/nodes/ResizeLayerNode.h"
+#include "arm_compute/graph/nodes/ROIAlignLayerNode.h"
 #include "arm_compute/graph/nodes/SliceLayerNode.h"
 #include "arm_compute/graph/nodes/SoftmaxLayerNode.h"
 #include "arm_compute/graph/nodes/SplitLayerNode.h"
 #include "arm_compute/graph/nodes/StackLayerNode.h"
 #include "arm_compute/graph/nodes/StridedSliceLayerNode.h"
 
-#endif /* ARM_COMPUTE_GRAPH_NODES_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_NODES_NODES_H
diff --git a/arm_compute/graph/nodes/NodesFwd.h b/arm_compute/graph/nodes/NodesFwd.h
index f1576d6336f2bd17979cbf94217ffefc768e90e0..580f339468a648ba30f8fa2c8821a3a405bf0174 100644
--- a/arm_compute/graph/nodes/NodesFwd.h
+++ b/arm_compute/graph/nodes/NodesFwd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_NODES_FWD_H
-#define ARM_COMPUTE_GRAPH_NODES_FWD_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_NODES_NODESFWD_H
+#define ACL_ARM_COMPUTE_GRAPH_NODES_NODESFWD_H
 
 namespace arm_compute
 {
@@ -49,9 +49,7 @@ class EltwiseLayerNode;
 class FlattenLayerNode;
 class FullyConnectedLayerNode;
 class FusedConvolutionBatchNormalizationNode;
-class FusedConvolutionWithPostOpNode;
 class FusedDepthwiseConvolutionBatchNormalizationNode;
-class FusedConvolutionBatchNormalizationWithPostOpsNode;
 class GenerateProposalsLayerNode;
 class InputNode;
 class L2NormalizeLayerNode;
@@ -77,4 +75,4 @@ class StackLayerNode;
 class StridedSliceLayerNode;
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_NODES_FWD_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_NODES_NODESFWD_H
diff --git a/arm_compute/graph/nodes/NormalizationLayerNode.h b/arm_compute/graph/nodes/NormalizationLayerNode.h
index 503b859e5317aed44e4e0452a5949584fc41eecf..86f2fb9dba2ff2b4d5ab0b5991eb4ceab361f4fe 100644
--- a/arm_compute/graph/nodes/NormalizationLayerNode.h
+++ b/arm_compute/graph/nodes/NormalizationLayerNode.h
@@ -49,7 +49,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     NormalizationLayerInfo _info;
diff --git a/arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h b/arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h
index 4d84c20de071cd4e9c1d2a993ccdfd1e6ad884f8..158acc4c2399a95a7a40701d9d359f3d29c55646 100644
--- a/arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h
+++ b/arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h
@@ -41,7 +41,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/OutputNode.h b/arm_compute/graph/nodes/OutputNode.h
index c91bc6b6993403f7ee5b4511b86711df708a42c1..75484ab32864fb4f05c3dc7b32cb318e6f311c7e 100644
--- a/arm_compute/graph/nodes/OutputNode.h
+++ b/arm_compute/graph/nodes/OutputNode.h
@@ -41,7 +41,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/PReluLayerNode.h b/arm_compute/graph/nodes/PReluLayerNode.h
index b8e6c1ae7fc78ffb4d0be0c15d0f40fb643be90e..532fdccb3a2e3e8953ba15b6767dff8744823a77 100644
--- a/arm_compute/graph/nodes/PReluLayerNode.h
+++ b/arm_compute/graph/nodes/PReluLayerNode.h
@@ -41,7 +41,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/PadLayerNode.h b/arm_compute/graph/nodes/PadLayerNode.h
index d6ff3553da67139171e7f19ac0adc0d04ec7d94d..dcb5ea595b17778b9ed407a981029380d89fd4a1 100644
--- a/arm_compute/graph/nodes/PadLayerNode.h
+++ b/arm_compute/graph/nodes/PadLayerNode.h
@@ -56,7 +56,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::PadLayer;
diff --git a/arm_compute/graph/nodes/PermuteLayerNode.h b/arm_compute/graph/nodes/PermuteLayerNode.h
index 0b2380b51c249b0a32886114f18433d6f810d635..62654e777c338ef2f6f5df31ebcf8ca1e391bf3f 100644
--- a/arm_compute/graph/nodes/PermuteLayerNode.h
+++ b/arm_compute/graph/nodes/PermuteLayerNode.h
@@ -51,7 +51,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     PermutationVector _perm;
diff --git a/arm_compute/graph/nodes/PoolingLayerNode.h b/arm_compute/graph/nodes/PoolingLayerNode.h
index b336bb906f7b27eb721a177d2539c13243ba8e74..c81f3f98dc9260681586cbb2b5abcd44dc4ddaa6 100644
--- a/arm_compute/graph/nodes/PoolingLayerNode.h
+++ b/arm_compute/graph/nodes/PoolingLayerNode.h
@@ -57,7 +57,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     PoolingLayerInfo _info;
diff --git a/arm_compute/graph/nodes/PrintLayerNode.h b/arm_compute/graph/nodes/PrintLayerNode.h
index b57ac1f6d40f02602be19fe8e57bd1939dc11fd7..e7accc8015a999498fac1bd26fd9dee744fe6c32 100644
--- a/arm_compute/graph/nodes/PrintLayerNode.h
+++ b/arm_compute/graph/nodes/PrintLayerNode.h
@@ -43,7 +43,9 @@ public:
      * @param[in] format_info (Optional) Format info.
      * @param[in] transform   (Optional) Input transform function.
      */
-    PrintLayerNode(std::ostream &stream, const IOFormatInfo &format_info = IOFormatInfo(), const std::function<ITensor *(ITensor *)> transform = nullptr);
+    PrintLayerNode(std::ostream                             &stream,
+                   const IOFormatInfo                       &format_info = IOFormatInfo(),
+                   const std::function<ITensor *(ITensor *)> transform   = nullptr);
 
     /** Stream metadata accessor
      *
@@ -67,7 +69,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     std::ostream                             &_stream;
diff --git a/arm_compute/graph/nodes/PriorBoxLayerNode.h b/arm_compute/graph/nodes/PriorBoxLayerNode.h
index c7eadd1fe50461607c6e0c3b4ca7c53419fbc87a..db36bfb1e09cb824c732f729296fbfacf0d07cd2 100644
--- a/arm_compute/graph/nodes/PriorBoxLayerNode.h
+++ b/arm_compute/graph/nodes/PriorBoxLayerNode.h
@@ -51,13 +51,14 @@ public:
      *
      * @return Output descriptor
      */
-    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor, const PriorBoxLayerInfo &info);
+    static TensorDescriptor compute_output_descriptor(const TensorDescriptor  &input_descriptor,
+                                                      const PriorBoxLayerInfo &info);
 
     // Inherited overridden methods:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     PriorBoxLayerInfo _info;
diff --git a/arm_compute/graph/nodes/QuantizationLayerNode.h b/arm_compute/graph/nodes/QuantizationLayerNode.h
index e5d81afa0eee5228ecf102848c00fde9527c5aa0..b8e4c7d27b0eeaaa5545d6edaa8174b72088a844 100644
--- a/arm_compute/graph/nodes/QuantizationLayerNode.h
+++ b/arm_compute/graph/nodes/QuantizationLayerNode.h
@@ -51,7 +51,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
     static constexpr NodeType node_type = NodeType::QuantizationLayer;
 
diff --git a/arm_compute/graph/nodes/ROIAlignLayerNode.h b/arm_compute/graph/nodes/ROIAlignLayerNode.h
index 5abd0659b5f5797a69c4490560dbd6d60780d709..70309a551c7d035df7e72fe0376a7f05b8011024 100644
--- a/arm_compute/graph/nodes/ROIAlignLayerNode.h
+++ b/arm_compute/graph/nodes/ROIAlignLayerNode.h
@@ -56,7 +56,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     ROIPoolingLayerInfo _pool_info;
diff --git a/arm_compute/graph/nodes/ReductionLayerNode.h b/arm_compute/graph/nodes/ReductionLayerNode.h
index b8d295945ccc1b1fa3fbad30aa18d123d2a66e65..ff99466c8fe595dc5d87e1fff5fcffd367de1ca4 100644
--- a/arm_compute/graph/nodes/ReductionLayerNode.h
+++ b/arm_compute/graph/nodes/ReductionLayerNode.h
@@ -56,7 +56,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     ReductionOperation _op;
diff --git a/arm_compute/graph/nodes/ReorgLayerNode.h b/arm_compute/graph/nodes/ReorgLayerNode.h
index 986692ed28a4e910da332c81af1b9c055371bff2..a3bbcdb00fc3c0d5346e4de1dffe032de8557ab9 100644
--- a/arm_compute/graph/nodes/ReorgLayerNode.h
+++ b/arm_compute/graph/nodes/ReorgLayerNode.h
@@ -57,7 +57,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     int _stride;
diff --git a/arm_compute/graph/nodes/ReshapeLayerNode.h b/arm_compute/graph/nodes/ReshapeLayerNode.h
index 727d253ce5e57ff953eed6b52586d52cef601c24..992275c2b16120d0896f17c24b56914766e1d5ac 100644
--- a/arm_compute/graph/nodes/ReshapeLayerNode.h
+++ b/arm_compute/graph/nodes/ReshapeLayerNode.h
@@ -44,7 +44,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     TensorShape _shape;
diff --git a/arm_compute/graph/nodes/ResizeLayerNode.h b/arm_compute/graph/nodes/ResizeLayerNode.h
index 79f8889f9c6fcc77b09c4cacc648252f9d7e8fd4..480d6e517fad019f088902ef4dcfaa1dc5aaafc6 100644
--- a/arm_compute/graph/nodes/ResizeLayerNode.h
+++ b/arm_compute/graph/nodes/ResizeLayerNode.h
@@ -51,7 +51,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     InterpolationPolicy _policy;
diff --git a/arm_compute/graph/nodes/SliceLayerNode.h b/arm_compute/graph/nodes/SliceLayerNode.h
index 08d3794e26869a2a7d13d35612ef5c4a67c2d55c..63f266b21731c046142b0c71ea86cd8d31f2561a 100644
--- a/arm_compute/graph/nodes/SliceLayerNode.h
+++ b/arm_compute/graph/nodes/SliceLayerNode.h
@@ -51,7 +51,8 @@ public:
      * @return  Output descriptor
      */
     static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                      const Coordinates &starts, const Coordinates &ends);
+                                                      const Coordinates      &starts,
+                                                      const Coordinates      &ends);
     /** Start coordinates accessor
      *
      * @return Start coordinates of the dimensions
@@ -67,7 +68,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     Coordinates _starts;
diff --git a/arm_compute/graph/nodes/SoftmaxLayerNode.h b/arm_compute/graph/nodes/SoftmaxLayerNode.h
index 0868c6ff16dca5cbcdcc9503baf44122d452315b..2cb1ac2cf499d18051fe1a23a3d88fc0db7f3177 100644
--- a/arm_compute/graph/nodes/SoftmaxLayerNode.h
+++ b/arm_compute/graph/nodes/SoftmaxLayerNode.h
@@ -49,7 +49,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::SoftmaxLayer;
diff --git a/arm_compute/graph/nodes/SplitLayerNode.h b/arm_compute/graph/nodes/SplitLayerNode.h
index 13cccdd447c97fb3b6c827fd87e7030acb31f406..5e6df53c0f681626e4e6efa69c79f2aeee09d1c3 100644
--- a/arm_compute/graph/nodes/SplitLayerNode.h
+++ b/arm_compute/graph/nodes/SplitLayerNode.h
@@ -55,7 +55,9 @@ public:
      * @return  A pair with the descriptor of the split and the starting coordinates
      */
     std::pair<TensorDescriptor, Coordinates> compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                       unsigned int num_splits, int axis, unsigned int idx);
+                                                                       unsigned int            num_splits,
+                                                                       int                     axis,
+                                                                       unsigned int            idx);
     /** Number of splits accessor
      *
      * @return Number of splits
@@ -72,7 +74,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     unsigned int     _num_splits;
diff --git a/arm_compute/graph/nodes/StackLayerNode.h b/arm_compute/graph/nodes/StackLayerNode.h
index 2990895c2b4beace17f338906598ef7a2f9e0862..9f0767c9f2718ce42e5793885a3b358659355fa5 100644
--- a/arm_compute/graph/nodes/StackLayerNode.h
+++ b/arm_compute/graph/nodes/StackLayerNode.h
@@ -58,7 +58,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     unsigned int _total_nodes;
diff --git a/arm_compute/graph/nodes/StridedSliceLayerNode.h b/arm_compute/graph/nodes/StridedSliceLayerNode.h
index 6039f312b3d47170bd708fb073d71ef8f82816f5..f521feb7807b14db210ce33870ca9d6c2c6209a1 100644
--- a/arm_compute/graph/nodes/StridedSliceLayerNode.h
+++ b/arm_compute/graph/nodes/StridedSliceLayerNode.h
@@ -84,7 +84,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     Coordinates           _starts;
diff --git a/arm_compute/graph/printers/DotGraphPrinter.h b/arm_compute/graph/printers/DotGraphPrinter.h
index 63b89272f486580179d2151162f8e3a0d677a60e..66380330441b43db4c2f5c9675ab35c5263115a6 100644
--- a/arm_compute/graph/printers/DotGraphPrinter.h
+++ b/arm_compute/graph/printers/DotGraphPrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019,2021 Arm Limited.
+ * Copyright (c) 2018-2019,2021,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_DOTGRAPHPRINTER_H
-#define ARM_COMPUTE_GRAPH_DOTGRAPHPRINTER_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_PRINTERS_DOTGRAPHPRINTER_H
+#define ACL_ARM_COMPUTE_GRAPH_PRINTERS_DOTGRAPHPRINTER_H
 
 #include "arm_compute/graph/IGraphPrinter.h"
-
 #include "arm_compute/graph/INodeVisitor.h"
 
 #include <string>
@@ -57,8 +56,6 @@ public:
     void visit(DepthwiseConvolutionLayerNode &n) override;
     void visit(EltwiseLayerNode &n) override;
     void visit(FusedConvolutionBatchNormalizationNode &n) override;
-    void visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n) override;
-    void visit(FusedConvolutionWithPostOpNode &n) override;
     void visit(FusedDepthwiseConvolutionBatchNormalizationNode &n) override;
     void visit(NormalizationLayerNode &n) override;
     void visit(PoolingLayerNode &n) override;
@@ -106,4 +103,4 @@ private:
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_DOTGRAPHPRINTER_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_PRINTERS_DOTGRAPHPRINTER_H
diff --git a/arm_compute/runtime/Allocator.h b/arm_compute/runtime/Allocator.h
index 83f072ab6b6afadfa4262339fc98b07087c65a6c..e99ddb3daca42ca39565742c00d2d59411b2b4db 100644
--- a/arm_compute/runtime/Allocator.h
+++ b/arm_compute/runtime/Allocator.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_ALLOCATOR_H
 
 #include "arm_compute/runtime/IAllocator.h"
-
 #include "arm_compute/runtime/IMemoryRegion.h"
 
 #include <cstddef>
@@ -40,9 +39,9 @@ public:
     Allocator() = default;
 
     // Inherited methods overridden:
-    void *allocate(size_t size, size_t alignment) override;
-    void free(void *ptr) override;
+    void                          *allocate(size_t size, size_t alignment) override;
+    void                           free(void *ptr) override;
     std::unique_ptr<IMemoryRegion> make_region(size_t size, size_t alignment) override;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ALLOCATOR_H */
diff --git a/arm_compute/runtime/Array.h b/arm_compute/runtime/Array.h
index 21d9c25c87bbe65959534696b2d9336cb04f9780..928327331715108443495bafbcc86dfdcac974ac 100644
--- a/arm_compute/runtime/Array.h
+++ b/arm_compute/runtime/Array.h
@@ -37,16 +37,14 @@ class Array : public IArray<T>
 {
 public:
     /** Default constructor: empty array */
-    Array()
-        : IArray<T>(0), _values(nullptr)
+    Array() : IArray<T>(0), _values(nullptr)
     {
     }
     /** Constructor: initializes an array which can contain up to max_num_points values
      *
      * @param[in] max_num_values Maximum number of values the array will be able to stored
      */
-    Array(size_t max_num_values)
-        : IArray<T>(max_num_values), _values(std::make_unique<T[]>(max_num_values))
+    Array(size_t max_num_values) : IArray<T>(max_num_values), _values(std::make_unique<T[]>(max_num_values))
     {
     }
 
@@ -72,5 +70,5 @@ using Int16Array = Array<int16_t>;
 using Int32Array = Array<int32_t>;
 /** Array of floats. */
 using FloatArray = Array<float>;
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_ARRAY_H */
diff --git a/arm_compute/runtime/BlobLifetimeManager.h b/arm_compute/runtime/BlobLifetimeManager.h
index 0d69f2e7c52731ae11aa55d02932236b2c0c1628..18ffe96ee5861f49b21ba071d0891bdba0ee6d89 100644
--- a/arm_compute/runtime/BlobLifetimeManager.h
+++ b/arm_compute/runtime/BlobLifetimeManager.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_BLOBLIFETIMEMANAGER_H
 
 #include "arm_compute/runtime/ISimpleLifetimeManager.h"
-
 #include "arm_compute/runtime/Types.h"
 
 #include <memory>
@@ -62,7 +61,7 @@ public:
 
     // Inherited methods overridden:
     std::unique_ptr<IMemoryPool> create_pool(IAllocator *allocator) override;
-    MappingType mapping_type() const override;
+    MappingType                  mapping_type() const override;
 
 private:
     // Inherited methods overridden:
diff --git a/arm_compute/runtime/BlobMemoryPool.h b/arm_compute/runtime/BlobMemoryPool.h
index 8481fa20f9951dd308b16eafa2b92087049016dd..b25efc3821897dcc76f8ba3bba1f8e7a083c9978 100644
--- a/arm_compute/runtime/BlobMemoryPool.h
+++ b/arm_compute/runtime/BlobMemoryPool.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_BLOBMEMORYPOOL_H
 
 #include "arm_compute/runtime/IMemoryPool.h"
-
 #include "arm_compute/runtime/IMemoryRegion.h"
 #include "arm_compute/runtime/Types.h"
 
@@ -62,8 +61,8 @@ public:
     BlobMemoryPool &operator=(BlobMemoryPool &&) = default;
 
     // Inherited methods overridden:
-    void acquire(MemoryMappings &handles) override;
-    void release(MemoryMappings &handles) override;
+    void                         acquire(MemoryMappings &handles) override;
+    void                         release(MemoryMappings &handles) override;
     MappingType                  mapping_type() const override;
     std::unique_ptr<IMemoryPool> duplicate() override;
 
diff --git a/arm_compute/runtime/CL/CLArray.h b/arm_compute/runtime/CL/CLArray.h
index 7efe208b9f1ce174014a736be2074ba48e9f71b2..6e81a46a29a4f6bacf12bcad2d3d7cf5f99112aa 100644
--- a/arm_compute/runtime/CL/CLArray.h
+++ b/arm_compute/runtime/CL/CLArray.h
@@ -38,8 +38,7 @@ class CLArray : public ICLArray<T>
 {
 public:
     /** Default constructor: empty array */
-    CLArray()
-        : ICLArray<T>(0), _buffer()
+    CLArray() : ICLArray<T>(0), _buffer()
     {
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -55,7 +54,8 @@ public:
      * @param[in] max_num_values Maximum number of values the array will be able to stored
      */
     CLArray(size_t max_num_values)
-        : ICLArray<T>(max_num_values), _buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, max_num_values * sizeof(T))
+        : ICLArray<T>(max_num_values),
+          _buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, max_num_values * sizeof(T))
     {
     }
     /** Enqueue a map operation of the allocated buffer.
@@ -91,7 +91,8 @@ protected:
     uint8_t *do_map(cl::CommandQueue &q, bool blocking) override
     {
         ARM_COMPUTE_ERROR_ON(nullptr == _buffer.get());
-        return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, this->max_num_values() * sizeof(T)));
+        return static_cast<uint8_t *>(q.enqueueMapBuffer(
+            _buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, this->max_num_values() * sizeof(T)));
     }
     void do_unmap(cl::CommandQueue &q, uint8_t *mapping) override
     {
@@ -114,5 +115,5 @@ using CLInt16Array = CLArray<cl_short>;
 using CLInt32Array = CLArray<cl_int>;
 /** OpenCL Array of floats. */
 using CLFloatArray = CLArray<cl_float>;
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLARRAY_H */
diff --git a/arm_compute/runtime/CL/CLBufferAllocator.h b/arm_compute/runtime/CL/CLBufferAllocator.h
index 7467e9d1c6dcc6c5a6b744fdf591ea2a8d9e6294..00ff017012d798341b2e00cc3b6313b52a9d8f98 100644
--- a/arm_compute/runtime/CL/CLBufferAllocator.h
+++ b/arm_compute/runtime/CL/CLBufferAllocator.h
@@ -35,9 +35,9 @@ class CLBufferAllocator final : public IAllocator
 {
 public:
     // Inherited methods overridden:
-    void *allocate(size_t size, size_t alignment) override;
-    void free(void *ptr) override;
+    void                          *allocate(size_t size, size_t alignment) override;
+    void                           free(void *ptr) override;
     std::unique_ptr<IMemoryRegion> make_region(size_t size, size_t alignment) override;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CLBUFFERALLOCATOR_H */
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 26e459680c1487ba89275803b63354e74d06fe87..cf757239cbfe18b279fcebf90400d4a1a54f18fe 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLFUNCTIONS_H
-#define ARM_COMPUTE_CLFUNCTIONS_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_CLFUNCTIONS_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_CLFUNCTIONS_H
 
 /* Header regrouping all the CL functions */
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
@@ -62,44 +62,44 @@
 #include "arm_compute/runtime/CL/functions/CLFloor.h"
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h"
+#include "arm_compute/runtime/CL/functions/CLGather.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
-#include "arm_compute/runtime/CL/functions/CLGather.h"
 #include "arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h"
 #include "arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h"
-#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
-#include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
 #include "arm_compute/runtime/CL/functions/CLLogicalAnd.h"
 #include "arm_compute/runtime/CL/functions/CLLogicalNot.h"
 #include "arm_compute/runtime/CL/functions/CLLogicalOr.h"
+#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
+#include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
 #include "arm_compute/runtime/CL/functions/CLMatMul.h"
 #include "arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h"
 #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
-#include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
 #include "arm_compute/runtime/CL/functions/CLPadLayer.h"
 #include "arm_compute/runtime/CL/functions/CLPermute.h"
 #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
 #include "arm_compute/runtime/CL/functions/CLPooling3dLayer.h"
 #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
 #include "arm_compute/runtime/CL/functions/CLPriorBoxLayer.h"
 #include "arm_compute/runtime/CL/functions/CLQLSTMLayer.h"
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLRNNLayer.h"
-#include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
-#include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
 #include "arm_compute/runtime/CL/functions/CLRange.h"
 #include "arm_compute/runtime/CL/functions/CLReduceMean.h"
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
 #include "arm_compute/runtime/CL/functions/CLReorgLayer.h"
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
+#include "arm_compute/runtime/CL/functions/CLRNNLayer.h"
+#include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
+#include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
 #include "arm_compute/runtime/CL/functions/CLScale.h"
 #include "arm_compute/runtime/CL/functions/CLSelect.h"
 #include "arm_compute/runtime/CL/functions/CLSlice.h"
@@ -114,4 +114,4 @@
 #include "arm_compute/runtime/CL/functions/CLUnstack.h"
 #include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
 
-#endif /* ARM_COMPUTE_CLFUNCTIONS_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_CLFUNCTIONS_H
diff --git a/arm_compute/runtime/CL/CLMemory.h b/arm_compute/runtime/CL/CLMemory.h
index 7adee66c7382fea358674a5a1661565748a629a0..5abe86bd5338b100b0c74959fb9c0d0993bdbb9f 100644
--- a/arm_compute/runtime/CL/CLMemory.h
+++ b/arm_compute/runtime/CL/CLMemory.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_RUNTIME_CL_CLMEMORY_H
 #define ARM_COMPUTE_RUNTIME_CL_CLMEMORY_H
 
-#include "arm_compute/runtime/IMemory.h"
-
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/runtime/CL/CLMemoryRegion.h"
+#include "arm_compute/runtime/IMemory.h"
 
 #include <cstddef>
 #include <memory>
@@ -75,8 +74,8 @@ public:
     // Inherited methods overridden:
     IMemoryRegion *region() final;
     IMemoryRegion *region() const final;
-    void set_region(IMemoryRegion *region) final;
-    void set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
+    void           set_region(IMemoryRegion *region) final;
+    void           set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
 
 private:
     ICLMemoryRegion                 *_region;
diff --git a/arm_compute/runtime/CL/CLMemoryRegion.h b/arm_compute/runtime/CL/CLMemoryRegion.h
index 66a30fa56bb96f8c8de54cac617483f755184c62..365973a9e6a0ab2e456edce118025632b0d7aefa 100644
--- a/arm_compute/runtime/CL/CLMemoryRegion.h
+++ b/arm_compute/runtime/CL/CLMemoryRegion.h
@@ -110,7 +110,7 @@ public:
     // Inherited methods overridden :
     void *ptr() final;
     void *map(cl::CommandQueue &q, bool blocking) final;
-    void unmap(cl::CommandQueue &q) final;
+    void  unmap(cl::CommandQueue &q) final;
 };
 
 /** OpenCL SVM memory region interface */
@@ -156,7 +156,7 @@ public:
 
     // Inherited methods overridden :
     void *map(cl::CommandQueue &q, bool blocking) final;
-    void unmap(cl::CommandQueue &q) final;
+    void  unmap(cl::CommandQueue &q) final;
 };
 
 /** OpenCL fine-grain SVM memory region implementation */
@@ -173,7 +173,7 @@ public:
 
     // Inherited methods overridden :
     void *map(cl::CommandQueue &q, bool blocking) final;
-    void unmap(cl::CommandQueue &q) final;
+    void  unmap(cl::CommandQueue &q) final;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_RUNTIME_CL_CL_MEMORY_REGION_H */
diff --git a/arm_compute/runtime/CL/CLRuntimeContext.h b/arm_compute/runtime/CL/CLRuntimeContext.h
index dd17645fa705c8369d835f7b484434680bf6780a..2ed4b747968aa6e728223881b1182436d55bea33 100644
--- a/arm_compute/runtime/CL/CLRuntimeContext.h
+++ b/arm_compute/runtime/CL/CLRuntimeContext.h
@@ -54,11 +54,11 @@ public:
     CLKernelLibrary &kernel_library();
 
 private:
-    std::unique_ptr<CLScheduler> _gpu_owned_scheduler{ nullptr };
-    CLScheduler                 *_gpu_scheduler{ nullptr };
-    CLTuner                      _tuner{ false };
+    std::unique_ptr<CLScheduler> _gpu_owned_scheduler{nullptr};
+    CLScheduler                 *_gpu_scheduler{nullptr};
+    CLTuner                      _tuner{false};
     CLSymbols                    _symbols{};
-    CLBackendType                _backend_type{ CLBackendType::Native };
+    CLBackendType                _backend_type{CLBackendType::Native};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLRUNTIME_CONTEXT_H */
diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h
index 3030239270be122bed9fde6128b2bc4b3de91d8e..b74fcb74efe8946a8929a8767ff6217b98a9b34b 100644
--- a/arm_compute/runtime/CL/CLScheduler.h
+++ b/arm_compute/runtime/CL/CLScheduler.h
@@ -28,8 +28,8 @@
 #include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLGEMMHeuristicsHandle.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLTypes.h"
@@ -63,7 +63,9 @@ public:
      * @param[in] gemm_h          (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
      * @param[in] cl_backend_type (Optional) Type of backend to use (default = CLBackendType::Native)
      */
-    void default_init(ICLTuner *cl_tuner = nullptr, CLGEMMHeuristicsHandle *gemm_h = nullptr, CLBackendType cl_backend_type = CLBackendType::Native);
+    void default_init(ICLTuner               *cl_tuner        = nullptr,
+                      CLGEMMHeuristicsHandle *gemm_h          = nullptr,
+                      CLBackendType           cl_backend_type = CLBackendType::Native);
     /** Initialises the scheduler with context and device provided by the user
      *
      * @param[in] device   OpenCL device to be used
@@ -71,7 +73,10 @@ public:
      * @param[in] cl_tuner (Optional) Pointer to ICLTuner (default=nullptr)
      * @param[in] gemm_h   (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
      */
-    void default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner = nullptr, CLGEMMHeuristicsHandle *gemm_h = nullptr);
+    void default_init_with_context(cl::Device             &device,
+                                   cl::Context            &ctx,
+                                   ICLTuner               *cl_tuner = nullptr,
+                                   CLGEMMHeuristicsHandle *gemm_h   = nullptr);
 
     /** Re-initializes the context and command queue used by the scheduler to default values
      *  and sets a default device and kernel path for the @ref CLKernelLibrary.
@@ -80,7 +85,9 @@ public:
      * @param[in] gemm_h          (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
      * @param[in] cl_backend_type (Optional) Type of backend to use (default = CLBackendType::Native)
      */
-    void default_reinit(ICLTuner *cl_tuner = nullptr, CLGEMMHeuristicsHandle *gemm_h = nullptr, CLBackendType cl_backend_type = CLBackendType::Native);
+    void default_reinit(ICLTuner               *cl_tuner        = nullptr,
+                        CLGEMMHeuristicsHandle *gemm_h          = nullptr,
+                        CLBackendType           cl_backend_type = CLBackendType::Native);
 
     /** Schedule the execution of the passed kernel if possible.
      *
@@ -105,8 +112,12 @@ public:
      * @param[in] gemm_h          (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
      * @param[in] cl_backend_type (Optional) Type of backend to use (default = CLBackendType::Native)
      */
-    void init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner = nullptr, CLGEMMHeuristicsHandle *gemm_h = nullptr,
-              CLBackendType cl_backend_type = CLBackendType::Native);
+    void init(cl::Context             context,
+              cl::CommandQueue        queue,
+              const cl::Device       &device,
+              ICLTuner               *cl_tuner        = nullptr,
+              CLGEMMHeuristicsHandle *gemm_h          = nullptr,
+              CLBackendType           cl_backend_type = CLBackendType::Native);
 
     /** Accessor for the associated CL context.
      *
diff --git a/arm_compute/runtime/CL/CLSubTensor.h b/arm_compute/runtime/CL/CLSubTensor.h
index 0a7f5f89b25534e16dfa1c7102ac72bd81ff53ec..c18df8086a0deca43b58e7b2f09925437918eb02 100644
--- a/arm_compute/runtime/CL/CLSubTensor.h
+++ b/arm_compute/runtime/CL/CLSubTensor.h
@@ -46,7 +46,10 @@ public:
      * @param[in] coords        Coordinates of the first subtensor element inside the parent tensor.
      * @param[in] extend_parent (Optional) Extend parent with subtensor shape if subtensor indexes out of bounds
      */
-    CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent = false);
+    CLSubTensor(ICLTensor         *parent,
+                const TensorShape &tensor_shape,
+                const Coordinates &coords,
+                bool               extend_parent = false);
     /** Destructor: free the tensor's memory */
     ~CLSubTensor() = default;
     /** Restrict instances of this class to be copy constructed */
@@ -93,11 +96,11 @@ public:
 protected:
     // Inherited methods overridden:
     uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
-    void do_unmap(cl::CommandQueue &q) override;
+    void     do_unmap(cl::CommandQueue &q) override;
 
 private:
     ICLTensor            *_parent;
     mutable SubTensorInfo _info;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CLSUBTENSOR_H */
diff --git a/arm_compute/runtime/CL/CLTensor.h b/arm_compute/runtime/CL/CLTensor.h
index ae73351f275f77ea5b145b86307ab721622d0c72..0729935e9e35b7e91ca6435431ba6e90852c3b35 100644
--- a/arm_compute/runtime/CL/CLTensor.h
+++ b/arm_compute/runtime/CL/CLTensor.h
@@ -87,17 +87,17 @@ public:
     TensorInfo       *info() override;
     const cl::Buffer &cl_buffer() const override;
     CLQuantization    quantization() const override;
-    void associate_memory_group(IMemoryGroup *memory_group) override;
+    void              associate_memory_group(IMemoryGroup *memory_group) override;
     CLRuntimeContext *context();
 
 protected:
     // Inherited methods overridden:
     uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
-    void do_unmap(cl::CommandQueue &q) override;
+    void     do_unmap(cl::CommandQueue &q) override;
 
 private:
     mutable CLTensorAllocator _allocator; /**< Instance of the OpenCL tensor allocator */
-    CLRuntimeContext         *_ctx{ nullptr };
+    CLRuntimeContext         *_ctx{nullptr};
 };
 
 /** OpenCL Image */
diff --git a/arm_compute/runtime/CL/CLTensorAllocator.h b/arm_compute/runtime/CL/CLTensorAllocator.h
index 1b061ee1d68481c0837cd159743fd56a7391f313..fde8e9c43a2ec108f021fcb455764e3ab71d5076 100644
--- a/arm_compute/runtime/CL/CLTensorAllocator.h
+++ b/arm_compute/runtime/CL/CLTensorAllocator.h
@@ -24,15 +24,14 @@
 #ifndef ARM_COMPUTE_CLTENSORALLOCATOR_H
 #define ARM_COMPUTE_CLTENSORALLOCATOR_H
 
+#include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/runtime/CL/CLArray.h"
 #include "arm_compute/runtime/CL/CLMemory.h"
 #include "arm_compute/runtime/IAllocator.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
-#include "arm_compute/core/CL/CLTypes.h"
-#include "arm_compute/core/CL/OpenCL.h"
-
 #include <cstdint>
 
 namespace arm_compute
@@ -148,7 +147,7 @@ private:
     static const cl::Buffer _empty_buffer;
 
 private:
-    CLRuntimeContext *_ctx;
+    CLRuntimeContext  *_ctx;
     IMemoryManageable *_owner;                   /**< Memory manageable object that owns the allocator */
     IMemoryGroup      *_associated_memory_group; /**< Registered memory manager */
     CLMemory           _memory;                  /**< OpenCL memory */
diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h
index 93aa45adc19ac835914c09238c438271af4d7aae..cf293d3d27aaba3062c9849c10f418a9076ffadc 100644
--- a/arm_compute/runtime/CL/CLTuner.h
+++ b/arm_compute/runtime/CL/CLTuner.h
@@ -153,9 +153,9 @@ private:
 
     std::unordered_map<std::string, CLTuningParams> _tuning_params_table;
     std::unordered_map<std::string, cl::NDRange>    _lws_table;
-    cl::Event    _kernel_event;
-    bool         _tune_new_kernels;
-    CLTuningInfo _tuning_info;
+    cl::Event                                       _kernel_event;
+    bool                                            _tune_new_kernels;
+    CLTuningInfo                                    _tuning_info;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLTUNER_H */
diff --git a/arm_compute/runtime/CL/CLTunerTypes.h b/arm_compute/runtime/CL/CLTunerTypes.h
index 508cafac9549df4860642115d089111d2fcb1fe4..d9b914676a68ed03dda61c5beb09fb978aae8dfb 100644
--- a/arm_compute/runtime/CL/CLTunerTypes.h
+++ b/arm_compute/runtime/CL/CLTunerTypes.h
@@ -43,7 +43,7 @@ enum class CLTunerMode
 struct CLTuningInfo
 {
     CLTunerMode tuner_mode = CLTunerMode::NORMAL; /**< Parameter to select the level (granularity) of the tuning */
-    bool        tune_wbsm  = false;               /**< Flag to tune the batches of work groups distributed to compute units.
+    bool        tune_wbsm  = false; /**< Flag to tune the batches of work groups distributed to compute units.
                                                        Internally, the library will check if this feature is available on
                                                        the target platform. This OpenCL tuner extension is still in experimental phase */
 };
@@ -56,11 +56,10 @@ struct CLTuningInfo
  */
 inline CLTunerMode tuner_mode_from_name(const std::string &name)
 {
-    static const std::map<std::string, CLTunerMode> tuner_modes =
-    {
-        { "exhaustive", CLTunerMode::EXHAUSTIVE },
-        { "normal", CLTunerMode::NORMAL },
-        { "rapid", CLTunerMode::RAPID },
+    static const std::map<std::string, CLTunerMode> tuner_modes = {
+        {"exhaustive", CLTunerMode::EXHAUSTIVE},
+        {"normal", CLTunerMode::NORMAL},
+        {"rapid", CLTunerMode::RAPID},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -71,7 +70,7 @@ inline CLTunerMode tuner_mode_from_name(const std::string &name)
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
diff --git a/arm_compute/runtime/CL/CLTuningParams.h b/arm_compute/runtime/CL/CLTuningParams.h
index 1e5ab25c26e7cd60dc8711d52f42b7b64218c270..a876fad112dbb0d213abede760421a2b09886c6c 100644
--- a/arm_compute/runtime/CL/CLTuningParams.h
+++ b/arm_compute/runtime/CL/CLTuningParams.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/runtime/CL/CLTunerTypes.h"
+
 #include "support/StringSupport.h"
 
 #include <ostream>
@@ -36,8 +37,7 @@ namespace arm_compute
 class CLTuningParams
 {
 public:
-    CLTuningParams(const CLTuningParams &tuning_params)
-        : _lws(tuning_params._lws), _wbsm(tuning_params._wbsm)
+    CLTuningParams(const CLTuningParams &tuning_params) : _lws(tuning_params._lws), _wbsm(tuning_params._wbsm)
     {
     }
 
@@ -45,18 +45,16 @@ public:
         : _lws(lws_x, lws_y, lws_z), _wbsm(wbsm)
     {
     }
-    CLTuningParams(cl::NDRange lws, cl_int wbsm = 0)
-        : _lws(lws), _wbsm(wbsm)
+    CLTuningParams(cl::NDRange lws, cl_int wbsm = 0) : _lws(lws), _wbsm(wbsm)
     {
     }
 
-    CLTuningParams(cl_int wbsm)
-        : CLTuningParams(cl::NullRange, wbsm)
+    CLTuningParams(cl_int wbsm) : CLTuningParams(cl::NullRange, wbsm)
     {
     }
-    CLTuningParams& operator=(const CLTuningParams &other)
+    CLTuningParams &operator=(const CLTuningParams &other)
     {
-        _lws = other._lws;
+        _lws  = other._lws;
         _wbsm = other._wbsm;
         return *this;
     }
@@ -84,8 +82,9 @@ public:
     std::string to_string(CLTuningInfo tuning_info)
     {
         std::string tuning_params_string = "";
-        tuning_params_string += ";" + support::cpp11::to_string(_lws[0]) + ";" + support::cpp11::to_string(_lws[1]) + ";" + support::cpp11::to_string(_lws[2]);
-        if(tuning_info.tune_wbsm)
+        tuning_params_string += ";" + support::cpp11::to_string(_lws[0]) + ";" + support::cpp11::to_string(_lws[1]) +
+                                ";" + support::cpp11::to_string(_lws[2]);
+        if (tuning_info.tune_wbsm)
         {
             tuning_params_string += ";" + support::cpp11::to_string(_wbsm);
         }
@@ -98,19 +97,19 @@ public:
         std::vector<std::string> array;
         std::stringstream        ss(tuning_params_string);
         std::string              temp;
-        while(ss >> temp)
+        while (ss >> temp)
         {
             array.push_back(temp);
         }
         // Read 3 values for lws
-        if(array.size() < 3)
+        if (array.size() < 3)
         {
             return false;
         }
         const unsigned int lws_0 = support::cpp11::stoi(array[0]);
         const unsigned int lws_1 = support::cpp11::stoi(array[1]);
         const unsigned int lws_2 = support::cpp11::stoi(array[2]);
-        if(lws_0 == 0 && lws_1 == 0 && lws_2 == 0)
+        if (lws_0 == 0 && lws_1 == 0 && lws_2 == 0)
         {
             // If lws values are 0, cl::NullRange has to be used
             // otherwise the lws object will be badly created
@@ -121,9 +120,9 @@ public:
             _lws = cl::NDRange(lws_0, lws_1, lws_2);
         }
         array.erase(array.begin(), array.begin() + 3);
-        if(tuning_info.tune_wbsm)
+        if (tuning_info.tune_wbsm)
         {
-            if(array.size() < 1)
+            if (array.size() < 1)
             {
                 return false;
             }
diff --git a/arm_compute/runtime/CL/CLTypes.h b/arm_compute/runtime/CL/CLTypes.h
index d298ecd61425376e16926273f270f7bc389d678d..931740c47f258f8da17a29603619069e56989cb7 100644
--- a/arm_compute/runtime/CL/CLTypes.h
+++ b/arm_compute/runtime/CL/CLTypes.h
@@ -43,12 +43,12 @@ enum class CLGEMMKernelType
 /** OpenCL GEMM kernel selection parameters. These information are retrieved to select the GEMM kernel on OpenCL */
 struct CLGEMMKernelSelectionParams
 {
-    unsigned int m{ 0 };                         /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
-    unsigned int n{ 0 };                         /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
-    unsigned int k{ 0 };                         /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
-    unsigned int b{ 0 };                         /**< Batch size */
-    bool         is_rhs_constant{ false };       /**< True if the content of the rhs matrix is constant */
-    DataType     data_type{ DataType::UNKNOWN }; /**< Data type */
+    unsigned int m{0};                         /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
+    unsigned int n{0};                         /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
+    unsigned int k{0};                         /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
+    unsigned int b{0};                         /**< Batch size */
+    bool         is_rhs_constant{false};       /**< True if the content of the rhs matrix is constant */
+    DataType     data_type{DataType::UNKNOWN}; /**< Data type */
 };
 
 /** List the possible OpenCL backends */
diff --git a/arm_compute/runtime/CL/ICLGEMMKernelSelection.h b/arm_compute/runtime/CL/ICLGEMMKernelSelection.h
index 7be939338869a00b319d5407bc7f4144d12e6921..5a71a61203c1f6fa868666120d63672a9ed0e9bf 100644
--- a/arm_compute/runtime/CL/ICLGEMMKernelSelection.h
+++ b/arm_compute/runtime/CL/ICLGEMMKernelSelection.h
@@ -40,8 +40,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    ICLGEMMKernelSelection(GPUTarget arch)
-        : _target(arch)
+    ICLGEMMKernelSelection(GPUTarget arch) : _target(arch)
     {
     }
     /** Default Move Constructor. */
@@ -59,7 +58,8 @@ public:
     virtual CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) = 0;
 
 protected:
-    GPUTarget _target; /**< GPU target could be used to call a dedicated heuristic for each GPU IP for a given GPU architecture */
+    GPUTarget
+        _target; /**< GPU target could be used to call a dedicated heuristic for each GPU IP for a given GPU architecture */
 };
 } // namespace cl_gemm
 } // namespace arm_compute
diff --git a/arm_compute/runtime/CL/ICLOperator.h b/arm_compute/runtime/CL/ICLOperator.h
index 38bcaf32f2d5a6d0e0485523b7dc4e5131f737ab..c0826e7733d8f22e2614353b12485fd0efce24fb 100644
--- a/arm_compute/runtime/CL/ICLOperator.h
+++ b/arm_compute/runtime/CL/ICLOperator.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_ICLOPERATOR_H
 
 #include "arm_compute/core/Types.h"
-
 #include "arm_compute/runtime/IOperator.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
 #include "arm_compute/runtime/Types.h"
@@ -56,8 +55,8 @@ public:
     ICLOperator &operator=(ICLOperator &&) = default;
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void               run(ITensorPack &tensors) override;
+    void               prepare(ITensorPack &constants) override;
     MemoryRequirements workspace() const override;
 
 protected:
diff --git a/arm_compute/runtime/CL/functions/CLActivationLayer.h b/arm_compute/runtime/CL/functions/CLActivationLayer.h
index 4a718ab4b6c2162fdedbc0e74a9baf3bd59483ca..e158efa0937a7333da904aeefc46c4f99c7d6478 100644
--- a/arm_compute/runtime/CL/functions/CLActivationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLActivationLayer.h
@@ -24,11 +24,10 @@
 #ifndef ARM_COMPUTE_CLACTIVATIONLAYER_H
 #define ARM_COMPUTE_CLACTIVATIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
@@ -91,7 +90,10 @@ public:
      * @param[out]     output          Destination tensor. Data type supported: same as @p input
      * @param[in]      act_info        Activation layer parameters.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   ActivationLayerInfo     act_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayer
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
diff --git a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
index ce5bee8d9528893741289433af77d97fd5b75788..d340d20a1fc1b57531a4f92ba9547b8d63096afb 100644
--- a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
@@ -91,7 +91,11 @@ public:
      * @param[out] output          Output source tensor. Data types supported: U32/S32.
      * @param[in]  op              Reduction operation to perform. Operations supported: ARG_IDX_MAX, ARG_IDX_MIN
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op);
+    void configure(const CLCompileContext   &compile_context,
+                   const ICLTensor          *input,
+                   int                       axis,
+                   ICLTensor                *output,
+                   const ReductionOperation &op);
     /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayer
      *
      * @param[in] input  Input source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
index 37a0680709984012f4672f0028572c67fb0c3118..f57bc8fe8b170b636290e35ac7fe65cdd503a1c5 100644
--- a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H
 #define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -84,7 +83,13 @@ public:
      * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f,
+    void configure(ICLTensor          *input,
+                   ICLTensor          *output,
+                   const ICLTensor    *mean,
+                   const ICLTensor    *var,
+                   const ICLTensor    *beta     = nullptr,
+                   const ICLTensor    *gamma    = nullptr,
+                   float               epsilon  = 0.001f,
                    ActivationLayerInfo act_info = ActivationLayerInfo());
     /** Set the input and output tensors.
      *
@@ -102,9 +107,15 @@ public:
      * @param[in]      epsilon         (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr,
-                   const ICLTensor *gamma = nullptr,
-                   float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *mean,
+                   const ICLTensor        *var,
+                   const ICLTensor        *beta     = nullptr,
+                   const ICLTensor        *gamma    = nullptr,
+                   float                   epsilon  = 0.001f,
+                   ActivationLayerInfo     act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayer
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
@@ -120,10 +131,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *output,
+                           const ITensorInfo  *mean,
+                           const ITensorInfo  *var,
+                           const ITensorInfo  *beta     = nullptr,
+                           const ITensorInfo  *gamma    = nullptr,
+                           float               epsilon  = 0.001f,
+                           ActivationLayerInfo act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
index 861330b9d4a6507fe43b5e0da739aaa9dfa313fb..20b9fdafede585dfc261569377443ff482f84d3f 100644
--- a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
+++ b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLBATCHTOSPACELAYER_H
 #define ARM_COMPUTE_CLBATCHTOSPACELAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -82,7 +81,10 @@ public:
      * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
     ARM_COMPUTE_DEPRECATED_REL(23.05)
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *block_shape,
+                   ICLTensor              *output);
     /** Set the input and output tensors. (Static block shape).
      *
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -91,7 +93,11 @@ public:
      * @param[out] output        Tensor output. Data types supported: same as @p input
      * @param[in]  crop_info     Information about how the output shape is cropped after batch to space is performed
      */
-    void configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info = CropInfo{});
+    void configure(const ICLTensor *input,
+                   int32_t          block_shape_x,
+                   int32_t          block_shape_y,
+                   ICLTensor       *output,
+                   const CropInfo  &crop_info = CropInfo{});
     /** Set the input and output tensors. (Static block shape).
      *
      * @param[in]  compile_context The compile context to be used.
@@ -101,7 +107,12 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  crop_info       Information about how the output shape is cropped after batch to space is performed
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info = CropInfo{});
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   int32_t                 block_shape_x,
+                   int32_t                 block_shape_y,
+                   ICLTensor              *output,
+                   const CropInfo         &crop_info = CropInfo{});
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer
      *
      * @param[in]  input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -124,7 +135,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info = CropInfo{});
+    static Status validate(const ITensorInfo *input,
+                           int32_t            block_shape_x,
+                           int32_t            block_shape_y,
+                           const ITensorInfo *output,
+                           const CropInfo    &crop_info = CropInfo{});
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
index b30be9b24f8ccd18e662bbc7b6ec69858a371298..f82af3af9b83832c2440f76df9d2802042a50db4 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
@@ -61,7 +61,10 @@ public:
      * @param[in]  input2          Input tensor. Data types supported: U8.
      * @param[out] output          Output tensor. Data types supported: U8.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBITWISEAND_H */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseNot.h b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
index 1456ebe57e77827cf69378683082c723a2d0e4ab..31f8e8680285ed7c4f82c3091bc65fa0983ad09e 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseNot.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
@@ -60,5 +60,5 @@ public:
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBITWISENOT_H */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseOr.h b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
index ff0a1f0d73e6aa2f0d83f869026e7d7f3b036753..9a25a2099e87caab217266cb6abe9176a8dbee9a 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseOr.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
@@ -61,7 +61,10 @@ public:
      * @param[in]  input2          Input tensor. Data types supported: U8.
      * @param[out] output          Output tensor. Data types supported: U8.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBITWISEOR_H */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseXor.h b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
index 0cd9d073b4942910581027d7e7b2377b6cbebdc9..9e288ef7b6f777a793bc6f2f9177116268cbbe05 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseXor.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
@@ -61,7 +61,10 @@ public:
      * @param[in]  input2          Input tensor. Data types supported: U8.
      * @param[out] output          Output tensor. Data types supported: U8.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBITWISEXOR_H */
diff --git a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
index d3499c3949a6c946a654b800bc58c03d5678b34f..dba5497f5d00b4947ca677becf7684c20711d019 100644
--- a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
+++ b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
@@ -64,7 +64,10 @@ public:
      *
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      */
-    void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    void configure(const ICLTensor                *boxes,
+                   ICLTensor                      *pred_boxes,
+                   const ICLTensor                *deltas,
+                   const BoundingBoxTransformInfo &info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -76,7 +79,11 @@ public:
      *
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    void configure(const CLCompileContext         &compile_context,
+                   const ICLTensor                *boxes,
+                   ICLTensor                      *pred_boxes,
+                   const ICLTensor                *deltas,
+                   const BoundingBoxTransformInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
      *
@@ -90,7 +97,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+    static Status validate(const ITensorInfo              *boxes,
+                           const ITensorInfo              *pred_boxes,
+                           const ITensorInfo              *deltas,
+                           const BoundingBoxTransformInfo &info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBOUNDINGBOXTRANSFORM_H */
diff --git a/arm_compute/runtime/CL/functions/CLCast.h b/arm_compute/runtime/CL/functions/CLCast.h
index 650cd11b9b9db2abbd9a020f474a0f22e11fb06a..9433f08fac86ec0c305f765e9f701788c7c4edd5 100644
--- a/arm_compute/runtime/CL/functions/CLCast.h
+++ b/arm_compute/runtime/CL/functions/CLCast.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLCAST_H
 #define ARM_COMPUTE_CLCAST_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -79,7 +78,8 @@ public:
      */
     void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy);
     // Initialize the function's source, destination
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref CLCast
      *
      * @param[in] input  Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/U64/S64/F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
index 3dc62595d290b49fc57bf4ac1b8ab700b4880cdc..8ca848a0206f72db97a762d94f27e8eb49f22629 100644
--- a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
+++ b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
@@ -65,7 +65,10 @@ public:
      * @param[out] output          Output tensor. Data type supported: Same as @p input
      * @param[in]  num_groups      Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            num_groups);
     /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel
      *
      * @param[in] input      Input tensor info. Data types supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLComparison.h b/arm_compute/runtime/CL/functions/CLComparison.h
index 3f984900ee74f7c537545daa8983b89fa411147d..fca4b168b02379372b3eaf92beb36b29f9c61e2a 100644
--- a/arm_compute/runtime/CL/functions/CLComparison.h
+++ b/arm_compute/runtime/CL/functions/CLComparison.h
@@ -66,7 +66,11 @@ public:
      * @param[out] output          Destination tensor. Data types supported: U8.
      * @param[out] operation       Comparison operation to be used.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input1,
+                   ICLTensor              *input2,
+                   ICLTensor              *output,
+                   ComparisonOperation     operation);
     /** Static function to check if given info will lead to a valid configuration of @ref CLComparison
      *
      * @param[in]  input1    Source tensor. Data types supported: All.
@@ -76,7 +80,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation);
+    static Status validate(const ITensorInfo  *input1,
+                           const ITensorInfo  *input2,
+                           const ITensorInfo  *output,
+                           ComparisonOperation operation);
 };
 
 /** Basic function to run @ref CLComparisonKernel */
diff --git a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
index 71e84e21b5c3cf98dab3579bfae0ef55f95a4227..88c4bed595f196698430f4c357ccebe1f955fb32 100644
--- a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLCONCATENATELAYER_H
 #define ARM_COMPUTE_CLCONCATENATELAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 #include <vector>
@@ -95,7 +94,10 @@ public:
      * @param[out]    output          Output tensor. Data types supported: Same as @p input.
      * @param[in]     axis            Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      */
-    void configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis);
+    void configure(const CLCompileContext         &compile_context,
+                   std::vector<const ICLTensor *> &inputs_vector,
+                   ICLTensor                      *output,
+                   size_t                          axis);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConcatenateLayer
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
@@ -108,7 +110,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
+    static Status
+    validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLConv3D.h b/arm_compute/runtime/CL/functions/CLConv3D.h
index 5728fe79d82dfa0efe17f1f501d5aae7e2154d52..aabaf01ab7d99a003bd23f56d8663059bf614d99 100644
--- a/arm_compute/runtime/CL/functions/CLConv3D.h
+++ b/arm_compute/runtime/CL/functions/CLConv3D.h
@@ -77,20 +77,33 @@ public:
      * @param[in]  conv3d_info     Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *src,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *biases,
+                   ICLTensor              *dst,
+                   const Conv3dInfo       &conv3d_info);
     /** Set the src and dst tensors.
      *
      * Similar to CLConv3D::configure() but using the default compile context
      *
      */
-    void configure(const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info);
+    void configure(const ICLTensor  *src,
+                   const ICLTensor  *weights,
+                   const ICLTensor  *biases,
+                   ICLTensor        *dst,
+                   const Conv3dInfo &conv3d_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConv3D
      *
      * Similar to CLConv3D::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv3d_info);
 
     // Inherited methods overridden:
     void run() override;
@@ -99,5 +112,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLCONVOLUTION3DLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
index 0a634b548280878653817bb57e79502b750baf83..409430d59500f3ccfadc8b76939ac8e14b9bc812 100644
--- a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
+++ b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
@@ -69,7 +69,10 @@ public:
      * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in]  data_layout          The data layout the weights have been trained in.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    void configure(const ICLTensor   *input,
+                   ICLTensor         *output,
+                   const TensorShape &original_input_shape,
+                   DataLayout         data_layout);
     /** Initialize the function.
      *
      * @param[in]  compile_context      The compile context to be used.
@@ -78,7 +81,11 @@ public:
      * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in]  data_layout          The data layout the weights have been trained in.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const TensorShape      &original_input_shape,
+                   DataLayout              data_layout);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConvertFullyConnectedWeights
      *
      * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
@@ -86,7 +93,10 @@ public:
      * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in] data_layout          The data layout the weights have been trained in.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const TensorShape &original_input_shape,
+                           DataLayout         data_layout);
 
     // Inherited methods overridden:
     void run() override;
@@ -144,7 +154,10 @@ public:
      * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in] data_layout          The data layout the weights have been trained in.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const TensorShape &original_input_shape, DataLayout data_layout)
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const TensorShape      &original_input_shape,
+                   DataLayout              data_layout)
     {
         _func.configure(compile_context, input, &_output, original_input_shape, data_layout);
     }
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index 8c9e45d753dbbf508a7d127f156e5d0f398d4188..8487be71c37675ef97ea8607ccdaeb7841b92311 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -21,12 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_CLCONVOLUTIONLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCONVOLUTIONLAYER_H
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -120,11 +119,17 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     * @param[in]  post_ops         (Optional) A sequence of post operations that are performed after the main operation.
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1,
-                   const experimental::PostOpList<ICLTensor *> &post_ops = experimental::PostOpList<ICLTensor *> {});
+    void configure(ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context  The compile context to be used.
@@ -144,11 +149,18 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     * @param[in]  post_ops         (Optional) A sequence of post operations that are performed after the main operation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
-                   unsigned int num_groups = 1, const experimental::PostOpList<ICLTensor *> &post_ops = experimental::PostOpList<ICLTensor *> {});
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -167,13 +179,19 @@ public:
      * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                             available which may introduce a drop of accuracy as well. Default is false
      * @param[in] num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     * @param[in] post_ops         (Optional) A sequence of post operations that are performed after the main operation.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
-                           unsigned int num_groups = 1, const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {});
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
     /** Static function to check if given info will return the convolution called by @ref CLConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -193,8 +211,15 @@ public:
      *
      * @return the Convolution Method Hint
      */
-    static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                    const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation = Size2D(1U, 1U), bool enable_fast_math = false);
+    static ConvolutionMethod get_convolution_method(const ITensorInfo         *input,
+                                                    const ITensorInfo         *weights,
+                                                    const ITensorInfo         *output,
+                                                    const PadStrideInfo       &conv_info,
+                                                    const WeightsInfo         &weights_info,
+                                                    const ActivationLayerInfo &act_info,
+                                                    const GPUTarget            gpu_target,
+                                                    const Size2D              &dilation         = Size2D(1U, 1U),
+                                                    bool                       enable_fast_math = false);
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
@@ -203,5 +228,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
-#endif /* ARM_COMPUTE_CLCONVOLUTIONLAYER_H */
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/CL/functions/CLCopy.h b/arm_compute/runtime/CL/functions/CLCopy.h
index 4fc4183d3e274d410bc2d2e4d0d5ba2e5bde8cf5..fd40b7b9de6b400649c945d7f2b3660eedecc487 100644
--- a/arm_compute/runtime/CL/functions/CLCopy.h
+++ b/arm_compute/runtime/CL/functions/CLCopy.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -74,7 +75,10 @@ public:
      * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      *
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   Window                 *dst_window = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref CLCopy
      *
      * @param[in] input      Source tensor. Data types supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLCrop.h b/arm_compute/runtime/CL/functions/CLCrop.h
index a474215190df074547edb06cec663f3f2d1ba050..2942e9362a2ea5565e2d2172e70a15a8e3f1d015 100644
--- a/arm_compute/runtime/CL/functions/CLCrop.h
+++ b/arm_compute/runtime/CL/functions/CLCrop.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -71,7 +72,13 @@ public:
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in]  output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, Window *output_window = nullptr);
+    void configure(const ICLTensor *input,
+                   ICLTensor       *output,
+                   Coordinates2D    start,
+                   Coordinates2D    end,
+                   uint32_t         batch_index,
+                   float            extrapolation_value = 0,
+                   Window          *output_window       = nullptr);
     /** Configure function
      *
      * @note Supported tensor rank: up to 4
@@ -85,8 +92,14 @@ public:
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in]  output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                   Window *output_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   Coordinates2D           start,
+                   Coordinates2D           end,
+                   uint32_t                batch_index,
+                   float                   extrapolation_value = 0,
+                   Window                 *output_window       = nullptr);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
      *
@@ -100,8 +113,13 @@ public:
      * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in] output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                           Window *output_window = nullptr);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           Coordinates2D      start,
+                           Coordinates2D      end,
+                           uint32_t           batch_index,
+                           float              extrapolation_value = 0,
+                           Window            *output_window       = nullptr);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLCropResize.h b/arm_compute/runtime/CL/functions/CLCropResize.h
index 5c60c2879cc4b9578e87558e2658526ee8c40d03..6fb055e8939511a654b0f2b423d966bf296f9be2 100644
--- a/arm_compute/runtime/CL/functions/CLCropResize.h
+++ b/arm_compute/runtime/CL/functions/CLCropResize.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_CL_CROP_RESIZE_H
 
 #include "arm_compute/core/CL/ICLTensor.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLCopy.h"
 #include "arm_compute/runtime/CL/functions/CLCrop.h"
@@ -82,8 +81,13 @@ public:
      * @param[in]  method              The policy to be used when resizing image. Default is bilinear.
      * @param[in]  extrapolation_value Value to be used for values outside of the image for cropping and resizing. Default is 0.
      */
-    void configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
-                   InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0);
+    void configure(const ICLTensor    *input,
+                   ICLTensor          *boxes,
+                   ICLTensor          *box_ind,
+                   ICLTensor          *output,
+                   Coordinates2D       crop_size,
+                   InterpolationPolicy method              = InterpolationPolicy::BILINEAR,
+                   float               extrapolation_value = 0);
     /** Configure kernel
      *
      * @note Supported tensor rank: up to 4
@@ -100,8 +104,14 @@ public:
      * @param[in]  method              The policy to be used when resizing image. Default is bilinear.
      * @param[in]  extrapolation_value Value to be used for values outside of the image for cropping and resizing. Default is 0.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
-                   InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *boxes,
+                   ICLTensor              *box_ind,
+                   ICLTensor              *output,
+                   Coordinates2D           crop_size,
+                   InterpolationPolicy     method              = InterpolationPolicy::BILINEAR,
+                   float                   extrapolation_value = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NESlice
      *
@@ -121,8 +131,13 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
-                           Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value);
+    static Status validate(const ITensorInfo  *input,
+                           ITensorInfo        *boxes,
+                           ITensorInfo        *box_ind,
+                           const ITensorInfo  *output,
+                           Coordinates2D       crop_size,
+                           InterpolationPolicy method,
+                           float               extrapolation_value);
 
     void run() override;
 
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
index 0c59e2c86dfad5a3ff3f888fbc4321ea0e9df652..92f87ee461b7865ae85bb1f337b6ff76ded39d33 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
@@ -70,7 +70,12 @@ public:
      * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
      *
      */
-    void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, const WeightsInfo &weights_info = WeightsInfo());
+    void configure(ICLTensor           *input,
+                   ICLTensor           *weights,
+                   const ICLTensor     *bias,
+                   ICLTensor           *output,
+                   const PadStrideInfo &deconv_info,
+                   const WeightsInfo   &weights_info = WeightsInfo());
     /** Set the input, weights, biases and output tensors.
      *
      * @param[in]     compile_context The compile context to be used.
@@ -82,8 +87,13 @@ public:
      * @param[in]     weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
      *
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
-                   const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *weights,
+                   const ICLTensor        *bias,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &deconv_info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayer
      *
      * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
@@ -95,11 +105,19 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                           const WeightsInfo &weights_info = WeightsInfo());
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *bias,
+                           ITensorInfo         *output,
+                           const PadStrideInfo &deconv_info,
+                           const WeightsInfo   &weights_info = WeightsInfo());
 
-    static DeconvolutionMethod get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                                                        const WeightsInfo &weights_info);
+    static DeconvolutionMethod get_deconvolution_method(const ITensorInfo   *input,
+                                                        const ITensorInfo   *weights,
+                                                        const ITensorInfo   *bias,
+                                                        ITensorInfo         *output,
+                                                        const PadStrideInfo &deconv_info,
+                                                        const WeightsInfo   &weights_info);
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
index 344ebd0afbe4489fe87138fb7990020ce46c0686..5a2abafe797232712a352b9251c8350f90304519 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
@@ -82,7 +82,8 @@ public:
      * @param[out]     output          Destination tensor. Data type supported: same as @p input.
      * @param[in]      info            Contains padding and policies to be used in the deconvolution.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
+    void
+    configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
      *
      * @param[in] input  Source tensor info. Data type supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
index 58deb7ec40ae27bd2ad961d251d290e20a66666c..3e7ca8830b356727fd5cd0d2f353e94fa243c11b 100644
--- a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLDEPTHCONVERT_H
 #define ARM_COMPUTE_CLDEPTHCONVERT_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -96,7 +95,11 @@ public:
      * @param[in]  policy          Conversion policy.
      * @param[in]  shift           Value for down/up conversions. Must be 0 <= shift < 8.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   ConvertPolicy           policy,
+                   uint32_t                shift);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConvertLayer
      *
      * @param[in] input  Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
index 0026cc2b67c514a9adc43aaa2298b3d7c5a4175b..14d0a7ec7cf12806a81fcd2da75009e2b791ee07 100644
--- a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
@@ -60,7 +60,8 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape     Block shape value.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayer.
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -71,5 +72,5 @@ public:
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLDEPTHTOSPACELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
index 27984491005d044cdd43dd6b8fa6eade8aa235c5..2c0fa7aa22e5a77da95bbf81eb57adcfd80ba39d 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
@@ -88,15 +88,28 @@ public:
      *
      * @note: For in-place support, please check @ref CLDepthwiseConvolutionLayerNativeKernel
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *biases,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &conv_info,
+                   unsigned int            depth_multiplier = 1,
+                   ActivationLayerInfo     act_info         = ActivationLayerInfo(),
+                   const Size2D           &dilation         = Size2D(1U, 1U));
 
     /** Initialize the function's source, destination, weights and convolution information.
      *
      * Similar to @ref CLDepthwiseConvolutionLayer::configure()
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    void configure(ICLTensor           *input,
+                   const ICLTensor     *weights,
+                   const ICLTensor     *biases,
+                   ICLTensor           *output,
+                   const PadStrideInfo &conv_info,
+                   unsigned int         depth_multiplier = 1,
+                   ActivationLayerInfo  act_info         = ActivationLayerInfo(),
+                   const Size2D        &dilation         = Size2D(1U, 1U));
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer
      *
@@ -104,8 +117,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *biases,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &conv_info,
+                           unsigned int         depth_multiplier = 1,
+                           ActivationLayerInfo  act_info         = ActivationLayerInfo(),
+                           const Size2D        &dilation         = Size2D(1U, 1U));
 
     // Inherited methods overriden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
index 462a3ac07e6f6418fa67ac15274508d4dddd4742..84900b03a3ec2eb3c5608bc0d83f9f6d733cd5cb 100644
--- a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
@@ -79,7 +79,12 @@ public:
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  act_info  (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -94,7 +99,12 @@ public:
      * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayer
      *
@@ -111,7 +121,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
@@ -121,5 +135,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
index d0a61cdd365619099075c328beef139a18a187ce..14384a09b554e2eaedaeeacc110ea1c794b3fc7f 100644
--- a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
@@ -24,12 +24,11 @@
 #ifndef ARM_COMPUTE_CLDIRECTDECONVOLUTIONLAYER_H
 #define ARM_COMPUTE_CLDIRECTDECONVOLUTIONLAYER_H
 
+#include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
-
-#include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -111,7 +110,12 @@ public:
      * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
      *
      */
-    void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, const WeightsInfo &weights_info = WeightsInfo());
+    void configure(ICLTensor           *input,
+                   ICLTensor           *weights,
+                   const ICLTensor     *bias,
+                   ICLTensor           *output,
+                   const PadStrideInfo &info,
+                   const WeightsInfo   &weights_info = WeightsInfo());
     /** Set the input, weights, biases and output tensors.
      *
      * @param[in]     compile_context The compile context to be used.
@@ -125,8 +129,13 @@ public:
      * @param[in]     weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
      *
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                   const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *weights,
+                   const ICLTensor        *bias,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLDirectDeconvolutionLayer
      *
      * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
@@ -140,8 +149,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
-                           const WeightsInfo &weights_info = WeightsInfo());
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *bias,
+                           ITensorInfo         *output,
+                           const PadStrideInfo &info,
+                           const WeightsInfo   &weights_info = WeightsInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
index 9de362d2b2673bd5a28378bb5b3a459c549e38d2..13844c98a136ff7eea7db1562e79af426ad790c3 100644
--- a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
+++ b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
@@ -82,7 +82,11 @@ public:
      * @param[in]      policy   Policy to use to handle overflow.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * Valid configurations (Input1,Input2) -> Output :
@@ -108,7 +112,11 @@ public:
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input1,
+                   const ICLTensor           *input2,
+                   ICLTensor                 *output,
+                   ConvertPolicy              policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClSaturatedArithmeticKernel for addition
      *
@@ -134,7 +142,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -192,7 +204,11 @@ public:
      * @param[in]      policy   Policy to use to handle overflow.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ICLTensor           *input1,
+                   const ICLTensor           *input2,
+                   ICLTensor                 *output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * Valid configurations (Input1,Input2) -> Output :
@@ -218,7 +234,11 @@ public:
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input1,
+                   const ICLTensor           *input2,
+                   ICLTensor                 *output,
+                   ConvertPolicy              policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClSaturatedArithmeticKernel for subtraction
      *
@@ -244,7 +264,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -292,7 +316,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output.
      *
      * @param[in]      compile_context The compile context to be used.
@@ -303,7 +330,11 @@ public:
      * @param[out]     output          Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input1,
+                   const ICLTensor           *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticDivision
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
@@ -313,7 +344,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -368,7 +402,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in]      compile_context The compile context to be used.
@@ -379,7 +416,11 @@ public:
      * @param[out]     output          Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for max
      *
      * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
@@ -389,7 +430,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -444,7 +488,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in]      compile_context The compile context to be used.
@@ -455,7 +502,11 @@ public:
      * @param[out]     output          Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for min
      *
      * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
@@ -465,7 +516,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -518,7 +572,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in]      compile_context The compile context to be used.
@@ -529,7 +586,11 @@ public:
      * @param[out]     output          Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for squared difference
      *
      * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
@@ -539,7 +600,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -587,7 +651,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported:F16/F32.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in]      compile_context The compile context to be used.
@@ -598,7 +665,11 @@ public:
      * @param[out]     output          Output tensor. Data types supported:F16/F32.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for power
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
@@ -608,7 +679,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h b/arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h
index 594ee4cfdcfbb1df2fcf8d77c23ce23f9d1b336d..d186b70d937ede4de2cae7905d75d3fa4230d0d0 100644
--- a/arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h
+++ b/arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLELEMENTWISEUNARYLAYER_H
 #define ARM_COMPUTE_CLELEMENTWISEUNARYLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/CL/functions/CLFFT1D.h b/arm_compute/runtime/CL/functions/CLFFT1D.h
index c7112dc7373f81eaef5389312998044e164351c2..49ecf3c2607d4ca5d51589ec74ab1edaeeda7bdd 100644
--- a/arm_compute/runtime/CL/functions/CLFFT1D.h
+++ b/arm_compute/runtime/CL/functions/CLFFT1D.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_CLFFT1D_H
 #define ARM_COMPUTE_CLFFT1D_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
 namespace arm_compute
@@ -82,7 +81,10 @@ public:
      * @param[out] output          Destination tensor. Data types and data layouts supported: Same as @p input.
      * @param[in]  config          FFT related configuration
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const FFT1DInfo        &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFT1D.
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLFFT2D.h b/arm_compute/runtime/CL/functions/CLFFT2D.h
index 3d20327bf1b98f37e962c8f55745c61bcfb021dc..b7d15f1602d90ab9d9843ad7c218ed24031f0ff8 100644
--- a/arm_compute/runtime/CL/functions/CLFFT2D.h
+++ b/arm_compute/runtime/CL/functions/CLFFT2D.h
@@ -24,11 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFT2D_H
 #define ARM_COMPUTE_CLFFT2D_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLFFT1D.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
 namespace arm_compute
@@ -79,7 +78,10 @@ public:
      * @param[out] output          Destination tensor. Data types and data layouts supported: Same as @p input.
      * @param[in]  config          FFT related configuration
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const FFT2DInfo        &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFT2D.
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h
index f873cb0b86a2982a154600a8b8bc13a7ffec9c47..ed78bbb7a73628d108cb4d64341e749b828b784a 100644
--- a/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_CLFFTCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_CLFFTCONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
@@ -37,6 +35,7 @@
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
 #include "arm_compute/runtime/CL/functions/CLSlice.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
@@ -94,8 +93,13 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Set the input and output tensors.
      *
      * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
@@ -113,8 +117,14 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTConvolutionLayer
      *
      * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
@@ -133,8 +143,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLFill.h b/arm_compute/runtime/CL/functions/CLFill.h
index 341d93a9f6b58ac57286ca5ed734e13d84a52ceb..be1059761a6a17e3ecaa274bcccd8bf3a0717239 100644
--- a/arm_compute/runtime/CL/functions/CLFill.h
+++ b/arm_compute/runtime/CL/functions/CLFill.h
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -73,7 +74,10 @@ public:
      * @param[in]     constant_value  The value used to fill the planes of the tensor
      * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *tensor,
+                   const PixelValue       &constant_value,
+                   Window                 *window = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFill
      *
      * @param[in] tensor         Source tensor info. Data types supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLFloor.h b/arm_compute/runtime/CL/functions/CLFloor.h
index 87cd5b44c7ed42c3c1aba9e44e7220338a2a5da5..4d3d704857cd9a0b40f14b0d42758dc8573fe7f0 100644
--- a/arm_compute/runtime/CL/functions/CLFloor.h
+++ b/arm_compute/runtime/CL/functions/CLFloor.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLFLOOR_H
 #define ARM_COMPUTE_CLFLOOR_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
index b784226a2f83e55c3602c9f3f1017f9302d5cdfa..9fd0b4aaef9925a648e9b47dc600be29f7d4bbca 100644
--- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
@@ -25,9 +25,8 @@
 #define ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H
 
 #include "arm_compute/function_info/FullyConnectedLayerInfo.h"
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
@@ -45,7 +44,8 @@ class CLFullyConnectedLayer : public IFunction
 {
 public:
     /** Constructor */
-    CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager  = nullptr,
+                          IWeightsManager                *weights_manager = nullptr);
     /** Default destructor */
     ~CLFullyConnectedLayer();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -83,13 +83,20 @@ public:
      *                             Data type supported: Same as @p input.
      * @param[in]  fc_info         (Optional) Fully connected layer additional info
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *biases,
+                   ICLTensor              *output,
                    FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
     /** Set the input and output tensors.
      *
      * Similar to @ref CLFullyConnectedLayer
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+    void configure(const ICLTensor        *input,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *biases,
+                   ICLTensor              *output,
                    FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLFullyConnectedLayer
      *
@@ -97,7 +104,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *output,
                            FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
 
     //Inherited methods override
diff --git a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
index cd752703927f05dbe9bde3b64a7e07954c00caca..2e777273cd17254ddfad36821e0a2e0c50620cdf 100644
--- a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
+++ b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
@@ -78,9 +78,16 @@ public:
      * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to Convolution.
      */
-    void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const ICLTensor           *input_weights,
+                   const ICLTensor           *bn_mean,
+                   const ICLTensor           *bn_var,
+                   ICLTensor                 *fused_weights,
+                   ICLTensor                 *fused_bias,
+                   const ICLTensor           *input_bias = nullptr,
+                   const ICLTensor           *bn_beta    = nullptr,
+                   const ICLTensor           *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -97,9 +104,17 @@ public:
      * @param[in]  epsilon         (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type        (Optional) Fused batch normalization type. Defaults to Convolution.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input_weights,
+                   const ICLTensor           *bn_mean,
+                   const ICLTensor           *bn_var,
+                   ICLTensor                 *fused_weights,
+                   ICLTensor                 *fused_bias,
+                   const ICLTensor           *input_bias = nullptr,
+                   const ICLTensor           *bn_beta    = nullptr,
+                   const ICLTensor           *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalization
      *
      * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -117,10 +132,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    static Status validate(const ITensorInfo         *input_weights,
+                           const ITensorInfo         *bn_mean,
+                           const ITensorInfo         *bn_var,
+                           const ITensorInfo         *fused_weights,
+                           const ITensorInfo         *fused_bias,
+                           const ITensorInfo         *input_bias = nullptr,
+                           const ITensorInfo         *bn_beta    = nullptr,
+                           const ITensorInfo         *bn_gamma   = nullptr,
+                           float                      epsilon    = 0.001f,
+                           FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index 3a39aca6924b33738499e411ef0d5128af559110..f5e6aa1237f81022e9c46a8d6e8c13894cb2e6da 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -92,13 +92,26 @@ public:
      *                             if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
      *                             in case matrix A and matrix B have been already transformed.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *a,
+                   const ICLTensor        *b,
+                   const ICLTensor        *c,
+                   ICLTensor              *output,
+                   float                   alpha,
+                   float                   beta,
+                   const GEMMInfo         &gemm_info = GEMMInfo());
 
     /** Initialise the kernel's inputs and output
      *
      * Similar to @ref CLGEMM::configure()
      */
-    void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ICLTensor *a,
+                   const ICLTensor *b,
+                   const ICLTensor *c,
+                   ICLTensor       *output,
+                   float            alpha,
+                   float            beta,
+                   const GEMMInfo  &gemm_info = GEMMInfo());
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMM.
      *
@@ -106,7 +119,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index 98273403828c63c26d16b713cc7e3bc63ec58701..70ceb1513bb0efe77c43990299c7112b24c3dbf8 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
@@ -21,10 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLGEMMCONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLGEMMCONVOLUTIONLAYER_H
 
-#include "arm_compute/core/experimental/IPostOp.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTypes.h"
@@ -53,7 +52,8 @@ public:
      * @param[in] memory_manager  (Optional) Memory manager.
      * @param[in] weights_manager (Optional) Weights manager.
      */
-    CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager  = nullptr,
+                           IWeightsManager                *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLGEMMConvolutionLayer(const CLGEMMConvolutionLayer &) = delete;
     /** Default move constructor */
@@ -95,11 +95,16 @@ public:
      * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
      * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     * @param[in]  post_ops     (Optional) A sequence of post operations that are performed after the main operation.
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1,
-                   const experimental::PostOpList<ICLTensor *> &post_ops = experimental::PostOpList<ICLTensor *> {});
+    void configure(const ICLTensor           *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info = WeightsInfo(),
+                   const Size2D              &dilation     = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info     = ActivationLayerInfo(),
+                   unsigned int               num_groups   = 1);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -118,12 +123,17 @@ public:
      * @param[in]  dilation        (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     * @param[in]  post_ops        (Optional) A sequence of post operations that are performed after the main operation.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1,
-                   const experimental::PostOpList<ICLTensor *> &post_ops = experimental::PostOpList<ICLTensor *> {});
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info = WeightsInfo(),
+                   const Size2D              &dilation     = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info     = ActivationLayerInfo(),
+                   unsigned int               num_groups   = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer.
      *
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -141,13 +151,18 @@ public:
      * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
      * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     * @param[in]  post_ops     (Optional) A sequence of post operations that are performed after the main operation.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1,
-                           const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {});
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info = WeightsInfo(),
+                           const Size2D              &dilation     = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info     = ActivationLayerInfo(),
+                           unsigned int               num_groups   = 1);
 
     // Inherited methods overridden:
     void run() override;
@@ -158,4 +173,4 @@ private:
     std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLGEMMCONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
index c985738a9ca1a610057eb6fed02f3a7e4eb6cfd2..3e8929c5ad8f856269850ad4032f36b14b683efe 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
@@ -113,7 +113,11 @@ public:
      * @param[out]    output      Output tensor. The output has the same number of dimensions as the @p input. Data layout supported: same as @p input.
      * @param[in]     deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This function supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info);
+    void configure(const ICLTensor     *input,
+                   const ICLTensor     *weights,
+                   const ICLTensor     *bias,
+                   ICLTensor           *output,
+                   const PadStrideInfo &deconv_info);
     /** Set the input, weights, biases and output tensors.
      *
      * @param[in]     compile_context The compile context to be used.
@@ -124,7 +128,12 @@ public:
      * @param[out]    output          Output tensor. The output has the same number of dimensions as the @p input. Data layout supported: same as @p input.
      * @param[in]     deconv_info     Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This function supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *bias,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &deconv_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayer
      *
      * @param[in] input       Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
@@ -136,7 +145,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *bias,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &deconv_info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index 8b8d9f235f6482e621702b4b9ce815fa0f867747..1b8e5dcc1d5a9f8e7cf6acafd4e7690c6f52c96f 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -91,7 +91,11 @@ public:
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ICLTensor *a,
+                   const ICLTensor *b,
+                   const ICLTensor *c,
+                   ICLTensor       *output,
+                   const GEMMInfo  &gemm_info = GEMMInfo());
     /** Initialise the kernel's inputs, output
      *
      * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
@@ -110,7 +114,12 @@ public:
      * @param[in]  gemm_info       (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *a,
+                   const ICLTensor        *b,
+                   const ICLTensor        *c,
+                   ICLTensor              *output,
+                   const GEMMInfo         &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyCore
      *
      * @param[in] a         First input tensor info (Matrix A). Data type supported: QASYMM8.
@@ -122,7 +131,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
index 6ec7b71f7d8c6d21d5c67c8a0968b69f7c11046b..ff9c872896c9909aa0e1c2143ffda2e9354ae33a 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
@@ -85,7 +85,8 @@ public:
      * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16
      * @param[in]  info   GEMMLowp output stage metadata.
      */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info);
     /** Initialise the kernel's inputs, output
      *
      * @param[in]  compile_context The compile context to be used.
@@ -95,7 +96,11 @@ public:
      * @param[out] output          Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in]  info            GEMMLowp output stage metadata.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info);
+    void configure(const CLCompileContext        &compile_context,
+                   const ICLTensor               *input,
+                   const ICLTensor               *bias,
+                   ICLTensor                     *output,
+                   const GEMMLowpOutputStageInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel
      *
      * @param[in] input  Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
@@ -106,7 +111,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info);
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *output,
+                           const GEMMLowpOutputStageInfo &info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLGather.h b/arm_compute/runtime/CL/functions/CLGather.h
index 0f1ccbad0849cbe9a86c39de43ebe0ad506ca038..360c8757b627ea18ab2cc6a6a6289d15f05a9c11 100644
--- a/arm_compute/runtime/CL/functions/CLGather.h
+++ b/arm_compute/runtime/CL/functions/CLGather.h
@@ -62,7 +62,11 @@ public:
      * @param[out] output          Destination tensor. Data type supported: Same as @p input
      * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Defaults to 0
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *indices,
+                   ICLTensor              *output,
+                   int                     axis = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel
      *
@@ -73,7 +77,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLGATHER_H */
diff --git a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
index aec5cdf1a85b3ddd653c73d5c65128b61a483178..3a201e79b0243a8790eac566e1e77bf33c677d0b 100644
--- a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
@@ -100,7 +100,12 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the @ref GenerateProposalsInfo struct.
      * @note Proposals contains all the proposals. Of those, only the first num_valid_proposals are valid.
      */
-    void configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
+    void configure(const ICLTensor             *scores,
+                   const ICLTensor             *deltas,
+                   const ICLTensor             *anchors,
+                   ICLTensor                   *proposals,
+                   ICLTensor                   *scores_out,
+                   ICLTensor                   *num_valid_proposals,
                    const GenerateProposalsInfo &info);
     /** Set the input and output tensors.
      *
@@ -118,8 +123,14 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the @ref GenerateProposalsInfo struct.
      * @note Proposals contains all the proposals. Of those, only the first num_valid_proposals are valid.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out,
-                   ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info);
+    void configure(const CLCompileContext      &compile_context,
+                   const ICLTensor             *scores,
+                   const ICLTensor             *deltas,
+                   const ICLTensor             *anchors,
+                   ICLTensor                   *proposals,
+                   ICLTensor                   *scores_out,
+                   ICLTensor                   *num_valid_proposals,
+                   const GenerateProposalsInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLGenerateProposalsLayer
      *
@@ -135,7 +146,11 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
+    static Status validate(const ITensorInfo           *scores,
+                           const ITensorInfo           *deltas,
+                           const ITensorInfo           *anchors,
+                           const ITensorInfo           *proposals,
+                           const ITensorInfo           *scores_out,
                            const ITensorInfo           *num_valid_proposals,
                            const GenerateProposalsInfo &info);
 
diff --git a/arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h
index 12b83ea25b0c4bdb7e21d831b1e9ddb68e74c427..91952af5dc070c08fb61f8ff9ff18641c4e469b2 100644
--- a/arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h
@@ -75,7 +75,12 @@ public:
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  act_info  (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -90,7 +95,12 @@ public:
      * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLIndirectConvolutionLayer
      *
@@ -107,7 +117,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
@@ -117,5 +131,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLINDIRECTCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
index 985a6a75f7e185f4ad6ee2c54ad5714698c8a9aa..98d215dd4bf0b789bf2205d4d09635b1bcdb7ed9 100644
--- a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
@@ -83,7 +83,12 @@ public:
      * @param[in]      epsilon             (Optional) Lower bound value for the normalization. Defaults to 1e-12
      * @param[in]      use_mixed_precision (Optional) Use mixed precision in case of FP16 execution
      */
-    void configure(ICLTensor *input, ICLTensor *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f, bool use_mixed_precision = true);
+    void configure(ICLTensor *input,
+                   ICLTensor *output,
+                   float      gamma               = 1.0f,
+                   float      beta                = 0.0f,
+                   float      epsilon             = 1e-12f,
+                   bool       use_mixed_precision = true);
     /** Set the input and output tensors.
      *
      * @param[in]      compile_context     The compile context to be used.
@@ -95,7 +100,13 @@ public:
      * @param[in]      epsilon             (Optional) Lower bound value for the normalization. Defaults to 1e-12
      * @param[in]      use_mixed_precision (Optional) Use mixed precision in case of FP16 execution
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f, bool use_mixed_precision = true);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   float                   gamma               = 1.0f,
+                   float                   beta                = 0.0f,
+                   float                   epsilon             = 1e-12f,
+                   bool                    use_mixed_precision = true);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
      *
@@ -108,8 +119,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f, bool use_mixed_precision = true);
-    void run() override;
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           float              gamma               = 1.0f,
+                           float              beta                = 0.0f,
+                           float              epsilon             = 1e-12f,
+                           bool               use_mixed_precision = true);
+    void          run() override;
 
 private:
     std::unique_ptr<ICLKernel> _inst_norm_kernel; /**< Kernel to run */
diff --git a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
index 4dc5c778d2d15548fe758b375918580e9cdbd4c6..a8b356a708cd32e3d99fa9d819615b5ebc68a304 100644
--- a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
@@ -26,8 +26,8 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
@@ -89,7 +89,8 @@ public:
      * @param[in]  axis            Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
      * @param[in]  epsilon         (Optional) Lower bound value for the normalization.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon = 1e-12f);
+    void configure(
+        const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon = 1e-12f);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayer.
      *
@@ -111,5 +112,5 @@ private:
     std::unique_ptr<CLL2NormalizeLayerKernel> _normalize_kernel;
     CLTensor                                  _sumsq;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CLL2NORMALIZELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
index d26b4c5595b23ef646aafdd12dba18d4ea11ea66..fe494991afd3af2e136923075c0469bb34048e8e 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_CLLSTMLAYER_H
 #define ARM_COMPUTE_CLLSTMLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
@@ -37,9 +35,10 @@
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
 
 #include <memory>
 
@@ -53,7 +52,7 @@ namespace kernels
 {
 class ClTransposeKernel;
 }
-}
+} // namespace opencl
 
 /** This function performs a single time step in a Long Short-Term Memory (LSTM) layer.
  *
@@ -120,13 +119,26 @@ public:
      * @param[in]  projection_threshold        (Optional) The clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip].
      *                                         If set to 0.0f then clipping is disabled.
      */
-    void configure(const ICLTensor *input,
-                   const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   const ICLTensor *output_state_in, ICLTensor *cell_state_in,
-                   ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
-                   const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    void configure(const ICLTensor             *input,
+                   const ICLTensor             *input_to_forget_weights,
+                   const ICLTensor             *input_to_cell_weights,
+                   const ICLTensor             *input_to_output_weights,
+                   const ICLTensor             *recurrent_to_forget_weights,
+                   const ICLTensor             *recurrent_to_cell_weights,
+                   const ICLTensor             *recurrent_to_output_weights,
+                   const ICLTensor             *forget_gate_bias,
+                   const ICLTensor             *cell_bias,
+                   const ICLTensor             *output_gate_bias,
+                   const ICLTensor             *output_state_in,
+                   ICLTensor                   *cell_state_in,
+                   ICLTensor                   *scratch_buffer,
+                   ICLTensor                   *output_state_out,
+                   ICLTensor                   *cell_state_out,
+                   ICLTensor                   *output,
+                   const LSTMParams<ICLTensor> &lstm_params,
+                   const ActivationLayerInfo   &activation_info,
+                   float                        cell_threshold       = 0.f,
+                   float                        projection_threshold = 0.f);
     /** Initialize function's tensors.
      *
      * @param[in]  compile_context             The compile context to be used.
@@ -166,13 +178,27 @@ public:
      * @param[in]  projection_threshold        (Optional) The clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip].
      *                                         If set to 0.0f then clipping is disabled.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                   const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   const ICLTensor *output_state_in, ICLTensor *cell_state_in,
-                   ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
-                   const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    void configure(const CLCompileContext      &compile_context,
+                   const ICLTensor             *input,
+                   const ICLTensor             *input_to_forget_weights,
+                   const ICLTensor             *input_to_cell_weights,
+                   const ICLTensor             *input_to_output_weights,
+                   const ICLTensor             *recurrent_to_forget_weights,
+                   const ICLTensor             *recurrent_to_cell_weights,
+                   const ICLTensor             *recurrent_to_output_weights,
+                   const ICLTensor             *forget_gate_bias,
+                   const ICLTensor             *cell_bias,
+                   const ICLTensor             *output_gate_bias,
+                   const ICLTensor             *output_state_in,
+                   ICLTensor                   *cell_state_in,
+                   ICLTensor                   *scratch_buffer,
+                   ICLTensor                   *output_state_out,
+                   ICLTensor                   *cell_state_out,
+                   ICLTensor                   *output,
+                   const LSTMParams<ICLTensor> &lstm_params,
+                   const ActivationLayerInfo   &activation_info,
+                   float                        cell_threshold       = 0.f,
+                   float                        projection_threshold = 0.f);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLLSTMLayer
      *
@@ -214,13 +240,26 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
-                           const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
-                           const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *input_to_forget_weights,
+                           const ITensorInfo             *input_to_cell_weights,
+                           const ITensorInfo             *input_to_output_weights,
+                           const ITensorInfo             *recurrent_to_forget_weights,
+                           const ITensorInfo             *recurrent_to_cell_weights,
+                           const ITensorInfo             *recurrent_to_output_weights,
+                           const ITensorInfo             *forget_gate_bias,
+                           const ITensorInfo             *cell_bias,
+                           const ITensorInfo             *output_gate_bias,
+                           const ITensorInfo             *output_state_in,
+                           const ITensorInfo             *cell_state_in,
+                           const ITensorInfo             *scratch_buffer,
+                           const ITensorInfo             *output_state_out,
+                           const ITensorInfo             *cell_state_out,
+                           const ITensorInfo             *output,
+                           const LSTMParams<ITensorInfo> &lstm_params,
+                           const ActivationLayerInfo     &activation_info,
+                           float                          cell_threshold       = 0.f,
+                           float                          projection_threshold = 0.f);
 
     // Inherited methods overridden:
     void run() override;
@@ -311,7 +350,7 @@ private:
     bool                                                _perform_projection_clipping;
     bool                                                _is_prepared;
     bool                                                _is_layer_norm_lstm;
-    const ICLTensor                                    *_recurrent_to_cell_weights{ nullptr };
+    const ICLTensor                                    *_recurrent_to_cell_weights{nullptr};
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLLSTMLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
index 9c004b85d0ddcdb73120524ab13d7ad34efbd747..8c116b14826d6c999fe99d7332319c4a4736dcb5 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
@@ -35,7 +35,6 @@
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLSlice.h"
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
-
 #include "arm_compute/runtime/common/LSTMParams.h"
 
 namespace arm_compute
@@ -100,11 +99,22 @@ public:
      * @param[out] output_state_out            Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
      */
     void configure(const ICLTensor *input,
-                   const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                   ICLTensor *cell_state_out, ICLTensor *output_state_out);
+                   const ICLTensor *input_to_input_weights,
+                   const ICLTensor *input_to_forget_weights,
+                   const ICLTensor *input_to_cell_weights,
+                   const ICLTensor *input_to_output_weights,
+                   const ICLTensor *recurrent_to_input_weights,
+                   const ICLTensor *recurrent_to_forget_weights,
+                   const ICLTensor *recurrent_to_cell_weights,
+                   const ICLTensor *recurrent_to_output_weights,
+                   const ICLTensor *input_gate_bias,
+                   const ICLTensor *forget_gate_bias,
+                   const ICLTensor *cell_bias,
+                   const ICLTensor *output_gate_bias,
+                   ICLTensor       *cell_state_in,
+                   const ICLTensor *output_state_in,
+                   ICLTensor       *cell_state_out,
+                   ICLTensor       *output_state_out);
     /** Initialize function's tensors.
      *
      * @param[in]  compile_context             The compile context to be used.
@@ -126,12 +136,24 @@ public:
      * @param[out] cell_state_out              Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size]. Data type supported:  QSYMM16.
      * @param[out] output_state_out            Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                   const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                   ICLTensor *cell_state_out, ICLTensor *output_state_out);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *input_to_input_weights,
+                   const ICLTensor        *input_to_forget_weights,
+                   const ICLTensor        *input_to_cell_weights,
+                   const ICLTensor        *input_to_output_weights,
+                   const ICLTensor        *recurrent_to_input_weights,
+                   const ICLTensor        *recurrent_to_forget_weights,
+                   const ICLTensor        *recurrent_to_cell_weights,
+                   const ICLTensor        *recurrent_to_output_weights,
+                   const ICLTensor        *input_gate_bias,
+                   const ICLTensor        *forget_gate_bias,
+                   const ICLTensor        *cell_bias,
+                   const ICLTensor        *output_gate_bias,
+                   ICLTensor              *cell_state_in,
+                   const ICLTensor        *output_state_in,
+                   ICLTensor              *cell_state_out,
+                   ICLTensor              *output_state_out);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLLSTMLayerQuantized
      *
@@ -156,11 +178,22 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out);
+                           const ITensorInfo *input_to_input_weights,
+                           const ITensorInfo *input_to_forget_weights,
+                           const ITensorInfo *input_to_cell_weights,
+                           const ITensorInfo *input_to_output_weights,
+                           const ITensorInfo *recurrent_to_input_weights,
+                           const ITensorInfo *recurrent_to_forget_weights,
+                           const ITensorInfo *recurrent_to_cell_weights,
+                           const ITensorInfo *recurrent_to_output_weights,
+                           const ITensorInfo *input_gate_bias,
+                           const ITensorInfo *forget_gate_bias,
+                           const ITensorInfo *cell_bias,
+                           const ITensorInfo *output_gate_bias,
+                           const ITensorInfo *cell_state_in,
+                           const ITensorInfo *output_state_in,
+                           const ITensorInfo *cell_state_out,
+                           const ITensorInfo *output_state_out);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLLogicalAnd.h b/arm_compute/runtime/CL/functions/CLLogicalAnd.h
index e3061e1dc30d1054f1ca976e3ee1417240f25af7..4ff488782afbe992dd198cd6b184ffc42a18b97d 100644
--- a/arm_compute/runtime/CL/functions/CLLogicalAnd.h
+++ b/arm_compute/runtime/CL/functions/CLLogicalAnd.h
@@ -111,7 +111,8 @@ public:
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[out]     output          Output tensor. Data types supported: same as @p input1.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref arm_compute::opencl::kernels::ClLogicalBinaryKernel
      *
      * @param[in] input1 First tensor input info. Data types supported: U8.
diff --git a/arm_compute/runtime/CL/functions/CLLogicalNot.h b/arm_compute/runtime/CL/functions/CLLogicalNot.h
index 27fd0f9c9f4e461e5b341c0990da695b73cc8a49..c7d9db93d789a1812066281b803ab7b261f3c604 100644
--- a/arm_compute/runtime/CL/functions/CLLogicalNot.h
+++ b/arm_compute/runtime/CL/functions/CLLogicalNot.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLLOGICALNOT_H
 #define ARM_COMPUTE_CLLOGICALNOT_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -85,7 +84,7 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-    void run() override;
+    void          run() override;
 
 private:
     struct Impl;
diff --git a/arm_compute/runtime/CL/functions/CLLogicalOr.h b/arm_compute/runtime/CL/functions/CLLogicalOr.h
index 893c22f721b34a212074cca97e851bd7fda88d2e..64b6d83177fffb9eaed0aa491e975d4f89df3450 100644
--- a/arm_compute/runtime/CL/functions/CLLogicalOr.h
+++ b/arm_compute/runtime/CL/functions/CLLogicalOr.h
@@ -111,7 +111,8 @@ public:
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[out]     output          Output tensor. Data types supported: same as @p input1.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref arm_compute::opencl::kernels::ClLogicalBinaryKernel
      *
      * @param[in] input1 First tensor input info. Data types supported: U8.
diff --git a/arm_compute/runtime/CL/functions/CLMatMul.h b/arm_compute/runtime/CL/functions/CLMatMul.h
index 9d54bab8683d44f7afd865634e9b822488d7f835..9c9939b9d0455f365b58dbdefc0d2cb584f864f1 100644
--- a/arm_compute/runtime/CL/functions/CLMatMul.h
+++ b/arm_compute/runtime/CL/functions/CLMatMul.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -88,14 +89,23 @@ public:
      * @param[in]  settings        Contains flags for function level settings
      * @param[in]  act_info        (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *rhs, ICLTensor *lhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{}, const
-                   ActivationLayerInfo &act_info = ActivationLayerInfo{});
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *rhs,
+                   ICLTensor                 *lhs,
+                   ICLTensor                 *dst,
+                   const MatMulInfo          &matmul_info,
+                   const GpuMatMulSettings   &settings = GpuMatMulSettings{},
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo{});
     /** Initialise the kernel's inputs and output
      *
      * Similar to @ref CLMatMul::configure()
      */
-    void configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{}, const ActivationLayerInfo &act_info =
-                       ActivationLayerInfo{});
+    void configure(ICLTensor                 *lhs,
+                   ICLTensor                 *rhs,
+                   ICLTensor                 *dst,
+                   const MatMulInfo          &matmul_info,
+                   const GpuMatMulSettings   &settings = GpuMatMulSettings{},
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo{});
     /** Static function to check if given info will lead to a valid configuration of @ref CLMatMul.
      *
      *
@@ -107,7 +117,11 @@ public:
      * @param[in]  matmul_info Contains MatMul operation information described in @ref MatMulInfo.
      * @param[in]  act_info    (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo{});
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *output,
+                           const MatMulInfo          &matmul_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo{});
     // Inherited methods overridden:
     void run() override;
 
diff --git a/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
index f7ff1234f647ae71261710d8a31d470f1eb876cf..2d2f064b4c40150826b3610fcdfa15d531639d96 100644
--- a/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
@@ -92,7 +92,11 @@ public:
      * @param[out] output          Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *indices,
+                   ICLTensor              *output,
+                   const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayer
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -105,7 +109,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run() override;
@@ -114,5 +121,5 @@ private:
     CLFill                                     _fill;
     std::unique_ptr<CLMaxUnpoolingLayerKernel> _unpooling_layer_kernel;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLMAXUNPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
index 68a7df24e6a6d92cb38b6975bfe80594165b10da..951db3e419ee2d1e50b89948c2ca2c54779f20ea 100644
--- a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
@@ -65,7 +65,10 @@ public:
      * @param[out]     output          (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
      * @param[in]      epsilon         (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output  = nullptr,
+                   float                   epsilon = 1e-8f);
     /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel
      *
      * @param[in] input   Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
diff --git a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
index 15406f77287e28a850ab7641a9e6683a1cbbf43f..10fd8ed4c68e35d43cf1187d1dce476401a3df7f 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
@@ -89,7 +89,10 @@ public:
      *                                 Data types supported: same as @p input. Data layouts supported: same as @p input.
      * @param[in]      norm_info       Normalization layer information like the normalization type, normalization size and other parameters.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info);
+    void configure(const CLCompileContext       &compile_context,
+                   ICLTensor                    *input,
+                   ICLTensor                    *output,
+                   const NormalizationLayerInfo &norm_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayer
      *
      * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -100,7 +103,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
index de5155c65a3a1b039e04409d8779bed1a48d5d8a..cdccc16a51b7b9c7e628b4ff1f42d40a39ee59e8 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYER_H
-#define ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLNORMALIZEPLANARYUVLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLNORMALIZEPLANARYUVLAYER_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
@@ -43,6 +43,18 @@ class CLNormalizePlanarYUVLayer : public ICLSimpleFunction
 {
 public:
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F32            |F32            |
+     * |F16            |F16            |
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
      *
      * @param[in]  input  Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels].
      *                    Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -62,7 +74,11 @@ public:
      * @param[in]  std             Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
      *                    Data types supported: Same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *mean,
+                   const ICLTensor        *std);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayer
      *
      * @param[in]  input  Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
@@ -74,7 +90,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLNORMALIZEPLANARYUVLAYER_H
diff --git a/arm_compute/runtime/CL/functions/CLPadLayer.h b/arm_compute/runtime/CL/functions/CLPadLayer.h
index 7f950bcfb3c8b4ae5dc9f581b881bdb59d4d0d51..89e693bd922c1081b94455fbf423f772ac6cf399 100644
--- a/arm_compute/runtime/CL/functions/CLPadLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPadLayer.h
@@ -76,7 +76,11 @@ public:
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(ICLTensor         *input,
+                   ICLTensor         *output,
+                   const PaddingList &padding,
+                   PixelValue         constant_value = PixelValue(),
+                   PaddingMode        mode           = PaddingMode::CONSTANT);
     /** Initialize the function
      *
      * @param[in]  compile_context The compile context to be used.
@@ -88,8 +92,12 @@ public:
      * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(),
-                   PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const PaddingList      &padding,
+                   PixelValue              constant_value = PixelValue(),
+                   PaddingMode             mode           = PaddingMode::CONSTANT);
 
     /**  Static function to check if given info will lead to a valid configuration of @ref CLPadLayer.
      *
@@ -101,7 +109,11 @@ public:
      * @param[in] mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const PaddingList &padding,
+                           PixelValue         constant_value = PixelValue(),
+                           PaddingMode        mode           = PaddingMode::CONSTANT);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLPermute.h b/arm_compute/runtime/CL/functions/CLPermute.h
index 8e15da2287a8509b832205a94680ea29fea9a778..7ac0bf6b9c610d343fbe9e5647d73dca28463e43 100644
--- a/arm_compute/runtime/CL/functions/CLPermute.h
+++ b/arm_compute/runtime/CL/functions/CLPermute.h
@@ -78,7 +78,10 @@ public:
      * @param[in] output          The output tensor. Data types supported: Same as @p input
      * @param[in] perm            Permutation vector
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm);
+    void configure(const CLCompileContext  &compile_context,
+                   const ICLTensor         *input,
+                   ICLTensor               *output,
+                   const PermutationVector &perm);
     /**  Static function to check if given info will lead to a valid configuration of @ref CLPermute.
      *
      * @note Arbitrary permutation vectors are supported with rank not greater than 4
diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
index 62b6d96ad530a6bd8cd9660d91a6ad83b953aa4e..f3e5cf9bd32367a9697fc95d5350f568bf83091c 100644
--- a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
+++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
@@ -84,8 +84,13 @@ public:
      * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and convertion policy.
      *
      * @param[in]      compile_context The compile context to be used.
@@ -100,8 +105,14 @@ public:
      * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplication
      *
      * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
@@ -115,8 +126,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -151,7 +167,10 @@ public:
      * @param[out]     output   The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output.
      *
      * @param[in]      compile_context The compile context to be used.
@@ -162,7 +181,11 @@ public:
      * @param[out]     output          The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLComplexPixelWiseMultiplication
      *
      * @param[in] input1   An input tensor info. Data types supported: F16/F32. Number of channels supported: 2.
@@ -170,7 +193,10 @@ public:
      * @param[in] output   The output tensor info, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLPooling3dLayer.h b/arm_compute/runtime/CL/functions/CLPooling3dLayer.h
index 91c46770da0b8f3e3c1b1e81e5adcc41b2fae2fb..1c691487711301de149a07882a3dec48816b5167 100644
--- a/arm_compute/runtime/CL/functions/CLPooling3dLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPooling3dLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLPOOLING3DLAYER_H
 #define ARM_COMPUTE_CLPOOLING3DLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -82,7 +81,10 @@ public:
      * @param[out]    output          Destination tensor. Data types supported: Same as @p input.
      * @param[in]     pool_info       Contains 3d pooling operation information described in @ref Pooling3dLayerInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info);
+    void configure(const CLCompileContext   &compile_context,
+                   const ICLTensor          *input,
+                   ICLTensor                *output,
+                   const Pooling3dLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPooling3dLayer
      *
      * @param[in] input     Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
index 2163c168014166790acb1b1dd952b06bb655eb7e..3dbdf8aeea3ac02e60ca7ecf9fc2b8ce44f4fa4d 100644
--- a/arm_compute/runtime/CL/functions/CLPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLPOOLINGLAYER_H
 #define ARM_COMPUTE_CLPOOLINGLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -74,7 +73,8 @@ public:
      * @param[in]     pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      * @param[out]    indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
+    void
+    configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
     /** Set the input and output tensors.
      *
      * @param[in]     compile_context The compile context to be used.
@@ -83,7 +83,11 @@ public:
      * @param[in]     pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
      * @param[out]    indices         (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const PoolingLayerInfo &pool_info,
+                   ICLTensor              *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayer
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -93,7 +97,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
index 9b36c9e4332c813f23235bed6ab0268d4f770fcc..4ede906baadf963ee61ce35997ff533b643a81f0 100644
--- a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
@@ -66,7 +66,11 @@ public:
      * @param[out] output          Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data types and layouts supported: same as @p input1
      * @param[in]  info            Prior box layer info.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ICLTensor         *input1,
+                   const ICLTensor         *input2,
+                   ICLTensor               *output,
+                   const PriorBoxLayerInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayer
      *
      * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
@@ -76,12 +80,15 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+    static Status validate(const ITensorInfo       *input1,
+                           const ITensorInfo       *input2,
+                           const ITensorInfo       *output,
+                           const PriorBoxLayerInfo &info);
 
 private:
     cl::Buffer _min;
     cl::Buffer _max;
     cl::Buffer _aspect_ratios;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLPRIORBOXLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
index 1b0b759d74da70f1848d51231f607462ffcc0280..3e76da086fc73326494b82cd99a09c3b0ee14247 100644
--- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
@@ -32,7 +32,6 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
-
 #include "arm_compute/runtime/common/LSTMParams.h"
 
 namespace arm_compute
@@ -127,12 +126,21 @@ public:
      *                                         projection_threshold       (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
      *                                                                               [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      */
-    void configure(const ICLTensor *input,
-                   const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   ICLTensor *cell_state_in, ICLTensor *output_state_in,
-                   ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+    void configure(const ICLTensor             *input,
+                   const ICLTensor             *input_to_forget_weights,
+                   const ICLTensor             *input_to_cell_weights,
+                   const ICLTensor             *input_to_output_weights,
+                   const ICLTensor             *recurrent_to_forget_weights,
+                   const ICLTensor             *recurrent_to_cell_weights,
+                   const ICLTensor             *recurrent_to_output_weights,
+                   const ICLTensor             *forget_gate_bias,
+                   const ICLTensor             *cell_bias,
+                   const ICLTensor             *output_gate_bias,
+                   ICLTensor                   *cell_state_in,
+                   ICLTensor                   *output_state_in,
+                   ICLTensor                   *cell_state_out,
+                   ICLTensor                   *output_state_out,
+                   ICLTensor                   *output,
                    const LSTMParams<ICLTensor> &lstm_params);
 
     /** Initialize function's tensors.
@@ -177,12 +185,22 @@ public:
      *                                         projection_threshold       (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
      *                                                                               [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                   const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   ICLTensor *cell_state_in, ICLTensor *output_state_in,
-                   ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+    void configure(const CLCompileContext      &compile_context,
+                   const ICLTensor             *input,
+                   const ICLTensor             *input_to_forget_weights,
+                   const ICLTensor             *input_to_cell_weights,
+                   const ICLTensor             *input_to_output_weights,
+                   const ICLTensor             *recurrent_to_forget_weights,
+                   const ICLTensor             *recurrent_to_cell_weights,
+                   const ICLTensor             *recurrent_to_output_weights,
+                   const ICLTensor             *forget_gate_bias,
+                   const ICLTensor             *cell_bias,
+                   const ICLTensor             *output_gate_bias,
+                   ICLTensor                   *cell_state_in,
+                   ICLTensor                   *output_state_in,
+                   ICLTensor                   *cell_state_out,
+                   ICLTensor                   *output_state_out,
+                   ICLTensor                   *output,
                    const LSTMParams<ICLTensor> &lstm_params);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayer
@@ -227,12 +245,21 @@ public:
      *                                                                              [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      * @return a status
      */
-    static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *input_to_forget_weights,
+                           const ITensorInfo             *input_to_cell_weights,
+                           const ITensorInfo             *input_to_output_weights,
+                           const ITensorInfo             *recurrent_to_forget_weights,
+                           const ITensorInfo             *recurrent_to_cell_weights,
+                           const ITensorInfo             *recurrent_to_output_weights,
+                           const ITensorInfo             *forget_gate_bias,
+                           const ITensorInfo             *cell_bias,
+                           const ITensorInfo             *output_gate_bias,
+                           const ITensorInfo             *cell_state_in,
+                           const ITensorInfo             *output_state_in,
+                           const ITensorInfo             *cell_state_out,
+                           const ITensorInfo             *output_state_out,
+                           const ITensorInfo             *output,
                            const LSTMParams<ITensorInfo> &lstm_params);
 
     // Inherited methods overridden:
@@ -266,10 +293,18 @@ private:
      * @param[in] mm_res_info     Tensor info to be used to initialize output stage result tensor.
      *
      */
-    void configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                      const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias, CLTensor *mm_res,
-                      CLTensor *outstage_res, float gemmlowp_scale,
-                      const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info);
+    void configure_mm(const CLCompileContext       &compile_context,
+                      CLGEMMLowpMatrixMultiplyCore &mm,
+                      CLGEMMLowpOutputStage        &outstage,
+                      GEMMLowpOutputStageInfo      &gemmlowp_info,
+                      const ICLTensor              *mm_input,
+                      const ICLTensor              *mm_weights,
+                      const ICLTensor              *bias,
+                      CLTensor                     *mm_res,
+                      CLTensor                     *outstage_res,
+                      float                         gemmlowp_scale,
+                      const TensorInfo             &mm_res_info,
+                      const TensorInfo             &outstage_tensor_info);
 
     MemoryGroup _memory_group{};
 
@@ -278,8 +313,8 @@ private:
     {
         static constexpr uint32_t max_dimension_supported = 2;
 
-        ICLTensor *_src{ nullptr };
-        ICLTensor *_dst{ nullptr };
+        ICLTensor *_src{nullptr};
+        ICLTensor *_dst{nullptr};
         size_t     _row_size{};
         Window     _window{};
 
@@ -368,7 +403,7 @@ private:
     CLArithmeticAddition                                               _accumulate_projection{};
     CLActivationLayer                                                  _projection_clip{};
     std::array<std::unique_ptr<CLQLSTMLayerNormalizationKernel>, _layer_norm_count> _layer_norms;
-    CLCopy _copy_output;
+    CLCopy                                                                          _copy_output;
 
     TensorCopyKernel _projection_bias_copy{};
     TensorCopyKernel _projection_output_to_accumulate_copy{};
@@ -376,21 +411,18 @@ private:
     TensorCopyKernel _hidden_to_output_copy{};
 
     // Tensor pointers
-    const ICLTensor *_input_to_input_weights
-    {
-        nullptr
-    };
-    const ICLTensor *_recurrent_to_input_weights{ nullptr };
-    const ICLTensor *_projection_bias{ nullptr };
-    const ICLTensor *_input_to_forget_weights{ nullptr };
-    const ICLTensor *_input_to_cell_weights{ nullptr };
-    const ICLTensor *_input_to_output_weights{ nullptr };
-    const ICLTensor *_recurrent_to_forget_weights{ nullptr };
-    const ICLTensor *_recurrent_to_cell_weights{ nullptr };
-    const ICLTensor *_recurrent_to_output_weights{ nullptr };
-    const ICLTensor *_projection_weights{ nullptr };
-    std::array<const ICLTensor *, _layer_norm_count> _layer_norm_weights{ {} };
-    std::array<const ICLTensor *, _layer_norm_count> _layer_norm_bias{ {} };
+    const ICLTensor                                 *_input_to_input_weights{nullptr};
+    const ICLTensor                                 *_recurrent_to_input_weights{nullptr};
+    const ICLTensor                                 *_projection_bias{nullptr};
+    const ICLTensor                                 *_input_to_forget_weights{nullptr};
+    const ICLTensor                                 *_input_to_cell_weights{nullptr};
+    const ICLTensor                                 *_input_to_output_weights{nullptr};
+    const ICLTensor                                 *_recurrent_to_forget_weights{nullptr};
+    const ICLTensor                                 *_recurrent_to_cell_weights{nullptr};
+    const ICLTensor                                 *_recurrent_to_output_weights{nullptr};
+    const ICLTensor                                 *_projection_weights{nullptr};
+    std::array<const ICLTensor *, _layer_norm_count> _layer_norm_weights{{}};
+    std::array<const ICLTensor *, _layer_norm_count> _layer_norm_bias{{}};
 
     using LayerNormIndexType = typename std::underlying_type<LayerNormGate>::type;
     inline LayerNormIndexType getGateIndex(LayerNormGate g)
@@ -423,78 +455,78 @@ private:
         return *_layer_norms[getGateIndex(g)];
     }
 
-    inline void configure_layer_norm(LayerNormGate g, const ICLTensor *in);
+    inline void          configure_layer_norm(LayerNormGate g, const ICLTensor *in);
     inline static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias);
 
     // Temporary tensors
-    CLTensor _input_to_forget_weights_transposed{ nullptr };
-    CLTensor _input_to_cell_weights_transposed{ nullptr };
-    CLTensor _input_to_output_weights_transposed{ nullptr };
-    CLTensor _input_to_input_weights_transposed{ nullptr };
-    CLTensor _recurrent_to_forget_weights_transposed{ nullptr };
-    CLTensor _recurrent_to_cell_weights_transposed{ nullptr };
-    CLTensor _recurrent_to_output_weights_transposed{ nullptr };
-    CLTensor _recurrent_to_input_weights_transposed{ nullptr };
-    CLTensor _projection_weights_transposed{ nullptr };
-    CLTensor _input_to_input_eff_bias{ nullptr };
-    CLTensor _recurrent_to_input_eff_bias{ nullptr };
-    CLTensor _input_to_forget_eff_bias{ nullptr };
-    CLTensor _recurrent_to_forget_eff_bias{ nullptr };
-    CLTensor _input_to_cell_eff_bias{ nullptr };
-    CLTensor _recurrent_to_cell_eff_bias{ nullptr };
-    CLTensor _input_to_output_eff_bias{ nullptr };
-    CLTensor _recurrent_to_output_eff_bias{ nullptr };
-    CLTensor _projection_reduction_res{ nullptr };
-    CLTensor _projection_eff_bias{ nullptr };
-    CLTensor _mm_input_to_forget_res{ nullptr };
-    CLTensor _mm_recurrent_to_forget_res{ nullptr };
-    CLTensor _mul_cell_to_forget_res{ nullptr };
-    CLTensor _input_to_forget_outstage_res{ nullptr };
-    CLTensor _cell_to_forget_outstage_res{ nullptr };
-    CLTensor _recurrent_to_forget_outstage_res{ nullptr };
-    CLTensor _forget_gate{ nullptr };
-    CLTensor _mm_input_to_cell_res{ nullptr };
-    CLTensor _input_to_cell_outstage_res{ nullptr };
-    CLTensor _mm_recurrent_to_cell_res{ nullptr };
-    CLTensor _recurrent_to_cell_outstage_res{ nullptr };
-    CLTensor _cell_gate{ nullptr };
-    CLTensor _mul_input_cell_res{ nullptr };
-    CLTensor _mm_input_to_input_res{ nullptr };
-    CLTensor _input_to_input_outstage_res{ nullptr };
-    CLTensor _mm_recurrent_to_input_res{ nullptr };
-    CLTensor _mul_cell_to_input_res{ nullptr };
-    CLTensor _cell_to_input_outstage_res{ nullptr };
-    CLTensor _recurrent_to_input_outstage_res{ nullptr };
-    CLTensor _input_gate{ nullptr };
-    CLTensor _mm_input_to_output_res{ nullptr };
-    CLTensor _input_to_output_outstage_res{ nullptr };
-    CLTensor _mm_recurrent_to_output_res{ nullptr };
-    CLTensor _mul_cell_to_output_res{ nullptr };
-    CLTensor _cell_to_output_outstage_res{ nullptr };
-    CLTensor _recurrent_to_output_outstage_res{ nullptr };
-    CLTensor _output_gate{ nullptr };
-    CLTensor _hidden_mul_res{ nullptr };
-    CLTensor _hidden_gate{ nullptr };
-    CLTensor _mm_projection_res{ nullptr };
-    CLTensor _projection_outstage_res{ nullptr };
-    CLTensor _projection_out_res{ nullptr };
-    CLTensor _projection_accumulate_res{ nullptr };
-    CLTensor _ones{ nullptr };
-    std::array<CLTensor, _layer_norm_count> _layer_norm_output{ {} };
+    CLTensor                                _input_to_forget_weights_transposed{nullptr};
+    CLTensor                                _input_to_cell_weights_transposed{nullptr};
+    CLTensor                                _input_to_output_weights_transposed{nullptr};
+    CLTensor                                _input_to_input_weights_transposed{nullptr};
+    CLTensor                                _recurrent_to_forget_weights_transposed{nullptr};
+    CLTensor                                _recurrent_to_cell_weights_transposed{nullptr};
+    CLTensor                                _recurrent_to_output_weights_transposed{nullptr};
+    CLTensor                                _recurrent_to_input_weights_transposed{nullptr};
+    CLTensor                                _projection_weights_transposed{nullptr};
+    CLTensor                                _input_to_input_eff_bias{nullptr};
+    CLTensor                                _recurrent_to_input_eff_bias{nullptr};
+    CLTensor                                _input_to_forget_eff_bias{nullptr};
+    CLTensor                                _recurrent_to_forget_eff_bias{nullptr};
+    CLTensor                                _input_to_cell_eff_bias{nullptr};
+    CLTensor                                _recurrent_to_cell_eff_bias{nullptr};
+    CLTensor                                _input_to_output_eff_bias{nullptr};
+    CLTensor                                _recurrent_to_output_eff_bias{nullptr};
+    CLTensor                                _projection_reduction_res{nullptr};
+    CLTensor                                _projection_eff_bias{nullptr};
+    CLTensor                                _mm_input_to_forget_res{nullptr};
+    CLTensor                                _mm_recurrent_to_forget_res{nullptr};
+    CLTensor                                _mul_cell_to_forget_res{nullptr};
+    CLTensor                                _input_to_forget_outstage_res{nullptr};
+    CLTensor                                _cell_to_forget_outstage_res{nullptr};
+    CLTensor                                _recurrent_to_forget_outstage_res{nullptr};
+    CLTensor                                _forget_gate{nullptr};
+    CLTensor                                _mm_input_to_cell_res{nullptr};
+    CLTensor                                _input_to_cell_outstage_res{nullptr};
+    CLTensor                                _mm_recurrent_to_cell_res{nullptr};
+    CLTensor                                _recurrent_to_cell_outstage_res{nullptr};
+    CLTensor                                _cell_gate{nullptr};
+    CLTensor                                _mul_input_cell_res{nullptr};
+    CLTensor                                _mm_input_to_input_res{nullptr};
+    CLTensor                                _input_to_input_outstage_res{nullptr};
+    CLTensor                                _mm_recurrent_to_input_res{nullptr};
+    CLTensor                                _mul_cell_to_input_res{nullptr};
+    CLTensor                                _cell_to_input_outstage_res{nullptr};
+    CLTensor                                _recurrent_to_input_outstage_res{nullptr};
+    CLTensor                                _input_gate{nullptr};
+    CLTensor                                _mm_input_to_output_res{nullptr};
+    CLTensor                                _input_to_output_outstage_res{nullptr};
+    CLTensor                                _mm_recurrent_to_output_res{nullptr};
+    CLTensor                                _mul_cell_to_output_res{nullptr};
+    CLTensor                                _cell_to_output_outstage_res{nullptr};
+    CLTensor                                _recurrent_to_output_outstage_res{nullptr};
+    CLTensor                                _output_gate{nullptr};
+    CLTensor                                _hidden_mul_res{nullptr};
+    CLTensor                                _hidden_gate{nullptr};
+    CLTensor                                _mm_projection_res{nullptr};
+    CLTensor                                _projection_outstage_res{nullptr};
+    CLTensor                                _projection_out_res{nullptr};
+    CLTensor                                _projection_accumulate_res{nullptr};
+    CLTensor                                _ones{nullptr};
+    std::array<CLTensor, _layer_norm_count> _layer_norm_output{{}};
 
     inline CLTensor &get_layer_norm_output(LayerNormGate g)
     {
         return _layer_norm_output[getGateIndex(g)];
     }
 
-    bool _is_prepared{ false };
-    bool _has_cifg{ false };
-    bool _has_cell_clipping{ false };
-    bool _has_projection{ false };
-    bool _has_projection_clipping{ false };
-    bool _has_peephole{ false };
-    bool _has_layer_norm{ false };
-    bool _projection_tensor_copy_required{ false };
+    bool _is_prepared{false};
+    bool _has_cifg{false};
+    bool _has_cell_clipping{false};
+    bool _has_projection{false};
+    bool _has_projection_clipping{false};
+    bool _has_peephole{false};
+    bool _has_layer_norm{false};
+    bool _projection_tensor_copy_required{false};
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLQLSTMLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLRNNLayer.h b/arm_compute/runtime/CL/functions/CLRNNLayer.h
index 2b3b35e37d9de22b9963b8f35f9a5fb32e47ecab..a8d835d04d472a686865741749ff3d9d6391404b 100644
--- a/arm_compute/runtime/CL/functions/CLRNNLayer.h
+++ b/arm_compute/runtime/CL/functions/CLRNNLayer.h
@@ -24,12 +24,12 @@
 #ifndef ARM_COMPUTE_CLRNN_LAYER_H
 #define ARM_COMPUTE_CLRNN_LAYER_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLCopy.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 #include <memory>
 
@@ -69,7 +69,13 @@ public:
      * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input
      * @param[in]     info              Activation layer parameter.
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info);
+    void configure(const ICLTensor     *input,
+                   const ICLTensor     *weights,
+                   const ICLTensor     *recurrent_weights,
+                   const ICLTensor     *bias,
+                   ICLTensor           *hidden_state,
+                   ICLTensor           *output,
+                   ActivationLayerInfo &info);
     /** Initialize the function
      *
      * @param[in]     compile_context   The compile context to be used.
@@ -81,8 +87,14 @@ public:
      * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input
      * @param[in]     info              Activation layer parameter.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state,
-                   ICLTensor *output, ActivationLayerInfo &info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *recurrent_weights,
+                   const ICLTensor        *bias,
+                   ICLTensor              *hidden_state,
+                   ICLTensor              *output,
+                   ActivationLayerInfo    &info);
     /** Initialize the function
      *
      * @param[in] input             Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32
@@ -95,7 +107,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, const ITensorInfo *output,
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *recurrent_weights,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *hidden_state,
+                           const ITensorInfo         *output,
                            const ActivationLayerInfo &info);
 
     // Inherited methods overridden:
@@ -114,5 +131,5 @@ private:
     CLTensor              _add_output;
     bool                  _is_prepared;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLRNN_LAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
index 1eaea1b2979181c095d948d6de52bf59c1a9208a..14d34767113cf916cf427a3c478456127e020e6f 100644
--- a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
+++ b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
@@ -68,7 +68,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -84,7 +85,11 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *rois,
+                   ICLTensor                 *output,
+                   const ROIPoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayer
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -100,7 +105,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLROIALIGNLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
index 151586a1f6cae3ac93ff963b17da0e1558b8ba6f..86294596d24f621f8737b7cbd5d3cb4449ddbdb3 100644
--- a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
@@ -66,7 +66,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -81,7 +82,11 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *rois,
+                   const ICLTensor           *output,
+                   const ROIPoolingLayerInfo &pool_info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLROIPoolingLayer
      *
@@ -97,7 +102,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLROIPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLRange.h b/arm_compute/runtime/CL/functions/CLRange.h
index fbce05162cb7369ffa91e88940ada52bc7ef8c96..ed665bc39856b2d11110b4c956b18ba3dd0d83d6 100644
--- a/arm_compute/runtime/CL/functions/CLRange.h
+++ b/arm_compute/runtime/CL/functions/CLRange.h
@@ -73,7 +73,8 @@ public:
      * @param[in]  end             The ending (not including) value of the sequence.
      * @param[in]  step            The gap between each pair of values in the sequence. Default is 1.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *output, float start, float end, float step = 1.f);
+    void
+    configure(const CLCompileContext &compile_context, ICLTensor *output, float start, float end, float step = 1.f);
     /** Static function to check if given info will lead to a valid configuration of @ref CLRange
      *
      * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLReduceMean.h b/arm_compute/runtime/CL/functions/CLReduceMean.h
index 1ce088b2ce8356df4aa905db956a38ca2853dd31..640fe7cf1b85771d600ff202cf277daedf52f92b 100644
--- a/arm_compute/runtime/CL/functions/CLReduceMean.h
+++ b/arm_compute/runtime/CL/functions/CLReduceMean.h
@@ -24,12 +24,12 @@
 #ifndef ARM_COMPUTE_CL_REDUCE_MEAN_H
 #define ARM_COMPUTE_CL_REDUCE_MEAN_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 
 namespace arm_compute
@@ -74,7 +74,11 @@ public:
      * @param[in]  keep_dims       If positive, retains reduced dimensions with length 1.
      * @param[out] output          Destination tensor. Data type supported: Same as @p input
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   const Coordinates      &reduction_axis,
+                   bool                    keep_dims,
+                   ICLTensor              *output);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReduceMean
      *
@@ -85,7 +89,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
+    static Status
+    validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index 2245735b6203cbbaba95d0589159355a1eb3f811..80068ac35c6eb8eb830cea1159df0ac320c27123 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h
@@ -80,7 +80,8 @@ public:
      * @param[in]  op        Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
      * @param[in]  keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      */
-    void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+    void
+    configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -90,7 +91,12 @@ public:
      * @param[in]  op              Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
      * @param[in]  keep_dims       (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   unsigned int            axis,
+                   ReductionOperation      op,
+                   bool                    keep_dims = true);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperation.
      *
@@ -102,7 +108,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           unsigned int       axis,
+                           ReductionOperation op,
+                           bool               keep_dims = true);
 
     // Inherited methods overridden:
     void run() override;
@@ -118,4 +128,4 @@ private:
     bool                                        _is_reshape_required;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLREDUCTIONOPERATION_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CLREDUCTIONOPERATION_H */
diff --git a/arm_compute/runtime/CL/functions/CLReshapeLayer.h b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
index 7346b65e9ba67d017a82ca39104695d93c3a5d1e..dad90e6ba96e22d6f61dda1fd24cb02c2badf719 100644
--- a/arm_compute/runtime/CL/functions/CLReshapeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/runtime/CL/ICLOperator.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
 #include <memory>
 
 namespace arm_compute
diff --git a/arm_compute/runtime/CL/functions/CLReverse.h b/arm_compute/runtime/CL/functions/CLReverse.h
index 94c63ca92d52982b2de9c5e3e92d78d3f00c9264..46229540b481ed0a975c17c5e6bb21ab0a727c6f 100644
--- a/arm_compute/runtime/CL/functions/CLReverse.h
+++ b/arm_compute/runtime/CL/functions/CLReverse.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLREVERSE_H
-#define ARM_COMPUTE_CLREVERSE_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLREVERSE_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLREVERSE_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
@@ -45,30 +45,38 @@ public:
      * Valid data type configurations:
      * |src0           |src1           |dst            |
      * |:--------------|:--------------|:--------------|
-     * |All            |U32            |All            |
+     * |All            |U32, S32       |All            |
      *
-     * @param[in]  input  Input tensor. Data types supported: All.
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis   Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in]  input             Input tensor. Data types supported: All.
+     * @param[out] output            Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis              Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in]  use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis, bool use_inverted_axis);
     /** Initialize the function
      *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[out] output          Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis            Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in]  compile_context   The compile context to be used.
+     * @param[in]  input             Input tensor. Data types supported: All.
+     * @param[out] output            Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis              Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in]  use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *axis,
+                   bool                    use_inverted_axis);
     /** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: All.
-     * @param[in] output Output tensor info. Data type supported: Same as @p input
-     * @param[in] axis   Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in] input             Input tensor info. Data types supported: All.
+     * @param[in] output            Output tensor info. Data type supported: Same as @p input
+     * @param[in] axis              Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLREVERSE_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLREVERSE_H
diff --git a/arm_compute/runtime/CL/functions/CLScale.h b/arm_compute/runtime/CL/functions/CLScale.h
index ddb4a2353134c9404ea1f851dd22ad9959f6a435..5c3824eb58aad8a99c16119b00429c1061f68826 100644
--- a/arm_compute/runtime/CL/functions/CLScale.h
+++ b/arm_compute/runtime/CL/functions/CLScale.h
@@ -83,7 +83,10 @@ public:
      *                                All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]     info            @ref ScaleKernelInfo descriptor to be used to configure
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const ScaleKernelInfo  &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLScale
      *
diff --git a/arm_compute/runtime/CL/functions/CLSelect.h b/arm_compute/runtime/CL/functions/CLSelect.h
index 8b1e6b201944acee50b27d3a6ec030389f8e5a51..effcb58313f2d0cfefbaa0408a7471cca7ace033 100644
--- a/arm_compute/runtime/CL/functions/CLSelect.h
+++ b/arm_compute/runtime/CL/functions/CLSelect.h
@@ -62,7 +62,11 @@ public:
      * @param[in]  y               Second input tensor. Data types supported: Same as @p x
      * @param[out] output          Output tensor. Data types supported: Same as @p x.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *c,
+                   const ICLTensor        *x,
+                   const ICLTensor        *y,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSelect
      *
      * @param[in] c      Condition input tensor. Data types supported: U8.
diff --git a/arm_compute/runtime/CL/functions/CLSlice.h b/arm_compute/runtime/CL/functions/CLSlice.h
index 297bcd86fed8563f73a00d04afa9a3191fb41e28..7a274ded7212d66bb23f1344d892de47ab6775bf 100644
--- a/arm_compute/runtime/CL/functions/CLSlice.h
+++ b/arm_compute/runtime/CL/functions/CLSlice.h
@@ -84,7 +84,11 @@ public:
      * @param[in]  starts          The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in]  ends            The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLSlice
      *
@@ -100,7 +104,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
 
     // Inherited methods overridden:
     void run() override;
@@ -129,7 +134,11 @@ public:
      * @param[in]  starts          The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in]  ends            The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   ITensorInfo            *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLSlice
      *
@@ -145,7 +154,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
 };
 } // namespace experimental
 } // namespace arm_compute
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index 687f8ff6d87e0dc43afe4f009ef58560798ed739..68541e35c5b46975ea4dbe4b8a4de0b56f8163ce 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLSOFTMAXLAYER_H
-#define ARM_COMPUTE_CLSOFTMAXLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSOFTMAXLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSOFTMAXLAYER_H
 
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
@@ -43,12 +43,6 @@ class CLCompileContext;
  *
  * Log Softmax is calculated by :
  * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f]
- *
- * This function runs the following operators/kernels:
- * -# If axis is not 0:
- * -# @ref opencl::ClPermute
- * -# @ref opencl::kernels::ClLogits1DNormKernel
- * -# @ref opencl::kernels::ClLogits1DMaxShiftExpSumKernel
  */
 template <bool IS_LOG = false>
 class CLSoftmaxLayerGeneric : public IFunction
@@ -87,7 +81,11 @@ public:
      * @param[in]  axis            (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
      *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta = 1.0f, int32_t axis = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   float                   beta = 1.0f,
+                   int32_t                 axis = 0);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSoftmaxLayer
      *
      * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
@@ -111,4 +109,4 @@ private:
 using CLSoftmaxLayer    = CLSoftmaxLayerGeneric<false>;
 using CLLogSoftmaxLayer = CLSoftmaxLayerGeneric<true>;
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLSOFTMAXLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSOFTMAXLAYER_H
diff --git a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
index 304a74137e1671d89a0946aedf1388a6a6803c2b..191f4863d5159c81bdedbd41813e56cf2beb64df 100644
--- a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
@@ -83,7 +83,11 @@ public:
      * @param[in]  paddings        2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *block_shape,
+                   const ICLTensor        *paddings,
+                   ICLTensor              *output);
     /** Set the input and output tensors. (Static block shape and paddings)
      *
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -93,7 +97,12 @@ public:
      * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
-    void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
+    void configure(const ICLTensor *input,
+                   const int        block_shape_x,
+                   const int        block_shape_y,
+                   const Size2D    &padding_left,
+                   const Size2D    &padding_right,
+                   ICLTensor       *output);
     /** Set the input and output tensors. (Static block shape and paddings)
      *
      * @param[in]  compile_context The compile context to be used.
@@ -104,8 +113,13 @@ public:
      * @param[in]  padding_right   The padding at the end of every dimension of the output tensor.
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                   ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const int               block_shape_x,
+                   const int               block_shape_y,
+                   const Size2D           &padding_left,
+                   const Size2D           &padding_right,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayer
      *
      * @param[in]  input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -115,7 +129,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *block_shape,
+                           const ITensorInfo *paddings,
+                           const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayer (Static block shape and paddings)
      *
      * @param[in]  input         Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -127,7 +144,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int          block_shape_x,
+                           const int          block_shape_y,
+                           const Size2D      &padding_left,
+                           const Size2D      &padding_right,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
index 8a47e95f9d491306dd28e2cf655c46f520de6d38..1b0dfc2b7407872bb29331145fec77cfe24f2744 100644
--- a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
@@ -75,7 +75,8 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape     Block shape value.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayer.
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLSplit.h b/arm_compute/runtime/CL/functions/CLSplit.h
index 86c7bdde7ddf97b4deac56dc2f7fe7285407874a..8d1375521268de9dbb87d2f9fe7afbab069e05ba 100644
--- a/arm_compute/runtime/CL/functions/CLSplit.h
+++ b/arm_compute/runtime/CL/functions/CLSplit.h
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-
 #include "arm_compute/runtime/CL/functions/CLSlice.h"
 #include "arm_compute/runtime/CPP/functions/CPPSplit.h"
 #include "arm_compute/runtime/IFunction.h"
diff --git a/arm_compute/runtime/CL/functions/CLStackLayer.h b/arm_compute/runtime/CL/functions/CLStackLayer.h
index 54c903a706ac546a44243ada0bff8a39c496fd17..18745c8a4ff09b841a48a66f210abf83dc00ea3e 100644
--- a/arm_compute/runtime/CL/functions/CLStackLayer.h
+++ b/arm_compute/runtime/CL/functions/CLStackLayer.h
@@ -85,7 +85,10 @@ public:
      *                             Negative values wrap around
      * @param[out] output          Output tensor. Data types supported: Same as @p input.
      */
-    void configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output);
+    void configure(const CLCompileContext         &compile_context,
+                   const std::vector<ICLTensor *> &input,
+                   int                             axis,
+                   ICLTensor                      *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel
      *
      * @note Supported input tensor rank: up to 4
diff --git a/arm_compute/runtime/CL/functions/CLStridedSlice.h b/arm_compute/runtime/CL/functions/CLStridedSlice.h
index 6fab0c01866b22bea491dd3a5464486180e039e0..b1edc2481cdbb610cceddc8d2e1cd72e555e030e 100644
--- a/arm_compute/runtime/CL/functions/CLStridedSlice.h
+++ b/arm_compute/runtime/CL/functions/CLStridedSlice.h
@@ -74,9 +74,14 @@ public:
      * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const ICLTensor *input, ICLTensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    void configure(const ICLTensor   *input,
+                   ICLTensor         *output,
+                   const Coordinates &starts,
+                   const Coordinates &ends,
+                   const BiStrides   &strides,
+                   int32_t            begin_mask       = 0,
+                   int32_t            end_mask         = 0,
+                   int32_t            shrink_axis_mask = 0);
     /** Configure kernel
      *
      * @note Supported tensor rank: up to 4
@@ -92,9 +97,15 @@ public:
      * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends,
+                   const BiStrides        &strides,
+                   int32_t                 begin_mask       = 0,
+                   int32_t                 end_mask         = 0,
+                   int32_t                 shrink_axis_mask = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSlice
      *
@@ -110,9 +121,14 @@ public:
      * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask       = 0,
+                           int32_t            end_mask         = 0,
+                           int32_t            shrink_axis_mask = 0);
 
     // Inherited methods overridden:
     void run() override;
@@ -143,9 +159,15 @@ public:
      * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   ITensorInfo            *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends,
+                   const BiStrides        &strides,
+                   int32_t                 begin_mask       = 0,
+                   int32_t                 end_mask         = 0,
+                   int32_t                 shrink_axis_mask = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSlice
      *
@@ -161,9 +183,14 @@ public:
      * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask       = 0,
+                           int32_t            end_mask         = 0,
+                           int32_t            shrink_axis_mask = 0);
 };
 } // namespace experimental
 } // namespace arm_compute
diff --git a/arm_compute/runtime/CL/functions/CLTile.h b/arm_compute/runtime/CL/functions/CLTile.h
index c266adbbd49bc64b81381ef602183547723b5a49..4c414670a57e1b65b2ddcc9cbf397e5d24e78d51 100644
--- a/arm_compute/runtime/CL/functions/CLTile.h
+++ b/arm_compute/runtime/CL/functions/CLTile.h
@@ -59,7 +59,10 @@ public:
      * @param[in]  multiples       Contains the number of times the input tensor should be replicated on the given dimension.
      * @param[out] output          Destination tensor. Same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const Multiples        &multiples);
     /** Static function to check if given info will lead to a valid configuration of @ref CLTile
      *
      * @param[in] input     Source tensor info. Data type supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLTranspose.h b/arm_compute/runtime/CL/functions/CLTranspose.h
index a866aeabaa866ded6c62f0ade4b904fef0e48749..9dc977fbeb60b3aa6a48649b0358bd90572b1e20 100644
--- a/arm_compute/runtime/CL/functions/CLTranspose.h
+++ b/arm_compute/runtime/CL/functions/CLTranspose.h
@@ -88,6 +88,6 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 
 #endif /* ARM_COMPUTE_CLTRANSPOSE_H */
diff --git a/arm_compute/runtime/CL/functions/CLUnstack.h b/arm_compute/runtime/CL/functions/CLUnstack.h
index 32ad439b702a0d819845dd820606cc31df545937..a6eee4317701c23a8feed20aa8d314b895089eb1 100644
--- a/arm_compute/runtime/CL/functions/CLUnstack.h
+++ b/arm_compute/runtime/CL/functions/CLUnstack.h
@@ -26,9 +26,8 @@
 
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -72,7 +71,10 @@ public:
      * @param[in]     axis            The axis to unstack along. Valid values are [-R,R) where R is the input's rank. Negative values wrap around.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis);
+    void configure(const CLCompileContext         &compile_context,
+                   const ICLTensor                *input,
+                   const std::vector<ICLTensor *> &output_vector,
+                   int                             axis);
     /** Static function to check if given info will lead to a valid configuration of @ref CLUnstack
      *
      * @param[in] input         Input tensor info. Data type supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
index adf5f18626d8e4327c050930a70f7c2415fd8ee1..efea9a155069d545d16acd2ae6d644a401643745 100644
--- a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
@@ -84,8 +84,13 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Set the input and output tensors.
      *
      * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout
@@ -104,8 +109,14 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradConvolutionLayer
      *
      * @note: This function only works with 3x3,3x1,1x3,5x5,5x1 and 1x5 kernels along with unit strides for both NCHW and NHWC data layout
@@ -125,8 +136,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/tuners/CLTuningParametersList.h b/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
index 69572c98d2b234dd5edbeff2eedcd9b7e7f7fd98..5f6d12b4a7931fb6802ad59dc606562488261171 100644
--- a/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
+++ b/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
@@ -29,6 +29,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/CL/CLTunerTypes.h"
 #include "arm_compute/runtime/CL/CLTuningParams.h"
+
 #include "support/ToolchainSupport.h"
 
 #include <memory>
diff --git a/arm_compute/runtime/CPP/CPPScheduler.h b/arm_compute/runtime/CPP/CPPScheduler.h
index a5932d630131a13a7193563fada4680cb130a778..7f70b5fa1f4f68f42e9c429129437a9d82eecafc 100644
--- a/arm_compute/runtime/CPP/CPPScheduler.h
+++ b/arm_compute/runtime/CPP/CPPScheduler.h
@@ -55,10 +55,10 @@ public:
     static CPPScheduler &get();
 
     // Inherited functions overridden
-    void set_num_threads(unsigned int num_threads) override;
-    void set_num_threads_with_affinity(unsigned int num_threads, BindFunc func) override;
+    void         set_num_threads(unsigned int num_threads) override;
+    void         set_num_threads_with_affinity(unsigned int num_threads, BindFunc func) override;
     unsigned int num_threads() const override;
-    void schedule(ICPPKernel *kernel, const Hints &hints) override;
+    void         schedule(ICPPKernel *kernel, const Hints &hints) override;
     void schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) override;
 
 protected:
diff --git a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
index 58b4bf25cc3f4d7be70775c896ea002a1e98c6cd..9af4ed6208394830c78b74d9af92ff6ec139863a 100644
--- a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
+++ b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
@@ -61,8 +61,16 @@ public:
      * @param[in]  keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: U32.
      * @param[in]  info             (Optional) BoxNMSLimitInfo information.
      */
-    void configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
-                   ITensor *batch_splits_out = nullptr, ITensor *keeps = nullptr, ITensor *keeps_size = nullptr, const BoxNMSLimitInfo info = BoxNMSLimitInfo());
+    void configure(const ITensor        *scores_in,
+                   const ITensor        *boxes_in,
+                   const ITensor        *batch_splits_in,
+                   ITensor              *scores_out,
+                   ITensor              *boxes_out,
+                   ITensor              *classes,
+                   ITensor              *batch_splits_out = nullptr,
+                   ITensor              *keeps            = nullptr,
+                   ITensor              *keeps_size       = nullptr,
+                   const BoxNMSLimitInfo info             = BoxNMSLimitInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CPPDetectionOutputLayer
      *
      * @param[in] scores_in        The scores input tensor of size [count, num_classes]. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
@@ -81,9 +89,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out,
-                           const ITensorInfo *classes,
-                           const ITensorInfo *batch_splits_out = nullptr, const ITensorInfo *keeps = nullptr, const ITensorInfo *keeps_size = nullptr, const BoxNMSLimitInfo info = BoxNMSLimitInfo());
+    static Status validate(const ITensorInfo    *scores_in,
+                           const ITensorInfo    *boxes_in,
+                           const ITensorInfo    *batch_splits_in,
+                           const ITensorInfo    *scores_out,
+                           const ITensorInfo    *boxes_out,
+                           const ITensorInfo    *classes,
+                           const ITensorInfo    *batch_splits_out = nullptr,
+                           const ITensorInfo    *keeps            = nullptr,
+                           const ITensorInfo    *keeps_size       = nullptr,
+                           const BoxNMSLimitInfo info             = BoxNMSLimitInfo());
     // Inherited methods overridden:
     void run() override;
 
diff --git a/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h b/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
index f2c7ccccc5711188af284490c615ba976adfb696..dc8c8e76baa7a8208f99dafc4b362b4e85ce3f85 100644
--- a/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
+++ b/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPP_DETECTION_OUTPUT_LAYER_H
 #define ARM_COMPUTE_CPP_DETECTION_OUTPUT_LAYER_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -52,7 +51,11 @@ public:
      *
      * @note Output contains all the detections. Of those, only the ones selected by the valid region are valid.
      */
-    void configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox, ITensor *output, DetectionOutputLayerInfo info = DetectionOutputLayerInfo());
+    void configure(const ITensor           *input_loc,
+                   const ITensor           *input_conf,
+                   const ITensor           *input_priorbox,
+                   ITensor                 *output,
+                   DetectionOutputLayerInfo info = DetectionOutputLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CPPDetectionOutputLayer
      *
      * @param[in] input_loc      The mbox location input tensor info. Data types supported: F32.
@@ -63,7 +66,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output,
+    static Status validate(const ITensorInfo       *input_loc,
+                           const ITensorInfo       *input_conf,
+                           const ITensorInfo       *input_priorbox,
+                           const ITensorInfo       *output,
                            DetectionOutputLayerInfo info = DetectionOutputLayerInfo());
     // Inherited methods overridden:
     void run() override;
@@ -82,12 +88,12 @@ private:
     int _num_priors;
     int _num;
 
-    std::vector<LabelBBox> _all_location_predictions;
+    std::vector<LabelBBox>                         _all_location_predictions;
     std::vector<std::map<int, std::vector<float>>> _all_confidence_scores;
-    std::vector<BBox> _all_prior_bboxes;
-    std::vector<std::array<float, 4>> _all_prior_variances;
-    std::vector<LabelBBox> _all_decode_bboxes;
-    std::vector<std::map<int, std::vector<int>>> _all_indices;
+    std::vector<BBox>                              _all_prior_bboxes;
+    std::vector<std::array<float, 4>>              _all_prior_variances;
+    std::vector<LabelBBox>                         _all_decode_bboxes;
+    std::vector<std::map<int, std::vector<int>>>   _all_indices;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CPP_DETECTION_OUTPUT_LAYER_H */
diff --git a/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h b/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
index 94248ff314a597f3410fa2bea385e80aeeb45d01..a40e4f9ecb2bbdebe25700f551849150c81ef111 100644
--- a/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
+++ b/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_CPP_DETECTION_POSTPROCESS_H
 #define ARM_COMPUTE_CPP_DETECTION_POSTPROCESS_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -65,8 +64,14 @@ public:
      *
      * @note Output contains all the detections. Of those, only the ones selected by the valid region are valid.
      */
-    void configure(const ITensor *input_box_encoding, const ITensor *input_score, const ITensor *input_anchors,
-                   ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
+    void configure(const ITensor                *input_box_encoding,
+                   const ITensor                *input_score,
+                   const ITensor                *input_anchors,
+                   ITensor                      *output_boxes,
+                   ITensor                      *output_classes,
+                   ITensor                      *output_scores,
+                   ITensor                      *num_detection,
+                   DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CPPDetectionPostProcessLayer
      *
      * @param[in]  input_box_encoding The bounding box input tensor info. Data types supported: F32/QASYMM8/QASYMM8_SIGNED.
@@ -80,8 +85,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
-                           ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection,
+    static Status validate(const ITensorInfo            *input_box_encoding,
+                           const ITensorInfo            *input_class_score,
+                           const ITensorInfo            *input_anchors,
+                           ITensorInfo                  *output_boxes,
+                           ITensorInfo                  *output_classes,
+                           ITensorInfo                  *output_scores,
+                           ITensorInfo                  *num_detection,
                            DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h b/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
index 71c44a8bd1f21bcdb4db9a353518e419a5e5657e..af6afc60293158f95dec6890de835142961f055a 100644
--- a/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
+++ b/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSION_LAYER_H
 #define ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSION_LAYER_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -48,7 +47,12 @@ public:
      * @param[in]  nms_threshold   The threshold used in non maximum suppression.
      *
      */
-    void configure(const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size, const float score_threshold, const float nms_threshold);
+    void configure(const ITensor *bboxes,
+                   const ITensor *scores,
+                   ITensor       *indices,
+                   unsigned int   max_output_size,
+                   const float    score_threshold,
+                   const float    nms_threshold);
 
     /** Static function to check if given arguments will lead to a valid configuration of @ref CPPNonMaximumSuppression
      *
@@ -60,8 +64,12 @@ public:
      * @param[in]  nms_threshold   The threshold used in non maximum suppression.
      *
      */
-    static Status validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
-                           const float score_threshold, const float nms_threshold);
+    static Status validate(const ITensorInfo *bboxes,
+                           const ITensorInfo *scores,
+                           const ITensorInfo *indices,
+                           unsigned int       max_output_size,
+                           const float        score_threshold,
+                           const float        nms_threshold);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSION_LAYER_H */
diff --git a/arm_compute/runtime/CPP/functions/CPPPermute.h b/arm_compute/runtime/CPP/functions/CPPPermute.h
index 85c1502324728e854503ef8c1498b290d37277f0..232da41b8e330094cdcfdca9479cc4d90e988f5a 100644
--- a/arm_compute/runtime/CPP/functions/CPPPermute.h
+++ b/arm_compute/runtime/CPP/functions/CPPPermute.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPPPERMUTE_H
 #define ARM_COMPUTE_CPPPERMUTE_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -53,5 +52,5 @@ public:
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CPPPERMUTE_H */
diff --git a/arm_compute/runtime/CPP/functions/CPPSplit.h b/arm_compute/runtime/CPP/functions/CPPSplit.h
index 56aad2db4b1792a40fb0ce4dfa7e60a9a57b98cb..9be081f5bb57fd595404951839854651cce63f99 100644
--- a/arm_compute/runtime/CPP/functions/CPPSplit.h
+++ b/arm_compute/runtime/CPP/functions/CPPSplit.h
@@ -29,7 +29,6 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
 #include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
@@ -39,8 +38,7 @@ template <typename SliceType, typename TensorInterfaceType = ITensor>
 class CPPSplit : public IFunction
 {
 public:
-    CPPSplit()
-        : _outputs_vector(), _slice_functions(), _num_outputs(0)
+    CPPSplit() : _outputs_vector(), _slice_functions(), _num_outputs(0)
     {
     }
     /** Static function to check if given info will lead to a valid configuration of @ref CPPSplit
@@ -64,14 +62,16 @@ public:
         unsigned int total_output_shape_size = 0;
 
         // Sum the output sizes and fall back to evenly-sized splits if any are zero
-        const bool using_split_shapes = std::none_of(outputs.begin(), outputs.end(), [&total_output_shape_size](ITensorInfo * info)
-        {
-            unsigned int output_shape_size = info->tensor_shape().total_size();
-            total_output_shape_size += output_shape_size;
-            return output_shape_size == 0;
-        });
-
-        if(using_split_shapes)
+        const bool using_split_shapes = std::none_of(outputs.begin(), outputs.end(),
+                                                     [&total_output_shape_size](ITensorInfo *info)
+                                                     {
+                                                         unsigned int output_shape_size =
+                                                             info->tensor_shape().total_size();
+                                                         total_output_shape_size += output_shape_size;
+                                                         return output_shape_size == 0;
+                                                     });
+
+        if (using_split_shapes)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() != total_output_shape_size);
         }
@@ -83,10 +83,10 @@ public:
 
         // Validate output tensors
         unsigned int axis_offset = 0;
-        for(const auto &output : outputs)
+        for (const auto &output : outputs)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-            if(using_split_shapes)
+            if (using_split_shapes)
             {
                 output_shape = output->tensor_shape();
                 ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0);
@@ -97,14 +97,14 @@ public:
             // Start/End coordinates
             Coordinates start_coords;
             Coordinates end_coords;
-            for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+            for (unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
             {
                 end_coords.set(d, -1);
             }
 
             // Output auto inizialitation if not yet initialized
             TensorInfo tmp_output_info = *output->clone();
-            if(tmp_output_info.tensor_shape().total_size() == 0)
+            if (tmp_output_info.tensor_shape().total_size() == 0)
             {
                 tmp_output_info = input->clone()->set_is_resizable(true).set_tensor_shape(output_shape);
             }
@@ -128,7 +128,8 @@ public:
      *                     from the split dimension.
      * @param[in]  axis    Axis on which to split the input.
      */
-    void configure(const TensorInterfaceType *input, const std::vector<TensorInterfaceType *> &outputs, unsigned int axis)
+    void
+    configure(const TensorInterfaceType *input, const std::vector<TensorInterfaceType *> &outputs, unsigned int axis)
     {
         // Create Slice functions
         _num_outputs = outputs.size();
@@ -136,17 +137,16 @@ public:
 
         // Extract output tensor info
         std::vector<ITensorInfo *> outputs_info;
-        for(auto &output : outputs)
+        for (auto &output : outputs)
         {
             ARM_COMPUTE_ERROR_ON_NULLPTR(output);
             outputs_info.emplace_back(output->info());
         }
 
         // If any of the outputs have a zero size, fall-back to using evenly-sized output splits
-        const bool outputs_have_sizes = std::none_of(outputs_info.begin(), outputs_info.end(), [](ITensorInfo * info)
-        {
-            return info->tensor_shape().total_size() == 0;
-        });
+        const bool outputs_have_sizes =
+            std::none_of(outputs_info.begin(), outputs_info.end(),
+                         [](ITensorInfo *info) { return info->tensor_shape().total_size() == 0; });
 
         // Validate
         ARM_COMPUTE_ERROR_THROW_ON(CPPSplit::validate(input->info(), outputs_info, axis));
@@ -154,12 +154,13 @@ public:
         unsigned int axis_offset = 0;
         unsigned int i           = 0;
 
-        for(const auto &output_info : outputs_info)
+        for (const auto &output_info : outputs_info)
         {
             // Get output shape
-            TensorShape output_shape = (outputs_have_sizes ?
-                                        output_info->tensor_shape() :
-                                        arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs));
+            TensorShape output_shape =
+                (outputs_have_sizes
+                     ? output_info->tensor_shape()
+                     : arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs));
 
             const size_t axis_split_step = output_shape[axis];
 
@@ -167,7 +168,7 @@ public:
             Coordinates start_coords;
             Coordinates end_coords;
 
-            for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+            for (unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
             {
                 end_coords.set(d, -1);
             }
diff --git a/arm_compute/runtime/CPP/functions/CPPTopKV.h b/arm_compute/runtime/CPP/functions/CPPTopKV.h
index 2f63084056ade3ee9d7f0790735bd70fdfd5d9e5..232cbb3067d72e08792d4eae755ba775de04de30 100644
--- a/arm_compute/runtime/CPP/functions/CPPTopKV.h
+++ b/arm_compute/runtime/CPP/functions/CPPTopKV.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPPTOPKV_H
 #define ARM_COMPUTE_CPPTOPKV_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -54,7 +53,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
+    static Status
+    validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CPPTOPKV_H */
diff --git a/arm_compute/runtime/CPP/functions/CPPUpsample.h b/arm_compute/runtime/CPP/functions/CPPUpsample.h
index b97d4d1cc18ffc395cfdf26cc1882b6cff1799cb..3b0f997b1743880a4a5163df9b42a90ba8eef955 100644
--- a/arm_compute/runtime/CPP/functions/CPPUpsample.h
+++ b/arm_compute/runtime/CPP/functions/CPPUpsample.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPPUPSAMPLE_H
 #define ARM_COMPUTE_CPPUPSAMPLE_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -44,5 +43,5 @@ public:
      */
     void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CPPUPSAMPLE_H */
diff --git a/arm_compute/runtime/FunctionDescriptors.h b/arm_compute/runtime/FunctionDescriptors.h
index 630f53324403c41686b72ab4e681b9835b2f60e6..c3af17d6f2d28ddc796929da6c1524c70511a9c0 100644
--- a/arm_compute/runtime/FunctionDescriptors.h
+++ b/arm_compute/runtime/FunctionDescriptors.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H
-#define ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_FUNCTIONDESCRIPTORS_H
+#define ACL_ARM_COMPUTE_RUNTIME_FUNCTIONDESCRIPTORS_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
@@ -41,16 +41,16 @@ enum class FFTDirection
 /** Descriptor used by the FFT1D function */
 struct FFT1DInfo
 {
-    unsigned int axis{ 0 };                          /**< Axis to run the FFT on. */
-    FFTDirection direction{ FFTDirection::Forward }; /**< Direction of the FFT. */
+    unsigned int axis{0};                          /**< Axis to run the FFT on. */
+    FFTDirection direction{FFTDirection::Forward}; /**< Direction of the FFT. */
 };
 
 /** Descriptor used by the FFT2D function */
 struct FFT2DInfo
 {
-    unsigned int axis0{ 0 };                         /**< Axis to run first pass on. If same, multiple transforms are performed on single axis*/
-    unsigned int axis1{ 1 };                         /**< Axis to run second pass on. If same, multiple transforms are performed on single axis*/
-    FFTDirection direction{ FFTDirection::Forward }; /**< Direction of the FFT. */
+    unsigned int axis0{0}; /**< Axis to run first pass on. If same, multiple transforms are performed on single axis*/
+    unsigned int axis1{1}; /**< Axis to run second pass on. If same, multiple transforms are performed on single axis*/
+    FFTDirection direction{FFTDirection::Forward}; /**< Direction of the FFT. */
 };
 
 /** Descriptor used by the 2d Convolution function */
@@ -58,24 +58,27 @@ struct Conv2dInfo
 {
     Conv2dInfo() = default;
 
-    Conv2dInfo(const PadStrideInfo                           &conv_info,
-               const Size2D                                  &dilation,
-               const ActivationLayerInfo                     &act_info,
-               bool                                           enable_fast_math,
-               unsigned int                                   num_groups,
-               const experimental::PostOpList<ITensorInfo *> &post_ops     = experimental::PostOpList<ITensorInfo *> {},
-               const WeightsInfo                             &weights_info = WeightsInfo())
-        : conv_info(conv_info), dilation(dilation), act_info(act_info), enable_fast_math(enable_fast_math), num_groups(num_groups), post_ops(post_ops), weights_info(weights_info)
+    Conv2dInfo(const PadStrideInfo       &conv_info,
+               const Size2D              &dilation,
+               const ActivationLayerInfo &act_info,
+               bool                       enable_fast_math,
+               unsigned int               num_groups,
+               const WeightsInfo         &weights_info = WeightsInfo())
+        : conv_info(conv_info),
+          dilation(dilation),
+          act_info(act_info),
+          enable_fast_math(enable_fast_math),
+          num_groups(num_groups),
+          weights_info(weights_info)
     {
     }
 
-    PadStrideInfo                           conv_info{};
-    Size2D                                  dilation{ 1U, 1U };
-    ActivationLayerInfo                     act_info{};
-    bool                                    enable_fast_math{ false };
-    unsigned int                            num_groups{ 1 };
-    experimental::PostOpList<ITensorInfo *> post_ops{};
-    WeightsInfo                             weights_info{};
+    PadStrideInfo       conv_info{};
+    Size2D              dilation{1U, 1U};
+    ActivationLayerInfo act_info{};
+    bool                enable_fast_math{false};
+    unsigned int        num_groups{1};
+    WeightsInfo         weights_info{};
 };
 
 /** Descriptor used by the 3d Convolution function */
@@ -89,17 +92,22 @@ struct Conv3dInfo
                const Size3D                &dilation,
                const DimensionRoundingType &round_type,
                bool                         enable_fast_math)
-        : stride(stride), padding(padding), act_info(act_info), dilation(dilation), round_type(round_type), enable_fast_math(enable_fast_math)
+        : stride(stride),
+          padding(padding),
+          act_info(act_info),
+          dilation(dilation),
+          round_type(round_type),
+          enable_fast_math(enable_fast_math)
     {
     }
 
-    Size3D                stride{ 1U, 1U, 1U };
+    Size3D                stride{1U, 1U, 1U};
     Padding3D             padding{};
     ActivationLayerInfo   act_info{};
-    Size3D                dilation{ 1U, 1U, 1U };
+    Size3D                dilation{1U, 1U, 1U};
     DimensionRoundingType round_type{};
-    bool                  enable_fast_math{ false };
+    bool                  enable_fast_math{false};
 };
 
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_FUNCTIONDESCRIPTORS_H
diff --git a/arm_compute/runtime/IAllocator.h b/arm_compute/runtime/IAllocator.h
index 5c28b24fea1e9c38bf17dbd5cb2dacf849cc33be..f8446db811902cc58c75ccb9387dc6c7576d4e96 100644
--- a/arm_compute/runtime/IAllocator.h
+++ b/arm_compute/runtime/IAllocator.h
@@ -56,5 +56,5 @@ public:
      */
     virtual std::unique_ptr<IMemoryRegion> make_region(size_t size, size_t alignment) = 0;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IALLOCATOR_H */
diff --git a/arm_compute/runtime/IFunction.h b/arm_compute/runtime/IFunction.h
index b7b28f999d8c40ca4778d15c06d7bb576de2faa3..fb68dbbecfd2224cb0b335bac595b286a82975b0 100644
--- a/arm_compute/runtime/IFunction.h
+++ b/arm_compute/runtime/IFunction.h
@@ -58,5 +58,5 @@ public:
     {
     }
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IFUNCTION_H */
diff --git a/arm_compute/runtime/IMemoryGroup.h b/arm_compute/runtime/IMemoryGroup.h
index a977a4a3c3352f3c7af66dd67718c9479de0059c..77198dd29d9aedd2336dc6fded529910c9e7c239 100644
--- a/arm_compute/runtime/IMemoryGroup.h
+++ b/arm_compute/runtime/IMemoryGroup.h
@@ -86,8 +86,7 @@ public:
      *
      * @param[in] memory_group Memory group to handle
      */
-    explicit MemoryGroupResourceScope(IMemoryGroup &memory_group)
-        : _memory_group(memory_group)
+    explicit MemoryGroupResourceScope(IMemoryGroup &memory_group) : _memory_group(memory_group)
     {
         _memory_group.acquire();
     }
@@ -100,5 +99,5 @@ public:
 private:
     IMemoryGroup &_memory_group;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IMEMORYGROUP_H */
diff --git a/arm_compute/runtime/IMemoryManager.h b/arm_compute/runtime/IMemoryManager.h
index 4d7d8cd9c901df36073113ca1cd769d2b5d5c826..42910edfdaf07aea6f85e10e34e50d31379a4ec1 100644
--- a/arm_compute/runtime/IMemoryManager.h
+++ b/arm_compute/runtime/IMemoryManager.h
@@ -65,5 +65,5 @@ public:
      */
     virtual void clear() = 0;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IMEMORYMANAGER_H */
diff --git a/arm_compute/runtime/IMemoryPool.h b/arm_compute/runtime/IMemoryPool.h
index b8d36c362dbbe6f128b290342b111814e9edc966..0c112c8f35de837ad04a8f80715403ed1ac72edb 100644
--- a/arm_compute/runtime/IMemoryPool.h
+++ b/arm_compute/runtime/IMemoryPool.h
@@ -60,5 +60,5 @@ public:
      */
     virtual std::unique_ptr<IMemoryPool> duplicate() = 0;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_IMEMORYPOOL_H */
diff --git a/arm_compute/runtime/IMemoryRegion.h b/arm_compute/runtime/IMemoryRegion.h
index 914aa57fbe63b3df1d475ec4eae25e471ebc3490..9431663e4e2eb5fca3b5db20f71f94f6eed30014 100644
--- a/arm_compute/runtime/IMemoryRegion.h
+++ b/arm_compute/runtime/IMemoryRegion.h
@@ -37,8 +37,7 @@ public:
      *
      * @param[in] size Region size
      */
-    explicit IMemoryRegion(size_t size)
-        : _size(size)
+    explicit IMemoryRegion(size_t size) : _size(size)
     {
     }
     /** Virtual Destructor */
diff --git a/arm_compute/runtime/IPoolManager.h b/arm_compute/runtime/IPoolManager.h
index 481bde5fb6c5f22788775478d27d6cadc34fe39a..5f6d4ffbe5c5838370386f2a2d6392093340cdf9 100644
--- a/arm_compute/runtime/IPoolManager.h
+++ b/arm_compute/runtime/IPoolManager.h
@@ -69,5 +69,5 @@ public:
      */
     virtual size_t num_pools() const = 0;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IPOOLMANAGER_H */
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
index df5a44001f36b2c1706bb5ddf72b6c4cc276eb89..ae204c85609ce34ee76155fd10251b308fe65546 100644
--- a/arm_compute/runtime/IScheduler.h
+++ b/arm_compute/runtime/IScheduler.h
@@ -25,8 +25,8 @@
 #define ARM_COMPUTE_ISCHEDULER_H
 
 #include "arm_compute/core/CPP/CPPTypes.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Types.h"
 
 #include <functional>
 #include <limits>
@@ -226,7 +226,11 @@ protected:
      *
      * @return Adjusted number of windows
      */
-    std::size_t adjust_num_of_windows(const Window &window, std::size_t split_dimension, std::size_t init_num_windows, const ICPPKernel &kernel, const CPUInfo &cpu_info);
+    std::size_t adjust_num_of_windows(const Window     &window,
+                                      std::size_t       split_dimension,
+                                      std::size_t       init_num_windows,
+                                      const ICPPKernel &kernel,
+                                      const CPUInfo    &cpu_info);
 
 private:
     unsigned int _num_threads_hint = {};
diff --git a/arm_compute/runtime/ISimpleLifetimeManager.h b/arm_compute/runtime/ISimpleLifetimeManager.h
index b2d17c6fea8cf9fe3bc5a4797767f113b8c3c57b..9e481bb563102be5a6d336c8b816c779f8f81f80 100644
--- a/arm_compute/runtime/ISimpleLifetimeManager.h
+++ b/arm_compute/runtime/ISimpleLifetimeManager.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_ISIMPLELIFETIMEMANAGER_H
 
 #include "arm_compute/runtime/ILifetimeManager.h"
-
 #include "arm_compute/runtime/IMemoryPool.h"
 #include "arm_compute/runtime/Types.h"
 
@@ -70,7 +69,11 @@ protected:
     /** Element struct */
     struct Element
     {
-        Element(void *id_ = nullptr, IMemory *handle_ = nullptr, size_t size_ = 0, size_t alignment_ = 0, bool status_ = false)
+        Element(void    *id_        = nullptr,
+                IMemory *handle_    = nullptr,
+                size_t   size_      = 0,
+                size_t   alignment_ = 0,
+                bool     status_    = false)
             : id(id_), handle(handle_), size(size_), alignment(alignment_), status(status_)
         {
         }
@@ -90,11 +93,12 @@ protected:
         std::set<void *> bound_elements;
     };
 
-    IMemoryGroup *_active_group;                                           /**< Active group */
-    std::map<void *, Element> _active_elements;                            /**< A map that contains the active elements */
-    std::list<Blob> _free_blobs;                                           /**< Free blobs */
-    std::list<Blob> _occupied_blobs;                                       /**< Occupied blobs */
-    std::map<IMemoryGroup *, std::map<void *, Element>> _finalized_groups; /**< A map that contains the finalized groups */
+    IMemoryGroup             *_active_group;    /**< Active group */
+    std::map<void *, Element> _active_elements; /**< A map that contains the active elements */
+    std::list<Blob>           _free_blobs;      /**< Free blobs */
+    std::list<Blob>           _occupied_blobs;  /**< Occupied blobs */
+    std::map<IMemoryGroup *, std::map<void *, Element>>
+        _finalized_groups; /**< A map that contains the finalized groups */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_ISIMPLELIFETIMEMANAGER_H */
diff --git a/arm_compute/runtime/ITensorAllocator.h b/arm_compute/runtime/ITensorAllocator.h
index 17e581b40e9e87d9df75d945ed6b086bc55dfde1..e2d35361693647d5714433580ff0ca06a7043d21 100644
--- a/arm_compute/runtime/ITensorAllocator.h
+++ b/arm_compute/runtime/ITensorAllocator.h
@@ -101,9 +101,9 @@ protected:
     virtual void unlock() = 0;
 
 private:
-    TensorInfo  _info_owned{};             /**< Tensor's metadata. */
-    TensorInfo *_info_external{ nullptr }; /**< External Tensor's metadata */
-    size_t      _alignment{};              /**< Tensor's alignment in bytes */
+    TensorInfo  _info_owned{};           /**< Tensor's metadata. */
+    TensorInfo *_info_external{nullptr}; /**< External Tensor's metadata */
+    size_t      _alignment{};            /**< Tensor's alignment in bytes */
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_ITENSORALLOCATOR_H */
diff --git a/arm_compute/runtime/ITransformWeights.h b/arm_compute/runtime/ITransformWeights.h
index f85b7966c5b879849928e81ad42a749fd04dbdb7..08671bbe3cf5657a421fbb332060d2196974136c 100644
--- a/arm_compute/runtime/ITransformWeights.h
+++ b/arm_compute/runtime/ITransformWeights.h
@@ -72,7 +72,7 @@ public:
     /** Allow instances of this class to be moved */
     ITransformWeights &operator=(ITransformWeights &&other)
     {
-        if(this != &other)
+        if (this != &other)
         {
             _num_refcount = other._num_refcount.load();
             _reshape_run  = other._reshape_run;
@@ -119,9 +119,9 @@ public:
     }
 
 protected:
-    std::atomic<int32_t> _num_refcount{ 0 };
-    bool                 _reshape_run{ false };
+    std::atomic<int32_t> _num_refcount{0};
+    bool                 _reshape_run{false};
 };
-} // arm_compute
+} // namespace arm_compute
 
 #endif /*ARM_COMPUTE_ITRANSFORMWEIGHTS_H */
diff --git a/arm_compute/runtime/IWeightsManager.h b/arm_compute/runtime/IWeightsManager.h
index 3b97d696bb45d8525b2af9b2d6f79dc958b38704..de8a92faa3a7c0509409a588dc451de77f4f9069 100644
--- a/arm_compute/runtime/IWeightsManager.h
+++ b/arm_compute/runtime/IWeightsManager.h
@@ -90,8 +90,8 @@ public:
 private:
     struct CounterElement
     {
-        bool             is_unused{ false };
-        std::atomic<int> counter{ 1 };
+        bool             is_unused{false};
+        std::atomic<int> counter{1};
     };
 
 private:
@@ -99,5 +99,5 @@ private:
     std::map<const ITensor *, CounterElement>                   _managed_counter;
     std::map<const ITensor *, ITransformWeights *>              _managed_weights_parents;
 };
-} // arm_compute
-#endif /*ARM_COMPUTE_IWEIGHTSMANAGER_H */
\ No newline at end of file
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_IWEIGHTSMANAGER_H */
diff --git a/arm_compute/runtime/Memory.h b/arm_compute/runtime/Memory.h
index 1eab605d504863d43069db64b1a771723ae6bcaa..63514c409b1bf713f022f0ca661531ad1b01dd0b 100644
--- a/arm_compute/runtime/Memory.h
+++ b/arm_compute/runtime/Memory.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_MEMORY_H
 
 #include "arm_compute/runtime/IMemory.h"
-
 #include "arm_compute/runtime/IMemoryRegion.h"
 
 #include <cstddef>
@@ -64,8 +63,8 @@ public:
     // Inherited methods overridden:
     IMemoryRegion *region() final;
     IMemoryRegion *region() const final;
-    void set_region(IMemoryRegion *region) final;
-    void set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
+    void           set_region(IMemoryRegion *region) final;
+    void           set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
 
 private:
     IMemoryRegion                 *_region;
diff --git a/arm_compute/runtime/MemoryGroup.h b/arm_compute/runtime/MemoryGroup.h
index 9fd2b9fa72f26100496ed760ca8e0d2a717cccb2..93ea3d2c72e8979101870dd7e1f79d3fbee64ce0 100644
--- a/arm_compute/runtime/MemoryGroup.h
+++ b/arm_compute/runtime/MemoryGroup.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_MEMORYGROUP_H
 #define ARM_COMPUTE_MEMORYGROUP_H
 
-#include "arm_compute/runtime/IMemoryGroup.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/Macros.h"
+#include "arm_compute/runtime/IMemoryGroup.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IMemoryPool.h"
 
@@ -59,8 +58,8 @@ public:
     // Inherited methods overridden:
     void manage(IMemoryManageable *obj) override;
     void finalize_memory(IMemoryManageable *obj, IMemory &obj_memory, size_t size, size_t alignment) override;
-    void            acquire() override;
-    void            release() override;
+    void acquire() override;
+    void release() override;
     MemoryMappings &mappings() override;
 
 private:
@@ -70,15 +69,13 @@ private:
 };
 
 inline MemoryGroup::MemoryGroup(std::shared_ptr<IMemoryManager> memory_manager) noexcept
-    : _memory_manager(memory_manager),
-      _pool(nullptr),
-      _mappings()
+    : _memory_manager(memory_manager), _pool(nullptr), _mappings()
 {
 }
 
 inline void MemoryGroup::manage(IMemoryManageable *obj)
 {
-    if(_memory_manager && (obj != nullptr))
+    if (_memory_manager && (obj != nullptr))
     {
         ARM_COMPUTE_ERROR_ON(!_memory_manager->lifetime_manager());
 
@@ -95,7 +92,7 @@ inline void MemoryGroup::manage(IMemoryManageable *obj)
 
 inline void MemoryGroup::finalize_memory(IMemoryManageable *obj, IMemory &obj_memory, size_t size, size_t alignment)
 {
-    if(_memory_manager)
+    if (_memory_manager)
     {
         ARM_COMPUTE_ERROR_ON(!_memory_manager->lifetime_manager());
         _memory_manager->lifetime_manager()->end_lifetime(obj, obj_memory, size, alignment);
@@ -104,7 +101,7 @@ inline void MemoryGroup::finalize_memory(IMemoryManageable *obj, IMemory &obj_me
 
 inline void MemoryGroup::acquire()
 {
-    if(!_mappings.empty())
+    if (!_mappings.empty())
     {
         ARM_COMPUTE_ERROR_ON(!_memory_manager->pool_manager());
         _pool = _memory_manager->pool_manager()->lock_pool();
@@ -114,7 +111,7 @@ inline void MemoryGroup::acquire()
 
 inline void MemoryGroup::release()
 {
-    if(_pool != nullptr)
+    if (_pool != nullptr)
     {
         ARM_COMPUTE_ERROR_ON(!_memory_manager->pool_manager());
         ARM_COMPUTE_ERROR_ON(_mappings.empty());
@@ -128,5 +125,5 @@ inline MemoryMappings &MemoryGroup::mappings()
 {
     return _mappings;
 }
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_MEMORYGROUP_H */
diff --git a/arm_compute/runtime/MemoryManagerOnDemand.h b/arm_compute/runtime/MemoryManagerOnDemand.h
index 50547ac38edf151b8894da153065dfdfcad315f5..7c31fe7f5a8f316f18f58577a7ca09e394c37a78 100644
--- a/arm_compute/runtime/MemoryManagerOnDemand.h
+++ b/arm_compute/runtime/MemoryManagerOnDemand.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H
 #define ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H
 
-#include "arm_compute/runtime/IMemoryManager.h"
-
 #include "arm_compute/runtime/ILifetimeManager.h"
 #include "arm_compute/runtime/IMemoryGroup.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IPoolManager.h"
 
 #include <memory>
@@ -39,7 +38,8 @@ class MemoryManagerOnDemand : public IMemoryManager
 {
 public:
     /** Default Constructor */
-    MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, std::shared_ptr<IPoolManager> pool_manager);
+    MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager,
+                          std::shared_ptr<IPoolManager>     pool_manager);
     /** Prevent instances of this class to be copy constructed */
     MemoryManagerOnDemand(const MemoryManagerOnDemand &) = delete;
     /** Prevent instances of this class to be copied */
@@ -52,12 +52,12 @@ public:
     // Inherited methods overridden:
     ILifetimeManager *lifetime_manager() override;
     IPoolManager     *pool_manager() override;
-    void populate(IAllocator &allocator, size_t num_pools) override;
-    void clear() override;
+    void              populate(IAllocator &allocator, size_t num_pools) override;
+    void              clear() override;
 
 private:
     std::shared_ptr<ILifetimeManager> _lifetime_mgr; /**< Lifetime manager */
     std::shared_ptr<IPoolManager>     _pool_mgr;     /**< Memory pool manager */
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H */
diff --git a/arm_compute/runtime/MemoryRegion.h b/arm_compute/runtime/MemoryRegion.h
index 6408deceaa06ea9781998eb23b6e0e6573a08ae6..f8a4898281e8e09411c4bf0784b3355d3f1b847d 100644
--- a/arm_compute/runtime/MemoryRegion.h
+++ b/arm_compute/runtime/MemoryRegion.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_RUNTIME_MEMORY_REGION_H
 #define ARM_COMPUTE_RUNTIME_MEMORY_REGION_H
 
-#include "arm_compute/runtime/IMemoryRegion.h"
-
 #include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IMemoryRegion.h"
 
 #include <cstddef>
 
@@ -41,21 +40,17 @@ public:
      * @param[in] size      Region size
      * @param[in] alignment Alignment in bytes of the base pointer. Defaults to 0
      */
-    MemoryRegion(size_t size, size_t alignment = 0)
-        : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
+    MemoryRegion(size_t size, size_t alignment = 0) : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
     {
-        if(size != 0)
+        if (size != 0)
         {
             // Allocate backing memory
             size_t space = size + alignment;
-            _mem         = std::shared_ptr<uint8_t>(new uint8_t[space](), [](uint8_t *ptr)
-            {
-                delete[] ptr;
-            });
-            _ptr = _mem.get();
+            _mem         = std::shared_ptr<uint8_t>(new uint8_t[space](), [](uint8_t *ptr) { delete[] ptr; });
+            _ptr         = _mem.get();
 
             // Calculate alignment offset
-            if(alignment != 0)
+            if (alignment != 0)
             {
                 void *aligned_ptr = _mem.get();
                 std::align(alignment, size, aligned_ptr, space);
@@ -63,10 +58,9 @@ public:
             }
         }
     }
-    MemoryRegion(void *ptr, size_t size)
-        : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
+    MemoryRegion(void *ptr, size_t size) : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
     {
-        if(size != 0)
+        if (size != 0)
         {
             _ptr = ptr;
         }
@@ -91,7 +85,7 @@ public:
     }
     std::unique_ptr<IMemoryRegion> extract_subregion(size_t offset, size_t size) final
     {
-        if(_ptr != nullptr && (offset < _size) && (_size - offset >= size))
+        if (_ptr != nullptr && (offset < _size) && (_size - offset >= size))
         {
             return std::make_unique<MemoryRegion>(static_cast<uint8_t *>(_ptr) + offset, size);
         }
diff --git a/arm_compute/runtime/NEON/INEOperator.h b/arm_compute/runtime/NEON/INEOperator.h
index 5637d831a3ebd3b00c59ea34344205b46bc18371..7971168d2470f01f5dad8f12f0762d0c9d9601c6 100644
--- a/arm_compute/runtime/NEON/INEOperator.h
+++ b/arm_compute/runtime/NEON/INEOperator.h
@@ -24,11 +24,11 @@
 #ifndef ARM_COMPUTE_INEOPERATOR_H
 #define ARM_COMPUTE_INEOPERATOR_H
 
-#include "../../core/ITensor.h"
 #include "arm_compute/runtime/IOperator.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
 #include "arm_compute/runtime/Types.h"
 
+#include "../../core/ITensor.h"
 #include <memory>
 
 namespace arm_compute
@@ -60,8 +60,8 @@ public:
     ~INEOperator();
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void               run(ITensorPack &tensors) override;
+    void               prepare(ITensorPack &constants) override;
     MemoryRequirements workspace() const override;
 
 protected:
diff --git a/arm_compute/runtime/NEON/INESimpleFunction.h b/arm_compute/runtime/NEON/INESimpleFunction.h
index 7512759bd068a95275765910c28e56be2cf19e76..f783a836ee73da71ff185672b778d5e7a7087456 100644
--- a/arm_compute/runtime/NEON/INESimpleFunction.h
+++ b/arm_compute/runtime/NEON/INESimpleFunction.h
@@ -57,5 +57,5 @@ protected:
     std::unique_ptr<INEKernel>          _kernel;         /**< Kernel to run */
     std::unique_ptr<NEFillBorderKernel> _border_handler; /**< Kernel to handle image borders */
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_INESIMPLEFUNCTION_H */
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index 3a10310452d69670cc28ef0cc8d39a0a846ed2fa..cc4d3032024e276bd87b6f14b085de9afffbd02a 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEFUNCTIONS_H
-#define ARM_COMPUTE_NEFUNCTIONS_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_NEFUNCTIONS_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_NEFUNCTIONS_H
 
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h"
@@ -62,34 +62,31 @@
 #include "arm_compute/runtime/NEON/functions/NEFloor.h"
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h"
+#include "arm_compute/runtime/NEON/functions/NEGather.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
-#include "arm_compute/runtime/NEON/functions/NEGather.h"
 #include "arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h"
+#include "arm_compute/runtime/NEON/functions/NELogical.h"
 #include "arm_compute/runtime/NEON/functions/NELSTMLayer.h"
 #include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h"
-#include "arm_compute/runtime/NEON/functions/NELogical.h"
 #include "arm_compute/runtime/NEON/functions/NEMatMul.h"
 #include "arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 #include "arm_compute/runtime/NEON/functions/NEPooling3dLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEQLSTMLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NERNNLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
 #include "arm_compute/runtime/NEON/functions/NERange.h"
 #include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
@@ -97,6 +94,9 @@
 #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEReverse.h"
+#include "arm_compute/runtime/NEON/functions/NERNNLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEScale.h"
 #include "arm_compute/runtime/NEON/functions/NESelect.h"
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
@@ -111,4 +111,4 @@
 #include "arm_compute/runtime/NEON/functions/NEUnstack.h"
 #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
 
-#endif /* ARM_COMPUTE_NEFUNCTIONS_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_NEFUNCTIONS_H
diff --git a/arm_compute/runtime/NEON/NEScheduler.h b/arm_compute/runtime/NEON/NEScheduler.h
index a3082d00f68068db4215354d6a6e58bb859154ea..613f44cc52caa010133e083977aadb53b29c9cbd 100644
--- a/arm_compute/runtime/NEON/NEScheduler.h
+++ b/arm_compute/runtime/NEON/NEScheduler.h
@@ -30,5 +30,5 @@ namespace arm_compute
 {
 /** CPU Scheduler */
 using NEScheduler = Scheduler;
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_NESCHEDULER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
index 9992de2af80deee16754c401a41007ad90eb216d..5584fdc783665c5df84ba4ebbe28a11d39ffcc0e 100644
--- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_NEACTIVATIONLAYER_H
 #define ARM_COMPUTE_NEACTIVATIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
 
 #include <memory>
@@ -102,5 +101,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-} // namespace arm_computes
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEACTIVATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEAddMulAdd.h b/arm_compute/runtime/NEON/functions/NEAddMulAdd.h
index e5e85542f8526ba45058c25c7874ef862f85399f..6c65c055dd1ad6513fb9b85ba0fd130568dc03a7 100644
--- a/arm_compute/runtime/NEON/functions/NEAddMulAdd.h
+++ b/arm_compute/runtime/NEON/functions/NEAddMulAdd.h
@@ -81,19 +81,28 @@ public:
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
      *
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *bn_mul, ITensor *bn_add,
-                   ITensor *add_output, ITensor *final_output,
-                   ConvertPolicy policy, const ActivationLayerInfo &act_info);
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *bn_mul,
+                   ITensor                   *bn_add,
+                   ITensor                   *add_output,
+                   ITensor                   *final_output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEAddMulAdd
      *
      * Similar to @ref NEAddMulAdd::configure() except the arguments are @ref ITensorInfo * instead of @ref ITensor *
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                           const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                           const ITensorInfo *add_output, const ITensorInfo *final_output,
-                           ConvertPolicy policy, const ActivationLayerInfo &act_info);
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *bn_mul,
+                           const ITensorInfo         *bn_add,
+                           const ITensorInfo         *add_output,
+                           const ITensorInfo         *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
index 4392de7b28890549abd5aeb6d246ff209dab7730..3bb50a0f904ec9264eb6774a040a0ff38bae20df 100644
--- a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_NEARGMINMAXLAYER_H
 #define ARM_COMPUTE_NEARGMINMAXLAYER_H
 
-#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
@@ -33,7 +31,6 @@
 namespace arm_compute
 {
 class ITensor;
-
 /** Function to calculate the index of the minimum or maximum values in a
  *  tensor based on an axis.
  *
@@ -68,13 +65,13 @@ public:
      * - All
      *
      * Valid data type configurations:
-     * |src            |dst        |
-     * |:--------------|:----------|
-     * |QASYMM8        |U32, S32   |
-     * |QASYMM8_SIGNED |U32, S32   |
-     * |S32            |U32, S32   |
-     * |F16            |U32, S32   |
-     * |F32            |U32, S32   |
+     * |src            |dst           |
+     * |:--------------|:-------------|
+     * |QASYMM8        |U32, S32      |
+     * |QASYMM8_SIGNED |U32, S32      |
+     * |S32            |U32, S32, S64 |
+     * |F16            |U32, S32      |
+     * |F32            |U32, S32      |
      *
      * @param[in]  input  Input source tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/S32/F16/F32.
      * @param[in]  axis   Axis to find max/min index.
@@ -86,7 +83,7 @@ public:
      *
      * @param[in] input  Input source tensor info. Data types supported: QASYMM8_SIGNED/QASYMM8/S32/F16/F32.
      * @param[in] axis   Axis to find max/min index.
-     * @param[in] output Output source tensor info. Data types supported: U32/S32.
+     * @param[in] output Output source tensor info. Data types supported: U32/S32/S64.
      * @param[in] op     Operation to perform: min or max
      *
      * @return a status
@@ -97,7 +94,8 @@ public:
     void run() override;
 
 private:
-    std::unique_ptr<NEReductionOperation> _reduction_function;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEARGMINMAXLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
index b0d710d51702aa82e264bd7628e4284403284e59..73a43dbc44d62d88b39472da0048db7c3ea86862 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -74,7 +75,11 @@ public:
      * @param[in]  policy   Policy to use to handle overflow.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensor             *input1,
+                   const ITensor             *input2,
+                   ITensor                   *output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAddition
      *
      * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
@@ -85,7 +90,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
index 6fbe9ad45053e57a37d15d3676588de4a51b834b..3e4f6356c59f73f43f84e534f2559db2d4bb6d5d 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
@@ -80,7 +80,11 @@ public:
      * @param[in]  policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensor             *input1,
+                   const ITensor             *input2,
+                   ITensor                   *output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction
      *
      * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32
@@ -91,7 +95,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
index ec00fbdbf284583aabfb4c946317430a9f5ad714..99e2dcadbb47454997b0ed9ff9ffa61174ac7c4c 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
@@ -81,7 +81,13 @@ public:
      * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f,
+    void configure(ITensor            *input,
+                   ITensor            *output,
+                   const ITensor      *mean,
+                   const ITensor      *var,
+                   const ITensor      *beta     = nullptr,
+                   const ITensor      *gamma    = nullptr,
+                   float               epsilon  = 0.001f,
                    ActivationLayerInfo act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayer
      *
@@ -98,10 +104,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *output,
+                           const ITensorInfo  *mean,
+                           const ITensorInfo  *var,
+                           const ITensorInfo  *beta     = nullptr,
+                           const ITensorInfo  *gamma    = nullptr,
+                           float               epsilon  = 0.001f,
+                           ActivationLayerInfo act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -109,5 +119,5 @@ public:
 private:
     std::unique_ptr<NEBatchNormalizationLayerKernel> _norm_kernel; /**< Batch normalization layer kernel */
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
index b33ba435a8cd3cc7d5f7fee4fb0353a93030b504..ebed0bea297572d3ee233bbd2e5126f29581d518 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEBATCHTOSPACELAYER_H
 #define ARM_COMPUTE_NEBATCHTOSPACELAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
@@ -77,7 +76,11 @@ public:
      * @param[out] output        Tensor output. Data types supported: same as @p input
      * @param[in]  crop_info     Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info = CropInfo{});
+    void configure(const ITensor  *input,
+                   int32_t         block_shape_x,
+                   int32_t         block_shape_y,
+                   ITensor        *output,
+                   const CropInfo &crop_info = CropInfo{});
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer
      *
      * @param[in]  input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -99,7 +102,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info = CropInfo{});
+    static Status validate(const ITensorInfo *input,
+                           int32_t            block_shape_x,
+                           int32_t            block_shape_y,
+                           const ITensorInfo *output,
+                           const CropInfo    &crop_info = CropInfo{});
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEBATCHTOSPACELAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
index 2a196a2de527b7685ac08f2f8395b433c7f4d097..aa41fc0df2ef786599f870159887912e6dce821b 100644
--- a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
+++ b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
@@ -57,7 +57,8 @@ public:
      *
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      */
-    void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
+    void
+    configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEBoundingBoxTransform
      *
@@ -71,7 +72,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+    static Status validate(const ITensorInfo              *boxes,
+                           const ITensorInfo              *pred_boxes,
+                           const ITensorInfo              *deltas,
+                           const BoundingBoxTransformInfo &info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEBOUNDINGBOXTRANSFORM_H */
diff --git a/arm_compute/runtime/NEON/functions/NECast.h b/arm_compute/runtime/NEON/functions/NECast.h
index 821249c1425c4e7e97836b74e5f1567c756f8bd8..43cae777f6dfc0ae4864564f269cbb367eaa5384 100644
--- a/arm_compute/runtime/NEON/functions/NECast.h
+++ b/arm_compute/runtime/NEON/functions/NECast.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NECAST_H
 #define ARM_COMPUTE_NECAST_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
index dd1c709d76d59fe597f89dff34aaca2a875316e4..1600f85488de20ce5e3cfcd693e6a4db1199167a 100644
--- a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NECONCATENATELAYER_H
 #define ARM_COMPUTE_NECONCATENATELAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -87,7 +86,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
+    static Status
+    validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEConv3D.h b/arm_compute/runtime/NEON/functions/NEConv3D.h
index 2a3c5351b06d2c4c48ce7fbe0fcac345720b80c7..525f37f3e729a6e9f478edfd83aa3adfe340770e 100644
--- a/arm_compute/runtime/NEON/functions/NEConv3D.h
+++ b/arm_compute/runtime/NEON/functions/NEConv3D.h
@@ -24,11 +24,10 @@
 #ifndef ARM_COMPUTE_NECONV3D_H
 #define ARM_COMPUTE_NECONV3D_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -76,14 +75,19 @@ public:
      * @param[out] output    Destination tensor. 4 lower dimensions represent a single output [OFM, width, height, depth], while the rest represent batch of outputs.
      * @param[in]  conv_info Contains padding, stride, acitvation information described in @ref Conv3dInfo.
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info);
+    void configure(
+        ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to NEConv3D::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv3dInfo &conv_info);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *output,
+                           const Conv3dInfo  &conv_info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
index a892d3036bdaf0b96137f1d4ada9aab3b79d072c..dc6b22d717842c450d5fbd47e84cc5878642a21c 100644
--- a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
+++ b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H
 #define ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
@@ -66,7 +65,8 @@ public:
      * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in]  data_layout          The data layout the weights have been trained in.
      */
-    void configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    void
+    configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConvertFullyConnectedWeights
      *
      * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
@@ -76,7 +76,10 @@ public:
      *
      * @return A Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const TensorShape &original_input_shape,
+                           DataLayout         data_layout);
 
     // Inherited methods overriden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index 4dd76d082bb6af20c4f16c2af6f710d497424319..2d07980adefe6dc2a6c7f30f8d8ab6f9e1a42567 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -21,14 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NECONVOLUTIONLAYER_H
-#define ARM_COMPUTE_NECONVOLUTIONLAYER_H
-
-#include "arm_compute/runtime/IFunction.h"
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H
 
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
 #include <memory>
@@ -39,7 +38,7 @@ namespace arm_compute
 class ITensor;
 
 /** Basic function to simulate a convolution layer. This function calls one of the following functions:
- * -# @ref cpu::CpuGemm     (executed only in case GEMM is required for the operation)
+ * -# @ref cpu::CpuGemmConv2d     (executed only in case GEMM is required for the operation)
  * -# @ref cpu::CpuWinogradConv2d (executed only in case Winograd is required for the operation)
  * -# @ref cpu::CpuDirectConv2d   (executed only in case Direct Convolution is required for the operation)
  * -# @ref NEFFTConvolutionLayer      (executed only in case FFT is required for the operation)
@@ -119,8 +118,16 @@ public:
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+    void configure(ITensor                   *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -143,9 +150,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
-                           unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
     /** Static function to check if given info will return the convolution called by @ref NEConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -165,8 +179,14 @@ public:
      *
      * @return the Convolution Method Hint
      */
-    static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                    const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static ConvolutionMethod get_convolution_method(const ITensorInfo         *input,
+                                                    const ITensorInfo         *weights,
+                                                    const ITensorInfo         *output,
+                                                    const PadStrideInfo       &conv_info,
+                                                    const WeightsInfo         &weights_info     = WeightsInfo(),
+                                                    const Size2D              &dilation         = Size2D(1U, 1U),
+                                                    const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                                                    bool                       enable_fast_math = false);
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
@@ -176,4 +196,4 @@ private:
     std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NECONVOLUTIONLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NECopy.h b/arm_compute/runtime/NEON/functions/NECopy.h
index ee02c259f4e1c02000846af5c4c8845ea2645ec8..840c03e96853842c48204c583b6b04509b31d8f6 100644
--- a/arm_compute/runtime/NEON/functions/NECopy.h
+++ b/arm_compute/runtime/NEON/functions/NECopy.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NECOPY_H
 #define ARM_COMPUTE_NECOPY_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/NEON/functions/NECropResize.h b/arm_compute/runtime/NEON/functions/NECropResize.h
index 143bbbc6f1e8015a87cffa70d0cec4ff2b7168b7..f806762158c75797d0cb0ee2c3945042ee826441 100644
--- a/arm_compute/runtime/NEON/functions/NECropResize.h
+++ b/arm_compute/runtime/NEON/functions/NECropResize.h
@@ -75,8 +75,13 @@ public:
      * @param[in]  method              The policy to be used when resizing image. Default is bilinear.
      * @param[in]  extrapolation_value Value to be used for values outside of the image for cropping and resizing. Default is 0.
      */
-    void configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size,
-                   InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0);
+    void configure(const ITensor      *input,
+                   const ITensor      *boxes,
+                   const ITensor      *box_ind,
+                   ITensor            *output,
+                   Coordinates2D       crop_size,
+                   InterpolationPolicy method              = InterpolationPolicy::BILINEAR,
+                   float               extrapolation_value = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NESlice
      *
@@ -96,8 +101,13 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output,
-                           Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value);
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *boxes,
+                           const ITensorInfo  *box_ind,
+                           const ITensorInfo  *output,
+                           Coordinates2D       crop_size,
+                           InterpolationPolicy method,
+                           float               extrapolation_value);
 
     void run() override;
 
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index cdc3a636b07dc3ce3733b4788e8b3a63cb40591e..aabe42f9289353e2291e7be2d19a999ac93b2c24 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -24,15 +24,14 @@
 #ifndef ARM_COMPUTE_NEDECONVOLUTIONLAYER_H
 #define ARM_COMPUTE_NEDECONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
-#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEReverse.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEReverse.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
@@ -117,7 +116,13 @@ public:
      *                                 the GEMM convolution.
      *
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, bool enable_fast_math = false, const WeightsInfo &weights_info = WeightsInfo());
+    void configure(ITensor             *input,
+                   const ITensor       *weights,
+                   const ITensor       *bias,
+                   ITensor             *output,
+                   const PadStrideInfo &info,
+                   bool                 enable_fast_math = false,
+                   const WeightsInfo   &weights_info     = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEDeconvolutionLayer
      *
      * @param[in] input            Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
@@ -134,8 +139,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info,
-                           bool enable_fast_math = false, const WeightsInfo &weights_info = WeightsInfo());
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *bias,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &info,
+                           bool                 enable_fast_math = false,
+                           const WeightsInfo   &weights_info     = WeightsInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
index eb0724ae124be1499a30e075e4703f329beb2640..7bfdfbd13d8e1573aa018b09e09936e14418725f 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEDEPTHCONVERT_H
 #define ARM_COMPUTE_NEDEPTHCONVERT_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -84,7 +83,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0);
 
     // Inherited methods overridden
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
index b9bdcd1f11a7467b70459903d94f8105d97e1cfa..c7df29a70428dab41edb51cee2718bf1ae806fa2 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYER_H
 #define ARM_COMPUTE_NEDEPTHTOSPACELAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 6f2ec8cddbe6a1faaf11001a4d25d09577afbf72..6ad5aa7bfa025d65a0a2c89d0599e89a4a61447f 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -28,6 +28,7 @@
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -80,8 +81,14 @@ public:
      * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    void configure(ITensor                   *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   unsigned int               depth_multiplier = 1,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U));
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer
      *
@@ -98,8 +105,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           unsigned int               depth_multiplier = 1,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U));
 
     // Inherited methods overriden:
     void run() override;
@@ -127,9 +140,11 @@ private:
         /** Default move constructor */
         NEDepthwiseConvolutionLayerOptimizedInternal(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default;
         /** Prevent instances of this class from being copied (As this class contains pointers) */
-        NEDepthwiseConvolutionLayerOptimizedInternal &operator=(const NEDepthwiseConvolutionLayerOptimizedInternal &) = delete;
+        NEDepthwiseConvolutionLayerOptimizedInternal &
+        operator=(const NEDepthwiseConvolutionLayerOptimizedInternal &) = delete;
         /** Default move assignment operator */
-        NEDepthwiseConvolutionLayerOptimizedInternal &operator=(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default;
+        NEDepthwiseConvolutionLayerOptimizedInternal &
+        operator=(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default;
         /** Default destructor */
         ~NEDepthwiseConvolutionLayerOptimizedInternal() = default;
         /** Initialize the function's source, destination, kernels and border_size.
@@ -144,8 +159,14 @@ private:
          * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
          * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
          */
-        void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                       unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+        void configure(ITensor                   *input,
+                       const ITensor             *weights,
+                       const ITensor             *biases,
+                       ITensor                   *output,
+                       const PadStrideInfo       &conv_info,
+                       unsigned int               depth_multiplier = 1,
+                       const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                       const Size2D              &dilation         = Size2D(1U, 1U));
 
         /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3
          *
@@ -161,8 +182,14 @@ private:
          *
          * @return a status
          */
-        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                               unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+        static Status validate(const ITensorInfo         *input,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *output,
+                               const PadStrideInfo       &conv_info,
+                               unsigned int               depth_multiplier = 1,
+                               const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                               const Size2D              &dilation         = Size2D(1U, 1U));
 
         // Inherited methods overriden:
         void run() override;
@@ -207,8 +234,14 @@ private:
          * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
          * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
          */
-        void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                       unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+        void configure(ITensor                   *input,
+                       const ITensor             *weights,
+                       const ITensor             *biases,
+                       ITensor                   *output,
+                       const PadStrideInfo       &conv_info,
+                       unsigned int               depth_multiplier = 1,
+                       const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                       const Size2D              &dilation         = Size2D(1U, 1U));
 
         /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayerGeneric
          *
@@ -225,8 +258,14 @@ private:
          *
          * @return a status
          */
-        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                               unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+        static Status validate(const ITensorInfo         *input,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *output,
+                               const PadStrideInfo       &conv_info,
+                               unsigned int               depth_multiplier = 1,
+                               const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                               const Size2D              &dilation         = Size2D(1U, 1U));
 
         // Inherited methods overriden:
         void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
index 2affa8d49ecd2dc2985f433522baa210295aa1ca..7a94833d10432f54dbc7294f88322b033f1bc99a 100644
--- a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
@@ -24,13 +24,12 @@
 #ifndef ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H
 #define ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <map>
@@ -78,8 +77,14 @@ public:
      *
      * @note Output contains all the detections. Of those, only the ones selected by the valid region are valid.
      */
-    void configure(const ITensor *input_box_encoding, const ITensor *input_score, const ITensor *input_anchors,
-                   ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
+    void configure(const ITensor                *input_box_encoding,
+                   const ITensor                *input_score,
+                   const ITensor                *input_anchors,
+                   ITensor                      *output_boxes,
+                   ITensor                      *output_classes,
+                   ITensor                      *output_scores,
+                   ITensor                      *num_detection,
+                   DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEDetectionPostProcessLayer
      *
      * @param[in] input_box_encoding The bounding box input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32.
@@ -93,8 +98,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
-                           ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection,
+    static Status validate(const ITensorInfo            *input_box_encoding,
+                           const ITensorInfo            *input_class_score,
+                           const ITensorInfo            *input_anchors,
+                           ITensorInfo                  *output_boxes,
+                           ITensorInfo                  *output_classes,
+                           ITensorInfo                  *output_scores,
+                           ITensorInfo                  *num_detection,
                            DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index 8db7e6596b818d991f478e0b31520a2ec4db64e7..3ae3b2a15c1ef3aca9736963e990123892f04eb2 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -85,7 +85,12 @@ public:
      * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]      act_info  (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input,
+                   const ITensor             *weights,
+                   const ITensor             *bias,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayer
      *
      * @note: DirectConvolution only works in the following configurations:
@@ -106,7 +111,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
index bfcd221e17fc388b4053e382405b4717d1e51771..ebf2277d1f6303be2db72ae24ee08e6785379ed9 100644
--- a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
+++ b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
@@ -73,7 +73,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for max
      *
      * @param[in] input1   First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -83,7 +86,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -133,7 +139,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for min
      *
      * @param[in] input1   First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -143,7 +152,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -193,7 +205,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for squared difference
      *
      * @param[in] input1   First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -203,7 +218,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -249,7 +267,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for division
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
@@ -259,7 +280,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -306,7 +330,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for power
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
@@ -316,7 +343,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -377,7 +407,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op);
+    static Status
+    validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEFFT1D.h b/arm_compute/runtime/NEON/functions/NEFFT1D.h
index 9654b1e6046a4c6f32d58e4a32a0135105730dff..99c6fd4eb416dbedd95e1b5741602a5a5c980f12 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT1D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT1D.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEFFT1D_H
 #define ARM_COMPUTE_NEFFT1D_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
diff --git a/arm_compute/runtime/NEON/functions/NEFFT2D.h b/arm_compute/runtime/NEON/functions/NEFFT2D.h
index 57f38d1942fbb4ce7681e46eecd17065dde0c442..cefd3df17a2748e27aba7d00aa6aa333b12af950 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT2D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT2D.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEFFT2D_H
 #define ARM_COMPUTE_NEFFT2D_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEFFT1D.h"
 #include "arm_compute/runtime/Tensor.h"
diff --git a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
index c5f4d45b6b8d8f511210ef227dfa3b29271159be..84bfe6b02f800764f2f4eddee90dc9f11c47a676 100644
--- a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEFFT2D.h"
@@ -94,8 +93,13 @@ public:
      * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. Unused for CPU backend.
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(ITensor                   *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration of @ref NEFFTConvolutionLayer
      *
      * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
@@ -113,8 +117,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEFill.h b/arm_compute/runtime/NEON/functions/NEFill.h
index e923ce33e1ec4f8a2c5ef090d7a69b606e574a55..1829c71fefe255fa5362338eeba346e60ada2534 100644
--- a/arm_compute/runtime/NEON/functions/NEFill.h
+++ b/arm_compute/runtime/NEON/functions/NEFill.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_NEFILL_H
 #define ARM_COMPUTE_NEFILL_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/NEON/functions/NEFillBorder.h b/arm_compute/runtime/NEON/functions/NEFillBorder.h
index ab77c288390d996845436a3224b76ba4f9d9d237..44b1d4a62b8e53abcd62e195f55f8c9fdd5a045f 100644
--- a/arm_compute/runtime/NEON/functions/NEFillBorder.h
+++ b/arm_compute/runtime/NEON/functions/NEFillBorder.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -57,7 +58,10 @@ public:
      * @param[in]      border_mode           Strategy to use for borders.
      * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ITensor          *input,
+                   unsigned int      border_width,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEFloor.h b/arm_compute/runtime/NEON/functions/NEFloor.h
index 4d47b068dbbd4148bfc4a1b8a60f2a0b9753ffc4..77ac484bab8c7b399ee5dc0fedc7a3f1d11e6828 100644
--- a/arm_compute/runtime/NEON/functions/NEFloor.h
+++ b/arm_compute/runtime/NEON/functions/NEFloor.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEFLOOR_H
 #define ARM_COMPUTE_NEFLOOR_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index 05b7ce3735452ccec71d4ee82013f29c9fec43dc..885f8430cf4f9ae00993c705ad98bd5cc3548114 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -28,7 +28,6 @@
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
-
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -88,7 +87,8 @@ class NEFullyConnectedLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager  = nullptr,
+                          IWeightsManager                *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEFullyConnectedLayer(const NEFullyConnectedLayer &) = delete;
     /** Prevent instances of this class from being moved (As this class contains pointers) */
@@ -126,16 +126,24 @@ public:
      * @param[in]  fc_info      (Optional) Fully connected layer additional info
      * @param[in]  weights_info (Optional) Stores neccessary compute information when weights are already reshaped
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
-                   FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const ITensor          *input,
+                   const ITensor          *weights,
+                   const ITensor          *biases,
+                   ITensor                *output,
+                   FullyConnectedLayerInfo fc_info      = FullyConnectedLayerInfo(),
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEFullyConnectedLayer
      *
      * Similar to @ref NEFullyConnectedLayer::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                           FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo());
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *output,
+                           FullyConnectedLayerInfo fc_info      = FullyConnectedLayerInfo(),
+                           const WeightsInfo      &weights_info = WeightsInfo());
 
     /** Static function that queries whether fixed-format kernel exists for a given problem description
      *
@@ -149,8 +157,13 @@ public:
      *
      * @return a status
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *input, const ITensorInfo *weights,
-                               const ITensorInfo *biases, const ITensorInfo *output, const FullyConnectedLayerInfo &fc_info, const WeightsInfo &weights_info);
+    static Status has_opt_impl(arm_compute::WeightFormat     &expected_weight_format,
+                               const ITensorInfo             *input,
+                               const ITensorInfo             *weights,
+                               const ITensorInfo             *biases,
+                               const ITensorInfo             *output,
+                               const FullyConnectedLayerInfo &fc_info,
+                               const WeightsInfo             &weights_info);
 
     //Inherited methods override
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
index 3dd7f490442ed49c25fc7488c5fc43d444ea8373..f53b3de7f67e343b45cd9298e8e7e268be635242 100644
--- a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
+++ b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
@@ -75,9 +75,16 @@ public:
      * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to Convolution.
      */
-    void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias,
-                   const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const ITensor             *input_weights,
+                   const ITensor             *bn_mean,
+                   const ITensor             *bn_var,
+                   ITensor                   *fused_weights,
+                   ITensor                   *fused_bias,
+                   const ITensor             *input_bias = nullptr,
+                   const ITensor             *bn_beta    = nullptr,
+                   const ITensor             *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalization
      *
      * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -95,10 +102,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    static Status validate(const ITensorInfo         *input_weights,
+                           const ITensorInfo         *bn_mean,
+                           const ITensorInfo         *bn_var,
+                           const ITensorInfo         *fused_weights,
+                           const ITensorInfo         *fused_bias,
+                           const ITensorInfo         *input_bias = nullptr,
+                           const ITensorInfo         *bn_beta    = nullptr,
+                           const ITensorInfo         *bn_gamma   = nullptr,
+                           float                      epsilon    = 0.001f,
+                           FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index c6ff2dfb92b4671ceb5fc6db3eb6fb93bfcf6546..29650a5ecab3beaa059ddc3e1d438e426bb64b31 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -78,14 +78,26 @@ public:
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should happen only for the first run
      */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ITensor  *a,
+                   const ITensor  *b,
+                   const ITensor  *c,
+                   ITensor        *d,
+                   float           alpha,
+                   float           beta,
+                   const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMM.
      *
      * Similar to @ref NEGEMM::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     /** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format
      * weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same
@@ -93,8 +105,14 @@ public:
      *
      * @return a status
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output,
-                               float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                               const ITensorInfo         *a,
+                               const ITensorInfo         *b,
+                               const ITensorInfo         *c,
+                               const ITensorInfo         *output,
+                               float                      alpha,
+                               float                      beta,
+                               const GEMMInfo            &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
index 53ceb6d978202fad4dfbe92ccc81243ea883009a..d1c5a1c9b33d0710fe9287b3ae2ef7ef97e7e8a9 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
@@ -86,7 +86,8 @@ public:
      *                     Data types supported: Same as @p input.
      * @param[in]  info    Convolution layer descriptor
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info);
+    void
+    configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConv2d
      *
      * @param[in] input   Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
@@ -102,7 +103,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *output,
+                           const Conv2dInfo  &info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index 72309e464ef18703789da07ad269efa1b2c3d793..3e84c3e2cf3b40d6c604e0a2bce9f8a768a093fb 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -49,7 +47,8 @@ class NEGEMMConvolutionLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager  = nullptr,
+                           IWeightsManager                       *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGEMMConvolutionLayer(const NEGEMMConvolutionLayer &) = delete;
     /** Prevent instances of this class from being moved (As this class contains non movable objects) */
@@ -95,8 +94,16 @@ public:
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+    void configure(const ITensor             *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer
      *
      * @param[in] input            Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
@@ -119,9 +126,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                           bool enable_fast_math = false, unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
 
     /** Static function to check if there is an optimized version of
      * GEMM available for the input parameters.
@@ -178,10 +192,16 @@ public:
      *
      * @return a Status
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                               const PadStrideInfo &conv_info,
-                               const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                               bool enable_fast_math = false);
+    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                               const ITensorInfo         *src,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *dst,
+                               const PadStrideInfo       &conv_info,
+                               const WeightsInfo         &weights_info     = WeightsInfo(),
+                               const Size2D              &dilation         = Size2D(1U, 1U),
+                               const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                               bool                       enable_fast_math = false);
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index addb13cdfa6a267b02bbaed2fc45d14433962802..824c4443add07608ad832adf4f6d124096fc8db7 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -47,7 +47,8 @@ class NEGEMMLowpMatrixMultiplyCore : public IFunction
 {
 public:
     /** Constructor */
-    NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager  = nullptr,
+                                 IWeightsManager                *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGEMMLowpMatrixMultiplyCore(const NEGEMMLowpMatrixMultiplyCore &) = delete;
     /** Default move constructor */
@@ -96,14 +97,19 @@ public:
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(
+        const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore
      *
      * Similar to @ref NEGEMMLowpMatrixMultiplyCore::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
index 232344e5c23dbd16f016fe3707cd9398db7a214a..0d932bb4af9f2d3b9a54a2a75edd12a89e1a7c33 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
@@ -89,7 +89,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info);
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *output,
+                           const GEMMLowpOutputStageInfo &info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
index 3b683382ec6a67d141c436c0c217fb27070c8b80..0f294fde22825fba7e6227814dbce9211fae361a 100644
--- a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
@@ -95,7 +95,12 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the @ref GenerateProposalsInfo struct.
      * @note Proposals contains all the proposals. Of those, only the first num_valid_proposals are valid.
      */
-    void configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals,
+    void configure(const ITensor               *scores,
+                   const ITensor               *deltas,
+                   const ITensor               *anchors,
+                   ITensor                     *proposals,
+                   ITensor                     *scores_out,
+                   ITensor                     *num_valid_proposals,
                    const GenerateProposalsInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEGenerateProposalsLayer
@@ -112,7 +117,11 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
+    static Status validate(const ITensorInfo           *scores,
+                           const ITensorInfo           *deltas,
+                           const ITensorInfo           *anchors,
+                           const ITensorInfo           *proposals,
+                           const ITensorInfo           *scores_out,
                            const ITensorInfo           *num_valid_proposals,
                            const GenerateProposalsInfo &info);
 
diff --git a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
index bb0697072b19b5cf0c46abd74aa1fdd8a7a631cd..0bc57be09ef78e12ac699d3bb67040aa7af8b055 100644
--- a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
@@ -89,7 +89,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           float              gamma   = 1.0f,
+                           float              beta    = 0.0f,
+                           float              epsilon = 1e-12f);
 
     // Inherited methods overridden:
     void run() override;
@@ -103,5 +107,5 @@ private:
     Tensor                                              _permuted_input;
     Tensor                                              _permuted_output;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
index 7f1a5e785e7db06111b61f076d4c03808bedbb26..8502cee5d202c975b1c4e3fcf787e8167d019829 100644
--- a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
@@ -97,5 +97,5 @@ private:
     std::unique_ptr<NEL2NormalizeLayerKernel> _normalize_kernel;
     Tensor                                    _sumsq;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEL2NORMALIZELAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
index 4272215486725eab4325988c88726c8f9b6e901b..629c5d10a085fb59639e10ce7b1cda478539c727 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NELSTMLAYER_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
@@ -35,7 +36,6 @@
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
 
 namespace arm_compute
 {
@@ -104,13 +104,26 @@ public:
      * @param[in]  projection_threshold        The clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip].
      *                                         If set to 0.0 then clipping is disabled.
      */
-    void configure(const ITensor *input,
-                   const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                   const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                   const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                   const ITensor *output_state_in, const ITensor *cell_state_in,
-                   ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output,
-                   const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    void configure(const ITensor             *input,
+                   const ITensor             *input_to_forget_weights,
+                   const ITensor             *input_to_cell_weights,
+                   const ITensor             *input_to_output_weights,
+                   const ITensor             *recurrent_to_forget_weights,
+                   const ITensor             *recurrent_to_cell_weights,
+                   const ITensor             *recurrent_to_output_weights,
+                   const ITensor             *forget_gate_bias,
+                   const ITensor             *cell_bias,
+                   const ITensor             *output_gate_bias,
+                   const ITensor             *output_state_in,
+                   const ITensor             *cell_state_in,
+                   ITensor                   *scratch_buffer,
+                   ITensor                   *output_state_out,
+                   ITensor                   *cell_state_out,
+                   ITensor                   *output,
+                   const LSTMParams<ITensor> &lstm_params,
+                   const ActivationLayerInfo &activation_info,
+                   float                      cell_threshold       = 0.f,
+                   float                      projection_threshold = 0.f);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NELSTMLayer
      *
@@ -151,13 +164,26 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
-                           const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
-                           const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *input_to_forget_weights,
+                           const ITensorInfo             *input_to_cell_weights,
+                           const ITensorInfo             *input_to_output_weights,
+                           const ITensorInfo             *recurrent_to_forget_weights,
+                           const ITensorInfo             *recurrent_to_cell_weights,
+                           const ITensorInfo             *recurrent_to_output_weights,
+                           const ITensorInfo             *forget_gate_bias,
+                           const ITensorInfo             *cell_bias,
+                           const ITensorInfo             *output_gate_bias,
+                           const ITensorInfo             *output_state_in,
+                           const ITensorInfo             *cell_state_in,
+                           const ITensorInfo             *scratch_buffer,
+                           const ITensorInfo             *output_state_out,
+                           const ITensorInfo             *cell_state_out,
+                           const ITensorInfo             *output,
+                           const LSTMParams<ITensorInfo> &lstm_params,
+                           const ActivationLayerInfo     &activation_info,
+                           float                          cell_threshold       = 0.f,
+                           float                          projection_threshold = 0.f);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
index bcb89d997d78406a30b4f928e772a50d0d6e6328..ae951669b37c479869c5d76151e9019c67434ae7 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NELSTMLAYERQUANTIZED_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
@@ -38,8 +39,6 @@
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
-#include "arm_compute/runtime/common/LSTMParams.h"
-
 namespace arm_compute
 {
 // Forward declarations
@@ -104,11 +103,22 @@ public:
      * @param[out] output_state_out            Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
      */
     void configure(const ITensor *input,
-                   const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                   const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                   const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                   ITensor *cell_state_in, const ITensor *output_state_in,
-                   ITensor *cell_state_out, ITensor *output_state_out);
+                   const ITensor *input_to_input_weights,
+                   const ITensor *input_to_forget_weights,
+                   const ITensor *input_to_cell_weights,
+                   const ITensor *input_to_output_weights,
+                   const ITensor *recurrent_to_input_weights,
+                   const ITensor *recurrent_to_forget_weights,
+                   const ITensor *recurrent_to_cell_weights,
+                   const ITensor *recurrent_to_output_weights,
+                   const ITensor *input_gate_bias,
+                   const ITensor *forget_gate_bias,
+                   const ITensor *cell_bias,
+                   const ITensor *output_gate_bias,
+                   ITensor       *cell_state_in,
+                   const ITensor *output_state_in,
+                   ITensor       *cell_state_out,
+                   ITensor       *output_state_out);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NELSTMLayer
      *
@@ -133,11 +143,22 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out);
+                           const ITensorInfo *input_to_input_weights,
+                           const ITensorInfo *input_to_forget_weights,
+                           const ITensorInfo *input_to_cell_weights,
+                           const ITensorInfo *input_to_output_weights,
+                           const ITensorInfo *recurrent_to_input_weights,
+                           const ITensorInfo *recurrent_to_forget_weights,
+                           const ITensorInfo *recurrent_to_cell_weights,
+                           const ITensorInfo *recurrent_to_output_weights,
+                           const ITensorInfo *input_gate_bias,
+                           const ITensorInfo *forget_gate_bias,
+                           const ITensorInfo *cell_bias,
+                           const ITensorInfo *output_gate_bias,
+                           const ITensorInfo *cell_state_in,
+                           const ITensorInfo *output_state_in,
+                           const ITensorInfo *cell_state_out,
+                           const ITensorInfo *output_state_out);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEMatMul.h b/arm_compute/runtime/NEON/functions/NEMatMul.h
index e961f860c1c55c12d8d9cc23cfd11485e357980d..414fc2f3fdd89ec7ad602d338d742c5c75a5c795 100644
--- a/arm_compute/runtime/NEON/functions/NEMatMul.h
+++ b/arm_compute/runtime/NEON/functions/NEMatMul.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -48,7 +49,7 @@ public:
     };
 
 private:
-    bool _fast_math{ false };
+    bool _fast_math{false};
 };
 
 // Forward declarations
@@ -96,7 +97,12 @@ public:
      * @param[in]  settings Contains flags for function level settings i.e fast math
      * @param[in]  act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
      */
-    void configure(ITensor *lhs, ITensor *rhs, ITensor *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *lhs,
+                   ITensor                   *rhs,
+                   ITensor                   *dst,
+                   const MatMulInfo          &info,
+                   const CpuMatMulSettings   &settings,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEMatMul
      *
      * @param[in]  lhs      Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
@@ -108,7 +114,11 @@ public:
      *
      * @return Status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings,
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &info,
+                           const CpuMatMulSettings   &settings,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden
@@ -118,5 +128,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL */
diff --git a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
index 2f77540e1ee795388c0987f9098e3ae0b72b3223..e00fc4544fc5ddb3e2a150f45549c911323166b2 100644
--- a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -86,7 +87,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run() override;
@@ -96,5 +100,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEMAXUNPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
index fbe000445cbef9bf2cc48db784071d676c0b184a..27e3fa674e5fa77c4baa7575dc374304ab61ebca 100644
--- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NENORMALIZATIONLAYER_H
 #define ARM_COMPUTE_NENORMALIZATIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
@@ -88,16 +87,17 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    MemoryGroup                                 _memory_group;  /**< Function memory group */
-    std::unique_ptr<NENormalizationLayerKernel> _norm_kernel;   /**< Normalization layer kernel */
-    NEPixelWiseMultiplication                   _multiply_f;    /**< Pixel multiplication function */
-    Tensor                                      _input_squared; /**< The intermediate buffer which stores results of squaring input */
+    MemoryGroup                                 _memory_group; /**< Function memory group */
+    std::unique_ptr<NENormalizationLayerKernel> _norm_kernel;  /**< Normalization layer kernel */
+    NEPixelWiseMultiplication                   _multiply_f;   /**< Pixel multiplication function */
+    Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NENORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPadLayer.h b/arm_compute/runtime/NEON/functions/NEPadLayer.h
index 4aa6725496234eabe67037a08dff5747b0614cb1..494b1c0641426c9ee3f3e83a8dc8efcfc6e2626f 100644
--- a/arm_compute/runtime/NEON/functions/NEPadLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPadLayer.h
@@ -24,14 +24,14 @@
 #ifndef ARM_COMPUTE_NEPADLAYER_H
 #define ARM_COMPUTE_NEPADLAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
 #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
 #include "arm_compute/runtime/SubTensor.h"
-
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -82,7 +82,11 @@ public:
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(ITensor           *input,
+                   ITensor           *output,
+                   const PaddingList &padding,
+                   const PixelValue   constant_value = PixelValue(),
+                   const PaddingMode  mode           = PaddingMode::CONSTANT);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEPadLayer.
      *
      * @param[in] input          Source tensor info. Data types supported: All.
@@ -95,7 +99,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const PaddingList &padding,
+                           const PixelValue   constant_value = PixelValue(),
+                           const PaddingMode  mode           = PaddingMode::CONSTANT);
 
     // Inherited methods overridden:
     void run() override;
@@ -109,7 +117,10 @@ private:
      *                            specifies the front and the end padding in the i-th dimension.
      * @param[in]  constant_value Constant value to be used for the padding
      */
-    void configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value);
+    void configure_constant_mode(ITensor           *input,
+                                 ITensor           *output,
+                                 const PaddingList &padding,
+                                 const PixelValue   constant_value);
     /** Configure functions for when reflect or symmetric padding is used.
      *
      * @param[in]  input  Source tensor. Data types supported: All.
diff --git a/arm_compute/runtime/NEON/functions/NEPermute.h b/arm_compute/runtime/NEON/functions/NEPermute.h
index c863fde0ac29f85ec501ce1e132d94a8f56e76e8..2cef64764d29aa71a8569bb5422e0655dbcf90bf 100644
--- a/arm_compute/runtime/NEON/functions/NEPermute.h
+++ b/arm_compute/runtime/NEON/functions/NEPermute.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEPERMUTE_H
 #define ARM_COMPUTE_NEPERMUTE_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index 634e8e0c39ebd5abb782e65a507acfa5154b1f6b..3d81bf6087f3cd49f3cf354058ad55d1da177b58 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -95,7 +95,12 @@ public:
      * @param[in]      rounding_policy Rounding policy.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+    void configure(const ITensor             *input1,
+                   const ITensor             *input2,
+                   ITensor                   *output,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
      *
@@ -122,7 +127,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
@@ -158,7 +168,10 @@ public:
      * @param[out]     output   The output tensor. Data types supported: same as @p input1. Number of channels: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplication
      *
      * @param[in] input1   An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
@@ -166,7 +179,10 @@ public:
      * @param[in] output   The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
index 4c5eb58e05ced72509f27b84aa9e9ead1e93bf9d..09251f2a5f273acabce8f77d83dec001719ffc73 100644
--- a/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
@@ -92,5 +92,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEPOOLING3DLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index 9147ad968790fe42e0fba4844e9211800aa1bcb8..768ad0d818ffbd231356118ec9636d80e8ee3c30 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -91,7 +91,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
 
     // Inherited methods overridden:
     void run() override;
@@ -100,5 +103,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
index 38e0c9f3ad96b696da5d59894fb2d804e961dcf7..858e3299afa4db2d8c4367b646a0c263317433b5 100644
--- a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
@@ -62,7 +62,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+    static Status validate(const ITensorInfo       *input1,
+                           const ITensorInfo       *input2,
+                           const ITensorInfo       *output,
+                           const PriorBoxLayerInfo &info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEPRIORBOXLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index 185d821ec013f7bcd0f3a94f3435eef4f849112a..009a4e09118345469e0d004bd3de24dc5503c660 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEQLSTMLAYER_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
@@ -35,7 +36,6 @@
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 #include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
 
 #include <memory>
 
@@ -130,12 +130,21 @@ public:
      *                                         projection_threshold       (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
      *                                                                               [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      */
-    void configure(const ITensor *input,
-                   const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                   const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                   const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                   const ITensor *cell_state_in, ITensor *output_state_in,
-                   ITensor *cell_state_out, ITensor *output_state_out, ITensor *output,
+    void configure(const ITensor             *input,
+                   const ITensor             *input_to_forget_weights,
+                   const ITensor             *input_to_cell_weights,
+                   const ITensor             *input_to_output_weights,
+                   const ITensor             *recurrent_to_forget_weights,
+                   const ITensor             *recurrent_to_cell_weights,
+                   const ITensor             *recurrent_to_output_weights,
+                   const ITensor             *forget_gate_bias,
+                   const ITensor             *cell_bias,
+                   const ITensor             *output_gate_bias,
+                   const ITensor             *cell_state_in,
+                   ITensor                   *output_state_in,
+                   ITensor                   *cell_state_out,
+                   ITensor                   *output_state_out,
+                   ITensor                   *output,
                    const LSTMParams<ITensor> &lstm_params);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEQLSTMLayer
@@ -180,12 +189,21 @@ public:
      *                                                                              [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      * @return a status
      */
-    static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *input_to_forget_weights,
+                           const ITensorInfo             *input_to_cell_weights,
+                           const ITensorInfo             *input_to_output_weights,
+                           const ITensorInfo             *recurrent_to_forget_weights,
+                           const ITensorInfo             *recurrent_to_cell_weights,
+                           const ITensorInfo             *recurrent_to_output_weights,
+                           const ITensorInfo             *forget_gate_bias,
+                           const ITensorInfo             *cell_bias,
+                           const ITensorInfo             *output_gate_bias,
+                           const ITensorInfo             *cell_state_in,
+                           const ITensorInfo             *output_state_in,
+                           const ITensorInfo             *cell_state_out,
+                           const ITensorInfo             *output_state_out,
+                           const ITensorInfo             *output,
                            const LSTMParams<ITensorInfo> &lstm_params);
 
     // Inherited methods overridden:
@@ -218,10 +236,17 @@ private:
      * @param[in] mm_res_info    Tensor info to be used to initialize output stage result tensor.
      *
      */
-    void configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                      const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias, Tensor *mm_res,
-                      Tensor *outstage_res, float gemmlowp_scale,
-                      const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info);
+    void configure_mm(NEGEMMLowpMatrixMultiplyCore &mm,
+                      NEGEMMLowpOutputStage        &outstage,
+                      GEMMLowpOutputStageInfo      &gemmlowp_info,
+                      const ITensor                *mm_input,
+                      const ITensor                *mm_weights,
+                      const ITensor                *bias,
+                      Tensor                       *mm_res,
+                      Tensor                       *outstage_res,
+                      float                         gemmlowp_scale,
+                      const TensorInfo             &mm_res_info,
+                      const TensorInfo             &outstage_tensor_info);
 
     MemoryGroup _memory_group;
 
@@ -230,8 +255,8 @@ private:
     {
         static constexpr uint32_t max_dimension_supported = 2;
 
-        ITensor *_src{ nullptr };
-        ITensor *_dst{ nullptr };
+        ITensor *_src{nullptr};
+        ITensor *_dst{nullptr};
         size_t   _row_size{};
         Window   _window{};
 
@@ -335,19 +360,16 @@ private:
     NECopy _copy_output;
 
     // Tensor pointers
-    const ITensor *_input_to_input_weights
-    {
-        nullptr
-    };
-    const ITensor *_recurrent_to_input_weights{ nullptr };
-    const ITensor *_projection_bias{ nullptr };
-    const ITensor *_input_to_forget_weights{ nullptr };
-    const ITensor *_input_to_cell_weights{ nullptr };
-    const ITensor *_input_to_output_weights{ nullptr };
-    const ITensor *_recurrent_to_forget_weights{ nullptr };
-    const ITensor *_recurrent_to_cell_weights{ nullptr };
-    const ITensor *_recurrent_to_output_weights{ nullptr };
-    const ITensor *_projection_weights{ nullptr };
+    const ITensor                                 *_input_to_input_weights{nullptr};
+    const ITensor                                 *_recurrent_to_input_weights{nullptr};
+    const ITensor                                 *_projection_bias{nullptr};
+    const ITensor                                 *_input_to_forget_weights{nullptr};
+    const ITensor                                 *_input_to_cell_weights{nullptr};
+    const ITensor                                 *_input_to_output_weights{nullptr};
+    const ITensor                                 *_recurrent_to_forget_weights{nullptr};
+    const ITensor                                 *_recurrent_to_cell_weights{nullptr};
+    const ITensor                                 *_recurrent_to_output_weights{nullptr};
+    const ITensor                                 *_projection_weights{nullptr};
     std::array<const ITensor *, _layer_norm_count> _layer_norm_weights{};
     std::array<const ITensor *, _layer_norm_count> _layer_norm_bias{};
 
@@ -382,66 +404,66 @@ private:
         return _layer_norms[getGateIndex(g)];
     }
 
-    void configure_layer_norm(LayerNormGate g, const ITensor *in);
+    void          configure_layer_norm(LayerNormGate g, const ITensor *in);
     static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias);
 
     // Temporary tensors
-    Tensor _input_to_forget_weights_f32{ nullptr };
-    Tensor _input_to_forget_weights_symm8{ nullptr };
+    Tensor _input_to_forget_weights_f32{nullptr};
+    Tensor _input_to_forget_weights_symm8{nullptr};
 
-    Tensor _input_to_forget_weights_transposed{ nullptr };
-    Tensor _input_to_cell_weights_transposed{ nullptr };
-    Tensor _input_to_output_weights_transposed{ nullptr };
-    Tensor _input_to_input_weights_transposed{ nullptr };
-    Tensor _recurrent_to_forget_weights_transposed{ nullptr };
-    Tensor _recurrent_to_cell_weights_transposed{ nullptr };
-    Tensor _recurrent_to_output_weights_transposed{ nullptr };
-    Tensor _recurrent_to_input_weights_transposed{ nullptr };
-    Tensor _projection_weights_transposed{ nullptr };
-    Tensor _input_to_input_eff_bias{ nullptr };
-    Tensor _recurrent_to_input_eff_bias{ nullptr };
-    Tensor _input_to_forget_eff_bias{ nullptr };
-    Tensor _recurrent_to_forget_eff_bias{ nullptr };
-    Tensor _input_to_cell_eff_bias{ nullptr };
-    Tensor _recurrent_to_cell_eff_bias{ nullptr };
-    Tensor _input_to_output_eff_bias{ nullptr };
-    Tensor _recurrent_to_output_eff_bias{ nullptr };
-    Tensor _projection_reduction_res{ nullptr };
-    Tensor _projection_eff_bias{ nullptr };
-    Tensor _mm_input_to_forget_res{ nullptr };
-    Tensor _mm_recurrent_to_forget_res{ nullptr };
-    Tensor _mul_cell_to_forget_res{ nullptr };
-    Tensor _input_to_forget_outstage_res{ nullptr };
-    Tensor _cell_to_forget_outstage_res{ nullptr };
-    Tensor _recurrent_to_forget_outstage_res{ nullptr };
-    Tensor _forget_gate{ nullptr };
-    Tensor _mm_input_to_cell_res{ nullptr };
-    Tensor _input_to_cell_outstage_res{ nullptr };
-    Tensor _mm_recurrent_to_cell_res{ nullptr };
-    Tensor _recurrent_to_cell_outstage_res{ nullptr };
-    Tensor _cell_gate{ nullptr };
-    Tensor _mul_input_cell_res{ nullptr };
-    Tensor _mm_input_to_input_res{ nullptr };
-    Tensor _input_to_input_outstage_res{ nullptr };
-    Tensor _mm_recurrent_to_input_res{ nullptr };
-    Tensor _mul_cell_to_input_res{ nullptr };
-    Tensor _cell_to_input_outstage_res{ nullptr };
-    Tensor _recurrent_to_input_outstage_res{ nullptr };
-    Tensor _input_gate{ nullptr };
-    Tensor _mm_input_to_output_res{ nullptr };
-    Tensor _input_to_output_outstage_res{ nullptr };
-    Tensor _mm_recurrent_to_output_res{ nullptr };
-    Tensor _mul_cell_to_output_res{ nullptr };
-    Tensor _cell_to_output_outstage_res{ nullptr };
-    Tensor _recurrent_to_output_outstage_res{ nullptr };
-    Tensor _output_gate{ nullptr };
-    Tensor _hidden_mul_res{ nullptr };
-    Tensor _hidden_gate{ nullptr };
-    Tensor _mm_projection_res{ nullptr };
-    Tensor _projection_outstage_res{ nullptr };
-    Tensor _projection_out_res{ nullptr };
-    Tensor _projection_accumulate_res{ nullptr };
-    Tensor _ones{ nullptr };
+    Tensor                                _input_to_forget_weights_transposed{nullptr};
+    Tensor                                _input_to_cell_weights_transposed{nullptr};
+    Tensor                                _input_to_output_weights_transposed{nullptr};
+    Tensor                                _input_to_input_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_forget_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_cell_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_output_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_input_weights_transposed{nullptr};
+    Tensor                                _projection_weights_transposed{nullptr};
+    Tensor                                _input_to_input_eff_bias{nullptr};
+    Tensor                                _recurrent_to_input_eff_bias{nullptr};
+    Tensor                                _input_to_forget_eff_bias{nullptr};
+    Tensor                                _recurrent_to_forget_eff_bias{nullptr};
+    Tensor                                _input_to_cell_eff_bias{nullptr};
+    Tensor                                _recurrent_to_cell_eff_bias{nullptr};
+    Tensor                                _input_to_output_eff_bias{nullptr};
+    Tensor                                _recurrent_to_output_eff_bias{nullptr};
+    Tensor                                _projection_reduction_res{nullptr};
+    Tensor                                _projection_eff_bias{nullptr};
+    Tensor                                _mm_input_to_forget_res{nullptr};
+    Tensor                                _mm_recurrent_to_forget_res{nullptr};
+    Tensor                                _mul_cell_to_forget_res{nullptr};
+    Tensor                                _input_to_forget_outstage_res{nullptr};
+    Tensor                                _cell_to_forget_outstage_res{nullptr};
+    Tensor                                _recurrent_to_forget_outstage_res{nullptr};
+    Tensor                                _forget_gate{nullptr};
+    Tensor                                _mm_input_to_cell_res{nullptr};
+    Tensor                                _input_to_cell_outstage_res{nullptr};
+    Tensor                                _mm_recurrent_to_cell_res{nullptr};
+    Tensor                                _recurrent_to_cell_outstage_res{nullptr};
+    Tensor                                _cell_gate{nullptr};
+    Tensor                                _mul_input_cell_res{nullptr};
+    Tensor                                _mm_input_to_input_res{nullptr};
+    Tensor                                _input_to_input_outstage_res{nullptr};
+    Tensor                                _mm_recurrent_to_input_res{nullptr};
+    Tensor                                _mul_cell_to_input_res{nullptr};
+    Tensor                                _cell_to_input_outstage_res{nullptr};
+    Tensor                                _recurrent_to_input_outstage_res{nullptr};
+    Tensor                                _input_gate{nullptr};
+    Tensor                                _mm_input_to_output_res{nullptr};
+    Tensor                                _input_to_output_outstage_res{nullptr};
+    Tensor                                _mm_recurrent_to_output_res{nullptr};
+    Tensor                                _mul_cell_to_output_res{nullptr};
+    Tensor                                _cell_to_output_outstage_res{nullptr};
+    Tensor                                _recurrent_to_output_outstage_res{nullptr};
+    Tensor                                _output_gate{nullptr};
+    Tensor                                _hidden_mul_res{nullptr};
+    Tensor                                _hidden_gate{nullptr};
+    Tensor                                _mm_projection_res{nullptr};
+    Tensor                                _projection_outstage_res{nullptr};
+    Tensor                                _projection_out_res{nullptr};
+    Tensor                                _projection_accumulate_res{nullptr};
+    Tensor                                _ones{nullptr};
     std::array<Tensor, _layer_norm_count> _layer_norm_output{};
 
     inline Tensor &get_layer_norm_output(LayerNormGate g)
@@ -449,15 +471,15 @@ private:
         return _layer_norm_output[getGateIndex(g)];
     }
 
-    bool _is_prepared{ false };
-    bool _has_cifg{ false };
-    bool _has_cell_clipping{ false };
-    bool _has_projection{ false };
-    bool _has_projection_clipping{ false };
-    bool _has_peephole{ false };
-    bool _has_layer_norm{ false };
-    bool _projection_tensor_copy_required{ false };
-    bool _convert_input_to_forget_weights_to_qsymm8{ false };
+    bool _is_prepared{false};
+    bool _has_cifg{false};
+    bool _has_cell_clipping{false};
+    bool _has_projection{false};
+    bool _has_projection_clipping{false};
+    bool _has_peephole{false};
+    bool _has_layer_norm{false};
+    bool _projection_tensor_copy_required{false};
+    bool _convert_input_to_forget_weights_to_qsymm8{false};
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEQLSTMLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NERNNLayer.h b/arm_compute/runtime/NEON/functions/NERNNLayer.h
index 667d3144ac79272413b25b6e2d213933256a8351..af7f464ac968c846087b0dd9ddd32edcbcbed82f 100644
--- a/arm_compute/runtime/NEON/functions/NERNNLayer.h
+++ b/arm_compute/runtime/NEON/functions/NERNNLayer.h
@@ -72,7 +72,13 @@ public:
      * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input
      * @param[in]     info              Activation layer parameter.
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info);
+    void configure(const ITensor       *input,
+                   const ITensor       *weights,
+                   const ITensor       *recurrent_weights,
+                   const ITensor       *bias,
+                   ITensor             *hidden_state,
+                   ITensor             *output,
+                   ActivationLayerInfo &info);
     /** Initialize the function
      *
      * @param[in] input             Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32
@@ -85,7 +91,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, const ITensorInfo *output,
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *recurrent_weights,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *hidden_state,
+                           const ITensorInfo         *output,
                            const ActivationLayerInfo &info);
 
     // Inherited methods overridden:
diff --git a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
index ea1af4daea77398e714b7f559e15ad8b9edc9885..b06ebe899dfbe440b9a966043013cc793ad560c0 100644
--- a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
@@ -77,7 +77,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEROIALIGNLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
index 2992b3eb9595c4db4e61a06f90ddea433187686c..929111ad4b9d5f71c380f60adb5771b394c7715c 100644
--- a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/IArray.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -73,7 +74,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run() override;
@@ -91,7 +93,10 @@ public:
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           const ITensorInfo         *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
 private:
     std::unique_ptr<NEROIPoolingLayerKernel> _roi_kernel;
diff --git a/arm_compute/runtime/NEON/functions/NERange.h b/arm_compute/runtime/NEON/functions/NERange.h
index cb14c8fdde3ad88e18e05dd287292a0f0a56af83..609456a4ef7708d0bf771538d2fce9a15d6e38ca 100644
--- a/arm_compute/runtime/NEON/functions/NERange.h
+++ b/arm_compute/runtime/NEON/functions/NERange.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h
index caaee8284a6808b7d277742ee32478f89e697d72..5b8d8cdf2bb5e51f1bf7a866c2f7598ac43108b4 100644
--- a/arm_compute/runtime/NEON/functions/NEReduceMean.h
+++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEON_REDUCE_MEAN_H
 #define ARM_COMPUTE_NEON_REDUCE_MEAN_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
@@ -81,7 +80,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
+    static Status
+    validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
index 533c10adcf3eebb9b7d1da3ba62e407fb3c906fb..f5391a6d0edac0fc0e6e4fc0e4b25657606e67fe 100644
--- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h
+++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
@@ -25,9 +25,9 @@
 #define ARM_COMPUTE_NEREDUCTIONOPERATION_H
 
 #include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -88,7 +88,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           unsigned int       axis,
+                           ReductionOperation op,
+                           bool               keep_dims = true);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEReorderLayer.h b/arm_compute/runtime/NEON/functions/NEReorderLayer.h
index eb777f19252735c699fb93c87785cdf8f077b00b..e3fa7b9c16ae0f8027a5b3776302e07fd975c133 100644
--- a/arm_compute/runtime/NEON/functions/NEReorderLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEReorderLayer.h
@@ -66,7 +66,10 @@ public:
      * @param[in]  input_wf  WeightFormat of input.
      * @param[in]  output_wf WeightFormat of output.
      */
-    void configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf);
+    void configure(const ITensor            *input,
+                   ITensor                  *output,
+                   arm_compute::WeightFormat input_wf,
+                   arm_compute::WeightFormat output_wf);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReorderLayer
      *
@@ -74,7 +77,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf);
+    static Status validate(const ITensorInfo        *input,
+                           const ITensorInfo        *output,
+                           arm_compute::WeightFormat input_wf,
+                           arm_compute::WeightFormat output_wf);
 
     // Inherited methods overridden:
     void run() override;
@@ -85,4 +91,4 @@ private:
 } // namespace arm_compute
 #endif /* ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER */
 
-#endif  // defined(__aarch64__)
\ No newline at end of file
+#endif // defined(__aarch64__)
diff --git a/arm_compute/runtime/NEON/functions/NEReverse.h b/arm_compute/runtime/NEON/functions/NEReverse.h
index c02fff54a5f4643bb4178167366135607bfc6454..e03e4150688977ba72c155f32e6d03e1cf5f07b6 100644
--- a/arm_compute/runtime/NEON/functions/NEReverse.h
+++ b/arm_compute/runtime/NEON/functions/NEReverse.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEREVERSE_H
-#define ARM_COMPUTE_NEREVERSE_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
@@ -45,22 +44,33 @@ public:
      * Valid data type configurations:
      * |src0           |src1           |dst            |
      * |:--------------|:--------------|:--------------|
-     * |All            |U32            |All            |
+     * |All            |U32, S32       |All            |
+     *
+     * @param[in]  input             Input tensor. Data types supported: All
+     * @param[out] output            Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis              Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in]  use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
+     *
+     * @note The value of each axis should be between [-rank, rank)
+     * @note If there are duplicate values in the tensor, the subsequent axis values are ignored. e.g. an array of [2, 2] has the same effects as [2].
+     *
+     * @deprecated Support for U32 in axis tensor will be removed in 24.02 release
      *
-     * @param[in]  input  Input tensor. Data types supported: All
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis   Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
      */
-    void configure(const ITensor *input, ITensor *output, const ITensor *axis);
+    void configure(const ITensor *input, ITensor *output, const ITensor *axis, const bool use_inverted_axis = false);
     /** Static function to check if given info will lead to a valid configuration of @ref NEReverseKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: All
-     * @param[in] output Output tensor info. Data type supported: Same as @p input
-     * @param[in] axis   Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in] input             Input tensor info. Data types supported: All
+     * @param[in] output            Output tensor info. Data type supported: Same as @p input
+     * @param[in] axis              Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const ITensorInfo *axis,
+                           const bool         use_inverted_axis = false);
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NEREVERSE_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H
diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
index 0920ff380229ebf6164cabc735732951f3bd951b..72dfa3bda4bbcddf42a9b9e36699fe28b1dfae91 100644
--- a/arm_compute/runtime/NEON/functions/NEScale.h
+++ b/arm_compute/runtime/NEON/functions/NEScale.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_NESCALEIMAGE_H
 #define ARM_COMPUTE_NESCALEIMAGE_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/NEON/functions/NESlice.h b/arm_compute/runtime/NEON/functions/NESlice.h
index ac79a5c6335500b8f70acfd1bff8153c174e1785..70a688d3b0d15056e0623359d8a80be759768e27 100644
--- a/arm_compute/runtime/NEON/functions/NESlice.h
+++ b/arm_compute/runtime/NEON/functions/NESlice.h
@@ -85,7 +85,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
 
     // Inherited methods overridden:
     void run() override;
@@ -129,7 +130,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
 };
 } // namespace experimental
 } // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
index ad8c1467d096bf1b9fec387377d2eccf99da1a4c..5dee61a4a838c4308e12fac101c801b37968fb31 100644
--- a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
@@ -24,9 +24,9 @@
 #ifndef ARM_COMPUTE_NESPACETOBATCHLAYER_H
 #define ARM_COMPUTE_NESPACETOBATCHLAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/Types.h"
 #include <memory>
 
 namespace arm_compute
@@ -82,7 +82,12 @@ public:
      * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
-    void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+    void configure(const ITensor *input,
+                   const int      block_shape_x,
+                   const int      block_shape_y,
+                   const Size2D  &padding_left,
+                   const Size2D  &padding_right,
+                   ITensor       *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayer
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -92,7 +97,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *block_shape,
+                           const ITensorInfo *paddings,
+                           const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayer (Static block shape and paddings)
      *
      * @param[in] input         Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -104,7 +112,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int          block_shape_x,
+                           const int          block_shape_y,
+                           const Size2D      &padding_left,
+                           const Size2D      &padding_right,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NESplit.h b/arm_compute/runtime/NEON/functions/NESplit.h
index 206f299c06c5119860146f7415e8fa8e8f9b30d9..36358a7094c1c41c522aff21264cb5ae23bab864 100644
--- a/arm_compute/runtime/NEON/functions/NESplit.h
+++ b/arm_compute/runtime/NEON/functions/NESplit.h
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-
 #include "arm_compute/runtime/CPP/functions/CPPSplit.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
diff --git a/arm_compute/runtime/NEON/functions/NEStackLayer.h b/arm_compute/runtime/NEON/functions/NEStackLayer.h
index ae4e468f218d5bdc7d5e0f7f8cac43c47b20befd..98dacde0c1079088bf759e724cb7ffc447cb4d30 100644
--- a/arm_compute/runtime/NEON/functions/NEStackLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEStackLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NESTACKLAYER_H
-#define ARM_COMPUTE_NESTACKLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -91,9 +91,8 @@ public:
     void run() override;
 
 private:
-    std::vector<ITensor *>                           _input;
-    std::vector<std::unique_ptr<NEStackLayerKernel>> _stack_kernels;
-    unsigned int                                     _num_inputs;
+    std::unique_ptr<NEStackLayerKernel> _stack_kernel;
+    bool                                _is_prepared;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NESTACKLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NEStridedSlice.h b/arm_compute/runtime/NEON/functions/NEStridedSlice.h
index 4b14d946f6231f9e6de6a1c42a297c2fda38bb4a..fa1113ffec5f2c2e98f85d51de8a7067728f4553 100644
--- a/arm_compute/runtime/NEON/functions/NEStridedSlice.h
+++ b/arm_compute/runtime/NEON/functions/NEStridedSlice.h
@@ -71,9 +71,14 @@ public:
      * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const ITensor *input, ITensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    void configure(const ITensor     *input,
+                   ITensor           *output,
+                   const Coordinates &starts,
+                   const Coordinates &ends,
+                   const BiStrides   &strides,
+                   int32_t            begin_mask       = 0,
+                   int32_t            end_mask         = 0,
+                   int32_t            shrink_axis_mask = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSlice
      *
@@ -89,9 +94,14 @@ public:
      * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask       = 0,
+                           int32_t            end_mask         = 0,
+                           int32_t            shrink_axis_mask = 0);
 
     // Inherited methods overridden:
     void run() override;
@@ -121,9 +131,14 @@ public:
      * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const ITensorInfo *input, ITensorInfo *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    void configure(const ITensorInfo *input,
+                   ITensorInfo       *output,
+                   const Coordinates &starts,
+                   const Coordinates &ends,
+                   const BiStrides   &strides,
+                   int32_t            begin_mask       = 0,
+                   int32_t            end_mask         = 0,
+                   int32_t            shrink_axis_mask = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSlice
      *
@@ -139,9 +154,14 @@ public:
      * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask       = 0,
+                           int32_t            end_mask         = 0,
+                           int32_t            shrink_axis_mask = 0);
 };
 } // namespace experimental
 } // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NETile.h b/arm_compute/runtime/NEON/functions/NETile.h
index 915e5aa1da0b0ee71b61377b9be96ab9c9547666..001a0a4128ce61a745ab7fb88108825198862d0f 100644
--- a/arm_compute/runtime/NEON/functions/NETile.h
+++ b/arm_compute/runtime/NEON/functions/NETile.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NETILE_H
 #define ARM_COMPUTE_NETILE_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/runtime/NEON/functions/NETranspose.h b/arm_compute/runtime/NEON/functions/NETranspose.h
index 581fe7430941c16c51a4f1e9a30c60fd31a5d86c..5d2d1f1b01f54f9f6234c1e8a1e2dcac95b65e1d 100644
--- a/arm_compute/runtime/NEON/functions/NETranspose.h
+++ b/arm_compute/runtime/NEON/functions/NETranspose.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NETRANSPOSE_H
 #define ARM_COMPUTE_NETRANSPOSE_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -83,4 +82,4 @@ private:
     std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NETRANSPOSE_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_NETRANSPOSE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEUnstack.h b/arm_compute/runtime/NEON/functions/NEUnstack.h
index 079fee5b9ec0e9b07f4331bf5f807cea3b588ab2..e1af96d08dff1ccbc52aed3151ea46692ce1f243 100644
--- a/arm_compute/runtime/NEON/functions/NEUnstack.h
+++ b/arm_compute/runtime/NEON/functions/NEUnstack.h
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
 
 #include <memory>
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index f6f0185e7de9e22c16374881ba8f8ef24335b405..6caa2aeb594f088c7ca1027da3bd84fa12c2cbc6 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -21,13 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H
-
-#include "arm_compute/runtime/IFunction.h"
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
@@ -78,7 +77,8 @@ public:
      *                              while every optional dimension from 4 and above represent a batch of inputs.
      *                              Data types supported: F16/F32.
      * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
-     *                              Currently only 3x3 and 5x5 kernels are supported.
+     *                              Supported kernel sizes: (height, width) -> 3x3, 1x3, 3x1, 5x5, 1x5, 5x1 for Fp32
+     *                              -> 3x3 for Fp16
      * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
      * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                              Data types supported: Same as @p input.
@@ -87,8 +87,13 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                   bool enable_fast_math = false);
+    void configure(const ITensor             *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
     void run() override;
@@ -100,12 +105,17 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
 private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/OffsetLifetimeManager.h b/arm_compute/runtime/OffsetLifetimeManager.h
index 2eef61a236ee5fb69427ca6766b316e603f53f50..13ebb9fbe3b9f2784a17072e6778f2d6a59a4b6b 100644
--- a/arm_compute/runtime/OffsetLifetimeManager.h
+++ b/arm_compute/runtime/OffsetLifetimeManager.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_OFFSETLIFETIMEMANAGER_H
 
 #include "arm_compute/runtime/ISimpleLifetimeManager.h"
-
 #include "arm_compute/runtime/Types.h"
 
 #include <map>
@@ -62,7 +61,7 @@ public:
 
     // Inherited methods overridden:
     std::unique_ptr<IMemoryPool> create_pool(IAllocator *allocator) override;
-    MappingType mapping_type() const override;
+    MappingType                  mapping_type() const override;
 
 private:
     // Inherited methods overridden:
diff --git a/arm_compute/runtime/OffsetMemoryPool.h b/arm_compute/runtime/OffsetMemoryPool.h
index a5c363d866889ea18217f5d351a9e9164345f1a5..7250194f8528b3dc0edfdc8444e280fc3654716a 100644
--- a/arm_compute/runtime/OffsetMemoryPool.h
+++ b/arm_compute/runtime/OffsetMemoryPool.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_OFFSETMEMORYPOOL_H
 
 #include "arm_compute/runtime/IMemoryPool.h"
-
 #include "arm_compute/runtime/IMemoryRegion.h"
 #include "arm_compute/runtime/Types.h"
 
@@ -65,8 +64,8 @@ public:
     const BlobInfo &info() const;
 
     // Inherited methods overridden:
-    void acquire(MemoryMappings &handles) override;
-    void release(MemoryMappings &handles) override;
+    void                         acquire(MemoryMappings &handles) override;
+    void                         release(MemoryMappings &handles) override;
     MappingType                  mapping_type() const override;
     std::unique_ptr<IMemoryPool> duplicate() override;
 
diff --git a/arm_compute/runtime/OperatorList.h b/arm_compute/runtime/OperatorList.h
index 92b5079e7ed89bb25ff9fa60927309c7c8d09bce..8bcdf5d3bff104e05a692d30fd8eb6895208bdb5 100644
--- a/arm_compute/runtime/OperatorList.h
+++ b/arm_compute/runtime/OperatorList.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_OPERATOR_LIST_H
-#define ARM_COMPUTE_OPERATOR_LIST_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_OPERATORLIST_H
+#define ACL_ARM_COMPUTE_RUNTIME_OPERATORLIST_H
 
 /** ActivationLayer
  *
@@ -40,6 +40,16 @@
  *
  */
 
+/** AddMulAdd
+ *
+ * Description:
+ * Performs a fused Add + Mul + Add [+ Relu-based-Activation] operation.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
 /** ArgMinMaxLayer
  *
  * Description:
@@ -647,6 +657,16 @@
  *
  */
 
+/** MatMul
+ *
+ * Description:
+ * Computes a matrix multiplication in batches.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_BATCH_MATMUL
+ *
+ */
+
 /** MaxUnpoolingLayer
  *
  * Description:
@@ -677,6 +697,16 @@
  *
  */
 
+/** NormalizePlanarYUVLayer
+ *
+ * Description:
+ * Function to compute normalization planar YUV layer.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
 /** PadLayer
  *
  * Description:
@@ -814,6 +844,16 @@
  *
  */
 
+/** ReorderLayer
+ *
+ * Description:
+ * Reorders a tensor to a different weights format.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
 /** ReorgLayer
  *
  * Description:
@@ -1009,4 +1049,4 @@
  *
  */
 
-#endif /* ARM_COMPUTE_OPERATOR_LIST_H */
\ No newline at end of file
+#endif // ACL_ARM_COMPUTE_RUNTIME_OPERATORLIST_H
diff --git a/arm_compute/runtime/OperatorTensor.h b/arm_compute/runtime/OperatorTensor.h
index 92ae01934b29e47143b1138b6ab7ea5b52ee47d4..237585bec20c1374babade6a4147acd393608f49 100644
--- a/arm_compute/runtime/OperatorTensor.h
+++ b/arm_compute/runtime/OperatorTensor.h
@@ -26,8 +26,8 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/Types.h"
 #include "arm_compute/runtime/experimental/Types.h"
+#include "arm_compute/runtime/Types.h"
 
 #include <cstdint>
 
diff --git a/arm_compute/runtime/PoolManager.h b/arm_compute/runtime/PoolManager.h
index cc50fc04a46ab9d9f3a06d173d9eb690d97ae593..6aa6aef6e21152b406b73b2750e947af2a87d87f 100644
--- a/arm_compute/runtime/PoolManager.h
+++ b/arm_compute/runtime/PoolManager.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_POOLMANAGER_H
 #define ARM_COMPUTE_POOLMANAGER_H
 
-#include "arm_compute/runtime/IPoolManager.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/IMemoryPool.h"
+#include "arm_compute/runtime/IPoolManager.h"
+
 #include "support/Mutex.h"
 #include "support/Semaphore.h"
 
@@ -53,9 +53,9 @@ public:
     PoolManager &operator=(PoolManager &&) = delete;
 
     // Inherited methods overridden:
-    IMemoryPool *lock_pool() override;
-    void unlock_pool(IMemoryPool *pool) override;
-    void register_pool(std::unique_ptr<IMemoryPool> pool) override;
+    IMemoryPool                 *lock_pool() override;
+    void                         unlock_pool(IMemoryPool *pool) override;
+    void                         register_pool(std::unique_ptr<IMemoryPool> pool) override;
     std::unique_ptr<IMemoryPool> release_pool() override;
     void                         clear_pools() override;
     size_t                       num_pools() const override;
@@ -66,5 +66,5 @@ private:
     std::unique_ptr<arm_compute::Semaphore> _sem;            /**< Semaphore to control the queues */
     mutable arm_compute::Mutex              _mtx;            /**< Mutex to control access to the queues */
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_POOLMANAGER_H */
diff --git a/arm_compute/runtime/RuntimeContext.h b/arm_compute/runtime/RuntimeContext.h
index 23bd2673752264170b0bbcea6ea080089a405c0f..d64e609196e8d66dd8d24241aa02cda092fe937b 100644
--- a/arm_compute/runtime/RuntimeContext.h
+++ b/arm_compute/runtime/RuntimeContext.h
@@ -54,8 +54,8 @@ public:
     IAssetManager *asset_manager() override;
 
 private:
-    std::unique_ptr<IScheduler> _owned_scheduler{ nullptr };
-    IScheduler                 *_scheduler{ nullptr };
+    std::unique_ptr<IScheduler> _owned_scheduler{nullptr};
+    IScheduler                 *_scheduler{nullptr};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_RUNTIME_CONTEXT_H */
diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
index 9e8add1f95a7aa2cf33b36a3bd65b5030d539183..bd29cbb31f5eb463b74137a0ebe4a9d7e3d54b66 100644
--- a/arm_compute/runtime/Scheduler.h
+++ b/arm_compute/runtime/Scheduler.h
@@ -74,8 +74,8 @@ public:
     static bool is_available(Type t);
 
 private:
-    static Type                        _scheduler_type;
-    static std::shared_ptr<IScheduler> _custom_scheduler;
+    static Type                                        _scheduler_type;
+    static std::shared_ptr<IScheduler>                 _custom_scheduler;
     static std::map<Type, std::unique_ptr<IScheduler>> _schedulers;
 
     Scheduler();
diff --git a/arm_compute/runtime/SubTensor.h b/arm_compute/runtime/SubTensor.h
index 3ca066e1c8ef5fe2ac1691bc12b4886e0e9dba63..2badb31b2642278219eb911c04874674baa2615d 100644
--- a/arm_compute/runtime/SubTensor.h
+++ b/arm_compute/runtime/SubTensor.h
@@ -72,5 +72,5 @@ private:
     ITensor              *_parent;
     mutable SubTensorInfo _info;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_SUBTENSOR_H */
diff --git a/arm_compute/runtime/Tensor.h b/arm_compute/runtime/Tensor.h
index 172c8963f0bd212127ea9b68c525977cbc3e0de1..e71fbd4a9647cf38cb2ebea6bfd8fc0a9a65921e 100644
--- a/arm_compute/runtime/Tensor.h
+++ b/arm_compute/runtime/Tensor.h
@@ -59,7 +59,7 @@ public:
     ITensorInfo *info() const override;
     ITensorInfo *info() override;
     uint8_t     *buffer() const override;
-    void associate_memory_group(IMemoryGroup *memory_group) override;
+    void         associate_memory_group(IMemoryGroup *memory_group) override;
 
 private:
     mutable TensorAllocator _allocator; /**< Instance of the basic CPU allocator.*/
diff --git a/arm_compute/runtime/TensorAllocator.h b/arm_compute/runtime/TensorAllocator.h
index a5e16c4d902f758811d671e237ea747805a4cf8a..d8199314156de88924b7eba5708e2606d6768be0 100644
--- a/arm_compute/runtime/TensorAllocator.h
+++ b/arm_compute/runtime/TensorAllocator.h
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_TENSORALLOCATOR_H
 #define ARM_COMPUTE_TENSORALLOCATOR_H
 #include "arm_compute/runtime/ITensorAllocator.h"
-
 #include "arm_compute/runtime/Memory.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
diff --git a/arm_compute/runtime/common/LSTMParams.h b/arm_compute/runtime/common/LSTMParams.h
index aedb9c0d46d96f6ea0b77dfc245f4fb20b6ef6eb..6800faf87f2df501b340c99c36411a4c3cb29b40 100644
--- a/arm_compute/runtime/common/LSTMParams.h
+++ b/arm_compute/runtime/common/LSTMParams.h
@@ -79,7 +79,10 @@ public:
      *
      * @return Reference to this LSTMParams object
      */
-    LSTMParams &set_cifg_params(const T *input_to_input_weights, const T *recurrent_to_input_weights, T *cell_to_input_weights, const T *input_gate_bias)
+    LSTMParams &set_cifg_params(const T *input_to_input_weights,
+                                const T *recurrent_to_input_weights,
+                                T       *cell_to_input_weights,
+                                const T *input_gate_bias)
     {
         _input_to_input_weights     = input_to_input_weights;
         _recurrent_to_input_weights = recurrent_to_input_weights;
@@ -125,8 +128,10 @@ public:
      *
      * @return Reference to this LSTMParams object
      */
-    LSTMParams &set_layer_normalization_params(T *input_layer_norm_weights, T *forget_layer_norm_weights,
-                                               T *cell_layer_norm_weights, T *output_layer_norm_weights)
+    LSTMParams &set_layer_normalization_params(T *input_layer_norm_weights,
+                                               T *forget_layer_norm_weights,
+                                               T *cell_layer_norm_weights,
+                                               T *output_layer_norm_weights)
     {
         _input_layer_norm_weights  = input_layer_norm_weights;
         _forget_layer_norm_weights = forget_layer_norm_weights;
@@ -169,7 +174,10 @@ public:
      *
      * @return Reference to this LSTMParams object
      */
-    LSTMParams &set_matmul_scale_params(float input_intermediate_scale, float forget_intermediate_scale, float cell_intermediate_scale, float output_intermediate_scale)
+    LSTMParams &set_matmul_scale_params(float input_intermediate_scale,
+                                        float forget_intermediate_scale,
+                                        float cell_intermediate_scale,
+                                        float output_intermediate_scale)
     {
         _input_intermediate_scale  = input_intermediate_scale;
         _forget_intermediate_scale = forget_intermediate_scale;
@@ -338,5 +346,5 @@ private:
     bool     _has_cifg_opt;
     bool     _use_layer_norm;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_LSTMPARAMS_H */
diff --git a/compute_kernel_writer/CMakeLists.txt b/compute_kernel_writer/CMakeLists.txt
index 9a975630254bc96fc7eace671e1f7c17c30c0958..69a4cdd51f374b0b88f7b5dbfd0e619eee518206 100644
--- a/compute_kernel_writer/CMakeLists.txt
+++ b/compute_kernel_writer/CMakeLists.txt
@@ -118,16 +118,22 @@ target_compile_definitions(ckw PUBLIC
 )
 
 target_sources(ckw PRIVATE
+    src/types/ConstantData.cpp
+    src/types/DataTypeHelpers.cpp
     src/Error.cpp
     src/Helpers.cpp
+    src/ITile.cpp
     src/Kernel.cpp
+    src/KernelArgument.cpp
     src/KernelWriter.cpp
+    src/Tensor3dMapper.cpp
     src/TensorInfo.cpp
     src/TensorOperand.cpp
     src/TensorSampler.cpp
     src/TensorUtils.cpp
     src/TileInfo.cpp
     src/TileOperand.cpp
+    src/TileView.cpp
 )
 
 if(CKW_ENABLE_OPENCL)
@@ -137,6 +143,8 @@ if(CKW_ENABLE_OPENCL)
         src/cl/CLHelpers.cpp
         src/cl/CLTile.cpp
         src/cl/CLKernelWriter.cpp
+        src/cl/helpers/CLMemoryOpBufferHelper.cpp
+        src/cl/helpers/CLMemoryOpImage2dHelper.cpp
     )
 endif()
 
diff --git a/compute_kernel_writer/include/ckw/Error.h b/compute_kernel_writer/include/ckw/Error.h
index eaf3f10c05ea1b51202e5f44661ff7ff7d70558f..6b807789574a14117b051bd42fb1478f43e6f74f 100644
--- a/compute_kernel_writer/include/ckw/Error.h
+++ b/compute_kernel_writer/include/ckw/Error.h
@@ -53,7 +53,7 @@ create_error_msg(const std::string &file, const std::string &func, const std::st
         const std::string arg2(std::to_string(__LINE__));             \
         const std::string arg3(msg);                                  \
         std::runtime_error(create_error_msg(arg0, arg1, arg2, arg3)); \
-    } while(false)
+    } while (false)
 
 /** Mark the variables as unused.
  *
@@ -74,16 +74,16 @@ inline void ignore_unused(T &&...)
  *
  * @param[in] msg The error message.
  */
-#define CKW_THROW_MSG(msg)                                                              \
-    do                                                                                  \
-    {                                                                                   \
-        const std::string file(__FILE__);                                               \
-        const std::string func(__func__);                                               \
-        const std::string line(std::to_string(__LINE__));                               \
-        const std::string message(msg);                                                 \
-                                                                                        \
+#define CKW_THROW_MSG(msg)                                                          \
+    do                                                                              \
+    {                                                                               \
+        const std::string file(__FILE__);                                           \
+        const std::string func(__func__);                                           \
+        const std::string line(std::to_string(__LINE__));                           \
+        const std::string message(msg);                                             \
+                                                                                    \
         throw std::runtime_error(ckw::create_error_msg(file, func, line, message)); \
-    } while(false)
+    } while (false)
 
 #ifdef COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED
 
@@ -95,11 +95,11 @@ inline void ignore_unused(T &&...)
 #define CKW_ASSERT_MSG(cond, msg) \
     do                            \
     {                             \
-        if(!(cond))               \
+        if (!(cond))              \
         {                         \
             CKW_THROW_MSG(msg);   \
         }                         \
-    } while(false)
+    } while (false)
 
 #else // COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED
 
@@ -113,6 +113,13 @@ inline void ignore_unused(T &&...)
  */
 #define CKW_ASSERT(cond) CKW_ASSERT_MSG(cond, #cond)
 
+/** If the precondition is met but the condition is not met, throw an std::runtime_error if assertion is enabled.
+ *
+ * @param[in] precond The precondition that triggers the check.
+ * @param[in] cond    The condition that is expected to be true if precondition is true.
+ */
+#define CKW_ASSERT_IF(precond, cond) CKW_ASSERT(!(precond) || (cond))
+
 /** Throw an std::runtime_error with the specified message if assertion is enabled.
  *
  * @param[in] msg  The error message when the condition is not met.
diff --git a/compute_kernel_writer/include/ckw/Kernel.h b/compute_kernel_writer/include/ckw/Kernel.h
index d93ed6f1d365fd55cfd8dad144abfbc7c82e46c6..f9b7bbb82e5424df3481de8d42420a75dfdcc124 100644
--- a/compute_kernel_writer/include/ckw/Kernel.h
+++ b/compute_kernel_writer/include/ckw/Kernel.h
@@ -25,7 +25,10 @@
 #ifndef CKW_INCLUDE_CKW_KERNEL_H
 #define CKW_INCLUDE_CKW_KERNEL_H
 
+#include "ckw/KernelArgument.h"
+
 #include <string>
+#include <vector>
 
 namespace ckw
 {
@@ -48,22 +51,24 @@ public:
     /** Initialize a new instance of @ref Kernel class with all emitted kernel information.
      *
      * @param[in] language    The target language of the kernel.
+     * @param[in] arguments   The list of kernel arguments.
      * @param[in] source_code The source code of the kernel.
      */
-    Kernel(TargetLanguage language, const std::string &source_code);
+    Kernel(TargetLanguage language, const std::vector<KernelArgument> &arguments, const std::string &source_code);
 
     /** Get the target language. */
     TargetLanguage target_language() const;
 
+    /** Get the list of arguments. */
+    const std::vector<KernelArgument> &arguments() const;
+
     /** Get the source code. */
     const std::string &source_code() const;
 
-    /** Add a tile operand */
-    virtual TileOperand &add_operand(const std::string &name, const TileInfo &tile_info) = 0;
-
 private:
-    TargetLanguage _language;
-    std::string    _source_code;
+    TargetLanguage              _language;
+    std::vector<KernelArgument> _arguments;
+    std::string                 _source_code;
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/include/ckw/KernelArgument.h b/compute_kernel_writer/include/ckw/KernelArgument.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e9bcbf1ee558994fb9ec7c50332ea878def01d6
--- /dev/null
+++ b/compute_kernel_writer/include/ckw/KernelArgument.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_INCLUDE_CKW_KERNELARGUMENT_H
+#define CKW_INCLUDE_CKW_KERNELARGUMENT_H
+
+#include "ckw/types/TensorComponentType.h"
+#include "ckw/types/TensorStorageType.h"
+
+#include <cstdint>
+
+namespace ckw
+{
+
+/** A kernel argument which can be either a tensor storage or a tensor component. */
+class KernelArgument
+{
+public:
+    /** The type of kernel argument. */
+    enum class Type : int32_t
+    {
+        /** The argument that provides the read and/or write access to the tensor data.
+         *
+         * See @ref ckw::TensorStorageType to see the list of supported storage type.
+         */
+        TensorStorage,
+
+        /** The argument that provides extra information about the tensor.
+         *
+         * See @ref ckw::TensorComponentType to see the list of supported component.
+         */
+        TensorComponent,
+    };
+
+    /** Initialize a new instance of kernel argument class for a tensor storage argument. */
+    KernelArgument(int32_t tensor_id, TensorStorageType storage_type);
+
+    /** Initialize a new instance of kernel argument class for a tensor component argument. */
+    KernelArgument(int32_t tensor_id, TensorComponentType component_type);
+
+    /** Get the type of kernel argument. */
+    Type type() const;
+
+    /** Get the argument ID.
+     *
+     * This method can be used to get the tensor info ID of both tensor storage and tensor component arguments.
+     */
+    int32_t id() const;
+
+    /** Get the type of tensor storage.
+     *
+     * This method can only be used for tensor storage argument.
+     */
+    TensorStorageType tensor_storage_type() const;
+
+    /** Get the tensor component type.
+     *
+     * This method can only be used for tensor component argument.
+     */
+    TensorComponentType tensor_component_type() const;
+
+private:
+    Type    _type;
+    int32_t _id;
+
+    union SubId
+    {
+        int32_t             unknown;
+        TensorStorageType   tensor_storage_type;
+        TensorComponentType tensor_component_type;
+    };
+
+    SubId _sub_id{0};
+};
+
+} // namespace ckw
+
+#endif // CKW_INCLUDE_CKW_KERNELARGUMENT_H
diff --git a/compute_kernel_writer/include/ckw/KernelWriter.h b/compute_kernel_writer/include/ckw/KernelWriter.h
index 2a347e9ae0e5877ada58d2a751c0a19b73b42301..0d739e859ab20b8c84bf9a05274d4dd68f171a04 100644
--- a/compute_kernel_writer/include/ckw/KernelWriter.h
+++ b/compute_kernel_writer/include/ckw/KernelWriter.h
@@ -27,18 +27,26 @@
 
 #include "ckw/TensorOperand.h"
 #include "ckw/TileOperand.h"
+#include "ckw/types/ConstantData.h"
+#include "ckw/types/ConvertPolicy.h"
+#include "ckw/types/Operators.h"
 
+#include <functional>
 #include <memory>
 #include <string>
+#include <tuple>
 
 namespace ckw
 {
 
-class Kernel;
-
 /** Forward Declerations */
+class Kernel;
 class TensorInfo;
+class TensorSampler;
+class TileArea;
 class TileInfo;
+
+enum class DataType;
 enum class TargetArchitecture;
 enum class TargetLanguage;
 
@@ -73,10 +81,119 @@ public:
     /** Destructor */
     virtual ~KernelWriter();
 
+    // =============================================================================================
+    // Data processing
+    // =============================================================================================
+
+    /** Write assignment statement: `<dst> = <src>;`.
+     *
+     * @param[in] dst The destination tile.
+     * @param[in] src The source tile.
+     */
+    virtual void op_assign(const TileOperand &dst, const TileOperand &src) = 0;
+
+    /** Write the cast statement: `<dst> = convert_<dst.type><policy>(<src>);`.
+     *
+     * @param[in] dst    The destination tile.
+     * @param[in] src    The source tile.
+     * @param[in] policy The policy governing the behavior of the cast.
+     */
+    virtual void op_cast(const TileOperand &dst, const TileOperand &src, ConvertPolicy policy) = 0;
+
+    /** Write the unary expression statement: `<dst> = <op> <src>;`.
+     *
+     * @param[in] dst The destination tile.
+     * @param[in] op  The unary operator.
+     * @param[in] src The source tile.
+     */
+    virtual void op_unary(const TileOperand &dst, UnaryOp op, const TileOperand &src) = 0;
+
+    /** Write the binary expression statement: `<dst> = <op>(<first>, <second>);`.
+     *
+     * @param[in] dst    The destination tile.
+     * @param[in] op     The binary operator.
+     * @param[in] first  The first source tile.
+     * @param[in] second The second source tile.
+     */
+    virtual void
+    op_binary(const TileOperand &dst, BinaryOp op, const TileOperand &first, const TileOperand &second) = 0;
+
+    /** Write ternary expression statement: `<dst> = <op>(<first>, <second>, <third>);`.
+     *
+     * @param[in] dst    The destination tile.
+     * @param[in] op     The ternary operator.
+     * @param[in] first  The first source tile.
+     * @param[in] second The second source tile.
+     * @param[in] third  The third source tile.
+     */
+    virtual void op_ternary(const TileOperand &dst,
+                            TernaryOp          op,
+                            const TileOperand &first,
+                            const TileOperand &second,
+                            const TileOperand &third) = 0;
+
+    // =============================================================================================
+    // Flow control
+    // =============================================================================================
+
+    /** Write if block: `if(<lhs> <op> <rhs>) { <body> }`.
+     *
+     * @param[in] lhs  The LHS tile of the condition.
+     * @param[in] op   The relational binary operator.
+     * @param[in] rhs  The RHS tile of the condition.
+     * @param[in] body The function that writes the body of the if block.
+     */
+    virtual void
+    op_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body) = 0;
+
+    /** Write else-if block: `else if(<lhs> <op> <rhs>) { <body> }`.
+     *
+     * @param[in] lhs  The LHS tile of the condition.
+     * @param[in] op   The relational binary operator.
+     * @param[in] rhs  The RHS tile of the condition.
+     * @param[in] body The function that writes the body of the else-if block.
+     */
+    virtual void
+    op_else_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body) = 0;
+
+    /** Write an else block: `else { <body> }`.
+     *
+     * @param[in] body The function that writes the body of the else block.
+     */
+    virtual void op_else(const std::function<void()> &body) = 0;
+
+    /** Write for-loop block: `for(; <var> <cond_op> <cond_value>; <update_var> <update_op> <update_value>) { body }`.
+     *
+     * @param[in] var          The scalar tile used in loop condition.
+     * @param[in] cond_op      The relational binary operator used in loop condition.
+     * @param[in] cond_value   The value which the variable is compared against.
+     * @param[in] update_var   The scalar tile which is updated each iteration.
+     * @param[in] update_op    The assignment operator used for updating the update value.
+     * @param[in] update_value The value which is updated at every iteration.
+     * @param[in] body         The function that writes the body of the for-loop block.
+     */
+    virtual void op_for_loop(const TileOperand           &var,
+                             BinaryOp                     cond_op,
+                             const TileOperand           &cond_value,
+                             const TileOperand           &update_var,
+                             AssignmentOp                 update_op,
+                             const TileOperand           &update_value,
+                             const std::function<void()> &body) = 0;
+
+    /** Write the return statement. */
+    virtual void op_return() = 0;
+
     // =============================================================================================
     // Misc
     // =============================================================================================
 
+    /** Write the statement to get the global ID of the specified dimension.
+     *
+     * @param[in] dst The tile to write the global ID into.
+     * @param[in] dim The dimension.
+     */
+    virtual void op_get_global_id(const TileOperand &dst, int32_t dim) = 0;
+
     /** Write the line comment in debug build.
      *
      * This function does not take effect on release build.
@@ -85,7 +202,33 @@ public:
      *
      * @param[in] text The comment to be written.
      */
-    virtual void comment(const std::string &text) = 0;
+    virtual void op_comment(const std::string &text) = 0;
+
+    /** Write the statement to print out the value of all the specified tiles.
+     *
+     * The printing statement is constructed so that the prefix and each of the operand are printed in separate lines.
+     * The format for each operand varies depending on whether it is a 2D tile, a vector or a scalar value.
+     *
+     * Example output of the printing statement when it is executed:
+     *
+     * prefix
+     * scalar_name = scalar_value
+     * vector_name = [vector_value_0, vector_value_1, vector_value_2]
+     * tile_name = [[tile_value_00, tile_value_01], [tile_value_10, tile_value_11]]
+     *
+     * @param[in] prefix   The first string to be printed out before the list of operands.
+     * @param[in] operands The list of tiles to be included in the printing statement.
+     */
+    virtual void op_print(const std::string &prefix, const std::vector<TileOperand> &operands) = 0;
+
+    /** Write the given raw code to kernel source code
+     *  It's used to address the cases where the user needs to
+     *  explicitly add a code where it's not (yet) supported by
+     *  the kernel writer utility calls.
+     *
+     * @param[in] raw_code raw code to write as string
+    */
+    virtual void op_write_raw_code(const std::string &raw_code) = 0;
 
     // =============================================================================================
     // Code generation
@@ -115,30 +258,140 @@ public:
      * @param[in] name Name of the tile
      * @param[in] tile_info Shape and data type of the tile
      *
-     * @returns The created tile operand
+     * @return The created tile operand
      */
     virtual TileOperand declare_tile(const std::string &name, const TileInfo &tile_info) = 0;
 
-    /** Write the given raw code to kernel source code
-     *  It's used to address the cases where the user needs to
-     *  explicitly add a code where it's not (yet) supported by
-     *  the kernel writer utility calls.
+    /** Declare a constant tile given a @ref:ConstantData object
      *
-     * @param[in] raw_code raw code to write as string
-    */
-    virtual void op_write_raw_code(const std::string &raw_code) = 0;
+     * @param[in] data a @ref ckw::ConstantData object that has the values and the
+     *                 underlying data type of the constant tile
+     *
+     * @return The created constant tile operand
+     */
+    virtual TileOperand declare_constant_tile(const ConstantData &data) = 0;
+
+    /** Load the data from the tensor memory to the tile using the sampling information.
+     *
+     * @param[in] tile_op   The tile to be loaded.
+     * @param[in] tensor_op The tensor to be read.
+     * @param[in] sampler   The tensor sampling information.
+     * @param[in] x         x-coordinate
+     * @param[in] y         y-coordinate
+     * @param[in] z         z-coordinate
+     * @param[in] batch     batch
+     */
+    virtual void op_load(const TileOperand   &tile_op,
+                         const TensorOperand &tensor_op,
+                         TensorSampler       &sampler,
+                         const TileOperand   &x,
+                         const TileOperand   &y,
+                         const TileOperand   &z,
+                         const TileOperand   &batch) = 0;
+
+    /** Load the data from the tensor memory to the tile in a dilated way using the sampling information.
+     *
+     * Similar to @ref KernelWriter::op_load() and
+     *
+     * @param[in] dilation_x Dilation while reading in x-dimension
+     * @param[in] dilation_y Dilation while reading in y-dimension
+     */
+    virtual void op_load_dilated(const TileOperand   &tile_op,
+                                 const TensorOperand &tensor_op,
+                                 TensorSampler       &sampler,
+                                 const TileOperand   &x,
+                                 const TileOperand   &y,
+                                 const TileOperand   &z,
+                                 const TileOperand   &batch,
+                                 const TileOperand   &dilation_x,
+                                 const TileOperand   &dilation_y) = 0;
+
+    /** Store the data to the tensor memory from the tile using the sampling information.
+     *
+     * Similar to @ref KernelWriter::op_load()
+     */
+    virtual void op_store(const TensorOperand &tensor_op,
+                          const TileOperand   &tile_op,
+                          TensorSampler       &sampler,
+                          const TileOperand   &x,
+                          const TileOperand   &y,
+                          const TileOperand   &z,
+                          const TileOperand   &batch) = 0;
+
+    /** Store the data to the tensor memory from the tile in a dilated way using the sampling information.
+     *
+     * Similar to @ref KernelWriter::op_load_dilated()
+     */
+    virtual void op_store_dilated(const TensorOperand &tensor_op,
+                                  const TileOperand   &tile_op,
+                                  TensorSampler       &sampler,
+                                  const TileOperand   &x,
+                                  const TileOperand   &y,
+                                  const TileOperand   &z,
+                                  const TileOperand   &batch,
+                                  const TileOperand   &dilation_x,
+                                  const TileOperand   &dilation_y) = 0;
+
+    /** Load the data from the tensor memory to the tile using the indirect buffer approach and respecting the sampling information.
+     *
+     * @param[in] tile_op   The tile to be loaded.
+     * @param[in] tensor_op The tensor to be read.
+     * @param[in] sampler   The tensor sampling information.
+     * @param[in] x         x-coordinate
+     * @param[in] y         y-coordinate
+     * @param[in] z         z-coordinate
+     * @param[in] batch     batch
+     */
+    virtual void op_load_indirect(const TileOperand   &tile_op,
+                                  const TensorOperand &tensor_op,
+                                  TensorSampler       &sampler,
+                                  const TileOperand   &x,
+                                  const TileOperand   &y,
+                                  const TileOperand   &z,
+                                  const TileOperand   &batch_op) = 0;
 
 protected:
+    // =============================================================================================
+    // ID space management
+    // =============================================================================================
+
+    /** Create the new unique ID space and return the value.
+     *
+     * This function changes the ID space to a new number which hasn't been used since the creation
+     * of this kernel writer object.
+     *
+     * @return The new ID space value.
+     */
+    int32_t new_id_space();
+
+    /** Get the current ID space. */
     int32_t id_space() const;
 
+    /** Set the current ID space.
+     *
+     * @param[in] value The ID space to be used.
+     */
+    KernelWriter &id_space(int32_t value);
+
+    /** Write the body code using the specified function.
+     *
+     * This function makes sure that a new ID space is created before and then is used solely
+     * by the specified body writing function.
+     * The ID space will not be reused after that.
+     *
+     * @param[in] body The function that writes the body code.
+     */
+    void write_body(const std::function<void()> &body);
+
+protected:
     /** Generate full variable name by prefixing it with id space */
     std::string generate_full_name(const std::string &name) const;
 
     /** Create a new tile operand referring to the specified tile object. */
     static TileOperand create_tile_operand(ITile &tile);
 
-    /** Get the reference to tile object from the tile operand. */
-    static ITile &get_tile(const TileOperand &operand);
+    /** Get the reference to the tile object and the active area from the tile operand. */
+    static std::tuple<ITile &, TileArea> get_tile(const TileOperand &operand);
 
     /** Create a new tensor operand from a tensor object. */
     static TensorOperand create_tensor_operand(ITensor &tensor);
@@ -146,8 +399,15 @@ protected:
     /** Get the reference to tensor object from the tensor operand. */
     static ITensor &get_tensor(const TensorOperand &operand);
 
+    /** Get the values of a constant data object. */
+    static const std::vector<std::vector<std::string>> &get_values(const ConstantData &data);
+
+    /** Get the data type of a constant data object. */
+    static DataType get_data_type(const ConstantData &data);
+
 private:
-    int32_t _id_space{ 0 };
+    int32_t _id_space{0};
+    int32_t _last_created_id_space{0};
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/include/ckw/TensorInfo.h b/compute_kernel_writer/include/ckw/TensorInfo.h
index 87cf7c142653e991f4a928078d60ea4fce08fe09..5c87cb5b120c30fac9bd944ac00deb54af5391aa 100644
--- a/compute_kernel_writer/include/ckw/TensorInfo.h
+++ b/compute_kernel_writer/include/ckw/TensorInfo.h
@@ -27,6 +27,7 @@
 
 #include "ckw/types/DataType.h"
 #include "ckw/types/TensorDataLayout.h"
+
 #include <array>
 #include <cstdint>
 
@@ -85,10 +86,10 @@ public:
     int32_t id() const;
 
 private:
-    TensorShape      _shape{ { 0 } };
-    DataType         _dt{ DataType::Unknown };
-    TensorDataLayout _dl{ TensorDataLayout::Unknown };
-    int32_t          _id{ -1 };
+    TensorShape      _shape{{0}};
+    DataType         _dt{DataType::Unknown};
+    TensorDataLayout _dl{TensorDataLayout::Unknown};
+    int32_t          _id{-1};
 };
 } // namespace ckw
 
diff --git a/compute_kernel_writer/include/ckw/TensorSampler.h b/compute_kernel_writer/include/ckw/TensorSampler.h
index 1b51636edbb4c7f15bdfa45ec0efa29ccc1e6183..117e8de2cf983c8b02ef49509010ce21f4bee765 100644
--- a/compute_kernel_writer/include/ckw/TensorSampler.h
+++ b/compute_kernel_writer/include/ckw/TensorSampler.h
@@ -25,8 +25,8 @@
 #ifndef CKW_INCLUDE_CKW_TENSORSAMPLER_H
 #define CKW_INCLUDE_CKW_TENSORSAMPLER_H
 
-#include "ckw/types/TensorStorageType.h"
 #include "ckw/types/TensorSamplerTypes.h"
+#include "ckw/types/TensorStorageType.h"
 
 namespace ckw
 {
@@ -53,12 +53,11 @@ public:
      * @param[in] address_mode_y The address mode of the y dimension.
      * @param[in] address_mode_z The address mode of the z dimension.
      */
-    TensorSampler(
-        TensorStorageType         storage,
-        TensorSamplerFormat       format,
-        TensorSamplerAddressModeX address_mode_x,
-        TensorSamplerAddressModeY address_mode_y,
-        TensorSamplerAddressModeZ address_mode_z);
+    TensorSampler(TensorStorageType         storage,
+                  TensorSamplerFormat       format,
+                  TensorSamplerAddressModeX address_mode_x,
+                  TensorSamplerAddressModeY address_mode_y,
+                  TensorSamplerAddressModeZ address_mode_z);
 
     /** Get the storage for the tensor */
     TensorStorageType storage() const;
@@ -91,11 +90,11 @@ public:
     TensorSampler &address_mode_z(TensorSamplerAddressModeZ address_mode_z);
 
 private:
-    TensorStorageType                _storage { TensorStorageType::BufferUint8Ptr };
-    TensorSamplerFormat              _format  { TensorSamplerFormat::Unknown };
-    TensorSamplerAddressModeX _address_mode_x { TensorSamplerAddressModeX::Unknown };
-    TensorSamplerAddressModeY _address_mode_y { TensorSamplerAddressModeY::Unknown };
-    TensorSamplerAddressModeZ _address_mode_z { TensorSamplerAddressModeZ::Unknown };
+    TensorStorageType         _storage{TensorStorageType::BufferUint8Ptr};
+    TensorSamplerFormat       _format{TensorSamplerFormat::Unknown};
+    TensorSamplerAddressModeX _address_mode_x{TensorSamplerAddressModeX::Unknown};
+    TensorSamplerAddressModeY _address_mode_y{TensorSamplerAddressModeY::Unknown};
+    TensorSamplerAddressModeZ _address_mode_z{TensorSamplerAddressModeZ::Unknown};
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/include/ckw/TileInfo.h b/compute_kernel_writer/include/ckw/TileInfo.h
index b8094f79bffffefb9a6f5dbbe3268bb6f3ec70fd..678bb7aaf6b2a22c3a486b1bce4d4ac081fc8683 100644
--- a/compute_kernel_writer/include/ckw/TileInfo.h
+++ b/compute_kernel_writer/include/ckw/TileInfo.h
@@ -83,7 +83,7 @@ public:
     DataType data_type() const;
 
 private:
-    DataType  _dt{ DataType::Unknown };
+    DataType  _dt{DataType::Unknown};
     TileShape _shape{};
 };
 
diff --git a/compute_kernel_writer/include/ckw/TileOperand.h b/compute_kernel_writer/include/ckw/TileOperand.h
index 873a9825f3631e2855dbce26208ff41a346fd2d1..56dc5e7b2b5b5217e78d4a8f89a10bceacb7b4c0 100644
--- a/compute_kernel_writer/include/ckw/TileOperand.h
+++ b/compute_kernel_writer/include/ckw/TileOperand.h
@@ -25,6 +25,8 @@
 #ifndef CKW_INCLUDE_CKW_TILEOPERAND_H
 #define CKW_INCLUDE_CKW_TILEOPERAND_H
 
+#include <cstdint>
+
 namespace ckw
 {
 
@@ -41,13 +43,55 @@ public:
     friend class KernelWriter;
     friend class TensorOperand;
 
+    /** Get a row vector of the current tile operand.
+     *
+     * @param[in] row The index of the row to be accessed in the current tile operand.
+     *
+     * @return A new tile operand referring to a row of the current tile operand.
+     */
+    TileOperand row(int32_t row) const;
+
+    /** Get a scalar element of the current tile operand.
+     *
+     * @param[in] row The index of the row to be accessed in the current tile operand.
+     * @param[in] col The index of the column to be accessed in the current tile operand.
+     *
+     * @return A new tile operand referring to a scalar element of the current tile operand.
+     */
+    TileOperand scalar(int32_t row, int32_t col) const;
+
 private:
     // These are hidden from the public API to avoid any misuse.
 
     /** Initialize a new instance of @ref TileOperand class for the given tile. */
     TileOperand(ITile &tile);
 
-    ITile &_tile;
+    /** Initialize a new instance of @ref TileOperand class that is the sub-tile of the given tile. */
+    TileOperand(const TileOperand &operand, int32_t row_start, int32_t row_end, int32_t col_start, int32_t col_end);
+
+    /** Get a sub-tile of the current tile operand.
+     *
+     * The range of rows and columns is defined by pairs of start and end indices, inclusive lower and exclusive upper.
+     * In other words, any row and column indices satisfying the following conditions will be part of the sub-tile:
+     *
+     *   row_start <= row_index < row_end
+     *   col_start <= col_index < col_end
+     *
+     * @param[in] row_start The start index of the row range.
+     * @param[in] row_end   The end index of the row range.
+     * @param[in] col_start The start index of the column range.
+     * @param[in] col_end   The end index of the column range.
+     *
+     * @return A new tile operand refering to the same tile but with the new active area.
+     */
+    TileOperand tile(int32_t row_start, int32_t row_end, int32_t col_start, int32_t col_end) const;
+
+    ITile *_tile;
+
+    int32_t _row_start;
+    int32_t _row_end;
+    int32_t _col_start;
+    int32_t _col_end;
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/include/ckw/types/ConstantData.h b/compute_kernel_writer/include/ckw/types/ConstantData.h
new file mode 100644
index 0000000000000000000000000000000000000000..7708818ca881af7ff8dadd460c671468752cf338
--- /dev/null
+++ b/compute_kernel_writer/include/ckw/types/ConstantData.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_INCLUDE_CKW_TYPES_CONSTANTDATA_H
+#define CKW_INCLUDE_CKW_TYPES_CONSTANTDATA_H
+
+#include "ckw/Error.h"
+#include "ckw/types/DataType.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <initializer_list>
+#include <iomanip>
+#include <iterator>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+namespace ckw
+{
+// Forward Declarations
+class KernelWriter;
+
+class ConstantData
+{
+    using String       = std::string;
+    using StringVector = std::vector<String>;
+
+public:
+    /** Templated constructor */
+    template <typename T>
+    ConstantData(std::initializer_list<std::initializer_list<T>> values, DataType data_type);
+
+private:
+    /** Validate the given data type and the template type
+     *
+     * @param[in] data_type data type
+     *
+     * @return true if user provided data type and the template type are conformant
+     */
+    template <typename T>
+    bool validate(DataType data_type);
+
+    /** Get the constant data as a 2d vector of string values
+     *
+     * @return a 2d vector of strings that has the string-converted values
+     */
+    const std::vector<StringVector> &values() const;
+
+    /** Get the underlying data type of the constant values
+     *
+     * @return a @ref ckw::DataType object that represents the underlying data type
+     */
+    DataType data_type() const;
+
+    // Friends
+    friend class KernelWriter;
+
+private:
+    // Data members
+    std::vector<StringVector> _values{};
+    DataType                  _data_type{};
+};
+
+} // namespace ckw
+
+#endif // CKW_INCLUDE_CKW_TYPES_CONSTANTDATA_H
diff --git a/compute_kernel_writer/include/ckw/types/ConvertPolicy.h b/compute_kernel_writer/include/ckw/types/ConvertPolicy.h
new file mode 100644
index 0000000000000000000000000000000000000000..43a37ff118fe4db901eb0b1d6f0c7bd9379c9661
--- /dev/null
+++ b/compute_kernel_writer/include/ckw/types/ConvertPolicy.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_INCLUDE_CKW_TYPES_CONVERTPOLICY_H
+#define CKW_INCLUDE_CKW_TYPES_CONVERTPOLICY_H
+
+#include <cstdint>
+
+namespace ckw
+{
+
+enum class ConvertPolicy : int32_t
+{
+    None     = 0, // No policy specified.
+    Saturate = 1, // Saturated.
+};
+
+} // namespace ckw
+
+#endif // CKW_INCLUDE_CKW_TYPES_CONVERTPOLICY_H
diff --git a/compute_kernel_writer/include/ckw/types/MemoryOperation.h b/compute_kernel_writer/include/ckw/types/MemoryOperation.h
new file mode 100644
index 0000000000000000000000000000000000000000..f93f60c51a8915d2e9a2641b5eae8d17697f6228
--- /dev/null
+++ b/compute_kernel_writer/include/ckw/types/MemoryOperation.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_INCLUDE_CKW_TYPES_MEMORYOPERATION
+#define CKW_INCLUDE_CKW_TYPES_MEMORYOPERATION
+
+namespace ckw
+{
+enum class MemoryOperation
+{
+    Load  = 1,
+    Store = 2
+};
+} // namespace ckw
+
+#endif /* CKW_INCLUDE_CKW_TYPES_MEMORYOPERATION */
diff --git a/compute_kernel_writer/include/ckw/types/Operators.h b/compute_kernel_writer/include/ckw/types/Operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e5f9bd54201da88485af37f999cb92be43edafb
--- /dev/null
+++ b/compute_kernel_writer/include/ckw/types/Operators.h
@@ -0,0 +1,100 @@
+/*
+* Copyright (c) 2023 Arm Limited.
+*
+* SPDX-License-Identifier: MIT
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to
+* deal in the Software without restriction, including without limitation the
+* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+* sell copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in all
+* copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+
+#ifndef CKW_INCLUDE_CKW_TYPES_OPERATORS_H
+#define CKW_INCLUDE_CKW_TYPES_OPERATORS_H
+
+#include <cstdint>
+
+namespace ckw
+{
+
+/** Unary operators and functions. */
+enum class UnaryOp : int32_t
+{
+    LogicalNot = 0x0000, // !
+    BitwiseNot = 0x0001, // ~
+
+    Exp   = 0x0010,
+    Tanh  = 0x0011,
+    Sqrt  = 0x0012,
+    Erf   = 0x0013,
+    Fabs  = 0x0014,
+    Log   = 0x0015,
+    Round = 0x0016,
+};
+
+/** Assignment operators. */
+enum class AssignmentOp : int32_t
+{
+    Increment = 0x0000, // +=
+    Decrement = 0x0001, // -=
+};
+
+/** Binary operators. */
+enum class BinaryOp : int32_t
+{
+    // Elementwise
+    Add = 0x0000, // +
+    Sub = 0x0001, // -
+    Mul = 0x0002, // *
+    Div = 0x0003, // /
+    Mod = 0x0004, // %
+
+    // Relational
+    Equal        = 0x1000, // ==
+    Less         = 0x1001, // <
+    LessEqual    = 0x1002, // <=
+    Greater      = 0x1003, // >
+    GreaterEqual = 0x1004, // >=
+
+    // Algebra
+    MatMul_Nt_Nt = 0x2000, // X
+    MatMul_Nt_T  = 0x2001, // X
+    MatMul_T_Nt  = 0x2002, // X
+    MatMul_T_T   = 0x2003, // X
+    Dot          = 0x2004, // .
+
+    // Logical
+    LogicalAnd = 0x3000, // &&
+    LogicalOr  = 0x3001, // ||
+
+    // Bitwise
+    BitwiseXOR = 0x4000, // ^
+
+    // Functions
+    Min = 0x8000,
+    Max = 0x8001,
+};
+
+/** Ternary operators. */
+enum class TernaryOp : int32_t
+{
+    Select = 0x0000,
+    Clamp  = 0x0001,
+};
+
+} // namespace ckw
+
+#endif // CKW_INCLUDE_CKW_TYPES_OPERATORS_H
diff --git a/compute_kernel_writer/include/ckw/types/TensorSamplerTypes.h b/compute_kernel_writer/include/ckw/types/TensorSamplerTypes.h
index 347536512e8f61f17534aad8c7714a923d622301..512d0b45019cec2ff176d811fcf6ad0060746749 100644
--- a/compute_kernel_writer/include/ckw/types/TensorSamplerTypes.h
+++ b/compute_kernel_writer/include/ckw/types/TensorSamplerTypes.h
@@ -22,8 +22,8 @@
  * SOFTWARE.
  */
 
-#ifndef CKW_INCLUDE_CKW_TENSORSAMPLERTYPES_H
-#define CKW_INCLUDE_CKW_TENSORSAMPLERTYPES_H
+#ifndef CKW_INCLUDE_CKW_TYPES_TENSORSAMPLERTYPES_H
+#define CKW_INCLUDE_CKW_TYPES_TENSORSAMPLERTYPES_H
 
 #include <cstdint>
 
@@ -33,9 +33,9 @@ namespace ckw
 // This enum class defines how the dimensions of a 3d tensor is mapped into x,y and z coordianates.
 enum class TensorSamplerFormat : int32_t
 {
-    Unknown    = 0,
-    D0_D1xD2_1 = 1, // Original dimensions 1 and 2 are collapsed onto y-axis
-    D0_D1_D2   = 2  // Original dimensions stays as they're defined. No collapsing.
+    Unknown          = 0,
+    Dim0_Dim1xDim2_1 = 1, // Original dimensions 1 and 2 are collapsed onto y-axis
+    Dim0_Dim1_Dim2   = 2  // Original dimensions stays as they're defined. No collapsing.
 };
 
 /** Tensor sampler address mode enum class for X dimension
@@ -47,15 +47,15 @@ enum class TensorSamplerFormat : int32_t
  *                             Leftover elements can be handled using overlapping. This involves processing some of the elements in the array twice.
  *      ClampToBorderMaxOnly : Clamp to max value allowed in the corresponding dimension, and construct an if/else guard to prevent out of bound access,
  *                             e.g. if( y < size-of-dimension-y ){ <do the operation>  }
+ *      SkipLessThanZero     : Skip loading/storing if the index is less than 0
  *
  *  Individual dimensions choose which adddress mode to implement in their respective enum classes.
  */
 enum class TensorSamplerAddressModeX : int32_t
 {
     Unknown        = 0,
-    None           = 1, // The user guarantees that the coordinate is always in-bound
-    OverlappingMin = 2  // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
-                        // Leftover elements can be handled using overlapping. This involves processing some of the elements in the array twice.
+    None           = 1,
+    OverlappingMin = 2
 };
 
 /**
@@ -66,7 +66,8 @@ enum class TensorSamplerAddressModeY : int32_t
     Unknown              = 0,
     None                 = 1,
     OverlappingMin       = 2,
-    ClampToBorderMaxOnly = 3
+    ClampToBorderMaxOnly = 3,
+    SkipLessThanZero     = 4
 };
 
 /**
@@ -74,10 +75,10 @@ enum class TensorSamplerAddressModeY : int32_t
  */
 enum class TensorSamplerAddressModeZ : int32_t
 {
-    Unknown        = 0,
-    None           = 1,
+    Unknown = 0,
+    None    = 1,
 };
 
 } // namespace ckw
 
-#endif //CKW_INCLUDE_CKW_TENSORSAMPLERTYPES_H
+#endif // CKW_INCLUDE_CKW_TYPES_TENSORSAMPLERTYPES_H
diff --git a/compute_kernel_writer/prototype/CMakeLists.txt b/compute_kernel_writer/prototype/CMakeLists.txt
index 13d1ae8fc44eb83caece3a1c7b95484ef063ca7a..439dcd3b7e89486b8b69ed9408187b4d88650594 100644
--- a/compute_kernel_writer/prototype/CMakeLists.txt
+++ b/compute_kernel_writer/prototype/CMakeLists.txt
@@ -42,7 +42,7 @@ target_compile_options(ckw_prototype
     ${CKW_CXX_FLAGS}
     "$<$<CXX_COMPILER_ID:GNU>:${GNU_WARNINGS}>"
     "$<$<CONFIG:Debug>:${CKW_ASSERTS_OPTS}>"
-    "$<$<BOOL:${CKW_ASSERTS}>:${CKW_ASSERTS_OPTS}>"
+    "$<$<BOOL:${CKW_ENABLE_ASSERTS}>:${CKW_ASSERTS_OPTS}>"
     ${CMAKE_CXX_FLAGS}
     PRIVATE
     $<$<CONFIG:Release>:-Os>
@@ -51,7 +51,7 @@ target_compile_options(ckw_prototype
 target_compile_definitions(ckw_prototype PUBLIC
     $<$<CONFIG:Debug>:COMPUTE_KERNEL_WRITER_DEBUG_ENABLED>
     $<$<CONFIG:Debug>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
-    $<$<BOOL:${CKW_ASSERTS}>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
+    $<$<BOOL:${CKW_ENABLE_ASSERTS}>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
     $<$<BOOL:${CKW_ENABLE_OPENCL}>:COMPUTE_KERNEL_WRITER_OPENCL_ENABLED>
 )
 
diff --git a/compute_kernel_writer/prototype/examples/add_exp_store.cpp b/compute_kernel_writer/prototype/examples/add_exp_store.cpp
index 6a9884543c5f3ed499aab1365c50468183a88ac9..2b640ca01bf7a14475b56ed2a26f456295b20002 100644
--- a/compute_kernel_writer/prototype/examples/add_exp_store.cpp
+++ b/compute_kernel_writer/prototype/examples/add_exp_store.cpp
@@ -32,7 +32,6 @@
 #include "common/ExampleComponentArgument.h"
 #include "common/ExampleKernelWriter.h"
 #include "common/ExampleScopedKernelWriter.h"
-
 #include <iostream>
 #include <vector>
 
@@ -78,14 +77,14 @@ void op_binary_elementwise(ExampleScopedKernelWriter writer, std::vector<Example
     auto dst = operands.at(2);
 
     // Load the LHS and RHS tile and prepare the tensor sampler.
-    if(!lhs->has_tile() && !rhs->has_tile())
+    if (!lhs->has_tile() && !rhs->has_tile())
     {
         const auto sampler = create_simple_sampler(writer);
 
         writer->op_load_once(lhs, sampler);
         writer->op_load_once(rhs, sampler);
     }
-    else if(lhs->has_tile())
+    else if (lhs->has_tile())
     {
         const auto &sampler = lhs->tile_sampler();
         writer->op_load_once(rhs, sampler);
@@ -101,7 +100,7 @@ void op_binary_elementwise(ExampleScopedKernelWriter writer, std::vector<Example
     const auto &sampler  = lhs->tile_sampler();
 
     // Prepare the output tile.
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
         auto &tile = writer->declare_tile("dst_tile", lhs_tile.tile_info());
         dst->init_virtual_tensor(tile, sampler);
@@ -119,7 +118,7 @@ void op_exp(ExampleScopedKernelWriter writer, std::vector<ExampleComponentArgume
     auto dst = operands.at(1);
 
     // Load the source tile and prepare the sampler.
-    if(!src->has_tile())
+    if (!src->has_tile())
     {
         const auto sampler = create_simple_sampler(writer);
         writer->op_load_once(src, sampler);
@@ -129,7 +128,7 @@ void op_exp(ExampleScopedKernelWriter writer, std::vector<ExampleComponentArgume
     const auto &sampler  = src->tile_sampler();
 
     // Prepare the output tile.
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
         auto &tile = writer->declare_tile("dst_tile", src_tile.tile_info());
         dst->init_virtual_tensor(tile, sampler);
@@ -160,34 +159,38 @@ int main()
 
     ExampleScopedKernelWriter writer(&root_writer);
 
-    const TensorInfo src0_info(DataType::Fp32, TensorShape({ 3, 10, 20, 1, 1 }), TensorDataLayout::Nhwc, 0);
-    const TensorInfo src1_info(DataType::Fp32, TensorShape({ 3, 10, 20, 1, 1 }), TensorDataLayout::Nhwc, 1);
-    const TensorInfo dst_info(DataType::Fp32, TensorShape({ 3, 10, 20, 1, 1 }), TensorDataLayout::Nhwc, 2);
+    const TensorInfo src0_info(DataType::Fp32, TensorShape({3, 10, 20, 1, 1}), TensorDataLayout::Nhwc, 0);
+    const TensorInfo src1_info(DataType::Fp32, TensorShape({3, 10, 20, 1, 1}), TensorDataLayout::Nhwc, 1);
+    const TensorInfo dst_info(DataType::Fp32, TensorShape({3, 10, 20, 1, 1}), TensorDataLayout::Nhwc, 2);
 
-    ExampleComponentArgument src0(writer->declare_tensor_argument("src0", src0_info, TensorStorageType::BufferUint8Ptr));
-    ExampleComponentArgument src1(writer->declare_tensor_argument("src1", src1_info, TensorStorageType::BufferUint8Ptr));
+    ExampleComponentArgument src0(
+        writer->declare_tensor_argument("src0", src0_info, TensorStorageType::BufferUint8Ptr));
+    ExampleComponentArgument src1(
+        writer->declare_tensor_argument("src1", src1_info, TensorStorageType::BufferUint8Ptr));
     ExampleComponentArgument dst(writer->declare_tensor_argument("dst", dst_info, TensorStorageType::BufferUint8Ptr));
 
     ExampleComponentArgument ans;
 
-    op_binary_elementwise(writer, { &src0, &src1, &ans });
-    op_exp(writer, { &ans, &ans });
-    op_store(writer, { &ans, &dst });
+    op_binary_elementwise(writer, {&src0, &src1, &ans});
+    op_exp(writer, {&ans, &ans});
+    op_store(writer, {&ans, &dst});
 
     const auto arguments = kernel.arguments();
 
     std::cout << "\n====================\nArguments:\n====================\n";
 
-    for(auto &arg : arguments)
+    for (auto &arg : arguments)
     {
-        switch(arg.type())
+        switch (arg.type())
         {
             case ckw::KernelArgument::Type::TensorStorage:
-                std::cout << "* Tensor storage:   ID = " << arg.id() << ", type = " << std::hex << "0x" << static_cast<uint32_t>(arg.tensor_storage_type()) << std::dec << "\n";
+                std::cout << "* Tensor storage:   ID = " << arg.id() << ", type = " << std::hex << "0x"
+                          << static_cast<uint32_t>(arg.tensor_storage_type()) << std::dec << "\n";
                 break;
 
             case ckw::KernelArgument::Type::TensorComponent:
-                std::cout << "* Tensor component: ID = " << arg.id() << ", type = " << std::hex << "0x" << static_cast<uint32_t>(arg.tensor_component_type()) << std::dec << "\n";
+                std::cout << "* Tensor component: ID = " << arg.id() << ", type = " << std::hex << "0x"
+                          << static_cast<uint32_t>(arg.tensor_component_type()) << std::dec << "\n";
                 break;
 
             default:
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp b/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp
index 5a2ec526cc75d006cb44346189c5981e2244194d..55223dae0efe32dc5c7bc39e2d02cf3a2272e47a 100644
--- a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp
+++ b/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp
@@ -23,19 +23,19 @@
  */
 
 #include "ExampleComponentArgument.h"
+
 #include "ckw/Error.h"
 
 ExampleComponentArgument::ExampleComponentArgument()
 {
 }
 
-ExampleComponentArgument::ExampleComponentArgument(ckw::TensorOperand &tensor)
-    : _tensor(&tensor)
+ExampleComponentArgument::ExampleComponentArgument(ckw::TensorOperand &tensor) : _tensor(&tensor)
 {
 }
 
-ExampleComponentArgument &
-ExampleComponentArgument::init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &tile_sampler)
+ExampleComponentArgument &ExampleComponentArgument::init_virtual_tensor(ckw::TileOperand             &tile,
+                                                                        const ckw::TensorTileSampler &tile_sampler)
 {
     CKW_ASSERT(_tile == nullptr);
 
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h b/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h
index 9fdc50ba086ab82f870b51e6ce928ec97d2b2bbe..0e029b115700e61ddefa5313878d966744914151 100644
--- a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h
+++ b/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h
@@ -104,8 +104,8 @@ public:
     const ckw::TensorTileSampler &tile_sampler() const;
 
 private:
-    ckw::TensorOperand    *_tensor{ nullptr };
-    ckw::TileOperand      *_tile{ nullptr };
+    ckw::TensorOperand    *_tensor{nullptr};
+    ckw::TileOperand      *_tile{nullptr};
     ckw::TensorTileSampler _tile_sampler{};
 };
 
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp b/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp
index 6b9f244735d2677f3f6af089ddfdba818447e4c6..1734ce8823fd3a62ba35fb4bde5053358a0b74f4 100644
--- a/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp
+++ b/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp
@@ -23,26 +23,27 @@
  */
 
 #include "ExampleKernelWriter.h"
-#include "ExampleComponentArgument.h"
+
 #include "ckw/Error.h"
 #include "ckw/TileInfo.h"
 
-ExampleKernelWriter::ExampleKernelWriter(ckw::Kernel &kernel)
-    : KernelWriter(kernel)
+#include "ExampleComponentArgument.h"
+
+ExampleKernelWriter::ExampleKernelWriter(ckw::Kernel &kernel) : KernelWriter(kernel)
 {
 }
 
 void ExampleKernelWriter::op_load_once(ExampleComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler)
 {
-    if(!tensor_or_tile->has_tile())
+    if (!tensor_or_tile->has_tile())
     {
         CKW_ASSERT(tensor_or_tile->has_tensor());
 
         auto &tensor = tensor_or_tile->tensor();
 
         const auto tile_name = tensor.name() + "_tile";
-        auto      &tile      = declare_tile(tile_name.c_str(),
-                                            ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
+        auto      &tile =
+            declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
 
         op_load(tile, tensor, sampler);
 
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp b/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp
index 7c44fa8749c50f44c9cb48fd3d1366accd72da4a..784d5ffb964167ffa8ca7a58cc679d844f1d8e78 100644
--- a/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp
+++ b/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "ExampleScopedKernelWriter.h"
+
 #include "ExampleKernelWriter.h"
 
 ExampleScopedKernelWriter::ExampleScopedKernelWriter(ExampleKernelWriter *writer)
diff --git a/compute_kernel_writer/prototype/examples/writer_helper.cpp b/compute_kernel_writer/prototype/examples/writer_helper.cpp
index ccef92dcdfc717a003bbaa49bbe9526014f73889..8623afbf50a8ea813da87725360e9fdc11fdf81a 100644
--- a/compute_kernel_writer/prototype/examples/writer_helper.cpp
+++ b/compute_kernel_writer/prototype/examples/writer_helper.cpp
@@ -23,14 +23,14 @@
 */
 
 #include "ckw/KernelWriter.h"
-#include "../include/ckw/KernelWriterHelper.h"
 #include "ckw/TensorTileSampler.h"
 
+#include "../include/ckw/KernelWriterHelper.h"
 #include <iostream>
 
 using namespace ckw;
 
-TensorTileSampler create_simple_sampler(KernelWriter& writer)
+TensorTileSampler create_simple_sampler(KernelWriter &writer)
 {
     TensorTileSampler sampler;
 
@@ -65,11 +65,11 @@ TensorTileSampler create_simple_sampler(KernelWriter& writer)
 
 int main()
 {
-    Kernel kernel("test", GpuTargetLanguage::OpenCL);
+    Kernel                           kernel("test", GpuTargetLanguage::OpenCL);
     KernelWriterHelper<KernelWriter> writer(kernel);
 
-    const TensorInfo src_info(DataType::Fp32, TensorShape({ 1, 1, 1, 1, 1 }), TensorDataLayout::Nhwc, 0);
-    const TensorInfo dst_info(DataType::Fp32, TensorShape({ 1, 1, 1, 1, 1 }), TensorDataLayout::Nhwc, 1);
+    const TensorInfo src_info(DataType::Fp32, TensorShape({1, 1, 1, 1, 1}), TensorDataLayout::Nhwc, 0);
+    const TensorInfo dst_info(DataType::Fp32, TensorShape({1, 1, 1, 1, 1}), TensorDataLayout::Nhwc, 1);
 
     auto &src_tensor = writer.declare_tensor_argument("src", src_info);
     auto &dst_tensor = writer.declare_tensor_argument("dst", dst_info);
@@ -77,27 +77,24 @@ int main()
     const auto sampler = create_simple_sampler(writer);
 
     auto &src = writer.declare_tile("src_tile", TileInfo(src_tensor.data_type(), sampler.height(), sampler.width()));
-    auto &other = writer.declare_tile("other_tile", TileInfo(src_tensor.data_type(), sampler.height(), sampler.width()));
+    auto &other =
+        writer.declare_tile("other_tile", TileInfo(src_tensor.data_type(), sampler.height(), sampler.width()));
     auto &dst = writer.declare_tile("dst_tile", TileInfo(src_tensor.data_type(), sampler.height(), sampler.width()));
 
     writer.op_load(src, src_tensor, sampler);
     writer.op_load(other, src_tensor, sampler);
     writer.op_load(dst, dst_tensor, sampler);
 
-    auto test = dst ^ src ^ other;
+    auto test       = dst ^ src ^ other;
     auto other_test = logical_and(dst, src, other);
     writer.op_assign(dst, logical_and(dst, src, other));
     writer.op_assign(dst, test);
     writer.op_assign(dst, other_test);
     writer.op_assign(dst, operator^(operator^(dst, src), other));
 
-    writer.op_if(exp(src) == dst, [&]{
-        writer.op_binary_expression(dst, src, BinaryOp::Add, src);
-    }).op_else_if(exp(src) > dst, [&]{
-        writer.op_binary_expression(dst, src, BinaryOp::Add, src);
-    }).op_else([&] {
-        writer.op_assign(dst, src);
-    });
+    writer.op_if(exp(src) == dst, [&] { writer.op_binary_expression(dst, src, BinaryOp::Add, src); })
+        .op_else_if(exp(src) > dst, [&] { writer.op_binary_expression(dst, src, BinaryOp::Add, src); })
+        .op_else([&] { writer.op_assign(dst, src); });
 
     writer.op_assign(dst, src + src * src);
     writer.op_assign(dst, src * max(src, dst) + src);
@@ -106,13 +103,11 @@ int main()
     writer.op_assign(dst, src ^ dst);
     writer.op_assign(dst, ~src);
 
-    writer.op_for_loop(dst < src, dst += src, [&]{
-        writer.op_assign(dst, src + dst);
-    });
+    writer.op_for_loop(dst < src, dst += src, [&] { writer.op_assign(dst, src + dst); });
 
     writer.op_assign(dst += src);
     writer.op_assign(dst += exp(src));
 
     std::cout << "======== KERNEL ========" << std::endl;
     std::cout << writer.generate_code() << std::endl;
-}
\ No newline at end of file
+}
diff --git a/compute_kernel_writer/prototype/include/ckw/Error.h b/compute_kernel_writer/prototype/include/ckw/Error.h
index b18944eac5ccd6e49eda7db1a1245d06dcdc4ad0..aab713c817a5dcf236fa010012adbc9c0bb40203 100644
--- a/compute_kernel_writer/prototype/include/ckw/Error.h
+++ b/compute_kernel_writer/prototype/include/ckw/Error.h
@@ -39,11 +39,11 @@ namespace ckw
 #define CKW_ASSERT_MSG(cond, msg)            \
     do                                       \
     {                                        \
-        if(!(cond))                          \
+        if (!(cond))                         \
         {                                    \
             throw ::std::runtime_error(msg); \
         }                                    \
-    } while(false)
+    } while (false)
 
 /** If the condition is not met, throw an std::runtime_error.
  *
@@ -56,8 +56,7 @@ namespace ckw
  * @param[in] precond The condition if is met requires the consequence must also be met.
  * @param[in] cond    The condition that is expected to be true if the precondition is true.
  */
-#define CKW_ASSERT_IF(precond, cond) \
-    CKW_ASSERT_MSG(!(precond) || ((precond) && (cond)), #precond " |-> " #cond)
+#define CKW_ASSERT_IF(precond, cond) CKW_ASSERT_MSG(!(precond) || ((precond) && (cond)), #precond " |-> " #cond)
 
 /** Mark the variables as unused.
  *
diff --git a/compute_kernel_writer/prototype/include/ckw/KernelArgument.h b/compute_kernel_writer/prototype/include/ckw/KernelArgument.h
index af8bcde6340b62382293c283bba5818e1a309d35..3384a20aef5062b1f7cb33e5df5f06004b27c05a 100644
--- a/compute_kernel_writer/prototype/include/ckw/KernelArgument.h
+++ b/compute_kernel_writer/prototype/include/ckw/KernelArgument.h
@@ -26,6 +26,7 @@
 #define CKW_PROTOTYPE_INCLUDE_CKW_KERNELARGUMENT_H
 
 #include "ckw/TensorInfo.h"
+
 #include <cstdint>
 
 namespace ckw
@@ -98,7 +99,7 @@ private:
         TensorComponentType tensor_component_type;
     };
 
-    SubId _sub_id{ 0 };
+    SubId _sub_id{0};
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/prototype/include/ckw/KernelWriter.h b/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
index 72f85c78aa234b42d5949ade092ce5a121d69b51..f9e0066f91be19b0ce5403dda0750e1f6efe7518 100644
--- a/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
+++ b/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
@@ -94,7 +94,9 @@ public:
      *
      * @return The @ref TensorOperand object.
      */
-    TensorOperand &declare_tensor_argument(const std::string &name, const TensorInfo &info, TensorStorageType storage_type = TensorStorageType::BufferUint8Ptr);
+    TensorOperand &declare_tensor_argument(const std::string &name,
+                                           const TensorInfo  &info,
+                                           TensorStorageType  storage_type = TensorStorageType::BufferUint8Ptr);
 
     /** Declare a compile-time constant scalar argument.
      *
@@ -134,7 +136,10 @@ public:
      * @param[in]  sampler    The tensor sampling information.
      * @param[in]  dilation_y Dilation in the Y dimension.
      */
-    void op_load(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler, const TileOperand &dilation_y = TileOperand("dil_y", 1));
+    void op_load(TileOperand             &tile,
+                 const TensorOperand     &tensor,
+                 const TensorTileSampler &sampler,
+                 const TileOperand       &dilation_y = TileOperand("dil_y", 1));
 
     /** Load the data from the tensor memory to the tile using the indirect buffer approach and respective of the sampling information.
      *
@@ -221,7 +226,10 @@ public:
      * @param[in]  first    The first argument tile.
      * @param[in]  second   The second argument tile.
      */
-    void op_binary_elementwise_function(const TileOperand &dst, BinaryFunction func, const TileOperand &first, const TileOperand &second);
+    void op_binary_elementwise_function(const TileOperand &dst,
+                                        BinaryFunction     func,
+                                        const TileOperand &first,
+                                        const TileOperand &second);
 
     /** Write function applied to scalar value: `<dst> = <func>(<first>, <second>, <third>);`.
      *
@@ -231,7 +239,11 @@ public:
      * @param[in]  second   The second argument tile.
      * @param[in]  third    The third argument tile.
      */
-    void op_ternary_elementwise_function(const TileOperand &dst, TernaryFunction func, const TileOperand &first, const TileOperand &second, const TileOperand &third);
+    void op_ternary_elementwise_function(const TileOperand &dst,
+                                         TernaryFunction    func,
+                                         const TileOperand &first,
+                                         const TileOperand &second,
+                                         const TileOperand &third);
 
     /** Write if-statement: `if(<lhs> <op> <rhs>) { <body> }`.
      *
@@ -267,7 +279,13 @@ public:
      * @param[in, out]  update_value      The value which is updated at every iteration.
      * @param[in]       body              The body of the for-loop.
      */
-    void op_for_loop(const TileOperand &var_name, BinaryOp cond_op, const TileOperand &cond_value_name, const TileOperand &update_var_name, AssignmentOp update_op, const TileOperand &update_value_name, const std::function<void()> &body);
+    void op_for_loop(const TileOperand           &var_name,
+                     BinaryOp                     cond_op,
+                     const TileOperand           &cond_value_name,
+                     const TileOperand           &update_var_name,
+                     AssignmentOp                 update_op,
+                     const TileOperand           &update_value_name,
+                     const std::function<void()> &body);
 
     /** Write the return statement: `return;`
      */
@@ -282,7 +300,7 @@ public:
      * @param[out] dst The tile to be written to.
      * @param[in]  dim The global ID dimension.
      */
-    void op_get_global_id(TileOperand &dst, int32_t dim);
+    void op_get_global_id(const TileOperand &dst, int32_t dim);
 
     // =============================================================================================
     // Code generation
@@ -311,8 +329,8 @@ private:
     ::std::unique_ptr<prototype::GpuKernelWriterAttribute> _impl_attr;
     ::std::unique_ptr<prototype::IGpuKernelWriter>         _impl;
 
-    int32_t _id_space{ 0 };
-    int32_t _max_id_space{ 0 };
+    int32_t _id_space{0};
+    int32_t _max_id_space{0};
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/prototype/include/ckw/KernelWriterHelper.h b/compute_kernel_writer/prototype/include/ckw/KernelWriterHelper.h
index a8be859680ca03885c1c428cda1df1515678a65b..3ba079bbc21bbf59e7ebcc7df15b6e285e347f2f 100644
--- a/compute_kernel_writer/prototype/include/ckw/KernelWriterHelper.h
+++ b/compute_kernel_writer/prototype/include/ckw/KernelWriterHelper.h
@@ -32,8 +32,6 @@
 #include <iostream>
 #include <type_traits>
 
-#include <iostream>
-
 /*
  * By including this header file you will be able to supplement the default
  * Compute Kernel Writer API with additional syntax to help ease the use of CKW.
@@ -154,7 +152,9 @@ struct can_be_assigned<TileOperand &> : ::std::true_type
  * @tparam TLeft The type of the destination of the assignment.
  * @tparam TRight The type of the source assigned to the destination.
  */
-template <typename TLeft, typename TRight, typename = ::std::enable_if<can_be_operand<TRight>::value && can_be_assigned<TLeft>::value>>
+template <typename TLeft,
+          typename TRight,
+          typename = ::std::enable_if<can_be_operand<TRight>::value && can_be_assigned<TLeft>::value>>
 struct Assignment
 {
     TLeft        lhs;
@@ -173,7 +173,7 @@ struct Assignment
 template <typename TLeft, typename TRight>
 inline Assignment<TLeft, TRight> operator+=(TLeft &&lhs, TRight &&rhs)
 {
-    return Assignment<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), AssignmentOp::Increment };
+    return Assignment<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), AssignmentOp::Increment};
 }
 
 /** Represents the expression: `\p lhs -= \p rhs`.
@@ -187,7 +187,7 @@ inline Assignment<TLeft, TRight> operator+=(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline Assignment<TLeft, TRight> operator-=(TLeft &&lhs, TRight &&rhs)
 {
-    return Assignment<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), AssignmentOp::Decrement };
+    return Assignment<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), AssignmentOp::Decrement};
 }
 
 // ==================================================
@@ -221,7 +221,7 @@ struct can_be_operand<UnaryExpression<TLeft>> : ::std::true_type
 template <typename TSrc>
 inline UnaryExpression<TSrc> operator!(TSrc &&src)
 {
-    return UnaryExpression<TSrc>{ std::forward<TSrc>(src), UnaryOp::LogicalNot };
+    return UnaryExpression<TSrc>{std::forward<TSrc>(src), UnaryOp::LogicalNot};
 }
 
 /** Represents the expression: `~\p src`.
@@ -233,7 +233,7 @@ inline UnaryExpression<TSrc> operator!(TSrc &&src)
 template <typename TSrc>
 inline UnaryExpression<TSrc> operator~(TSrc &&src)
 {
-    return UnaryExpression<TSrc>{ std::forward<TSrc>(src), UnaryOp::BitwiseNot };
+    return UnaryExpression<TSrc>{std::forward<TSrc>(src), UnaryOp::BitwiseNot};
 }
 
 // ==================================================
@@ -247,7 +247,9 @@ inline UnaryExpression<TSrc> operator~(TSrc &&src)
  * @tparam TLeft  The type of the left argument of the expression.
  * @tparam TRight The type of the right argument of the expression.
  */
-template <typename TLeft, typename TRight, typename = ::std::enable_if_t<can_be_operand<TLeft>::value && can_be_operand<TRight>::value>>
+template <typename TLeft,
+          typename TRight,
+          typename = ::std::enable_if_t<can_be_operand<TLeft>::value && can_be_operand<TRight>::value>>
 struct BinaryExpression
 {
     TLeft    lhs;
@@ -271,7 +273,7 @@ struct can_be_operand<BinaryExpression<TLeft, TRight>> : ::std::true_type
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator+(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Add };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Add};
 }
 
 /** Represents the expression: `\p lhs - \p rhs`.
@@ -285,7 +287,7 @@ inline BinaryExpression<TLeft, TRight> operator+(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator-(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Sub };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Sub};
 }
 
 /** Represents the expression: `\p lhs * \p rhs`.
@@ -299,7 +301,7 @@ inline BinaryExpression<TLeft, TRight> operator-(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator*(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Mul };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Mul};
 }
 
 /** Represents the expression: `\p lhs / \p rhs`.
@@ -313,7 +315,7 @@ inline BinaryExpression<TLeft, TRight> operator*(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator/(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Div };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Div};
 }
 
 /** Represents the expression: `\p lhs % \p rhs`.
@@ -327,7 +329,7 @@ inline BinaryExpression<TLeft, TRight> operator/(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator%(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Mod };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Mod};
 }
 
 /** Represents the expression: `\p lhs == \p rhs`.
@@ -341,7 +343,7 @@ inline BinaryExpression<TLeft, TRight> operator%(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator==(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Equal };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Equal};
 }
 
 /** Represents the expression: `\p lhs < \p rhs`.
@@ -355,7 +357,7 @@ inline BinaryExpression<TLeft, TRight> operator==(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator<(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Less };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Less};
 }
 
 /** Represents the expression: `\p lhs <= \p rhs`.
@@ -369,7 +371,7 @@ inline BinaryExpression<TLeft, TRight> operator<(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator<=(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LessEqual };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LessEqual};
 }
 
 /** Represents the expression: `\p lhs > \p rhs`.
@@ -383,7 +385,7 @@ inline BinaryExpression<TLeft, TRight> operator<=(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator>(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Greater };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Greater};
 }
 
 /** Represents the expression: `\p lhs >= \p rhs`.
@@ -397,7 +399,7 @@ inline BinaryExpression<TLeft, TRight> operator>(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator>=(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::GreaterEqual };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::GreaterEqual};
 }
 
 /** Represents the expression: `\p lhs ^ \p rhs`.
@@ -411,7 +413,7 @@ inline BinaryExpression<TLeft, TRight> operator>=(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator^(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::BitwiseXOR };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::BitwiseXOR};
 }
 
 /** Represents the expression: `\p lhs && \p rhs`.
@@ -425,7 +427,7 @@ inline BinaryExpression<TLeft, TRight> operator^(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> logical_and(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalAnd };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalAnd};
 }
 
 /** Represents the expression: `\p lhs && \p rhs`.
@@ -440,7 +442,7 @@ template <typename TLeft, typename TRight, typename... TOps>
 inline BinaryExpression<BinaryExpression<TLeft, TRight>, TOps...> logical_and(TLeft &&lhs, TRight &&rhs, TOps &&...ops)
 {
     return logical_and(
-        BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalAnd },
+        BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalAnd},
         std::forward<TOps>(ops)...);
 }
 
@@ -455,7 +457,7 @@ inline BinaryExpression<BinaryExpression<TLeft, TRight>, TOps...> logical_and(TL
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> logical_or(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalOr };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalOr};
 }
 
 /** Represents the expression: `\p lhs || \p rhs`.
@@ -470,7 +472,7 @@ template <typename TLeft, typename TRight, typename... TOps>
 inline BinaryExpression<BinaryExpression<TLeft, TRight>, TOps...> logical_or(TLeft &&lhs, TRight &&rhs, TOps &&...ops)
 {
     return logical_or(
-        BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalOr },
+        BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalOr},
         std::forward<TOps>(ops)...);
 }
 
@@ -505,7 +507,7 @@ struct can_be_operand<UnaryElementwiseFunction<TLeft>> : ::std::true_type
 template <typename TSrc>
 UnaryElementwiseFunction<TSrc> exp(TSrc &&src)
 {
-    return UnaryElementwiseFunction<TSrc>{ std::forward<TSrc>(src), UnaryFunction::Exp };
+    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Exp};
 }
 
 /** Represents the expression: `tanh(\p src)`.
@@ -517,7 +519,7 @@ UnaryElementwiseFunction<TSrc> exp(TSrc &&src)
 template <typename TSrc>
 UnaryElementwiseFunction<TSrc> tanh(TSrc &&src)
 {
-    return UnaryElementwiseFunction<TSrc>{ std::forward<TSrc>(src), UnaryFunction::Tanh };
+    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Tanh};
 }
 
 /** Represents the expression: `sqrt(\p src)`.
@@ -529,7 +531,7 @@ UnaryElementwiseFunction<TSrc> tanh(TSrc &&src)
 template <typename TSrc>
 UnaryElementwiseFunction<TSrc> sqrt(TSrc &&src)
 {
-    return UnaryElementwiseFunction<TSrc>{ std::forward<TSrc>(src), UnaryFunction::Sqrt };
+    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Sqrt};
 }
 
 /** Represents the expression: `erf(\p src)`.
@@ -541,7 +543,7 @@ UnaryElementwiseFunction<TSrc> sqrt(TSrc &&src)
 template <typename TSrc>
 UnaryElementwiseFunction<TSrc> erf(TSrc &&src)
 {
-    return UnaryElementwiseFunction<TSrc>{ std::forward<TSrc>(src), UnaryFunction::Erf };
+    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Erf};
 }
 
 /** Represents the expression: `fabs(\p src)`.
@@ -553,7 +555,7 @@ UnaryElementwiseFunction<TSrc> erf(TSrc &&src)
 template <typename TSrc>
 UnaryElementwiseFunction<TSrc> fabs(TSrc &&src)
 {
-    return UnaryElementwiseFunction<TSrc>{ std::forward<TSrc>(src), UnaryFunction::Fabs };
+    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Fabs};
 }
 
 /** Represents the expression: `log(\p src)`.
@@ -565,7 +567,7 @@ UnaryElementwiseFunction<TSrc> fabs(TSrc &&src)
 template <typename TSrc>
 UnaryElementwiseFunction<TSrc> log(TSrc &&src)
 {
-    return UnaryElementwiseFunction<TSrc>{ std::forward<TSrc>(src), UnaryFunction::Log };
+    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Log};
 }
 
 /** Represents the expression: `round(\p src)`.
@@ -577,7 +579,7 @@ UnaryElementwiseFunction<TSrc> log(TSrc &&src)
 template <typename TSrc>
 UnaryElementwiseFunction<TSrc> round(TSrc &&src)
 {
-    return UnaryElementwiseFunction<TSrc>{ std::forward<TSrc>(src), UnaryFunction::Round };
+    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Round};
 }
 
 /** Represents the expression: `sizeof(\p src)`.
@@ -589,7 +591,7 @@ UnaryElementwiseFunction<TSrc> round(TSrc &&src)
 template <typename TSrc>
 UnaryElementwiseFunction<TSrc> sizeOf(TSrc &&src)
 {
-    return UnaryElementwiseFunction<TSrc>{ std::forward<TSrc>(src), UnaryFunction::SizeOf };
+    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::SizeOf};
 }
 
 // ==================================================
@@ -603,7 +605,9 @@ UnaryElementwiseFunction<TSrc> sizeOf(TSrc &&src)
  * @tparam TFirst  The type of the left argument of the function.
  * @tparam TSecond The type of the right argument of the function.
  */
-template <typename TFirst, typename TSecond, typename = ::std::enable_if<can_be_operand<TFirst>::value && can_be_operand<TSecond>::value>>
+template <typename TFirst,
+          typename TSecond,
+          typename = ::std::enable_if<can_be_operand<TFirst>::value && can_be_operand<TSecond>::value>>
 struct BinaryElementwiseFunction
 {
     TFirst         first;
@@ -627,7 +631,8 @@ struct can_be_operand<BinaryElementwiseFunction<TFirst, TSecond>> : ::std::true_
 template <typename TFirst, typename TSecond>
 BinaryElementwiseFunction<TFirst, TSecond> max(TFirst &&first, TSecond &&second)
 {
-    return BinaryElementwiseFunction<TFirst, TSecond>{ std::forward<TFirst>(first), std::forward<TSecond>(second), BinaryFunction::Max };
+    return BinaryElementwiseFunction<TFirst, TSecond>{std::forward<TFirst>(first), std::forward<TSecond>(second),
+                                                      BinaryFunction::Max};
 }
 
 /** Represents the function call: `min(\p first, \p second)`.
@@ -641,7 +646,8 @@ BinaryElementwiseFunction<TFirst, TSecond> max(TFirst &&first, TSecond &&second)
 template <typename TFirst, typename TSecond>
 BinaryElementwiseFunction<TFirst, TSecond> min(TFirst &&first, TSecond &&second)
 {
-    return BinaryElementwiseFunction<TFirst, TSecond>{ std::forward<TFirst>(first), std::forward<TSecond>(second), BinaryFunction::Min };
+    return BinaryElementwiseFunction<TFirst, TSecond>{std::forward<TFirst>(first), std::forward<TSecond>(second),
+                                                      BinaryFunction::Min};
 }
 
 // ==================================================
@@ -656,7 +662,11 @@ BinaryElementwiseFunction<TFirst, TSecond> min(TFirst &&first, TSecond &&second)
  * @tparam TSecond The type of the second argument to the function.
  * @tparam TThird The type of the third argument to the function.
  */
-template <typename TFirst, typename TSecond, typename TThird, typename = ::std::enable_if<can_be_operand<TFirst>::value && can_be_operand<TSecond>::value && can_be_operand<TThird>::value>>
+template <typename TFirst,
+          typename TSecond,
+          typename TThird,
+          typename = ::std::enable_if<can_be_operand<TFirst>::value && can_be_operand<TSecond>::value &&
+                                      can_be_operand<TThird>::value>>
 struct TernaryElementwiseFunction
 {
     TFirst          first;
@@ -683,7 +693,9 @@ struct can_be_operand<TernaryElementwiseFunction<TFirst, TSecond, TThird>> : ::s
 template <typename TFirst, typename TSecond, typename TThird>
 TernaryElementwiseFunction<TFirst, TSecond, TThird> select(TFirst &&first, TSecond &&second, TThird &&third)
 {
-    return TernaryElementwiseFunction<TFirst, TSecond, TThird>{ std::forward<TFirst>(first), std::forward<TSecond>(second), std::forward<TThird>(third), TernaryFunction::Select };
+    return TernaryElementwiseFunction<TFirst, TSecond, TThird>{std::forward<TFirst>(first),
+                                                               std::forward<TSecond>(second),
+                                                               std::forward<TThird>(third), TernaryFunction::Select};
 }
 
 /** Helper class used to extend a KernelWriter with additional functionality
@@ -715,7 +727,8 @@ public:
      * @param[in] cond The BinaryExpression representing the condition.
      * @param[in] body The body of the if-statement.
      */
-    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TileOperand &, TileOperand &> &cond, const std::function<void()> &body)
+    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TileOperand &, TileOperand &> &cond,
+                                       const std::function<void()>                          &body)
     {
         TWriter::op_if(cond.lhs, cond.opcode, cond.rhs, body);
         return *this;
@@ -730,7 +743,8 @@ public:
      * @param[in] body The body of the if-statement.
      */
     template <typename TRight>
-    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TileOperand &, TRight> &cond, const std::function<void()> &body)
+    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TileOperand &, TRight> &cond,
+                                       const std::function<void()>                   &body)
     {
         auto &tmp1 = declare_temp_tile(cond.lhs.tile_info());
         op_assign(tmp1, cond.rhs);
@@ -747,7 +761,8 @@ public:
      * @param[in] body The body of the if-statement.
      */
     template <typename TLeft>
-    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TLeft, TileOperand &> &cond, const std::function<void()> &body)
+    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TLeft, TileOperand &> &cond,
+                                       const std::function<void()>                  &body)
     {
         auto &tmp1 = declare_temp_tile(cond.rhs.tile_info());
         op_assign(tmp1, cond.lhs);
@@ -766,7 +781,8 @@ public:
      * @param[in] cond The BinaryExpression representing the condition.
      * @param[in] body The body of the else-if-statement.
      */
-    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TileOperand &, TileOperand &> &cond, const std::function<void()> &body)
+    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TileOperand &, TileOperand &> &cond,
+                                            const std::function<void()>                          &body)
     {
         TWriter::op_else_if(cond.lhs, cond.opcode, cond.rhs, body);
         return *this;
@@ -781,7 +797,8 @@ public:
      * @param[in] body The body of the else-if-statement.
      */
     template <typename TRight>
-    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TileOperand &, TRight> &cond, const std::function<void()> &body)
+    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TileOperand &, TRight> &cond,
+                                            const std::function<void()>                   &body)
     {
         auto &tmp1 = declare_temp_tile(cond.lhs.tile_info());
         op_assign(tmp1, cond.rhs);
@@ -798,7 +815,8 @@ public:
      * @param[in] body The body of the else-if-statement.
      */
     template <typename TLeft>
-    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TLeft, TileOperand &> &cond, const std::function<void()> &body)
+    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TLeft, TileOperand &> &cond,
+                                            const std::function<void()>                  &body)
     {
         auto &tmp1 = declare_temp_tile(cond.rhs.tile_info());
         op_assign(tmp1, cond.lhs);
@@ -823,7 +841,9 @@ public:
      * @param[in] updater The Assignment representing the updater.
      * @param[in] body    The body of the for-loop.
      */
-    void op_for_loop(const BinaryExpression<TileOperand &, TileOperand &> &cond, const Assignment<TileOperand &, TileOperand &> &updater, const std::function<void()> &body)
+    void op_for_loop(const BinaryExpression<TileOperand &, TileOperand &> &cond,
+                     const Assignment<TileOperand &, TileOperand &>       &updater,
+                     const std::function<void()>                          &body)
     {
         TWriter::op_for_loop(cond.lhs, cond.opcode, cond.rhs, updater.lhs, updater.opcode, updater.rhs, body);
     }
@@ -1029,7 +1049,8 @@ public:
      * @param[in] dst The tile which is assigned to.
      * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
      */
-    void op_assign(const TileOperand &dst, const TernaryElementwiseFunction<TileOperand &, TileOperand &, TileOperand &> &exp)
+    void op_assign(const TileOperand                                                             &dst,
+                   const TernaryElementwiseFunction<TileOperand &, TileOperand &, TileOperand &> &exp)
     {
         TWriter::op_ternary_elementwise_function(dst, exp.opcode, exp.first, exp.second, exp.third);
     }
@@ -1169,11 +1190,11 @@ public:
      */
     void op_assign(const Assignment<TileOperand &, TileOperand &> &exp)
     {
-        if(exp.opcode == AssignmentOp::Increment)
+        if (exp.opcode == AssignmentOp::Increment)
         {
             TWriter::op_binary_expression(exp.lhs, exp.lhs, BinaryOp::Add, exp.rhs);
         }
-        else if(exp.opcode == AssignmentOp::Decrement)
+        else if (exp.opcode == AssignmentOp::Decrement)
         {
             TWriter::op_binary_expression(exp.lhs, exp.lhs, BinaryOp::Sub, exp.rhs);
         }
@@ -1192,7 +1213,7 @@ public:
     {
         auto &tmp1 = declare_temp_tile(exp.lhs.tile_info());
         op_assign(tmp1, exp.rhs);
-        op_assign(Assignment<TileOperand &, TileOperand &>{ exp.lhs, tmp1, exp.opcode });
+        op_assign(Assignment<TileOperand &, TileOperand &>{exp.lhs, tmp1, exp.opcode});
     }
 
 private:
@@ -1241,11 +1262,8 @@ private:
     template <typename... TOps, typename = ::std::enable_if_t<std::is_same<TOps..., TileInfo>::value>>
     TileInfo get_largest_size(const TileInfo &first, const TileInfo &second, const TOps &...ops)
     {
-        TileInfo largest = {
-            first.data_type(),
-            std::max(first.width(), second.width()),
-            std::max(first.height(), second.height())
-        };
+        TileInfo largest = {first.data_type(), std::max(first.width(), second.width()),
+                            std::max(first.height(), second.height())};
         return get_largest_size(largest, ops...);
     }
 
diff --git a/compute_kernel_writer/prototype/include/ckw/OperandBase.h b/compute_kernel_writer/prototype/include/ckw/OperandBase.h
index 06d9f82756d430e74ac387adfc64048c901a9904..984212733914c144f3084b330b202516a084bbdd 100644
--- a/compute_kernel_writer/prototype/include/ckw/OperandBase.h
+++ b/compute_kernel_writer/prototype/include/ckw/OperandBase.h
@@ -26,6 +26,7 @@
 #define CKW_PROTOTYPE_INCLUDE_CKW_OPERANDBASE_H
 
 #include "ckw/types/DataType.h"
+
 #include <string>
 
 namespace ckw
diff --git a/compute_kernel_writer/prototype/include/ckw/ScalarValue.h b/compute_kernel_writer/prototype/include/ckw/ScalarValue.h
index 16c3f6d441dc8676bafd2940380b3009f59d58e8..2a9c42acc82b3d4198f0b65c7bef32f0bdbe429d 100644
--- a/compute_kernel_writer/prototype/include/ckw/ScalarValue.h
+++ b/compute_kernel_writer/prototype/include/ckw/ScalarValue.h
@@ -59,9 +59,9 @@ public:
 
         _size = sizeof(T);
 
-        if(::std::is_integral<T>::value)
+        if (::std::is_integral<T>::value)
         {
-            if(::std::is_signed<T>::value)
+            if (::std::is_signed<T>::value)
             {
                 _type      = Type::INT;
                 _value.i64 = value;
@@ -90,9 +90,9 @@ public:
         CKW_ASSERT(::std::is_integral<T>::value || ::std::is_floating_point<T>::value);
         CKW_ASSERT(sizeof(T) >= _size);
 
-        if(::std::is_integral<T>::value)
+        if (::std::is_integral<T>::value)
         {
-            if(::std::is_signed<T>::value)
+            if (::std::is_signed<T>::value)
             {
                 CKW_ASSERT(_type == Type::INT || _type == Type::UINT);
                 CKW_ASSERT_IF(_type == Type::UINT, sizeof(T) > _size);
diff --git a/compute_kernel_writer/prototype/include/ckw/TensorInfo.h b/compute_kernel_writer/prototype/include/ckw/TensorInfo.h
index 55f8101a53c68d09703c848fdc4214ea3ea7b4fd..24da7dc8ab4f2bd89b9b89a65f118a495d4117e4 100644
--- a/compute_kernel_writer/prototype/include/ckw/TensorInfo.h
+++ b/compute_kernel_writer/prototype/include/ckw/TensorInfo.h
@@ -143,10 +143,10 @@ public:
     int32_t id() const;
 
 private:
-    TensorShape      _shape{ { 0 } };
-    DataType         _dt{ DataType::Unknown };
-    TensorDataLayout _dl{ TensorDataLayout::Unknown };
-    int32_t          _id{ -1 };
+    TensorShape      _shape{{0}};
+    DataType         _dt{DataType::Unknown};
+    TensorDataLayout _dl{TensorDataLayout::Unknown};
+    int32_t          _id{-1};
 };
 } // namespace ckw
 
diff --git a/compute_kernel_writer/prototype/include/ckw/TensorOperand.h b/compute_kernel_writer/prototype/include/ckw/TensorOperand.h
index 6d88932c6635cd7e0211ab0002ef75ab52922c42..c221b449fa28aea94a3c612186ec0ad1df2ab58b 100644
--- a/compute_kernel_writer/prototype/include/ckw/TensorOperand.h
+++ b/compute_kernel_writer/prototype/include/ckw/TensorOperand.h
@@ -139,21 +139,21 @@ private:
     TensorInfo        _info;
     TensorStorageType _storage_type;
 
-    TileOperand      *_tile{ nullptr };
+    TileOperand      *_tile{nullptr};
     TensorTileSampler _tile_sampler{};
 
-    ::std::unique_ptr<TensorComponentOperand> _stride1{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _stride2{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _stride3{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _stride4{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim0{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim1{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim2{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim3{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim4{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim1_dim2{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim1_dim2_dim3{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _offset_first_element_in_bytes{ nullptr };
+    ::std::unique_ptr<TensorComponentOperand> _stride1{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _stride2{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _stride3{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _stride4{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _dim0{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _dim1{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _dim2{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _dim3{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _dim4{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _dim1_dim2{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _dim1_dim2_dim3{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _offset_first_element_in_bytes{nullptr};
 };
 
 // =================================================================================================
diff --git a/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h b/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h
index e1bf0c52b8639e0fe2ba6820f6e5c8568a071d4a..606dec35353be920f612d5f4929209489d4d80c1 100644
--- a/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h
+++ b/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h
@@ -26,6 +26,7 @@
 #define CKW_PROTOTYPE_INCLUDE_CKW_TENSORTILESAMPLER_H
 
 #include "ckw/types/TensorSamplerTypes.h"
+
 #include <functional>
 
 namespace ckw
@@ -55,12 +56,14 @@ public:
      * @param[in] address_mode_y The address mode of the y dimension.
      * @param[in] address_mode_z The address mode of the z dimension.
      */
-    TensorTileSampler(
-        TileOperand &x, TileOperand &y, TileOperand &z, TileOperand &b,
-        TensorSamplerFormat       format,
-        TensorSamplerAddressModeX address_mode_x,
-        TensorSamplerAddressModeY address_mode_y,
-        TensorSamplerAddressModeZ address_mode_z);
+    TensorTileSampler(TileOperand              &x,
+                      TileOperand              &y,
+                      TileOperand              &z,
+                      TileOperand              &b,
+                      TensorSamplerFormat       format,
+                      TensorSamplerAddressModeX address_mode_x,
+                      TensorSamplerAddressModeY address_mode_y,
+                      TensorSamplerAddressModeZ address_mode_z);
 
     /** Initialize a new instance of @ref TensorSampler class.
      *
@@ -75,13 +78,16 @@ public:
      * @param[in] address_mode_y The address mode of the y dimension.
      * @param[in] address_mode_z The address mode of the z dimension.
      */
-    TensorTileSampler(
-        TileOperand &x, TileOperand &y, TileOperand &z, TileOperand &b,
-        int32_t height, int32_t width,
-        TensorSamplerFormat       format,
-        TensorSamplerAddressModeX address_mode_x,
-        TensorSamplerAddressModeY address_mode_y,
-        TensorSamplerAddressModeZ address_mode_z);
+    TensorTileSampler(TileOperand              &x,
+                      TileOperand              &y,
+                      TileOperand              &z,
+                      TileOperand              &b,
+                      int32_t                   height,
+                      int32_t                   width,
+                      TensorSamplerFormat       format,
+                      TensorSamplerAddressModeX address_mode_x,
+                      TensorSamplerAddressModeY address_mode_y,
+                      TensorSamplerAddressModeZ address_mode_z);
 
     /** Get the coordinate in the x dimension. */
     const TileOperand &x() const;
@@ -144,18 +150,18 @@ public:
     TensorTileSampler &address_mode_z(TensorSamplerAddressModeZ address_mode_z);
 
 private:
-    TileOperand *_x{ nullptr };
-    TileOperand *_y{ nullptr };
-    TileOperand *_z{ nullptr };
-    TileOperand *_b{ nullptr };
-
-    int32_t _height{ 0 };
-    int32_t _width{ 0 };
-
-    TensorSamplerFormat       _format{ TensorSamplerFormat::Unknown };
-    TensorSamplerAddressModeX _address_mode_x{ TensorSamplerAddressModeX::Unknown };
-    TensorSamplerAddressModeY _address_mode_y{ TensorSamplerAddressModeY::Unknown };
-    TensorSamplerAddressModeZ _address_mode_z{ TensorSamplerAddressModeZ::Unknown };
+    TileOperand *_x{nullptr};
+    TileOperand *_y{nullptr};
+    TileOperand *_z{nullptr};
+    TileOperand *_b{nullptr};
+
+    int32_t _height{0};
+    int32_t _width{0};
+
+    TensorSamplerFormat       _format{TensorSamplerFormat::Unknown};
+    TensorSamplerAddressModeX _address_mode_x{TensorSamplerAddressModeX::Unknown};
+    TensorSamplerAddressModeY _address_mode_y{TensorSamplerAddressModeY::Unknown};
+    TensorSamplerAddressModeZ _address_mode_z{TensorSamplerAddressModeZ::Unknown};
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/prototype/include/ckw/TileInfo.h b/compute_kernel_writer/prototype/include/ckw/TileInfo.h
index de9e47af2b1d3b0571f720ebbd92455460ea4345..e0d064169ed068b082168dbee6f78ffcaef4437c 100644
--- a/compute_kernel_writer/prototype/include/ckw/TileInfo.h
+++ b/compute_kernel_writer/prototype/include/ckw/TileInfo.h
@@ -83,7 +83,7 @@ public:
     DataType data_type() const;
 
 private:
-    DataType  _dt{ DataType::Unknown };
+    DataType  _dt{DataType::Unknown};
     TileShape _shape{};
 };
 
diff --git a/compute_kernel_writer/prototype/include/ckw/types/Functions.h b/compute_kernel_writer/prototype/include/ckw/types/Functions.h
index 2dd5ed0b3d3cee1e6e0a4df6b78418c18b322126..c6afaa0ac8d8bbe80e1013f0ddaf358797904f39 100644
--- a/compute_kernel_writer/prototype/include/ckw/types/Functions.h
+++ b/compute_kernel_writer/prototype/include/ckw/types/Functions.h
@@ -32,27 +32,29 @@ namespace ckw
 
 enum class UnaryFunction : int32_t
 {
-    Exp            = 0x0000,
-    Tanh           = 0x0001,
-    Sqrt           = 0x0002,
-    Erf            = 0x0003,
-    Fabs           = 0x0004,
-    Log            = 0x0006,
-    Round          = 0x0007,
+    Exp   = 0x0000,
+    Tanh  = 0x0001,
+    Sqrt  = 0x0002,
+    Erf   = 0x0003,
+    Fabs  = 0x0004,
+    Log   = 0x0006,
+    Round = 0x0007,
+    Floor = 0x0008,
 
     // Misc
-    SizeOf = 0x0008,
+    SizeOf = 0x0009,
 };
 
 enum class BinaryFunction : int32_t
 {
-    Min  = 0x0000,
-    Max  = 0x0001,
+    Min = 0x0000,
+    Max = 0x0001,
 };
 
 enum class TernaryFunction : int32_t
 {
     Select = 0x0000,
+    Clamp  = 0x0001,
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/prototype/include/ckw/types/Operators.h b/compute_kernel_writer/prototype/include/ckw/types/Operators.h
index 14a88c91b48f1ee976deec9a30a81d9000e3cfac..b560996837614618ec69b15a214c13cdb7fe1f9a 100644
--- a/compute_kernel_writer/prototype/include/ckw/types/Operators.h
+++ b/compute_kernel_writer/prototype/include/ckw/types/Operators.h
@@ -34,6 +34,7 @@ enum class UnaryOp : int32_t
 {
     LogicalNot = 0x0000, // !
     BitwiseNot = 0x0001, // ~
+    Negate     = 0x0002, // -
 };
 
 /* Binary operations
@@ -68,8 +69,8 @@ enum class BinaryOp : int32_t
 enum class AssignmentOp : int32_t
 {
     // Unary
-    Increment  = 0x0000, // +=
-    Decrement  = 0x0001, // -=
+    Increment = 0x0000, // +=
+    Decrement = 0x0001, // -=
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/prototype/include/ckw/types/TensorSamplerTypes.h b/compute_kernel_writer/prototype/include/ckw/types/TensorSamplerTypes.h
index 836bd13c95b17e69854713a75eadfe7f29278785..63405a0764e0bae9027fe18acfa97e16a60d5594 100644
--- a/compute_kernel_writer/prototype/include/ckw/types/TensorSamplerTypes.h
+++ b/compute_kernel_writer/prototype/include/ckw/types/TensorSamplerTypes.h
@@ -39,34 +39,38 @@ enum class TensorSamplerFormat : int32_t
 
 enum class TensorSamplerAddressModeX : int32_t
 {
-    Unknown        = 0,
-    None           = 1, // The user guarantees that the X coordinate is always in-bound
-    OverlappingMin = 2  // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
-                        // Leftover elements can be handled using overlapping. This involves processing some of the elements in the array twice.
+    Unknown = 0,
+    None    = 1, // The user guarantees that the X coordinate is always in-bound
+    OverlappingMin =
+        2 // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
+    // Leftover elements can be handled using overlapping. This involves processing some of the elements in the array twice.
 };
 
 enum class TensorSamplerAddressModeY : int32_t
 {
-    Unknown                  = 0,
-    None                     = 1, // The user guarantees that the Y coordinate is always in-bound
-    OverlappingMin           = 2, // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
-    Skip                     = 3, // Skip the read/write
-    SkipMinEdgeOnly          = 4, // Skip greater than or equal to max only. The user guarantees that the Y coordinate is always >= 0
-    SkipMaxEdgeOnly          = 5, // Skip less than 0 only
-    ClampToNearest           = 6, // Clamp the coordinate to nearest edge (0 or max value allowed on Y)
-    ClampToMinEdgeOnly       = 7, // Clamp the negative coordinate to 0 only. Therefore, we expect Y to be always < MAX
-    ClampToMaxEdgeOnly       = 8, // Clamp the coordinate to the max value allowed on Y only. We expect Y to be always >= 0
-    ClampToBorder            = 9, // Clamp to border which always has 0 value
+    Unknown = 0,
+    None    = 1, // The user guarantees that the Y coordinate is always in-bound
+    OverlappingMin =
+        2, // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
+    Skip = 3, // Skip the read/write
+    SkipMinEdgeOnly =
+        4, // Skip greater than or equal to max only. The user guarantees that the Y coordinate is always >= 0
+    SkipMaxEdgeOnly    = 5, // Skip less than 0 only
+    ClampToNearest     = 6, // Clamp the coordinate to nearest edge (0 or max value allowed on Y)
+    ClampToMinEdgeOnly = 7, // Clamp the negative coordinate to 0 only. Therefore, we expect Y to be always < MAX
+    ClampToMaxEdgeOnly = 8, // Clamp the coordinate to the max value allowed on Y only. We expect Y to be always >= 0
+    ClampToBorder      = 9, // Clamp to border which always has 0 value
     ClampToBorderMinEdgeOnly = 10,
     ClampToBorderMaxEdgeOnly = 11
 };
 
 enum class TensorSamplerAddressModeZ : int32_t
 {
-    Unknown            = 0,
-    None               = 1, // The user guarantees that the Y coordinate is always in-bound
-    Skip               = 3, // Skip the read/write
-    SkipMinEdgeOnly    = 4, // Skip greater than or equal to max only. The user guarantees that the Y coordinate is always >= 0
+    Unknown = 0,
+    None    = 1, // The user guarantees that the Y coordinate is always in-bound
+    Skip    = 3, // Skip the read/write
+    SkipMinEdgeOnly =
+        4, // Skip greater than or equal to max only. The user guarantees that the Y coordinate is always >= 0
     SkipMaxEdgeOnly    = 5, // Skip less than 0 only
     ClampToNearest     = 6, // Clamp the coordinate to nearest edge (0 or max value allowed on Y)
     ClampToMinEdgeOnly = 7, // Clamp the negative coordinate to 0 only. Therefore, we expect Y to be always < MAX
diff --git a/compute_kernel_writer/prototype/src/Kernel.cpp b/compute_kernel_writer/prototype/src/Kernel.cpp
index 095ac879f18a8eec2831688b00b9e207e7fb9fd9..6228ed17d093e6e7cc7bf58427ac291a1103c764 100644
--- a/compute_kernel_writer/prototype/src/Kernel.cpp
+++ b/compute_kernel_writer/prototype/src/Kernel.cpp
@@ -23,24 +23,27 @@
  */
 
 #include "ckw/Kernel.h"
+
 #include "ckw/TensorOperand.h"
 #include "ckw/types/GpuTargetLanguage.h"
+
 #include "src/Prototype.h"
 
 namespace ckw
 {
 
-Kernel::Kernel(GpuTargetLanguage language)
-    : Kernel{"unnamed", language}
+Kernel::Kernel(GpuTargetLanguage language) : Kernel{"unnamed", language}
 {
 }
 
 Kernel::Kernel(const char *name, GpuTargetLanguage language)
-    : _name(name), _kernel(std::make_unique<prototype::GpuKernelWriterDataHolder>(language)), _operands{}, _tensor_id_operands{}
+    : _name(name),
+      _kernel(std::make_unique<prototype::GpuKernelWriterDataHolder>(language)),
+      _operands{},
+      _tensor_id_operands{}
 {
 }
 
-
 Kernel::~Kernel()
 {
 }
@@ -50,7 +53,7 @@ const std::string &Kernel::name() const
     return _name;
 }
 
-void Kernel::name(const std::string& name)
+void Kernel::name(const std::string &name)
 {
     _name = name;
 }
@@ -60,14 +63,14 @@ std::vector<KernelArgument> Kernel::arguments() const
 
     const auto impl_args = _kernel->arguments.tensor_argument_declarations();
 
-    for(auto tensor_arg : impl_args)
+    for (auto tensor_arg : impl_args)
     {
         auto tensor = _tensor_id_operands.at(tensor_arg->format().id);
         arguments.push_back(*tensor);
 
-        for(auto component_arg : tensor_arg->component_declarations())
+        for (auto component_arg : tensor_arg->component_declarations())
         {
-            switch(component_arg)
+            switch (component_arg)
             {
                 case TensorComponentType::OffsetFirstElement:
                     arguments.push_back(tensor->offset_first_element_in_bytes());
diff --git a/compute_kernel_writer/prototype/src/KernelArgument.cpp b/compute_kernel_writer/prototype/src/KernelArgument.cpp
index 2b4d7c8cee5f93c587e0e48f35c777899abd0759..24ace28eb3642a777504686569e562fd9a0b2b5e 100644
--- a/compute_kernel_writer/prototype/src/KernelArgument.cpp
+++ b/compute_kernel_writer/prototype/src/KernelArgument.cpp
@@ -23,14 +23,14 @@
  */
 
 #include "ckw/KernelArgument.h"
+
 #include "ckw/Error.h"
 #include "ckw/TensorOperand.h"
 
 namespace ckw
 {
 
-KernelArgument::KernelArgument(TensorOperand &tensor)
-    : _type(Type::TensorStorage), _id(tensor.info().id())
+KernelArgument::KernelArgument(TensorOperand &tensor) : _type(Type::TensorStorage), _id(tensor.info().id())
 {
     _sub_id.tensor_storage_type = tensor.storage_type();
 }
diff --git a/compute_kernel_writer/prototype/src/KernelWriter.cpp b/compute_kernel_writer/prototype/src/KernelWriter.cpp
index f29cf1280215487bf0afa0a2e52c16d9ee95f3e1..9f58d9fefa86b30dc65c777272a971acf3c7de3b 100644
--- a/compute_kernel_writer/prototype/src/KernelWriter.cpp
+++ b/compute_kernel_writer/prototype/src/KernelWriter.cpp
@@ -23,9 +23,11 @@
  */
 
 #include "ckw/KernelWriter.h"
+
 #include "ckw/Error.h"
 #include "ckw/TensorInfo.h"
 #include "ckw/TensorOperand.h"
+
 #include "src/Prototype.h"
 
 #include <sstream>
@@ -38,7 +40,7 @@ namespace
 
 inline prototype::TensorInfo create_impl_tensor_info(const TensorInfo &info)
 {
-    return prototype::TensorInfo{ info.shape(), info.data_type(), info.data_layout(), info.id() };
+    return prototype::TensorInfo{info.shape(), info.data_type(), info.data_layout(), info.id()};
 }
 
 } // namespace
@@ -86,7 +88,8 @@ int32_t KernelWriter::next_id_space()
 // Tensor and tile declaration
 // =================================================================================================
 
-TensorOperand &KernelWriter::declare_tensor_argument(const std::string &name, const TensorInfo &info, TensorStorageType storage_type)
+TensorOperand &
+KernelWriter::declare_tensor_argument(const std::string &name, const TensorInfo &info, TensorStorageType storage_type)
 {
     const auto var_name = generate_variable_name(name);
 
@@ -120,13 +123,11 @@ TileOperand &KernelWriter::declare_tile_operand(std::unique_ptr<TileOperand> ope
     auto       &operand = _kernel->register_operand(std::move(operand_ptr));
     const auto &name    = operand.name();
 
-    if(!operand.is_constant())
+    if (!operand.is_constant())
     {
         const auto &info = operand.tile_info();
 
-        _impl->declare_tile(
-            name,
-            prototype::TileInfo(info.data_type(), info.width(), info.height()));
+        _impl->declare_tile(name, prototype::TileInfo(info.data_type(), info.width(), info.height()));
     }
     else
     {
@@ -140,16 +141,15 @@ TileOperand &KernelWriter::declare_tile_operand(std::unique_ptr<TileOperand> ope
 // Load and store
 // =================================================================================================
 
-void KernelWriter::op_load(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler, const TileOperand &dilation_y)
+void KernelWriter::op_load(TileOperand             &tile,
+                           const TensorOperand     &tensor,
+                           const TensorTileSampler &sampler,
+                           const TileOperand       &dilation_y)
 {
     prototype::TensorOperand impl_tensor(
         tensor.name(),
-        prototype::GpuSampler{
-            sampler.format(),
-            prototype::to_gpu_tensor_storage(tensor.storage_type()),
-            sampler.address_mode_x(),
-            sampler.address_mode_y(),
-            sampler.address_mode_z() });
+        prototype::GpuSampler{sampler.format(), prototype::to_gpu_tensor_storage(tensor.storage_type()),
+                              sampler.address_mode_x(), sampler.address_mode_y(), sampler.address_mode_z()});
 
     auto impl_x = sampler.x().create_impl_operand(_impl.get());
     auto impl_y = sampler.y().create_impl_operand(_impl.get());
@@ -167,12 +167,8 @@ void KernelWriter::op_load_indirect(TileOperand &tile, const TensorOperand &tens
 {
     prototype::TensorOperand impl_tensor(
         tensor.name(),
-        prototype::GpuSampler{
-            sampler.format(),
-            prototype::to_gpu_tensor_storage(tensor.storage_type()),
-            sampler.address_mode_x(),
-            sampler.address_mode_y(),
-            sampler.address_mode_z() });
+        prototype::GpuSampler{sampler.format(), prototype::to_gpu_tensor_storage(tensor.storage_type()),
+                              sampler.address_mode_x(), sampler.address_mode_y(), sampler.address_mode_z()});
 
     auto impl_x = sampler.x().create_impl_operand(_impl.get());
     auto impl_y = sampler.y().create_impl_operand(_impl.get());
@@ -194,12 +190,8 @@ void KernelWriter::util_get_indirect_buffer(TileOperand             &tile,
 {
     prototype::TensorOperand impl_tensor(
         tensor.name(),
-        prototype::GpuSampler{
-            sampler.format(),
-            prototype::to_gpu_tensor_storage(tensor.storage_type()),
-            sampler.address_mode_x(),
-            sampler.address_mode_y(),
-            sampler.address_mode_z() });
+        prototype::GpuSampler{sampler.format(), prototype::to_gpu_tensor_storage(tensor.storage_type()),
+                              sampler.address_mode_x(), sampler.address_mode_y(), sampler.address_mode_z()});
 
     auto impl_x     = x.create_impl_operand(_impl.get());
     auto impl_y     = y.create_impl_operand(_impl.get());
@@ -215,12 +207,8 @@ void KernelWriter::op_store(TensorOperand &tensor, const TileOperand &tile, cons
 {
     prototype::TensorOperand impl_tensor(
         tensor.name(),
-        prototype::GpuSampler{
-            sampler.format(),
-            prototype::to_gpu_tensor_storage(tensor.storage_type()),
-            sampler.address_mode_x(),
-            sampler.address_mode_y(),
-            sampler.address_mode_z() });
+        prototype::GpuSampler{sampler.format(), prototype::to_gpu_tensor_storage(tensor.storage_type()),
+                              sampler.address_mode_x(), sampler.address_mode_y(), sampler.address_mode_z()});
     auto impl_src = tile.create_impl_operand(_impl.get());
     auto impl_x   = sampler.x().create_impl_operand(_impl.get());
     auto impl_y   = sampler.y().create_impl_operand(_impl.get());
@@ -250,7 +238,10 @@ void KernelWriter::op_cast_expression(const TileOperand &dst, const TileOperand
     _impl->op_cast_expression(impl_dst, impl_src, policy);
 }
 
-void KernelWriter::op_binary_expression(const TileOperand &dst, const TileOperand &lhs, BinaryOp op, const TileOperand &rhs)
+void KernelWriter::op_binary_expression(const TileOperand &dst,
+                                        const TileOperand &lhs,
+                                        BinaryOp           op,
+                                        const TileOperand &rhs)
 {
     auto impl_lhs = lhs.create_impl_operand(_impl.get());
     auto impl_rhs = rhs.create_impl_operand(_impl.get());
@@ -275,7 +266,10 @@ void KernelWriter::op_unary_elementwise_function(const TileOperand &dst, UnaryFu
     _impl->op_unary_elementwise_function(impl_dst, opcode, impl_src);
 }
 
-void KernelWriter::op_binary_elementwise_function(const TileOperand &dst, BinaryFunction opcode, const TileOperand &first, const TileOperand &second)
+void KernelWriter::op_binary_elementwise_function(const TileOperand &dst,
+                                                  BinaryFunction     opcode,
+                                                  const TileOperand &first,
+                                                  const TileOperand &second)
 {
     auto impl_dst    = dst.create_impl_operand(_impl.get());
     auto impl_first  = first.create_impl_operand(_impl.get());
@@ -284,7 +278,11 @@ void KernelWriter::op_binary_elementwise_function(const TileOperand &dst, Binary
     _impl->op_binary_elementwise_function(impl_dst, opcode, impl_first, impl_second);
 }
 
-void KernelWriter::op_ternary_elementwise_function(const TileOperand &dst, TernaryFunction opcode, const TileOperand &first, const TileOperand &second, const TileOperand &third)
+void KernelWriter::op_ternary_elementwise_function(const TileOperand &dst,
+                                                   TernaryFunction    opcode,
+                                                   const TileOperand &first,
+                                                   const TileOperand &second,
+                                                   const TileOperand &third)
 {
     auto impl_dst    = dst.create_impl_operand(_impl.get());
     auto impl_first  = first.create_impl_operand(_impl.get());
@@ -305,7 +303,10 @@ void KernelWriter::op_if(const TileOperand &lhs, BinaryOp op, const TileOperand
     _impl->compound_statement_end();
 }
 
-void KernelWriter::op_else_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body)
+void KernelWriter::op_else_if(const TileOperand           &lhs,
+                              BinaryOp                     op,
+                              const TileOperand           &rhs,
+                              const std::function<void()> &body)
 {
     auto impl_lhs = lhs.create_impl_operand(_impl.get());
     auto impl_rhs = rhs.create_impl_operand(_impl.get());
@@ -324,14 +325,21 @@ void KernelWriter::op_else(const std::function<void()> &body)
     _impl->compound_statement_end();
 }
 
-void KernelWriter::op_for_loop(const TileOperand &var_name, BinaryOp cond_op, const TileOperand &cond_value_name, const TileOperand &update_var_name, AssignmentOp update_op, const TileOperand &update_value_name, const std::function<void()> &body)
+void KernelWriter::op_for_loop(const TileOperand           &var_name,
+                               BinaryOp                     cond_op,
+                               const TileOperand           &cond_value_name,
+                               const TileOperand           &update_var_name,
+                               AssignmentOp                 update_op,
+                               const TileOperand           &update_value_name,
+                               const std::function<void()> &body)
 {
     auto impl_var_name          = var_name.create_impl_operand(_impl.get());
     auto impl_cond_value_name   = cond_value_name.create_impl_operand(_impl.get());
     auto impl_update_var_name   = update_var_name.create_impl_operand(_impl.get());
     auto impl_update_value_name = update_value_name.create_impl_operand(_impl.get());
 
-    _impl->op_for_loop_header(impl_var_name, cond_op, impl_cond_value_name, impl_update_var_name, update_op, impl_update_value_name);
+    _impl->op_for_loop_header(impl_var_name, cond_op, impl_cond_value_name, impl_update_var_name, update_op,
+                              impl_update_value_name);
     _impl->compound_statement_begin();
     body();
     _impl->compound_statement_end();
@@ -341,7 +349,7 @@ void KernelWriter::op_for_loop(const TileOperand &var_name, BinaryOp cond_op, co
 // Misc
 // =================================================================================================
 
-void KernelWriter::op_get_global_id(TileOperand &dst, int32_t dim)
+void KernelWriter::op_get_global_id(const TileOperand &dst, int32_t dim)
 {
     _impl->op_get_global_id(prototype::Operand(dst.name()), dim);
 }
diff --git a/compute_kernel_writer/prototype/src/OperandBase.cpp b/compute_kernel_writer/prototype/src/OperandBase.cpp
index 59cf846cc70247b779ac0eaa02d07fdd77425eb0..e0617fdc06770560dbaa2e590be88eb4e0e43551 100644
--- a/compute_kernel_writer/prototype/src/OperandBase.cpp
+++ b/compute_kernel_writer/prototype/src/OperandBase.cpp
@@ -27,8 +27,7 @@
 namespace ckw
 {
 
-OperandBase::OperandBase(const std::string &name)
-    : _name(name)
+OperandBase::OperandBase(const std::string &name) : _name(name)
 {
 }
 
diff --git a/compute_kernel_writer/prototype/src/Prototype.h b/compute_kernel_writer/prototype/src/Prototype.h
index 2b519471ac6d519d29d6f0ccf256dd7cd6f4bcc4..b392fe265128bd6d09fb4966cc64e8a6611b2178 100644
--- a/compute_kernel_writer/prototype/src/Prototype.h
+++ b/compute_kernel_writer/prototype/src/Prototype.h
@@ -25,12 +25,21 @@
 #ifndef CKW_PROTOTYPE_SRC_PROTOTYPE_H
 #define CKW_PROTOTYPE_SRC_PROTOTYPE_H
 
+#include "ckw/Error.h"
+#include "ckw/TensorInfo.h"
+#include "ckw/types/ConvertPolicy.h"
+#include "ckw/types/DataType.h"
+#include "ckw/types/Functions.h"
+#include "ckw/types/GpuTargetLanguage.h"
+#include "ckw/types/Operators.h"
+#include "ckw/types/TensorSamplerTypes.h"
+
 #include <algorithm>
 #include <array>
 #include <cassert> // assert (to be removed)
 #include <chrono>
 #include <cmath>
-#include <cstdint>  // int32_t
+#include <cstdint> // int32_t
 #include <functional>
 #include <iostream> // cout (to be removed)
 #include <map>
@@ -40,15 +49,6 @@
 #include <unordered_map>
 #include <vector>
 
-#include "ckw/Error.h"
-#include "ckw/TensorInfo.h"
-#include "ckw/types/ConvertPolicy.h"
-#include "ckw/types/DataType.h"
-#include "ckw/types/Functions.h"
-#include "ckw/types/GpuTargetLanguage.h"
-#include "ckw/types/Operators.h"
-#include "ckw/types/TensorSamplerTypes.h"
-
 namespace ckw
 {
 namespace prototype
@@ -83,21 +83,21 @@ enum class GpuExtensions
 
 struct TensorInfo
 {
-    TensorShape      shape{ { 0 } };
-    DataType         data_type{ DataType::Unknown };
-    TensorDataLayout data_layout{ TensorDataLayout::Nhwc };
-    int32_t          id{ -1 };
+    TensorShape      shape{{0}};
+    DataType         data_type{DataType::Unknown};
+    TensorDataLayout data_layout{TensorDataLayout::Nhwc};
+    int32_t          id{-1};
 };
 
 struct ComponentAttribute
 {
-    GpuCompilationSpeed compilation_speed{ GpuCompilationSpeed::Fast };
-    bool                overwrite_tile{ true };
+    GpuCompilationSpeed compilation_speed{GpuCompilationSpeed::Fast};
+    bool                overwrite_tile{true};
 };
 
 inline std::string data_type_to_cl_type(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::Fp32:
             return "float";
@@ -125,7 +125,7 @@ inline std::string data_type_to_cl_type(DataType dt)
 
 inline int32_t width_to_cl_vector_size(int32_t width)
 {
-    switch(width)
+    switch (width)
     {
         case 1:
             return 1;
@@ -160,7 +160,7 @@ inline std::string get_cl_data_type(DataType dt, int32_t width)
     std::string data_type;
     int32_t     w = width_to_cl_vector_size(width);
     data_type += data_type_to_cl_type(dt);
-    if(w != 1)
+    if (w != 1)
     {
         data_type += std::to_string(w);
     }
@@ -169,7 +169,7 @@ inline std::string get_cl_data_type(DataType dt, int32_t width)
 
 inline std::string to_opencl_store(int32_t vector_length)
 {
-    if(vector_length != 1)
+    if (vector_length != 1)
     {
         return "vstore" + std::to_string(vector_length) + "(";
     }
@@ -185,24 +185,21 @@ struct TileInfo
     {
     }
 
-    TileInfo(DataType dt)
-        : dt(dt), w(1), h(1)
+    TileInfo(DataType dt) : dt(dt), w(1), h(1)
     {
     }
 
-    TileInfo(DataType dt, int32_t width)
-        : dt(dt), w(width), h(1)
+    TileInfo(DataType dt, int32_t width) : dt(dt), w(width), h(1)
     {
     }
 
-    TileInfo(DataType dt, int32_t width, int32_t height)
-        : dt(dt), w(width), h(height)
+    TileInfo(DataType dt, int32_t width, int32_t height) : dt(dt), w(width), h(height)
     {
     }
 
-    DataType dt{ DataType::Unknown }; // Data type of the tile
-    int32_t  w{ 0 };                  // Width (i.e. c0 - portion of the channels)
-    int32_t  h{ 0 };                  // Height (i.e. s0 - portion of the spatial dimensions)
+    DataType dt{DataType::Unknown}; // Data type of the tile
+    int32_t  w{0};                  // Width (i.e. c0 - portion of the channels)
+    int32_t  h{0};                  // Height (i.e. s0 - portion of the spatial dimensions)
 };
 
 inline std::ostream &operator<<(std::ostream &o, const TileInfo &a)
@@ -213,14 +210,14 @@ inline std::ostream &operator<<(std::ostream &o, const TileInfo &a)
 
 struct DataTypeAsString
 {
-    std::string str{ "" };
-    DataType    dt{ DataType::Unknown };
-    int32_t     size{ 1 };
+    std::string str{""};
+    DataType    dt{DataType::Unknown};
+    int32_t     size{1};
 };
 
 struct ValueAsString
 {
-    std::string      str{ "" };
+    std::string      str{""};
     DataTypeAsString type{};
 };
 
@@ -276,8 +273,8 @@ public:
     virtual bool need_declaration() const = 0;
 
 protected:
-    TileInfo    _format{};       // Tile format
-    std::string _basename{ "" }; // Tile name
+    TileInfo    _format{};     // Tile format
+    std::string _basename{""}; // Tile name
 };
 
 // A tile is a collection of variables used to express a 2D data. The variables are vectors in the GPU context.
@@ -329,7 +326,7 @@ public:
         t.type.size = 1;
 
         // Check required because if the width has only one element, we cannot use .s0
-        if(_format.w != 1)
+        if (_format.w != 1)
         {
             // Automatic broadcasting
             t.str += ".s" + std::to_string(x);
@@ -360,10 +357,10 @@ public:
         t.type.dt   = _format.dt;
         t.type.size = width;
 
-        if(_format.w != 1)
+        if (_format.w != 1)
         {
             t.str += ".s";
-            for(int i = 0; i < width; ++i)
+            for (int i = 0; i < width; ++i)
             {
                 t.str += to_scalar_hex(x_start + i);
             }
@@ -374,7 +371,7 @@ public:
     std::vector<ValueAsString> underlying_source_variables() const override
     {
         std::vector<ValueAsString> vars;
-        for(int32_t y = 0; y < _format.h; ++y)
+        for (int32_t y = 0; y < _format.h; ++y)
         {
             ValueAsString t;
             t.str       = build_variable_name(y);
@@ -401,7 +398,7 @@ private:
     {
         std::string var_name = _basename;
 
-        if(_format.h == 1)
+        if (_format.h == 1)
         {
             return var_name;
         }
@@ -416,7 +413,7 @@ private:
 
     std::string to_scalar_hex(int32_t x) const
     {
-        switch(x)
+        switch (x)
         {
             case 0:
             case 1:
@@ -461,9 +458,9 @@ public:
 
         _data = std::vector<std::vector<std::string>>(_format.h, std::vector<std::string>(_format.w));
 
-        for(int32_t y = 0; y < _format.h; ++y)
+        for (int32_t y = 0; y < _format.h; ++y)
         {
-            for(int32_t x = 0; x < _format.w; ++x)
+            for (int32_t x = 0; x < _format.w; ++x)
             {
                 _data[y][x] = in[y][x];
             }
@@ -501,20 +498,20 @@ public:
         t.type.dt   = _format.dt;
         t.type.size = width;
 
-        if(width > 1)
+        if (width > 1)
         {
             t.str += "((" + get_cl_data_type(_format.dt, width) + ")(";
         }
 
         int32_t x = x_start;
-        for(; x < width - 1; ++x)
+        for (; x < width - 1; ++x)
         {
             t.str += scalar(x, y).str;
             t.str += ", ";
         }
         t.str += scalar(x, y).str;
 
-        if(width > 1)
+        if (width > 1)
         {
             t.str += "))";
         }
@@ -526,9 +523,9 @@ public:
     {
         std::vector<ValueAsString> vars;
 
-        for(int32_t y = 0; y < _format.h; ++y)
+        for (int32_t y = 0; y < _format.h; ++y)
         {
-            for(int32_t x = 0; x < _format.w; ++x)
+            for (int32_t x = 0; x < _format.w; ++x)
             {
                 ValueAsString t;
                 t.str       = _data[y][x];
@@ -572,7 +569,7 @@ enum class TensorComponentGroup : int32_t
 
 inline std::string to_string(TensorComponentType x)
 {
-    switch(x)
+    switch (x)
     {
         case TensorComponentType::Unknown:
             return "Unknown";
@@ -672,7 +669,7 @@ enum class GpuTensorStorage : int32_t
 
 inline GpuTensorStorage to_gpu_tensor_storage(TensorStorageType s)
 {
-    switch(s)
+    switch (s)
     {
         case TensorStorageType::Unknown:
             return GpuTensorStorage::Unknown;
@@ -694,7 +691,7 @@ inline GpuTensorStorage to_gpu_tensor_storage(TensorStorageType s)
 
 inline TensorStorageType to_tensor_storage(GpuTensorStorage s)
 {
-    switch(s)
+    switch (s)
     {
         case GpuTensorStorage::Unknown:
             return TensorStorageType::Unknown;
@@ -755,23 +752,23 @@ public:
     // Methods to override
     std::string component(TensorComponentType x) override
     {
-        if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::Constant)))
+        if ((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::Constant)))
         {
             int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
             return std::to_string(idx - 1);
         }
 
-        if(_return_by_value_when_possible)
+        if (_return_by_value_when_possible)
         {
-            if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::Dimension)))
+            if ((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::Dimension)))
             {
                 int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
                 return std::to_string(_format.shape[idx]);
             }
 
-            if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::FoldedDimension)))
+            if ((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::FoldedDimension)))
             {
-                switch(x)
+                switch (x)
                 {
                     case TensorComponentType::Dim1xDim2:
                         return std::to_string(_format.shape[1] * _format.shape[2]);
@@ -784,7 +781,7 @@ public:
             }
         }
 
-        if(std::find(_components_required.begin(), _components_required.end(), x) == _components_required.end())
+        if (std::find(_components_required.begin(), _components_required.end(), x) == _components_required.end())
         {
             _components_required.push_back(x);
         }
@@ -804,7 +801,7 @@ public:
 
     std::string storage(GpuTensorStorage x) override
     {
-        if(std::find(_storage_required.begin(), _storage_required.end(), x) == _storage_required.end())
+        if (std::find(_storage_required.begin(), _storage_required.end(), x) == _storage_required.end())
         {
             _storage_required.push_back(x);
         }
@@ -814,7 +811,7 @@ public:
 
     std::string storage_type_declaration(GpuTensorStorage x) const override
     {
-        switch(x)
+        switch (x)
         {
             case GpuTensorStorage::BufferUint8Ptr:
                 return "__global uchar*";
@@ -848,7 +845,7 @@ private:
     {
         std::string var_name = _basename;
 
-        switch(x)
+        switch (x)
         {
             case GpuTensorStorage::BufferUint8Ptr:
                 return var_name + "_ptr";
@@ -870,7 +867,7 @@ private:
     {
         std::string var_name = _basename;
 
-        switch(x)
+        switch (x)
         {
             case TensorComponentType::OffsetFirstElement:
                 return var_name + "_offset_first_element";
@@ -900,9 +897,9 @@ private:
         return var_name;
     }
 
-    bool                          _return_by_value_when_possible{ false };
-    std::vector<GpuTensorStorage> _storage_required{};
-    std::vector<TensorComponentType>  _components_required{};
+    bool                             _return_by_value_when_possible{false};
+    std::vector<GpuTensorStorage>    _storage_required{};
+    std::vector<TensorComponentType> _components_required{};
 };
 
 /**
@@ -930,16 +927,16 @@ public:
 
     struct RegistryTileTableEntry
     {
-        RegistryLevel                registry_level{ 0 };
-        std::unique_ptr<IVectorTile> tile_object{ nullptr };
+        RegistryLevel                registry_level{0};
+        std::unique_ptr<IVectorTile> tile_object{nullptr};
     };
 
     struct RegistryTileTypeTableEntry
     {
-        RegistryTileType tile_type{ RegistryTileType::Tile };
+        RegistryTileType tile_type{RegistryTileType::Tile};
         RegistryTileName tile_name{};
-        RegistryIdSpace  registry_idspace{ 0 };
-        RegistryLevel    registry_level{ 0 };
+        RegistryIdSpace  registry_idspace{0};
+        RegistryLevel    registry_level{0};
     };
 
     using RegistryTileTable     = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTableEntry>>;
@@ -1002,7 +999,7 @@ public:
 
         auto it = _frags.begin();
 
-        while(it != _frags.end())
+        while (it != _frags.end())
         {
             x.push_back(it->first);
 
@@ -1026,7 +1023,7 @@ public:
         // First check whether a tile with the same name exists
         IVectorTile *result = (*this)[key_var_name];
         assert(result == nullptr);
-        if(result == nullptr)
+        if (result == nullptr)
         {
             std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
 
@@ -1058,7 +1055,7 @@ public:
         // First check whether a tile with the same name exists
         IVectorTile *result = (*this)[key_var_name];
         assert(result == nullptr);
-        if(result == nullptr)
+        if (result == nullptr)
         {
             std::unique_ptr<ClTile> tile                     = std::make_unique<ClTile>(var_name, format);
             _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
@@ -1090,7 +1087,7 @@ public:
         // First check whether a tile with the same name exists
         IVectorTile *result = (*this)[key_var_name];
         assert(result == nullptr);
-        if(result == nullptr)
+        if (result == nullptr)
         {
             std::unique_ptr<ClConstantTile> tile             = std::make_unique<ClConstantTile>(in, dt);
             _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
@@ -1123,7 +1120,7 @@ public:
         // First check whether a tile with the same name exists
         IVectorTile *result = (*this)[key_var_name];
         assert(result == nullptr);
-        if(result == nullptr)
+        if (result == nullptr)
         {
             std::unique_ptr<ClConstantTile> tile             = std::make_unique<ClConstantTile>(in, dt);
             _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
@@ -1153,10 +1150,10 @@ public:
 
         IVectorTile *result         = nullptr;
         auto         search_IdSpace = _frags.find(key_IdSpace);
-        if(search_IdSpace != _frags.end())
+        if (search_IdSpace != _frags.end())
         {
             auto search_tile = _frags[key_IdSpace].find(key_var_name);
-            if(search_tile != _frags[key_IdSpace].end())
+            if (search_tile != _frags[key_IdSpace].end())
             {
                 result = search_tile->second.tile_object.get();
                 assert(result != nullptr);
@@ -1224,7 +1221,7 @@ public:
 
         std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it = _frag_types[IdSpace].begin();
 
-        while(it != _frag_types[IdSpace].end())
+        while (it != _frag_types[IdSpace].end())
         {
             // The following line should be enabled. However, we cannot at this stage
             // because it used to retrieve the output tile produced by each component.
@@ -1259,9 +1256,9 @@ public:
         // Remove all variables in the local scope
         std::map<RegistryTileName, RegistryTileTableEntry>::iterator it = _frags[_IdSpace].begin();
 
-        while(it != _frags[_IdSpace].end())
+        while (it != _frags[_IdSpace].end())
         {
-            if(it->second.registry_level == _registry_level)
+            if (it->second.registry_level == _registry_level)
             {
                 it = _frags[_IdSpace].erase(it);
             }
@@ -1273,9 +1270,9 @@ public:
 
         std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it_type = _frag_types[_IdSpace].begin();
 
-        while(it_type != _frag_types[_IdSpace].end())
+        while (it_type != _frag_types[_IdSpace].end())
         {
-            if(it_type->second.registry_level == _registry_level)
+            if (it_type->second.registry_level == _registry_level)
             {
                 it_type = _frag_types[_IdSpace].erase(it_type);
             }
@@ -1302,7 +1299,7 @@ private:
     std::string generate_tile_name(const std::string &name)
     {
         assert(_IdSpace >= 0);
-        if(_registry_level == 0)
+        if (_registry_level == 0)
         {
             return "_G" + std::to_string(_IdSpace) + "_" + name;
         }
@@ -1314,10 +1311,10 @@ private:
 
     RegistryTileTable     _frags{};
     RegistryTileTypeTable _frag_types{};
-    RegistryLevel         _registry_level{ 0 };
-    RegistryIdSpace       _IdSpace{ -1 };
-    int32_t               _anonymous_frag_count{ 0 };              // Counter used to create the anonymous tiles
-    GpuTargetLanguage     _language{ GpuTargetLanguage::Unknown }; // Gpu programming language
+    RegistryLevel         _registry_level{0};
+    RegistryIdSpace       _IdSpace{-1};
+    int32_t               _anonymous_frag_count{0};              // Counter used to create the anonymous tiles
+    GpuTargetLanguage     _language{GpuTargetLanguage::Unknown}; // Gpu programming language
 };
 
 using TensorEntry = std::unique_ptr<IGpuTensorArgument>;
@@ -1388,7 +1385,7 @@ public:
 
         auto it = _refs.begin();
 
-        while(it != _refs.end())
+        while (it != _refs.end())
         {
             x.push_back(it->first);
 
@@ -1420,12 +1417,12 @@ public:
 
         // Check whether a tensor with that tensorID exists
         auto result = _tensor_arguments.find(tensor_id);
-        if(result == _tensor_arguments.end())
+        if (result == _tensor_arguments.end())
         {
             // It means that we haven't added a tensor with that tensor_id yet. Create a IGpuTensorArgument before creating the reference
-            std::unique_ptr<ClTensorArgument> arg = std::make_unique<ClTensorArgument>(var_name, x,
-                                                                                       return_by_value_when_possible);
-            _tensor_arguments[tensor_id]          = std::move(arg);
+            std::unique_ptr<ClTensorArgument> arg =
+                std::make_unique<ClTensorArgument>(var_name, x, return_by_value_when_possible);
+            _tensor_arguments[tensor_id] = std::move(arg);
         }
 
         _refs[key_IdSpace][key_var_name] = tensor_id;
@@ -1445,15 +1442,15 @@ public:
 
         IGpuTensorArgument *result         = nullptr;
         auto                search_IdSpace = _refs.find(key_IdSpace);
-        if(search_IdSpace != _refs.end())
+        if (search_IdSpace != _refs.end())
         {
             auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
 
-            if(search_tensor_id != _refs[key_IdSpace].end())
+            if (search_tensor_id != _refs[key_IdSpace].end())
             {
                 const int32_t tensor_id              = search_tensor_id->second;
                 auto          search_tensor_argument = _tensor_arguments.find(tensor_id);
-                if(search_tensor_argument != _tensor_arguments.end())
+                if (search_tensor_argument != _tensor_arguments.end())
                 {
                     result = search_tensor_argument->second.get();
                 }
@@ -1475,7 +1472,7 @@ public:
 
         auto it = _tensor_arguments.begin();
 
-        while(it != _tensor_arguments.end())
+        while (it != _tensor_arguments.end())
         {
             args.push_back(it->second.get());
             it++;
@@ -1499,7 +1496,7 @@ public:
 
         auto search_IdSpace = _refs.find(key_IdSpace);
 
-        if(search_IdSpace != _refs.end())
+        if (search_IdSpace != _refs.end())
         {
             auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
 
@@ -1527,7 +1524,7 @@ public:
 
         auto search_IdSpace = _refs.find(key_IdSpace);
 
-        if(search_IdSpace != _refs.end())
+        if (search_IdSpace != _refs.end())
         {
             auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
 
@@ -1550,8 +1547,8 @@ private:
 
     std::map<int32_t, TensorEntry>                    _tensor_arguments{};
     std::map<int32_t, std::map<std::string, int32_t>> _refs{};
-    int32_t                                           _IdSpace{ -1 };
-    GpuTargetLanguage                                 _language{ GpuTargetLanguage::Unknown }; // Gpu programming language
+    int32_t                                           _IdSpace{-1};
+    GpuTargetLanguage                                 _language{GpuTargetLanguage::Unknown}; // Gpu programming language
 };
 
 enum class OpType : int32_t
@@ -1563,7 +1560,7 @@ enum class OpType : int32_t
 
 inline std::string to_string(AssignmentOp op)
 {
-    switch(op)
+    switch (op)
     {
         case AssignmentOp::Decrement:
             return "-=";
@@ -1577,12 +1574,14 @@ inline std::string to_string(AssignmentOp op)
 
 inline std::string to_string(UnaryOp op)
 {
-    switch(op)
+    switch (op)
     {
         case UnaryOp::LogicalNot:
             return "!";
         case UnaryOp::BitwiseNot:
             return "~";
+        case UnaryOp::Negate:
+            return "-";
         default:
             assert(false);
             return "";
@@ -1591,7 +1590,7 @@ inline std::string to_string(UnaryOp op)
 
 inline std::string to_string(BinaryOp op)
 {
-    switch(op)
+    switch (op)
     {
         case BinaryOp::Add:
             return "+";
@@ -1627,7 +1626,7 @@ inline std::string to_string(BinaryOp op)
 
 inline std::string binary_op_string(BinaryOp op)
 {
-    switch(op)
+    switch (op)
     {
         case BinaryOp::Add:
             return "add";
@@ -1696,13 +1695,12 @@ struct ScalarTileCoord
     {
     }
 
-    ScalarTileCoord(int32_t x0, int32_t y0)
-        : x(x0), y(y0)
+    ScalarTileCoord(int32_t x0, int32_t y0) : x(x0), y(y0)
     {
     }
 
-    int32_t x{ -1 };
-    int32_t y{ -1 };
+    int32_t x{-1};
+    int32_t y{-1};
 };
 
 /**
@@ -1766,7 +1764,7 @@ public:
 
 private:
     std::string     _str{};
-    OperandType     _type{ OperandType::Unknown };
+    OperandType     _type{OperandType::Unknown};
     ScalarTileCoord _coord{};
 };
 
@@ -1776,16 +1774,15 @@ struct GpuSampler
 {
     GpuSampler() = default;
 
-    TensorSamplerFormat       format{ TensorSamplerFormat::Unknown };
-    GpuSamplerTensorStorage   storage{ GpuSamplerTensorStorage::Unknown };
-    TensorSamplerAddressModeX address_mode_x{ TensorSamplerAddressModeX::Unknown };
-    TensorSamplerAddressModeY address_mode_y{ TensorSamplerAddressModeY::Unknown };
-    TensorSamplerAddressModeZ address_mode_z{ TensorSamplerAddressModeZ::Unknown };
+    TensorSamplerFormat       format{TensorSamplerFormat::Unknown};
+    GpuSamplerTensorStorage   storage{GpuSamplerTensorStorage::Unknown};
+    TensorSamplerAddressModeX address_mode_x{TensorSamplerAddressModeX::Unknown};
+    TensorSamplerAddressModeY address_mode_y{TensorSamplerAddressModeY::Unknown};
+    TensorSamplerAddressModeZ address_mode_z{TensorSamplerAddressModeZ::Unknown};
 };
 
-inline GpuSampler
-create_simple_sampler(const TensorInfo *tensor_info_id, GpuSampler sampler, int32_t step_x, int32_t step_y,
-                      int32_t step_z)
+inline GpuSampler create_simple_sampler(
+    const TensorInfo *tensor_info_id, GpuSampler sampler, int32_t step_x, int32_t step_y, int32_t step_z)
 {
     CKW_UNUSED(step_x, step_y, step_z);
 
@@ -1802,7 +1799,7 @@ create_simple_sampler(const TensorInfo *tensor_info_id, GpuSampler sampler, int3
     int32_t dim_y = 0;
     int32_t dim_z = 0;
 
-    switch(sampler.format)
+    switch (sampler.format)
     {
         case TensorSamplerFormat::C_W_H:
             dim_x = tensor[0];
@@ -1820,19 +1817,19 @@ create_simple_sampler(const TensorInfo *tensor_info_id, GpuSampler sampler, int3
             break;
     }
 
-    if(dim_x == 1)
+    if (dim_x == 1)
     {
         assert(step_x == 1);
         dst_sampler.address_mode_x = TensorSamplerAddressModeX::None;
     }
 
-    if(dim_y == 1)
+    if (dim_y == 1)
     {
         assert(step_y == 1);
         dst_sampler.address_mode_y = TensorSamplerAddressModeY::None;
     }
 
-    if(dim_z == 1)
+    if (dim_z == 1)
     {
         assert(step_z == 1);
         dst_sampler.address_mode_z = TensorSamplerAddressModeZ::None;
@@ -1856,8 +1853,12 @@ public:
      * @param[in] step_y  Increment step in the Y direction. Not necessarily it is the same of m0 of tile!
      * @param[in] step_z  Increment step in the Z direction. Not necessarily it is the same of d0 of tile!
      */
-    void initialize(const TensorInfo *tensor_info_id, GpuSamplerTensorStorage tensor_storage,
-                    TensorSamplerFormat tensor_format, int32_t step_x, int32_t step_y, int32_t step_z)
+    void initialize(const TensorInfo       *tensor_info_id,
+                    GpuSamplerTensorStorage tensor_storage,
+                    TensorSamplerFormat     tensor_format,
+                    int32_t                 step_x,
+                    int32_t                 step_y,
+                    int32_t                 step_z)
     {
         assert(_is_initialized == false);
 
@@ -1906,13 +1907,13 @@ private:
         sampler.address_mode_z = TensorSamplerAddressModeZ::None;
 
         // In the case of texture, we do not need any special checks at the border
-        if(tensor_storage == GpuSamplerTensorStorage::BufferUint8Ptr)
+        if (tensor_storage == GpuSamplerTensorStorage::BufferUint8Ptr)
         {
             int32_t dim_x = 0;
             int32_t dim_y = 0;
             int32_t dim_z = 0;
 
-            switch(tensor_format)
+            switch (tensor_format)
             {
                 case TensorSamplerFormat::C_W_H:
                     dim_x = tensor[0];
@@ -1930,17 +1931,17 @@ private:
                     break;
             }
 
-            if((dim_x % _step_x) != 0 && dim_x != 1)
+            if ((dim_x % _step_x) != 0 && dim_x != 1)
             {
                 sampler.address_mode_x = TensorSamplerAddressModeX::OverlappingMin;
             }
 
-            if((dim_y % _step_y) != 0 && dim_y != 1)
+            if ((dim_y % _step_y) != 0 && dim_y != 1)
             {
                 sampler.address_mode_y = TensorSamplerAddressModeY::ClampToMaxEdgeOnly;
             }
 
-            if((dim_z % _step_z) != 0 && dim_z != 1)
+            if ((dim_z % _step_z) != 0 && dim_z != 1)
             {
                 sampler.address_mode_z = TensorSamplerAddressModeZ::ClampToMaxEdgeOnly;
             }
@@ -1950,11 +1951,11 @@ private:
     }
 
     GpuSampler        _sampler{}; // GpuSampler
-    int32_t           _step_x{ 1 };
-    int32_t           _step_y{ 1 };
-    int32_t           _step_z{ 1 };
-    const TensorInfo *_tensor_info_id{ nullptr };
-    bool              _is_initialized{ false };
+    int32_t           _step_x{1};
+    int32_t           _step_y{1};
+    int32_t           _step_z{1};
+    const TensorInfo *_tensor_info_id{nullptr};
+    bool              _is_initialized{false};
 };
 
 /**
@@ -1963,8 +1964,7 @@ private:
 class TensorOperand
 {
 public:
-    TensorOperand(const std::string &val, GpuSampler sampler)
-        : _str(val), _sampler(sampler)
+    TensorOperand(const std::string &val, GpuSampler sampler) : _str(val), _sampler(sampler)
     {
     }
 
@@ -2048,9 +2048,9 @@ private:
 
 struct LWS
 {
-    int32_t x{ 1 };
-    int32_t y{ 1 };
-    int32_t z{ 1 };
+    int32_t x{1};
+    int32_t y{1};
+    int32_t z{1};
 };
 
 /**
@@ -2060,8 +2060,7 @@ struct LWS
 class OperandUnpacker
 {
 public:
-    OperandUnpacker(GpuTileRegistry &tiles, GpuTensorArgumentRegistry &arguments)
-        : _tiles(tiles), _arguments(arguments)
+    OperandUnpacker(GpuTileRegistry &tiles, GpuTensorArgumentRegistry &arguments) : _tiles(tiles), _arguments(arguments)
     {
         // Increase the level of the stack to allocate possible temporary tiles
         _tiles.increment_registry_level();
@@ -2076,26 +2075,26 @@ public:
     IVectorTile *unpack(const Operand &src)
     {
         // Get the tile
-        if(src.type() == OperandType::Tile)
+        if (src.type() == OperandType::Tile)
         {
             assert(_tiles.has_tile(src.value()));
             return _tiles[src.value()];
         }
         // Create an anonymous tile with a constant
-        else if(static_cast<int32_t>(src.type()) & 0x00001000)
+        else if (static_cast<int32_t>(src.type()) & 0x00001000)
         {
-            if(src.type() == OperandType::ScalarTile)
+            if (src.type() == OperandType::ScalarTile)
             {
                 ScalarTileCoord coord = src.scalar_tile_coordinate();
                 assert(_tiles.has_tile(src.value()));
                 assert(coord.x >= 0);
                 assert(coord.y >= 0);
                 auto val = _tiles[src.value()]->scalar(coord.x, coord.y);
-                return _tiles.insert({ { { val.str } } }, val.type.dt);
+                return _tiles.insert({{{val.str}}}, val.type.dt);
             }
             else
             {
-                return _tiles.insert({ { { src.value() } } }, to_tile_data_type(src.type()));
+                return _tiles.insert({{{src.value()}}}, to_tile_data_type(src.type()));
             }
         }
         // Create an anonymous tile with the tensor component
@@ -2105,7 +2104,7 @@ public:
             auto              x   = _arguments[src.value()];
             const std::string val = x->component(to_tensor_component(src.type()));
             const DataType    dt  = x->component_data_type();
-            return _tiles.insert({ { { val } } }, dt);
+            return _tiles.insert({{{val}}}, dt);
         }
     }
 
@@ -2117,7 +2116,7 @@ private:
 
     TensorComponentType to_tensor_component(OperandType x)
     {
-        switch(x)
+        switch (x)
         {
             case OperandType::TensorDim0:
                 return TensorComponentType::Dim0;
@@ -2161,8 +2160,7 @@ private:
 class TensorOperandUnpacker
 {
 public:
-    TensorOperandUnpacker(GpuTensorArgumentRegistry &arguments)
-        : _arguments(arguments){};
+    TensorOperandUnpacker(GpuTensorArgumentRegistry &arguments) : _arguments(arguments){};
 
     IGpuTensorArgument *unpack(const TensorOperand &src)
     {
@@ -2189,9 +2187,11 @@ struct GpuKernel
     std::string      config_id{}; // Unique id, required for the tuning stage
     std::vector<LWS> list_lws{};  // LWS to test, required for the tuning stage
     // Dispatch stage
-    GpuOutputSampler                                  output_sampler{};       // GpuOutputSampler, required for the dispatch stage
-    std::vector<std::pair<int32_t, GpuTensorStorage>> list_tensor_storages;   // List of tensor storages, required for the dispatch stage
-    std::vector<std::pair<int32_t, TensorComponentType>>  list_tensor_components; // List of tensor components (width, stride,..), required for the dispatch stage)
+    GpuOutputSampler output_sampler{}; // GpuOutputSampler, required for the dispatch stage
+    std::vector<std::pair<int32_t, GpuTensorStorage>>
+        list_tensor_storages; // List of tensor storages, required for the dispatch stage
+    std::vector<std::pair<int32_t, TensorComponentType>>
+        list_tensor_components; // List of tensor components (width, stride,..), required for the dispatch stage)
 };
 
 // Generate all extension pragmas (hardcoded for now)
@@ -2232,13 +2232,13 @@ inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::strin
 
     auto tensor_args = in.arguments.tensor_argument_declarations();
 
-    for(auto &i : tensor_args)
+    for (auto &i : tensor_args)
     {
         // For each tensor used, get the storage and tensor components
         auto storages   = i->storage_declarations();
         auto components = i->component_declarations();
 
-        for(auto &y : storages)
+        for (auto &y : storages)
         {
             std::string str;
             str += i->storage_type_declaration(y);
@@ -2247,7 +2247,7 @@ inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::strin
             arg_str.push_back(str);
         }
 
-        for(auto &y : components)
+        for (auto &y : components)
         {
             std::string str;
             str += i->component_type_declaration();
@@ -2257,10 +2257,10 @@ inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::strin
         }
     }
 
-    for(size_t i = 0; i < arg_str.size(); ++i)
+    for (size_t i = 0; i < arg_str.size(); ++i)
     {
         code += arg_str[i];
-        if(i + 1 < arg_str.size())
+        if (i + 1 < arg_str.size())
         {
             code += ",\n";
         }
@@ -2282,13 +2282,12 @@ inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::strin
 class GpuTensor3dMapper
 {
 public:
-    GpuTensor3dMapper(IGpuTensorArgument *tensor, GpuSampler sampler)
-        : _sampler(sampler), _tensor(tensor){};
+    GpuTensor3dMapper(IGpuTensorArgument *tensor, GpuSampler sampler) : _sampler(sampler), _tensor(tensor){};
 
     std::string tensor_component_x() const
     {
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
             case TensorSamplerFormat::C_W_H:
@@ -2303,7 +2302,7 @@ public:
     std::string tensor_component_y() const
     {
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
                 return _tensor->component(TensorComponentType::Dim1xDim2);
@@ -2319,7 +2318,7 @@ public:
     std::string tensor_component_z() const
     {
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
                 return "1";
@@ -2335,7 +2334,7 @@ public:
     std::string tensor_component_stride_y() const
     {
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
             case TensorSamplerFormat::C_W_H:
@@ -2350,7 +2349,7 @@ public:
     std::string tensor_component_stride_z() const
     {
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
                 return "0";
@@ -2366,7 +2365,7 @@ public:
     std::string tensor_component_stride_batch() const
     {
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
             case TensorSamplerFormat::C_W_H:
@@ -2382,7 +2381,7 @@ public:
     {
         auto       t      = _tensor->format();
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
             case TensorSamplerFormat::C_W_H:
@@ -2398,7 +2397,7 @@ public:
     {
         auto       t      = _tensor->format();
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
                 return (t.shape[1] * t.shape[2]) == 1;
@@ -2415,7 +2414,7 @@ public:
     {
         auto       t      = _tensor->format();
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
                 return true;
@@ -2432,7 +2431,7 @@ public:
     {
         auto       t      = _tensor->format();
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
             case TensorSamplerFormat::C_W_H:
@@ -2461,7 +2460,7 @@ private:
 
 struct GpuKernelWriterAttribute
 {
-    bool return_tensor_component_by_value{ false };
+    bool return_tensor_component_by_value{false};
 };
 
 enum class RoundingMode
@@ -2487,7 +2486,8 @@ public:
 
     virtual void declare_tile(const std::string &name, const TileInfo &info) = 0;
 
-    virtual void declare_const_tile(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt) = 0;
+    virtual void
+    declare_const_tile(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt) = 0;
 
     virtual void write_text(const std::string &x) = 0;
 
@@ -2496,48 +2496,82 @@ public:
     virtual void compound_statement_end() = 0;
 
     // Operations
-    virtual void op_get_global_id(const Operand &dst_var, int32_t dim)                                                                                                                                                                                                   = 0;
+    virtual void op_get_global_id(const Operand &dst_var, int32_t dim) = 0;
 
-    virtual void op_get_global_coord(const Operand &dst, const Operand &step, const TensorOperand &tensor, int32_t dim)                                                                                                                                                  = 0;
+    virtual void
+    op_get_global_coord(const Operand &dst, const Operand &step, const TensorOperand &tensor, int32_t dim) = 0;
 
-    virtual void op_get_global_batch(const Operand &dst, const TensorOperand &tensor)                                                                                                                                                                                    = 0;
+    virtual void op_get_global_batch(const Operand &dst, const TensorOperand &tensor) = 0;
 
-    virtual void op_get_global_size(const Operand &dst_var, int32_t dim)                                                                                                                                                                                                 = 0;
+    virtual void op_get_global_size(const Operand &dst_var, int32_t dim) = 0;
 
-    virtual void op_unary_expression(const Operand &dst, UnaryOp op, const Operand &src)                                                                                                                                                                                 = 0;
+    virtual void op_unary_expression(const Operand &dst, UnaryOp op, const Operand &src) = 0;
 
-    virtual void op_binary_expression(const Operand &dst, const Operand &lhs, BinaryOp op, const Operand &rhs)                                                                                                                                                           = 0;
+    virtual void op_binary_expression(const Operand &dst, const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
 
-    virtual void op_assign(const Operand &dst_name, const Operand &src_name)                                                                                                                                                                                             = 0;
+    virtual void op_assign(const Operand &dst_name, const Operand &src_name) = 0;
 
-    virtual void op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name)                                                                                                                                                     = 0;
+    virtual void
+    op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name) = 0;
 
-    virtual void op_binary_elementwise_function(const Operand &dst_name, BinaryFunction func, const Operand &first_name, const Operand &second_name)                                                                                                                     = 0;
+    virtual void op_binary_elementwise_function(const Operand &dst_name,
+                                                BinaryFunction func,
+                                                const Operand &first_name,
+                                                const Operand &second_name) = 0;
 
-    virtual void op_ternary_elementwise_function(const Operand &dst_name, TernaryFunction func, const Operand &first_name, const Operand &second_name, const Operand &third_name)                                                                                        = 0;
+    virtual void op_ternary_elementwise_function(const Operand  &dst_name,
+                                                 TernaryFunction func,
+                                                 const Operand  &first_name,
+                                                 const Operand  &second_name,
+                                                 const Operand  &third_name) = 0;
 
-    virtual void op_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs)                                                                                                                                                                                       = 0;
+    virtual void op_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
 
-    virtual void op_else_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs)                                                                                                                                                                                  = 0;
+    virtual void op_else_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
 
-    virtual void op_else_header()                                                                                                                                                                                                                                        = 0;
+    virtual void op_else_header() = 0;
 
-    virtual void op_for_loop_header(const Operand &var_name, BinaryOp cond_op, const Operand &cond_value, const Operand &update_var, AssignmentOp update_op, const Operand &update_value)                                                                                                           = 0;
+    virtual void op_for_loop_header(const Operand &var_name,
+                                    BinaryOp       cond_op,
+                                    const Operand &cond_value,
+                                    const Operand &update_var,
+                                    AssignmentOp   update_op,
+                                    const Operand &update_value) = 0;
 
-    virtual void op_load_indirect(const TensorOperand &tensor, const Operand &dst, const Operand &x, const Operand &y_indirect, const Operand &z, const Operand &b = Operand("0", OperandType::ScalarInt32))                                                             = 0;
+    virtual void op_load_indirect(const TensorOperand &tensor,
+                                  const Operand       &dst,
+                                  const Operand       &x,
+                                  const Operand       &y_indirect,
+                                  const Operand       &z,
+                                  const Operand       &b = Operand("0", OperandType::ScalarInt32)) = 0;
 
-    virtual void op_load_immediate(const TensorOperand &tensor, const Operand &dst, const Operand &x, const Operand &y, const Operand &z, const Operand &b = Operand("0", OperandType::ScalarInt32), const Operand &dilation_y = Operand("1", OperandType::ScalarInt32)) = 0;
+    virtual void op_load_immediate(const TensorOperand &tensor,
+                                   const Operand       &dst,
+                                   const Operand       &x,
+                                   const Operand       &y,
+                                   const Operand       &z,
+                                   const Operand       &b          = Operand("0", OperandType::ScalarInt32),
+                                   const Operand       &dilation_y = Operand("1", OperandType::ScalarInt32)) = 0;
 
-    virtual void op_store_immediate(const TensorOperand &tensor, const Operand &src, const Operand &x, const Operand &y, const Operand &z, const Operand &b = Operand("0", OperandType::ScalarInt32))                                                                    = 0;
+    virtual void op_store_immediate(const TensorOperand &tensor,
+                                    const Operand       &src,
+                                    const Operand       &x,
+                                    const Operand       &y,
+                                    const Operand       &z,
+                                    const Operand       &b = Operand("0", OperandType::ScalarInt32)) = 0;
 
-    virtual void op_cast_expression(const Operand &dst, const Operand &src, ConvertPolicy policy)                                                                                                                                                                        = 0;
+    virtual void op_cast_expression(const Operand &dst, const Operand &src, ConvertPolicy policy) = 0;
 
-    virtual void op_return()                                                                                                                                                                                                                                             = 0;
+    virtual void op_return() = 0;
 
     // Utils
     // It is the process of converting
-    virtual void util_get_indirect_buffer(const Operand &dst, const TensorOperand &tensor, const Operand &x,
-                                          const Operand &y, const Operand &x_off, const Operand &y_off) = 0;
+    virtual void util_get_indirect_buffer(const Operand       &dst,
+                                          const TensorOperand &tensor,
+                                          const Operand       &x,
+                                          const Operand       &y,
+                                          const Operand       &x_off,
+                                          const Operand       &y_off) = 0;
 };
 
 enum class GpuLoadStoreType
@@ -2584,12 +2618,11 @@ public:
 
     ClLoadStoreBufferHelperWriter &operator=(const ClLoadStoreBufferHelperWriter &) = default;
 
-    static bool
-    validate(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type, IVectorTile *dst)
+    static bool validate(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type, IVectorTile *dst)
     {
         CKW_UNUSED(x, type, dst);
 
-        if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::BufferUint8Ptr)
+        if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::BufferUint8Ptr)
         {
             return false;
         }
@@ -2673,10 +2706,10 @@ public:
         out_of_bound_finalize_y(dst);
 
         // The left over load/store will be written in the finalize stage
-        if(_ls_width_part.size() != 0)
+        if (_ls_width_part.size() != 0)
         {
             int32_t w = 0;
-            for(auto &p : _ls_width_part)
+            for (auto &p : _ls_width_part)
             {
                 const std::string dst0    = _dst->vector(w, p, idx_y).str;
                 const std::string coord_x = _coord_x + " + " + std::to_string(w);
@@ -2696,8 +2729,8 @@ public:
     }
 
 private:
-    IVectorTile                                                             *_dst{ nullptr };
-    int32_t                                                                  _ls_width_full{ 0 };
+    IVectorTile                                                             *_dst{nullptr};
+    int32_t                                                                  _ls_width_full{0};
     std::vector<int32_t>                                                     _ls_width_part{};
     std::vector<std::pair<std::pair<std::string, std::string>, std::string>> _leftovers_x{};
     std::string                                                              _coord_x{};
@@ -2707,13 +2740,13 @@ private:
 
     void out_of_bound_initialize_x(std::string &coord)
     {
-        if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
+        if (_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
         {
             auto tensor_format = _mapper.tensor_argument()->format();
             auto shape         = tensor_format.shape;
 
             _ls_width_part = decompose_leftover_ls_vector_width(shape[0] % _ls_width_full);
-            if(_ls_width_part.size() != 0)
+            if (_ls_width_part.size() != 0)
             {
                 _writer->write_text("if(" + coord + " > 0)\n");
                 _writer->compound_statement_begin();
@@ -2723,16 +2756,16 @@ private:
 
     void out_of_bound_finalize_x()
     {
-        if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
+        if (_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
         {
-            if(_ls_width_part.size() != 0)
+            if (_ls_width_part.size() != 0)
             {
                 _writer->compound_statement_end();
                 _writer->write_text("else\n");
                 _writer->compound_statement_begin();
 
                 out_of_bound_initialize_z(_coord_orig_z);
-                for(auto &i : _leftovers_x)
+                for (auto &i : _leftovers_x)
                 {
                     out_of_bound_initialize_y(i.first.second);
                     _writer->write_text(i.second);
@@ -2751,7 +2784,7 @@ private:
 
         const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
 
-        switch(address_mode_y)
+        switch (address_mode_y)
         {
             case TensorSamplerAddressModeY::Skip:
             case TensorSamplerAddressModeY::ClampToBorder:
@@ -2797,7 +2830,7 @@ private:
     {
         const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
 
-        switch(address_mode_y)
+        switch (address_mode_y)
         {
             case TensorSamplerAddressModeY::ClampToBorder:
             case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
@@ -2814,7 +2847,7 @@ private:
                 assert(false);
         }
 
-        switch(address_mode_y)
+        switch (address_mode_y)
         {
             case TensorSamplerAddressModeY::ClampToBorder:
             case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
@@ -2839,7 +2872,7 @@ private:
 
         const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
 
-        switch(address_mode_z)
+        switch (address_mode_z)
         {
             case TensorSamplerAddressModeZ::Skip:
                 max = _mapper.tensor_component_z();
@@ -2878,7 +2911,7 @@ private:
     {
         const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
 
-        switch(address_mode_z)
+        switch (address_mode_z)
         {
             case TensorSamplerAddressModeZ::Skip:
             case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
@@ -2897,7 +2930,7 @@ private:
     {
         std::vector<int32_t> x;
 
-        switch(ls_leftover_vector_width)
+        switch (ls_leftover_vector_width)
         {
             case 0:
                 break;
@@ -2959,13 +2992,13 @@ private:
         return x;
     }
 
-    std::string to_ls_buffer(GpuLoadStoreType type, int32_t vector_width, const std::string &data,
-                             const std::string &address)
+    std::string
+    to_ls_buffer(GpuLoadStoreType type, int32_t vector_width, const std::string &data, const std::string &address)
     {
-        switch(type)
+        switch (type)
         {
             case GpuLoadStoreType::Load:
-                if(vector_width != 1)
+                if (vector_width != 1)
                 {
                     return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")";
                 }
@@ -2975,7 +3008,7 @@ private:
                 }
                 break;
             case GpuLoadStoreType::Store:
-                if(vector_width != 1)
+                if (vector_width != 1)
                 {
                     return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")";
                 }
@@ -2991,25 +3024,25 @@ private:
         }
     }
 
-    std::string to_ls_buffer_address(const std::string &x, const std::string &y, const std::string &z,
-                                     const std::string &b) const
+    std::string
+    to_ls_buffer_address(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const
     {
-        auto tensor_storage            = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
+        auto tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
         assert(tensor_storage == GpuTensorStorage::BufferUint8Ptr);
-        const std::string ptr_buf      = _mapper.tensor_argument()->storage(tensor_storage);
-        const std::string dst_type     = get_cl_data_type(_dst->format().dt, 1);
+        const std::string ptr_buf  = _mapper.tensor_argument()->storage(tensor_storage);
+        const std::string dst_type = get_cl_data_type(_dst->format().dt, 1);
 
         std::string address;
         address += "(__global ";
         address += dst_type;
         address += "*)(";
         address += ptr_buf;
-        if(x != "0" && (_mapper.is_one_component_x() != true))
+        if (x != "0" && (_mapper.is_one_component_x() != true))
         {
             address += " + (";
             address += x + ") * sizeof(" + dst_type + ")";
         }
-        if(y != "0")
+        if (y != "0")
         {
             const std::string stride_y = _mapper.tensor_component_stride_y();
             address += " + (";
@@ -3017,7 +3050,7 @@ private:
             address += " * ";
             address += stride_y;
         }
-        if(z != "0" && (_mapper.is_one_component_z() != true))
+        if (z != "0")
         {
             const std::string stride_z = _mapper.tensor_component_stride_z();
             address += " + (";
@@ -3025,7 +3058,7 @@ private:
             address += " * ";
             address += stride_z;
         }
-        if(b != "0" && (_mapper.is_one_component_batch() != true))
+        if (b != "0" && (_mapper.is_one_component_batch() != true))
         {
             const std::string stride_b = _mapper.tensor_component_stride_batch();
             address += " + (";
@@ -3041,32 +3074,32 @@ private:
 class ClLoadStoreImage2dHelperWriter : public IGpuLoadStoreHelperWriter
 {
 public:
-    static bool
-    validate(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type, IVectorTile *dst)
+    static bool validate(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type, IVectorTile *dst)
     {
         CKW_UNUSED(x);
 
-        if(dst->format().w != 4)
+        if (dst->format().w != 4)
         {
             return false;
         }
-        if(mapper.gpu_sampler().address_mode_x != TensorSamplerAddressModeX::None)
+        if (mapper.gpu_sampler().address_mode_x != TensorSamplerAddressModeX::None)
         {
             return false;
         }
-        if(mapper.gpu_sampler().address_mode_z != TensorSamplerAddressModeZ::None)
+        if (mapper.gpu_sampler().address_mode_z != TensorSamplerAddressModeZ::None)
         {
             return false;
         }
-        if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dReadOnly && type == GpuLoadStoreType::Load)
+        if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dReadOnly && type == GpuLoadStoreType::Load)
         {
             return false;
         }
-        if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dWriteOnly && type == GpuLoadStoreType::Store)
+        if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dWriteOnly &&
+            type == GpuLoadStoreType::Store)
         {
             return false;
         }
-        if((dst->format().dt != DataType::Fp32) && (dst->format().dt != DataType::Fp16))
+        if ((dst->format().dt != DataType::Fp32) && (dst->format().dt != DataType::Fp16))
         {
             return false;
         }
@@ -3132,8 +3165,8 @@ public:
     }
 
 private:
-    IVectorTile *_dst{ nullptr };
-    int32_t      _ls_width_full{ 0 };
+    IVectorTile *_dst{nullptr};
+    int32_t      _ls_width_full{0};
     std::string  _coord_x{};
     std::string  _coord_z{};
     std::string  _coord_b{};
@@ -3144,7 +3177,7 @@ private:
 
         const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
 
-        switch(address_mode_y)
+        switch (address_mode_y)
         {
             case TensorSamplerAddressModeY::Skip:
                 max = _mapper.tensor_component_y();
@@ -3180,7 +3213,7 @@ private:
 
         const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
 
-        switch(address_mode_y)
+        switch (address_mode_y)
         {
             case TensorSamplerAddressModeY::Skip:
             case TensorSamplerAddressModeY::SkipMinEdgeOnly:
@@ -3193,16 +3226,19 @@ private:
         }
     };
 
-    std::string to_ls_image2d(GpuLoadStoreType type, int32_t vector_width, const std::string &data,
-                              const std::string &sampler, const std::string &coord)
+    std::string to_ls_image2d(GpuLoadStoreType   type,
+                              int32_t            vector_width,
+                              const std::string &data,
+                              const std::string &sampler,
+                              const std::string &coord)
     {
         CKW_UNUSED(vector_width);
 
         auto              tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
         const std::string image2d_obj    = _mapper.tensor_argument()->storage(tensor_storage);
-        const std::string post_fix = _dst->format().dt == DataType::Fp32 ? "f" : "h";
+        const std::string post_fix       = _dst->format().dt == DataType::Fp32 ? "f" : "h";
 
-        switch(type)
+        switch (type)
         {
             case GpuLoadStoreType::Load:
                 return data + " = read_image" + post_fix + "(" + image2d_obj + ", " + sampler + ", " + coord + ")";
@@ -3221,7 +3257,7 @@ private:
     {
         const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
 
-        switch(address_mode_y)
+        switch (address_mode_y)
         {
             case TensorSamplerAddressModeY::None:
                 return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST";
@@ -3243,17 +3279,17 @@ private:
         }
     }
 
-    std::string to_ls_image2d_coord(const std::string &x, const std::string &y, const std::string &z,
-                                    const std::string &b) const
+    std::string
+    to_ls_image2d_coord(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const
     {
         std::string coord_x = "(" + x + ") >> 2";
         std::string coord_y = "(";
 
-        if(y != "0")
+        if (y != "0")
         {
             coord_y += y;
         }
-        if(z != "0" && (_mapper.is_one_component_z() != true))
+        if (z != "0" && (_mapper.is_one_component_z() != true))
         {
             const std::string dim = _mapper.tensor_component_y();
             coord_y += " + (";
@@ -3261,7 +3297,7 @@ private:
             coord_y += " * ";
             coord_y += dim;
         }
-        if(b != "0" && (_mapper.is_one_component_batch() != true))
+        if (b != "0" && (_mapper.is_one_component_batch() != true))
         {
             const std::string dim0 = _mapper.tensor_component_y();
             const std::string dim1 = _mapper.tensor_component_z();
@@ -3290,7 +3326,7 @@ public:
     create(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type)
     {
         const auto tensor_storage = mapper.gpu_sampler().storage;
-        switch(tensor_storage)
+        switch (tensor_storage)
         {
             case GpuSamplerTensorStorage::BufferUint8Ptr:
                 return std::make_unique<ClLoadStoreBufferHelperWriter>(x, mapper, type);
@@ -3350,14 +3386,14 @@ public:
 
         IVectorTile *x = _data->tiles[name];
 
-        for(auto &t : x->underlying_source_variables())
+        for (auto &t : x->underlying_source_variables())
         {
             _data->code += t.type.str + " " + t.str + ";\n";
         }
     }
 
-    void declare_const_tile(const std::string &name, const std::vector<std::vector<std::string>> &in,
-                            DataType dt) override
+    void
+    declare_const_tile(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt) override
     {
         assert(_data->tiles[name] == nullptr);
         _data->tiles.insert(name, in, dt);
@@ -3385,7 +3421,8 @@ public:
     {
         assert(dst_var.type() == OperandType::Tile);
         assert(_data->tiles.has_tile(dst_var.value()));
-        assert(_data->tiles[dst_var.value()]->format().w == 1 && _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
+        assert(_data->tiles[dst_var.value()]->format().w == 1 &&
+               _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
 
         auto var = _data->tiles[dst_var.value()];
 
@@ -3395,8 +3432,10 @@ public:
         _data->code += ");\n";
     };
 
-    void op_get_global_coord(const Operand &o_dst, const Operand &o_step, const TensorOperand &o_tensor,
-                             int32_t dim) override
+    void op_get_global_coord(const Operand       &o_dst,
+                             const Operand       &o_step,
+                             const TensorOperand &o_tensor,
+                             int32_t              dim) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
         auto            dst  = operands.unpack(o_dst);
@@ -3410,17 +3449,17 @@ public:
 
         GpuTensor3dMapper mapper(tensor, gpu_sampler);
 
-        switch(dim)
+        switch (dim)
         {
             case 0:
-                if(mapper.is_one_component_x())
+                if (mapper.is_one_component_x())
                 {
                     _data->code += dst->scalar(0, 0).str;
                     _data->code += " = 0;\n";
                 }
                 else
                 {
-                    if(mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
+                    if (mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
                     {
                         // Validation: Check: fixed tensor shape
                         // TO BE CHANGED
@@ -3439,14 +3478,14 @@ public:
                 }
                 break;
             case 1:
-                if(mapper.is_one_component_y())
+                if (mapper.is_one_component_y())
                 {
                     _data->code += dst->scalar(0, 0).str;
                     _data->code += " = 0;\n";
                 }
                 else
                 {
-                    if(mapper.gpu_sampler().address_mode_y == TensorSamplerAddressModeY::OverlappingMin)
+                    if (mapper.gpu_sampler().address_mode_y == TensorSamplerAddressModeY::OverlappingMin)
                     {
                     }
                     else
@@ -3459,7 +3498,7 @@ public:
                 }
                 break;
             case 2:
-                if(mapper.is_one_component_z())
+                if (mapper.is_one_component_z())
                 {
                     _data->code += dst->scalar(0, 0).str;
                     _data->code += " = 0;\n";
@@ -3488,7 +3527,7 @@ public:
 
         GpuTensor3dMapper mapper(tensor, gpu_sampler);
 
-        if(mapper.is_one_component_batch())
+        if (mapper.is_one_component_batch())
         {
             _data->code += dst->scalar(0, 0).str;
             _data->code += " = 0;\n";
@@ -3504,7 +3543,8 @@ public:
     {
         assert(dst_var.type() == OperandType::Tile);
         assert(_data->tiles.has_tile(dst_var.value()));
-        assert(_data->tiles[dst_var.value()]->format().w == 1 && _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
+        assert(_data->tiles[dst_var.value()]->format().w == 1 &&
+               _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
 
         auto var = _data->tiles[dst_var.value()];
 
@@ -3530,7 +3570,7 @@ public:
         const std::string src_prefix = broadcast_src_x ? "(" + dt + ")" : "";
 
         // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
+        for (int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = ";
@@ -3540,7 +3580,9 @@ public:
         }
     }
 
-    void op_binary_expression(const Operand &dst_name, const Operand &lhs_name, BinaryOp op,
+    void op_binary_expression(const Operand &dst_name,
+                              const Operand &lhs_name,
+                              BinaryOp       op,
                               const Operand &rhs_name) override
     {
         OperandUnpacker    operands(_data->tiles, _data->arguments);
@@ -3554,14 +3596,14 @@ public:
         const int32_t lhs_w = lhs->format().w;
         const int32_t rhs_w = rhs->format().w;
 
-        if(op == BinaryOp::MatMul_Nt_T)
+        if (op == BinaryOp::MatMul_Nt_T)
         {
             assert((dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16));
-            for(int32_t y = 0; y < dst_h; ++y)
+            for (int32_t y = 0; y < dst_h; ++y)
             {
-                for(int32_t x = 0; x < dst_w; ++x)
+                for (int32_t x = 0; x < dst_w; ++x)
                 {
-                    for(int32_t k = 0; k < lhs_w; ++k)
+                    for (int32_t k = 0; k < lhs_w; ++k)
                     {
                         _data->code += dst->scalar(x, y).str;
                         _data->code += " = fma(";
@@ -3581,12 +3623,14 @@ public:
         const bool broadcast_lhs_x = dst_w != 1 && lhs_w == 1;
         const bool broadcast_rhs_x = dst_w != 1 && rhs_w == 1;
 
-        const std::string lhs_prefix = broadcast_lhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
-        const std::string rhs_prefix = broadcast_rhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
-        const std::string op_str     = to_string(op);
+        const std::string lhs_prefix =
+            broadcast_lhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
+        const std::string rhs_prefix =
+            broadcast_rhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
+        const std::string op_str = to_string(op);
 
         // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
+        for (int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = ";
@@ -3605,13 +3649,13 @@ public:
         const IVectorTile *src = operands.unpack(o_src);
         const IVectorTile *dst = operands.unpack(o_dst);
         // const int32_t dst_w  = dst->format().w;
-        const int32_t     dst_h = dst->format().h;
-        const std::string dt    = dst->underlying_source_variables()[0].type.str;
-        const bool is_float     = (dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16);
-        const std::string sat   = ((policy == ConvertPolicy::Saturate && !is_float) ? "_sat" : "");
+        const int32_t     dst_h    = dst->format().h;
+        const std::string dt       = dst->underlying_source_variables()[0].type.str;
+        const bool        is_float = (dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16);
+        const std::string sat      = ((policy == ConvertPolicy::Saturate && !is_float) ? "_sat" : "");
 
         // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
+        for (int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = convert_" + dt + sat + "(";
@@ -3636,7 +3680,7 @@ public:
         const std::string src_prefix = broadcast_src_x ? "(" + dt + ")" : "";
 
         // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
+        for (int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = ";
@@ -3645,8 +3689,7 @@ public:
         }
     }
 
-    void
-    op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name) override
+    void op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name) override
     {
         OperandUnpacker    operands(_data->tiles, _data->arguments);
         const IVectorTile *src = operands.unpack(src_name);
@@ -3663,12 +3706,12 @@ public:
         const std::string src_prefix = "(" + dt + ")";
 
         // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
+        for (int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = ";
 
-            switch(func)
+            switch (func)
             {
                 case UnaryFunction::Exp:
                     _data->code += "exp(";
@@ -3694,6 +3737,9 @@ public:
                 case UnaryFunction::Round:
                     _data->code += "round(";
                     break;
+                case UnaryFunction::Floor:
+                    _data->code += "floor(";
+                    break;
                 default:
                     CKW_ASSERT_MSG(false, "Unexpected UnaryFunction used.");
             }
@@ -3703,7 +3749,10 @@ public:
         }
     }
 
-    void op_binary_elementwise_function(const Operand &dst_name, BinaryFunction func, const Operand &first_name, const Operand &second_name) override
+    void op_binary_elementwise_function(const Operand &dst_name,
+                                        BinaryFunction func,
+                                        const Operand &first_name,
+                                        const Operand &second_name) override
     {
         OperandUnpacker    operands(_data->tiles, _data->arguments);
         const IVectorTile *first  = operands.unpack(first_name);
@@ -3721,12 +3770,12 @@ public:
         const bool is_float = (datatype.dt == DataType::Fp32 || datatype.dt == DataType::Fp16);
 
         // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
+        for (int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = ";
 
-            switch(func)
+            switch (func)
             {
                 case BinaryFunction::Min:
                     _data->code += is_float ? "fmin(" : "min(";
@@ -3745,7 +3794,11 @@ public:
         }
     }
 
-    void op_ternary_elementwise_function(const Operand &dst_name, TernaryFunction func, const Operand &first_name, const Operand &second_name, const Operand &third_name) override
+    void op_ternary_elementwise_function(const Operand  &dst_name,
+                                         TernaryFunction func,
+                                         const Operand  &first_name,
+                                         const Operand  &second_name,
+                                         const Operand  &third_name) override
     {
         OperandUnpacker    operands(_data->tiles, _data->arguments);
         const IVectorTile *first  = operands.unpack(first_name);
@@ -3753,8 +3806,8 @@ public:
         const IVectorTile *third  = operands.unpack(third_name);
         const IVectorTile *dst    = operands.unpack(dst_name);
 
-        const int32_t     dst_h    = dst->format().h;
-        const std::string dt       = dst->underlying_source_variables()[0].type.str;
+        const int32_t     dst_h = dst->format().h;
+        const std::string dt    = dst->underlying_source_variables()[0].type.str;
 
         // Always perform an explicit cast. See similar comments in op_unary_elementwise_function
         const std::string first_prefix  = "(" + dt + ")";
@@ -3762,16 +3815,19 @@ public:
         const std::string third_prefix  = "(" + dt + ")";
 
         // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
+        for (int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = ";
 
-            switch(func)
+            switch (func)
             {
                 case TernaryFunction::Select:
                     _data->code += "select(";
                     break;
+                case TernaryFunction::Clamp:
+                    _data->code += "clamp(";
+                    break;
                 default:
                     CKW_ASSERT_MSG(false, "Unexpected TernaryFunction used.");
             }
@@ -3814,7 +3870,12 @@ public:
         _data->code += "else\n";
     }
 
-    void op_for_loop_header(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value_name, const Operand &update_var_name, AssignmentOp update_op, const Operand& update_value_name) override
+    void op_for_loop_header(const Operand &var_name,
+                            BinaryOp       cond_op,
+                            const Operand &cond_value_name,
+                            const Operand &update_var_name,
+                            AssignmentOp   update_op,
+                            const Operand &update_value_name) override
     {
         OperandUnpacker    operands(_data->tiles, _data->arguments);
         const IVectorTile *var          = operands.unpack(var_name);
@@ -3842,9 +3903,13 @@ public:
         _data->code += "\n";
     }
 
-    void op_load_immediate(const TensorOperand &o_tensor, const Operand &o_dst, const Operand &o_x,
-                           const Operand &o_y, const Operand &o_z, const Operand &o_batch_idx,
-                           const Operand &dilation_y) override
+    void op_load_immediate(const TensorOperand &o_tensor,
+                           const Operand       &o_dst,
+                           const Operand       &o_x,
+                           const Operand       &o_y,
+                           const Operand       &o_z,
+                           const Operand       &o_batch_idx,
+                           const Operand       &dilation_y) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
 
@@ -3867,10 +3932,10 @@ public:
         // Initialize the constant part
         load_writer->initialize(dst, x, z, b);
 
-        for(int i = 0; i < dst->format().h; ++i)
+        for (int i = 0; i < dst->format().h; ++i)
         {
             std::string coord_y = y->scalar(0, 0).str + " + " + std::to_string(i);
-            if(dil_y->scalar(0, 0).str != "1")
+            if (dil_y->scalar(0, 0).str != "1")
             {
                 coord_y += " * " + dil_y->scalar(0, 0).str;
             }
@@ -3880,9 +3945,12 @@ public:
         load_writer->finalize();
     }
 
-    void op_load_indirect(const TensorOperand &o_tensor, const Operand &o_dst, const Operand &o_x,
-                          const Operand &o_indirect_h, const Operand &o_z,
-                          const Operand &o_batch_idx) override
+    void op_load_indirect(const TensorOperand &o_tensor,
+                          const Operand       &o_dst,
+                          const Operand       &o_x,
+                          const Operand       &o_indirect_h,
+                          const Operand       &o_z,
+                          const Operand       &o_batch_idx) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
 
@@ -3904,7 +3972,7 @@ public:
         // Initialize the constant part
         load_writer->initialize(dst, x, z, b);
 
-        for(int i = 0; i < dst->format().h; ++i)
+        for (int i = 0; i < dst->format().h; ++i)
         {
             load_writer->write(std::make_pair(i, y_ind->scalar(0, i).str));
         }
@@ -3912,9 +3980,12 @@ public:
         load_writer->finalize();
     }
 
-    void op_store_immediate(const TensorOperand &tensor_name, const Operand &src_name, const Operand &x_name,
-                            const Operand &y_name, const Operand &z_name,
-                            const Operand &batch_index_name) override
+    void op_store_immediate(const TensorOperand &tensor_name,
+                            const Operand       &src_name,
+                            const Operand       &x_name,
+                            const Operand       &y_name,
+                            const Operand       &z_name,
+                            const Operand       &batch_index_name) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
 
@@ -3938,7 +4009,7 @@ public:
 
         int32_t tile_h = src->format().h;
 
-        for(int m0 = tile_h - 1; m0 >= 0; m0--)
+        for (int m0 = tile_h - 1; m0 >= 0; m0--)
         {
             store_writer->write(std::make_pair(m0, y->scalar(0, 0).str + " + " + std::to_string(m0)));
         }
@@ -3951,8 +4022,12 @@ public:
         _data->code += "return;\n";
     }
 
-    void util_get_indirect_buffer(const Operand &o_dst, const TensorOperand &o_tensor, const Operand &o_x,
-                                  const Operand &o_y, const Operand &o_x_off, const Operand &o_y_off) override
+    void util_get_indirect_buffer(const Operand       &o_dst,
+                                  const TensorOperand &o_tensor,
+                                  const Operand       &o_x,
+                                  const Operand       &o_y,
+                                  const Operand       &o_x_off,
+                                  const Operand       &o_y_off) override
     {
         OperandUnpacker    operands(_data->tiles, _data->arguments);
         const IVectorTile *dst   = operands.unpack(o_dst);
@@ -3994,7 +4069,7 @@ public:
         declare_tile("_y_s", TileInfo(DataType::Int32));
         auto x_s = operands.unpack(Operand("_x_s"));
         auto y_s = operands.unpack(Operand("_y_s"));
-        for(int i = 0; i < dst->format().h; ++i)
+        for (int i = 0; i < dst->format().h; ++i)
         {
             // x_s = (xi_0 + x_k);
             // y_s = (yi_0 + y_k);
@@ -4052,8 +4127,8 @@ public:
     }
 
 private:
-    GpuKernelWriterDataHolder *_data{ nullptr };
-    GpuKernelWriterAttribute  *_attr{ nullptr };
+    GpuKernelWriterDataHolder *_data{nullptr};
+    GpuKernelWriterAttribute  *_attr{nullptr};
 };
 
 /** IGpuKernelWriter factory class */
@@ -4066,10 +4141,9 @@ public:
      *
      * @return IGpuKernelWriter
      */
-    static std::unique_ptr<IGpuKernelWriter>
-    create(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
+    static std::unique_ptr<IGpuKernelWriter> create(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
     {
-        switch(x->programming_language())
+        switch (x->programming_language())
         {
             case GpuTargetLanguage::OpenCL:
                 return std::make_unique<ClKernelWriter>(attr, x);
@@ -4086,9 +4160,9 @@ adjust_step(TensorSamplerFormat tensor_format, int32_t step, const TensorInfo *t
 {
     auto tensor = tensor_info_id->shape;
 
-    int32_t dim[3] = { 0 };
+    int32_t dim[3] = {0};
 
-    switch(tensor_format)
+    switch (tensor_format)
     {
         case TensorSamplerFormat::C_W_H:
             dim[0] = tensor[0];
diff --git a/compute_kernel_writer/prototype/src/TensorOperand.cpp b/compute_kernel_writer/prototype/src/TensorOperand.cpp
index c6725d3b26c2f5c4d398fa2cfc7e16785a3bd107..d1aefbbb718ca7e94865e7cca884a1fa8de4730c 100644
--- a/compute_kernel_writer/prototype/src/TensorOperand.cpp
+++ b/compute_kernel_writer/prototype/src/TensorOperand.cpp
@@ -23,10 +23,12 @@
  */
 
 #include "ckw/TensorOperand.h"
+
 #include "ckw/Error.h"
 #include "ckw/Kernel.h"
 #include "ckw/TensorInfo.h"
 #include "ckw/TileOperand.h"
+
 #include "src/Prototype.h"
 
 namespace ckw
@@ -35,9 +37,11 @@ namespace ckw
 namespace
 {
 
-TensorComponentOperand &get_or_create_component(TensorOperand &tensor, std::unique_ptr<TensorComponentOperand> &ptr, TensorComponentType component)
+TensorComponentOperand &get_or_create_component(TensorOperand                           &tensor,
+                                                std::unique_ptr<TensorComponentOperand> &ptr,
+                                                TensorComponentType                      component)
 {
-    if(ptr == nullptr)
+    if (ptr == nullptr)
     {
         ptr = std::make_unique<TensorComponentOperand>(tensor, component);
     }
@@ -59,7 +63,7 @@ TensorOperand::TensorOperand(const std::string &name, const TensorInfo &info, Te
 prototype::Operand TensorOperand::create_impl_operand(prototype::IGpuKernelWriter *writer) const
 {
     CKW_UNUSED(writer);
-    return { name() };
+    return {name()};
 }
 
 const TensorInfo &TensorOperand::info() const
@@ -206,9 +210,9 @@ TensorComponentType TensorComponentOperand::component_type() const
 prototype::Operand TensorComponentOperand::create_impl_operand(prototype::IGpuKernelWriter *writer) const
 {
     CKW_UNUSED(writer);
-    prototype::OperandType type{ prototype::OperandType::Unknown };
+    prototype::OperandType type{prototype::OperandType::Unknown};
 
-    switch(_component)
+    switch (_component)
     {
         case TensorComponentType::OffsetFirstElement:
             type = prototype::OperandType::TensorDataOffset;
diff --git a/compute_kernel_writer/prototype/src/TensorTileSampler.cpp b/compute_kernel_writer/prototype/src/TensorTileSampler.cpp
index 28e54df3a570a1bec16e23179ef0b6da24b7b6d1..bf9f946ce8cf9db444b3cbc8b969e990bcbf98e4 100644
--- a/compute_kernel_writer/prototype/src/TensorTileSampler.cpp
+++ b/compute_kernel_writer/prototype/src/TensorTileSampler.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "ckw/TensorTileSampler.h"
+
 #include "ckw/TileOperand.h"
 #include "ckw/types/TensorSamplerTypes.h"
 
@@ -33,24 +34,47 @@ TensorTileSampler::TensorTileSampler()
 {
 }
 
-TensorTileSampler::TensorTileSampler(
-    TileOperand &x, TileOperand &y, TileOperand &z, TileOperand &b,
-    TensorSamplerFormat       format,
-    TensorSamplerAddressModeX address_mode_x,
-    TensorSamplerAddressModeY address_mode_y,
-    TensorSamplerAddressModeZ address_mode_z)
-    : _x(&x), _y(&y), _z(&z), _b(&b), _height(0), _width(0), _format(format), _address_mode_x(address_mode_x), _address_mode_y(address_mode_y), _address_mode_z(address_mode_z)
-{
-}
-
-TensorTileSampler::TensorTileSampler(
-    TileOperand &x, TileOperand &y, TileOperand &z, TileOperand &b,
-    int32_t height, int32_t width,
-    TensorSamplerFormat       format,
-    TensorSamplerAddressModeX address_mode_x,
-    TensorSamplerAddressModeY address_mode_y,
-    TensorSamplerAddressModeZ address_mode_z)
-    : _x(&x), _y(&y), _z(&z), _b(&b), _height(height), _width(width), _format(format), _address_mode_x(address_mode_x), _address_mode_y(address_mode_y), _address_mode_z(address_mode_z)
+TensorTileSampler::TensorTileSampler(TileOperand              &x,
+                                     TileOperand              &y,
+                                     TileOperand              &z,
+                                     TileOperand              &b,
+                                     TensorSamplerFormat       format,
+                                     TensorSamplerAddressModeX address_mode_x,
+                                     TensorSamplerAddressModeY address_mode_y,
+                                     TensorSamplerAddressModeZ address_mode_z)
+    : _x(&x),
+      _y(&y),
+      _z(&z),
+      _b(&b),
+      _height(0),
+      _width(0),
+      _format(format),
+      _address_mode_x(address_mode_x),
+      _address_mode_y(address_mode_y),
+      _address_mode_z(address_mode_z)
+{
+}
+
+TensorTileSampler::TensorTileSampler(TileOperand              &x,
+                                     TileOperand              &y,
+                                     TileOperand              &z,
+                                     TileOperand              &b,
+                                     int32_t                   height,
+                                     int32_t                   width,
+                                     TensorSamplerFormat       format,
+                                     TensorSamplerAddressModeX address_mode_x,
+                                     TensorSamplerAddressModeY address_mode_y,
+                                     TensorSamplerAddressModeZ address_mode_z)
+    : _x(&x),
+      _y(&y),
+      _z(&z),
+      _b(&b),
+      _height(height),
+      _width(width),
+      _format(format),
+      _address_mode_x(address_mode_x),
+      _address_mode_y(address_mode_y),
+      _address_mode_z(address_mode_z)
 {
 }
 
diff --git a/compute_kernel_writer/prototype/src/TileInfo.cpp b/compute_kernel_writer/prototype/src/TileInfo.cpp
index 66d8cb1620732f26914cc812c58da5eea4e3e0f9..273266eedcfac85eb07b463531d87bdbacceb352 100644
--- a/compute_kernel_writer/prototype/src/TileInfo.cpp
+++ b/compute_kernel_writer/prototype/src/TileInfo.cpp
@@ -26,18 +26,15 @@
 
 namespace ckw
 {
-TileInfo::TileInfo(DataType dt)
-    : _dt(dt), _shape({ { 1, 1 } })
+TileInfo::TileInfo(DataType dt) : _dt(dt), _shape({{1, 1}})
 {
 }
 
-TileInfo::TileInfo(DataType dt, int32_t w)
-    : _dt(dt), _shape({ { w, 1 } })
+TileInfo::TileInfo(DataType dt, int32_t w) : _dt(dt), _shape({{w, 1}})
 {
 }
 
-TileInfo::TileInfo(DataType dt, int32_t h, int32_t w)
-    : _dt(dt), _shape({ { w, h } })
+TileInfo::TileInfo(DataType dt, int32_t h, int32_t w) : _dt(dt), _shape({{w, h}})
 {
 }
 
diff --git a/compute_kernel_writer/prototype/src/TileOperand.cpp b/compute_kernel_writer/prototype/src/TileOperand.cpp
index bf6a15b9df69e13482046f47e8a6724c041755ae..e09c833d960466964a8b1ab4eee7aec7164ab81f 100644
--- a/compute_kernel_writer/prototype/src/TileOperand.cpp
+++ b/compute_kernel_writer/prototype/src/TileOperand.cpp
@@ -23,47 +23,43 @@
  */
 
 #include "ckw/TileOperand.h"
+
 #include "ckw/Error.h"
+
 #include "src/Prototype.h"
 
 namespace ckw
 {
 
 TileOperand::TileOperand(const std::string &name, const TileInfo &info)
-    : OperandBase(name),
-      _info(info),
-      _value{ std::vector<std::string>{ "0" } },
-      _constant(false)
+    : OperandBase(name), _info(info), _value{std::vector<std::string>{"0"}}, _constant(false)
 {
 }
 
 TileOperand::TileOperand(const std::string &name, DataType data_type)
-    : OperandBase(name),
-      _info(TileInfo{ data_type }),
-      _value{ std::vector<std::string>{ "0" } },
-      _constant(false)
+    : OperandBase(name), _info(TileInfo{data_type}), _value{std::vector<std::string>{"0"}}, _constant(false)
 {
 }
 
 TileOperand::TileOperand(const std::string &name, int32_t value)
     : OperandBase(name),
-      _info(TileInfo{ DataType::Int32 }),
-      _value{ std::vector<std::string>{ std::to_string(value) } },
+      _info(TileInfo{DataType::Int32}),
+      _value{std::vector<std::string>{std::to_string(value)}},
       _constant(true)
 {
 }
 
 TileOperand::TileOperand(const std::string &name, float value)
     : OperandBase(name),
-      _info(TileInfo{ DataType::Fp32 }),
-      _value{ std::vector<std::string>{ std::to_string(value) } },
+      _info(TileInfo{DataType::Fp32}),
+      _value{std::vector<std::string>{std::to_string(value)}},
       _constant(true)
 {
 }
 
 TileOperand::TileOperand(const std::string &name, const TileContainer &vals, DataType dt)
     : OperandBase(name),
-      _info(TileInfo{ dt, static_cast<int32_t>(vals.size()), static_cast<int32_t>(vals[0].size()) }),
+      _info(TileInfo{dt, static_cast<int32_t>(vals.size()), static_cast<int32_t>(vals[0].size())}),
       _value(vals),
       _constant(true)
 {
@@ -73,11 +69,11 @@ prototype::Operand TileOperand::create_impl_operand(prototype::IGpuKernelWriter
 {
     CKW_UNUSED(writer);
 
-    if(_constant)
+    if (_constant)
     {
-        if(is_scalar())
+        if (is_scalar())
         {
-            switch(_info.data_type())
+            switch (_info.data_type())
             {
                 case DataType::Int32:
                     return prototype::Operand(_value[0][0], prototype::OperandType::ScalarInt32);
@@ -85,6 +81,9 @@ prototype::Operand TileOperand::create_impl_operand(prototype::IGpuKernelWriter
                 case DataType::Fp32:
                     return prototype::Operand(_value[0][0], prototype::OperandType::ScalarFp32);
 
+                case DataType::Fp16:
+                    return prototype::Operand(_value[0][0], prototype::OperandType::ScalarFp16);
+
                 default:
                     CKW_ASSERT(false);
             }
diff --git a/compute_kernel_writer/src/Error.cpp b/compute_kernel_writer/src/Error.cpp
index c5dae2eb75768ebcc02c09999497b74351c1eea5..e1e4bffcecdc4181e3e9cb2ab4d66a6f4806fd59 100644
--- a/compute_kernel_writer/src/Error.cpp
+++ b/compute_kernel_writer/src/Error.cpp
@@ -28,8 +28,8 @@
 
 namespace ckw
 {
-std::string create_error_msg(const std::string &file, const std::string &func, const std::string &line,
-                             const std::string &msg)
+std::string
+create_error_msg(const std::string &file, const std::string &func, const std::string &line, const std::string &msg)
 {
     std::string err;
     err += "[COMPUTE_KERNEL_WRITER][ERROR]:";
@@ -38,4 +38,4 @@ std::string create_error_msg(const std::string &file, const std::string &func, c
     err += " " + msg;
     return err;
 }
-} // namespace ckw
\ No newline at end of file
+} // namespace ckw
diff --git a/compute_kernel_writer/src/Helpers.cpp b/compute_kernel_writer/src/Helpers.cpp
index 799f79a18712ab40cc3d14ccdadaad75bef8d302..82d4c4e917cfe8af0a3abc2c6560c7b73070b3b8 100644
--- a/compute_kernel_writer/src/Helpers.cpp
+++ b/compute_kernel_writer/src/Helpers.cpp
@@ -22,15 +22,15 @@
  * SOFTWARE.
  */
 
-#include "ckw/Error.h"
-
 #include "src/Helpers.h"
 
+#include "ckw/Error.h"
+
 namespace ckw
 {
 std::string dec_to_hex_as_string(int32_t dec)
 {
-    switch(dec)
+    switch (dec)
     {
         case 0:
         case 1:
diff --git a/compute_kernel_writer/src/ITensorArgument.h b/compute_kernel_writer/src/ITensorArgument.h
index 838bd40f85b5d812df07072e4d659f8ee9da7342..ece45a4dc41af0bd1bfd754842740cc64f8d3ea7 100644
--- a/compute_kernel_writer/src/ITensorArgument.h
+++ b/compute_kernel_writer/src/ITensorArgument.h
@@ -28,6 +28,7 @@
 #include "ckw/TensorInfo.h"
 #include "ckw/types/TensorComponentType.h"
 #include "ckw/types/TensorStorageType.h"
+
 #include "src/ITile.h"
 
 #include <string>
@@ -41,8 +42,8 @@ class ITensorComponent;
 /** Tensor storage variable */
 struct TensorStorageVariable
 {
-    std::string       val{ "" };                          /** Tensor storage as a string */
-    TensorStorageType type{ TensorStorageType::Unknown }; /** Tensor storage type */
+    std::string       val{""};                          /** Tensor storage as a string */
+    TensorStorageType type{TensorStorageType::Unknown}; /** Tensor storage type */
 };
 
 /** Tensor argument base class.
@@ -83,8 +84,8 @@ public:
     }
 
 protected:
-    TensorInfo  _info{};         // Tensor info
-    std::string _basename{ "" }; // Tensor name
+    TensorInfo  _info{};       // Tensor info
+    std::string _basename{""}; // Tensor name
 };
 
 /** Tensor component argument base class */
diff --git a/compute_kernel_writer/src/ITensorComponent.h b/compute_kernel_writer/src/ITensorComponent.h
index e2775b62b0dc494f9581eb3e43e1ab3e6157fd26..f9c9d8fd81cbf019cff2e42d5da90946f7644ed3 100644
--- a/compute_kernel_writer/src/ITensorComponent.h
+++ b/compute_kernel_writer/src/ITensorComponent.h
@@ -26,6 +26,7 @@
 #define CKW_SRC_ITENSORCOMPONENT_H
 
 #include "ckw/types/TensorComponentType.h"
+
 #include "src/ITile.h"
 
 namespace ckw
diff --git a/compute_kernel_writer/src/ITile.cpp b/compute_kernel_writer/src/ITile.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..eeb7816068ab898d17ff25ec89a35a8f22d7afe2
--- /dev/null
+++ b/compute_kernel_writer/src/ITile.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/ITile.h"
+
+namespace ckw
+{
+
+bool ITile::is_scalar() const
+{
+    return info().width() == 1 && info().height() == 1;
+}
+
+} // namespace ckw
diff --git a/compute_kernel_writer/src/ITile.h b/compute_kernel_writer/src/ITile.h
index a54fd9b7943bed9acee08b597ceff62c8761db4d..8eaac5ac1202ef16f4022bc483377bfa8d8e60f9 100644
--- a/compute_kernel_writer/src/ITile.h
+++ b/compute_kernel_writer/src/ITile.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef COMPUTE_KERNEL_WRITER_SRC_ITILE
-#define COMPUTE_KERNEL_WRITER_SRC_ITILE
+#ifndef CKW_SRC_ITILE_H
+#define CKW_SRC_ITILE_H
 
 #include "ckw/TileInfo.h"
 
@@ -37,49 +37,15 @@ using TileContainer = std::vector<std::vector<std::string>>;
 /** Tile descriptor which reports the underlying datatype and vector length */
 struct TileVariableDescriptor
 {
-    DataType dt{ DataType::Unknown }; /** Data type  */
-    int32_t  len{ 1 };                /** Number of elements in a single variable. For example, 1 for scalar  */
+    DataType dt{DataType::Unknown}; /** Data type  */
+    int32_t  len{1};                /** Number of elements in a single variable. For example, 1 for scalar  */
 };
 
 /** Tile variable */
 struct TileVariable
 {
-    std::string            str{ "" }; /** Tile variable as a string */
-    TileVariableDescriptor desc{};    /** Tile value descriptor which reports the datatype and vector length */
-};
-
-/** Tile base class.
- *  A Tile is a collection of variables (either program variables or constants) used to express a 2D data.
- */
-class ITile
-{
-public:
-    virtual ~ITile() = default;
-
-    /** Method to get all TileVariable objects
-     *
-     * @return a vector containing all @ref TileVariable objects
-     */
-    virtual std::vector<TileVariable> all() const = 0;
-
-    /** Method to get the name of the tile.
-     *
-     * @return the name of the tile
-     */
-    virtual const std::string &name() const = 0;
-
-    /** Method to get the tile info
-     *
-     * @return the @ref TileInfo
-     */
-    virtual const TileInfo &info() const = 0;
-
-    /** Method to know whether the tile is assignable or not.
-     *  For example, a constant tile is not assignable.
-     *
-     * @return true if the tile is assignable
-     */
-    virtual bool is_assignable() const = 0;
+    std::string            str{""}; /** Tile variable as a string */
+    TileVariableDescriptor desc{};  /** Tile value descriptor which reports the datatype and vector length */
 };
 
 /** Interface to provide support for scalar access for a Tile.
@@ -130,6 +96,46 @@ public:
      */
     virtual std::vector<int32_t> supported_vector_lengths() const = 0;
 };
+
+/** Tile base class.
+ *  A Tile is a collection of variables (either program variables or constants) used to express a 2D data.
+ */
+class ITile : public IScalarAccess
+{
+public:
+    virtual ~ITile() = default;
+
+    /** Method to get all TileVariable objects
+     *
+     * @return a vector containing all @ref TileVariable objects
+     */
+    virtual std::vector<TileVariable> all() const = 0;
+
+    /** Method to get the name of the tile.
+     *
+     * @return the name of the tile
+     */
+    virtual const std::string &name() const = 0;
+
+    /** Method to get the tile info
+     *
+     * @return the @ref TileInfo
+     */
+    virtual const TileInfo &info() const = 0;
+
+    /** Method to know whether the tile is assignable or not.
+     *  For example, a constant tile is not assignable.
+     *
+     * @return true if the tile is assignable
+     */
+    virtual bool is_assignable() const = 0;
+
+    /** Get whether the tile is scalar, i.e. the width and height are both 1.
+     *
+     * @return true if the tile is scalar.
+     */
+    bool is_scalar() const;
+};
 } // namespace ckw
 
-#endif /* COMPUTE_KERNEL_WRITER_SRC_ITILE */
+#endif // CKW_SRC_ITILE_H
diff --git a/compute_kernel_writer/src/Kernel.cpp b/compute_kernel_writer/src/Kernel.cpp
index 5eea1aa5486c9357dbca282383eec7c7d01783e4..12389b3816e5ef70401311498132fe58e9ac8b2f 100644
--- a/compute_kernel_writer/src/Kernel.cpp
+++ b/compute_kernel_writer/src/Kernel.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "ckw/Kernel.h"
+
 #include "ckw/types/TargetLanguage.h"
 
 namespace ckw
@@ -30,8 +31,8 @@ namespace ckw
 
 Kernel::~Kernel() = default;
 
-Kernel::Kernel(TargetLanguage language, const std::string &source_code)
-    : _language(language), _source_code(source_code)
+Kernel::Kernel(TargetLanguage language, const std::vector<KernelArgument> &arguments, const std::string &source_code)
+    : _language(language), _arguments(arguments), _source_code(source_code)
 {
 }
 
@@ -40,6 +41,11 @@ TargetLanguage Kernel::target_language() const
     return _language;
 }
 
+const std::vector<KernelArgument> &Kernel::arguments() const
+{
+    return _arguments;
+}
+
 const std::string &Kernel::source_code() const
 {
     return _source_code;
diff --git a/compute_kernel_writer/src/KernelArgument.cpp b/compute_kernel_writer/src/KernelArgument.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a640d365074bde57f42e98896e3e13191997182a
--- /dev/null
+++ b/compute_kernel_writer/src/KernelArgument.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ckw/KernelArgument.h"
+
+#include "ckw/Error.h"
+
+namespace ckw
+{
+
+KernelArgument::KernelArgument(int32_t tensor_id, TensorStorageType storage_type)
+    : _type(Type::TensorStorage), _id(tensor_id)
+{
+    _sub_id.tensor_storage_type = storage_type;
+}
+
+KernelArgument::KernelArgument(int32_t tensor_id, TensorComponentType component_type)
+    : _type(Type::TensorComponent), _id(tensor_id)
+{
+    _sub_id.tensor_component_type = component_type;
+}
+
+KernelArgument::Type KernelArgument::type() const
+{
+    return _type;
+}
+
+int32_t KernelArgument::id() const
+{
+    return _id;
+}
+
+TensorStorageType KernelArgument::tensor_storage_type() const
+{
+    CKW_ASSERT(_type == Type::TensorStorage);
+
+    return _sub_id.tensor_storage_type;
+}
+
+TensorComponentType KernelArgument::tensor_component_type() const
+{
+    CKW_ASSERT(_type == Type::TensorComponent);
+
+    return _sub_id.tensor_component_type;
+}
+
+} // namespace ckw
diff --git a/compute_kernel_writer/src/KernelWriter.cpp b/compute_kernel_writer/src/KernelWriter.cpp
index ce34a1c2d66803f9887d8c90331c6da4386a3211..a478231c09119fffc43bc6ffc8d9e3792bc57cc7 100644
--- a/compute_kernel_writer/src/KernelWriter.cpp
+++ b/compute_kernel_writer/src/KernelWriter.cpp
@@ -23,13 +23,18 @@
  */
 
 #include "ckw/KernelWriter.h"
+
 #include "ckw/Error.h"
 #include "ckw/TileOperand.h"
 #include "ckw/types/TargetArchitecture.h"
 #include "ckw/types/TargetLanguage.h"
+
 #include "src/cl/CLKernelWriter.h"
 #include "src/cl/CLTensorArgument.h"
 #include "src/cl/CLTile.h"
+#include "src/TileView.h"
+
+#include <tuple>
 
 namespace ckw
 {
@@ -39,7 +44,7 @@ KernelWriter::~KernelWriter() = default;
 std::unique_ptr<KernelWriter> KernelWriter::create_instance(TargetArchitecture architecture, TargetLanguage language)
 {
     CKW_UNUSED(architecture);
-    switch(language)
+    switch (language)
     {
         case TargetLanguage::OpenCL:
             // Currently this is the oldest and the only supported GPU architecture.
@@ -51,11 +56,35 @@ std::unique_ptr<KernelWriter> KernelWriter::create_instance(TargetArchitecture a
     }
 }
 
+int32_t KernelWriter::new_id_space()
+{
+    _id_space = ++_last_created_id_space;
+
+    return _id_space;
+}
+
 int32_t KernelWriter::id_space() const
 {
     return _id_space;
 }
 
+KernelWriter &KernelWriter::id_space(int32_t value)
+{
+    CKW_ASSERT(value <= _last_created_id_space);
+
+    _id_space = value;
+
+    return *this;
+}
+
+void KernelWriter::write_body(const std::function<void()> &body)
+{
+    const auto curr_id_space = id_space();
+    new_id_space();
+    body();
+    id_space(curr_id_space);
+}
+
 std::string KernelWriter::generate_full_name(const std::string &name) const
 {
     return "G" + std::to_string(id_space()) + "__" + name;
@@ -66,9 +95,9 @@ TileOperand KernelWriter::create_tile_operand(ITile &tile)
     return TileOperand(tile);
 }
 
-ITile &KernelWriter::get_tile(const TileOperand &operand)
+std::tuple<ITile &, TileArea> KernelWriter::get_tile(const TileOperand &operand)
 {
-    return operand._tile;
+    return {*operand._tile, {operand._row_start, operand._row_end, operand._col_start, operand._col_end}};
 }
 
 TensorOperand KernelWriter::create_tensor_operand(ITensor &tensor)
@@ -81,4 +110,14 @@ ITensor &KernelWriter::get_tensor(const TensorOperand &operand)
     return operand._tensor;
 }
 
+const std::vector<std::vector<std::string>> &KernelWriter::get_values(const ConstantData &data)
+{
+    return data.values();
+}
+
+DataType KernelWriter::get_data_type(const ConstantData &data)
+{
+    return data.data_type();
+}
+
 } // namespace ckw
diff --git a/compute_kernel_writer/src/Tensor3dMapper.cpp b/compute_kernel_writer/src/Tensor3dMapper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..acef6412a4595350c9b21a79fb909676fc433621
--- /dev/null
+++ b/compute_kernel_writer/src/Tensor3dMapper.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "Tensor3dMapper.h"
+
+#include "ckw/Error.h"
+#include "ckw/types/TensorSamplerTypes.h"
+
+#include "src/ITensor.h"
+#include "src/ITile.h"
+
+namespace ckw
+{
+Tensor3dMapper::Tensor3dMapper(ITensor *tensor, TensorSamplerFormat format) : _tensor(tensor), _format(format)
+{
+}
+
+TileVariable Tensor3dMapper::dim_x() const
+{
+    switch (_format)
+    {
+        case TensorSamplerFormat::Dim0_Dim1xDim2_1:
+        case TensorSamplerFormat::Dim0_Dim1_Dim2:
+            return _tensor->component(TensorComponentType::Dim0).scalar(0, 0);
+        default:
+            CKW_THROW_MSG("Unsupported tensor format");
+            return _tensor->component(TensorComponentType::Unknown).scalar(0, 0);
+    }
+}
+
+TileVariable Tensor3dMapper::dim_y() const
+{
+    switch (_format)
+    {
+        case TensorSamplerFormat::Dim0_Dim1xDim2_1:
+            return _tensor->component(TensorComponentType::Dim1xDim2).scalar(0, 0);
+        case TensorSamplerFormat::Dim0_Dim1_Dim2:
+            return _tensor->component(TensorComponentType::Dim1).scalar(0, 0);
+        default:
+            CKW_THROW_MSG("Unsupported tensor format");
+            return _tensor->component(TensorComponentType::Unknown).scalar(0, 0);
+    }
+}
+
+TileVariable Tensor3dMapper::dim_z() const
+{
+    TileVariable dim_one;
+
+    switch (_format)
+    {
+        case TensorSamplerFormat::Dim0_Dim1xDim2_1:
+            dim_one     = _tensor->component(TensorComponentType::Dim3).scalar(0, 0);
+            dim_one.str = "1";
+            return dim_one;
+        case TensorSamplerFormat::Dim0_Dim1_Dim2:
+            return _tensor->component(TensorComponentType::Dim2).scalar(0, 0);
+        default:
+            CKW_THROW_MSG("Unsupported tensor format");
+            return _tensor->component(TensorComponentType::Unknown).scalar(0, 0);
+    }
+}
+
+TileVariable Tensor3dMapper::dim_batch() const
+{
+    TileVariable dim_one;
+
+    switch (_format)
+    {
+        case TensorSamplerFormat::Dim0_Dim1xDim2_1:
+        case TensorSamplerFormat::Dim0_Dim1_Dim2:
+            return _tensor->component(TensorComponentType::Dim3).scalar(0, 0);
+        default:
+            CKW_THROW_MSG("Unsupported tensor format");
+            return _tensor->component(TensorComponentType::Unknown).scalar(0, 0);
+    }
+}
+
+TileVariable Tensor3dMapper::stride_x() const
+{
+    switch (_format)
+    {
+        case TensorSamplerFormat::Dim0_Dim1xDim2_1:
+        case TensorSamplerFormat::Dim0_Dim1_Dim2:
+            return _tensor->component(TensorComponentType::Stride0).scalar(0, 0);
+        default:
+            CKW_THROW_MSG("Unsupported tensor format");
+            return _tensor->component(TensorComponentType::Unknown).scalar(0, 0);
+    }
+}
+
+TileVariable Tensor3dMapper::stride_y() const
+{
+    switch (_format)
+    {
+        case TensorSamplerFormat::Dim0_Dim1xDim2_1:
+        case TensorSamplerFormat::Dim0_Dim1_Dim2:
+            return _tensor->component(TensorComponentType::Stride1).scalar(0, 0);
+        default:
+            CKW_THROW_MSG("Unsupported tensor format");
+            return _tensor->component(TensorComponentType::Unknown).scalar(0, 0);
+    }
+}
+
+TileVariable Tensor3dMapper::stride_z() const
+{
+    TileVariable stride_zero;
+
+    switch (_format)
+    {
+        case TensorSamplerFormat::Dim0_Dim1xDim2_1:
+            stride_zero     = _tensor->component(TensorComponentType::Stride3).scalar(0, 0);
+            stride_zero.str = "0";
+            return stride_zero;
+        case TensorSamplerFormat::Dim0_Dim1_Dim2:
+            return _tensor->component(TensorComponentType::Stride2).scalar(0, 0);
+        default:
+            CKW_THROW_MSG("Unsupported tensor format");
+            return _tensor->component(TensorComponentType::Unknown).scalar(0, 0);
+    }
+}
+
+TileVariable Tensor3dMapper::stride_batch() const
+{
+    switch (_format)
+    {
+        case TensorSamplerFormat::Dim0_Dim1xDim2_1:
+        case TensorSamplerFormat::Dim0_Dim1_Dim2:
+            return _tensor->component(TensorComponentType::Stride3).scalar(0, 0);
+        default:
+            CKW_THROW_MSG("Unsupported tensor format");
+            return _tensor->component(TensorComponentType::Unknown).scalar(0, 0);
+    }
+}
+} // namespace ckw
diff --git a/compute_kernel_writer/src/Tensor3dMapper.h b/compute_kernel_writer/src/Tensor3dMapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..e94b59519372b48d5c9c948637c1d3567e9e5e4d
--- /dev/null
+++ b/compute_kernel_writer/src/Tensor3dMapper.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_SRC_TENSOR3DMAPPER_H
+#define CKW_SRC_TENSOR3DMAPPER_H
+
+#include <string>
+
+namespace ckw
+{
+// Forward declarations
+class ITensor;
+enum class TensorSamplerFormat;
+struct TileVariable;
+
+/** This internal-only class is responsible to map an Nd tensor spatial dimensions to a 3d tensor spatial dimensions with the
+ *  help of TensorSamplerFormat.
+ *  Attention: The batch is not considered as a spatial dimension and it is treated as an offset
+ *
+ *  The aim of the dimensionality reduction is primarily to reduce
+ *  the address calculation to:
+ *      x + y * stride_y + z * stride_z + offset, where offset is determined by the batch (for example, b * stride_batch).
+ *
+ */
+class Tensor3dMapper
+{
+public:
+    /** Constructor */
+    Tensor3dMapper(ITensor *tensor, TensorSamplerFormat format);
+
+    /** Get dimension x as string */
+    TileVariable dim_x() const;
+
+    /** Get dimension y as string */
+    TileVariable dim_y() const;
+
+    /** Get dimension z as string */
+    TileVariable dim_z() const;
+
+    /** Get batch dimension as string */
+    TileVariable dim_batch() const;
+
+    /** Get stride for dimension x as string */
+    TileVariable stride_x() const;
+
+    /** Get stride for dimension y as string */
+    TileVariable stride_y() const;
+
+    /** Get stride for dimension z as string */
+    TileVariable stride_z() const;
+
+    /** Get stride for batch dimension as string */
+    TileVariable stride_batch() const;
+
+private:
+    ITensor            *_tensor;
+    TensorSamplerFormat _format;
+};
+} // namespace ckw
+
+#endif /* CKW_SRC_TENSOR3DMAPPER_H */
diff --git a/compute_kernel_writer/src/TensorOperand.cpp b/compute_kernel_writer/src/TensorOperand.cpp
index 5ad24c62768af8ad1444737c7548d412a63596ed..bf11d0d3329d1f30404fc388da3c3db592bc9626 100644
--- a/compute_kernel_writer/src/TensorOperand.cpp
+++ b/compute_kernel_writer/src/TensorOperand.cpp
@@ -23,13 +23,13 @@
  */
 
 #include "ckw/TensorOperand.h"
+
 #include "src/ITensor.h"
 
 namespace ckw
 {
 
-TensorOperand::TensorOperand(ITensor &tensor)
-    : _tensor(tensor)
+TensorOperand::TensorOperand(ITensor &tensor) : _tensor(tensor)
 {
 }
 
@@ -108,4 +108,4 @@ TileOperand TensorOperand::offset_first_element_in_bytes()
     return TileOperand(_tensor.component(TensorComponentType::OffsetFirstElement));
 }
 
-} // namespace ckw
\ No newline at end of file
+} // namespace ckw
diff --git a/compute_kernel_writer/src/TensorSampler.cpp b/compute_kernel_writer/src/TensorSampler.cpp
index 2ee8df4bca33b860c68e9252737c5d29af18ab26..91d5af2fd036d54625260fe4e31da23804eca171 100644
--- a/compute_kernel_writer/src/TensorSampler.cpp
+++ b/compute_kernel_writer/src/TensorSampler.cpp
@@ -32,7 +32,11 @@ TensorSampler::TensorSampler(TensorStorageType         storage,
                              TensorSamplerAddressModeX address_mode_x,
                              TensorSamplerAddressModeY address_mode_y,
                              TensorSamplerAddressModeZ address_mode_z)
-    : _storage(storage), _format(format), _address_mode_x(address_mode_x), _address_mode_y(address_mode_y), _address_mode_z(address_mode_z)
+    : _storage(storage),
+      _format(format),
+      _address_mode_x(address_mode_x),
+      _address_mode_y(address_mode_y),
+      _address_mode_z(address_mode_z)
 {
 }
 
diff --git a/compute_kernel_writer/src/TensorUtils.cpp b/compute_kernel_writer/src/TensorUtils.cpp
index 24836092d4b65ff7f2afd94e5695fe4a6026000b..17fc9547ae728c99fc176fc79403e2b98e43a49f 100644
--- a/compute_kernel_writer/src/TensorUtils.cpp
+++ b/compute_kernel_writer/src/TensorUtils.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "src/TensorUtils.h"
+
 #include "ckw/Error.h"
 #include "ckw/TensorInfo.h"
 #include "ckw/types/TensorComponentType.h"
@@ -31,10 +32,10 @@ namespace ckw
 {
 TensorComponentType get_tensor_dimension(TensorDataLayout layout, TensorDataLayoutComponent component)
 {
-    switch(layout)
+    switch (layout)
     {
         case TensorDataLayout::Nhwc:
-            switch(component)
+            switch (component)
             {
                 case TensorDataLayoutComponent::C:
                     return TensorComponentType::Dim0;
@@ -49,7 +50,7 @@ TensorComponentType get_tensor_dimension(TensorDataLayout layout, TensorDataLayo
                     return TensorComponentType::Unknown;
             }
         case TensorDataLayout::Ndhwc:
-            switch(component)
+            switch (component)
             {
                 case TensorDataLayoutComponent::C:
                     return TensorComponentType::Dim0;
@@ -73,10 +74,10 @@ TensorComponentType get_tensor_dimension(TensorDataLayout layout, TensorDataLayo
 
 TensorComponentType get_tensor_stride(TensorDataLayout layout, TensorDataLayoutComponent component)
 {
-    switch(layout)
+    switch (layout)
     {
         case TensorDataLayout::Nhwc:
-            switch(component)
+            switch (component)
             {
                 case TensorDataLayoutComponent::C:
                     return TensorComponentType::Stride0;
@@ -91,7 +92,7 @@ TensorComponentType get_tensor_stride(TensorDataLayout layout, TensorDataLayoutC
                     return TensorComponentType::Unknown;
             }
         case TensorDataLayout::Ndhwc:
-            switch(component)
+            switch (component)
             {
                 case TensorDataLayoutComponent::C:
                     return TensorComponentType::Stride0;
diff --git a/compute_kernel_writer/src/TileInfo.cpp b/compute_kernel_writer/src/TileInfo.cpp
index 66d8cb1620732f26914cc812c58da5eea4e3e0f9..273266eedcfac85eb07b463531d87bdbacceb352 100644
--- a/compute_kernel_writer/src/TileInfo.cpp
+++ b/compute_kernel_writer/src/TileInfo.cpp
@@ -26,18 +26,15 @@
 
 namespace ckw
 {
-TileInfo::TileInfo(DataType dt)
-    : _dt(dt), _shape({ { 1, 1 } })
+TileInfo::TileInfo(DataType dt) : _dt(dt), _shape({{1, 1}})
 {
 }
 
-TileInfo::TileInfo(DataType dt, int32_t w)
-    : _dt(dt), _shape({ { w, 1 } })
+TileInfo::TileInfo(DataType dt, int32_t w) : _dt(dt), _shape({{w, 1}})
 {
 }
 
-TileInfo::TileInfo(DataType dt, int32_t h, int32_t w)
-    : _dt(dt), _shape({ { w, h } })
+TileInfo::TileInfo(DataType dt, int32_t h, int32_t w) : _dt(dt), _shape({{w, h}})
 {
 }
 
diff --git a/compute_kernel_writer/src/TileOperand.cpp b/compute_kernel_writer/src/TileOperand.cpp
index 7d180feec826bf47859455de5150bfed2d43d22c..865ef85a13e4e8678a91e1cfa756c87551dee132 100644
--- a/compute_kernel_writer/src/TileOperand.cpp
+++ b/compute_kernel_writer/src/TileOperand.cpp
@@ -24,12 +24,52 @@
 
 #include "ckw/TileOperand.h"
 
+#include "ckw/Error.h"
+
+#include "src/ITile.h"
+
 namespace ckw
 {
 
 TileOperand::TileOperand(ITile &tile)
-    : _tile(tile)
+    : _tile(&tile), _row_start(0), _row_end(tile.info().height()), _col_start(0), _col_end(tile.info().width())
+{
+}
+
+TileOperand::TileOperand(
+    const TileOperand &operand, int32_t row_start, int32_t row_end, int32_t col_start, int32_t col_end)
+    : _tile(operand._tile), _row_start(row_start), _row_end(row_end), _col_start(col_start), _col_end(col_end)
 {
+    CKW_ASSERT(row_start >= 0 && row_start < _tile->info().height());
+    CKW_ASSERT(row_end > row_start && row_end <= _tile->info().height());
+    CKW_ASSERT(col_start >= 0 && col_start < _tile->info().width());
+    CKW_ASSERT(col_end > col_start && col_end <= _tile->info().width());
+}
+
+TileOperand TileOperand::tile(int32_t row_start, int32_t row_end, int32_t col_start, int32_t col_end) const
+{
+    CKW_ASSERT(row_start >= 0 && _row_start + row_start < _row_end);
+    CKW_ASSERT(row_end > row_start && _row_start + row_end <= _row_end);
+    CKW_ASSERT(col_start >= 0 && _col_start + col_start < _col_end);
+    CKW_ASSERT(col_end > col_start && _col_start + col_end <= _col_end);
+
+    return TileOperand(*this, _row_start + row_start, _row_start + row_end, _col_start + col_start,
+                       _col_start + col_end);
+}
+
+TileOperand TileOperand::row(int32_t row) const
+{
+    CKW_ASSERT(row >= 0 && _row_start + row < _row_end);
+
+    return tile(_row_start + row, _row_start + row + 1, _col_start, _col_end);
+}
+
+TileOperand TileOperand::scalar(int32_t row, int32_t col) const
+{
+    CKW_ASSERT(row >= 0 && _row_start + row < _row_end);
+    CKW_ASSERT(col >= 0 && _col_start + col < _col_end);
+
+    return tile(_row_start + row, _row_start + row + 1, _col_start + col, _col_start + col + 1);
 }
 
 } // namespace ckw
diff --git a/compute_kernel_writer/src/TileView.cpp b/compute_kernel_writer/src/TileView.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ea803f92f4ea3748cd6c11a06a2c6354f51e8953
--- /dev/null
+++ b/compute_kernel_writer/src/TileView.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/TileView.h"
+
+#include <cstdint>
+
+namespace ckw
+{
+
+TileArea::TileArea(int32_t row_start, int32_t row_end, int32_t col_start, int32_t col_end)
+    : _row_start(row_start), _row_end(row_end), _col_start(col_start), _col_end(col_end)
+{
+}
+
+int32_t TileArea::row_start() const
+{
+    return _row_start;
+}
+
+int32_t TileArea::row_end() const
+{
+    return _row_end;
+}
+
+int32_t TileArea::col_start() const
+{
+    return _col_start;
+}
+
+int32_t TileArea::col_end() const
+{
+    return _col_end;
+}
+
+} // namespace ckw
diff --git a/compute_kernel_writer/src/TileView.h b/compute_kernel_writer/src/TileView.h
new file mode 100644
index 0000000000000000000000000000000000000000..50ae66b389b1c1b4f2370f17ce13f8d41d0e8723
--- /dev/null
+++ b/compute_kernel_writer/src/TileView.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_SRC_TILEVIEW_H
+#define CKW_SRC_TILEVIEW_H
+
+#include "ckw/Error.h"
+#include "ckw/types/DataType.h"
+
+#include "src/ITile.h"
+
+#include <cstdint>
+
+namespace ckw
+{
+
+/** A rectangular active area of a tile. */
+class TileArea
+{
+public:
+    /** Create a new tile rectangular active area.
+     *
+     * The range of rows and columns is defined by pairs of start and end indices, inclusive lower and exclusive upper.
+     * In other word, any row and column indices satisfied the following conditions will be part of the active area:
+     *
+     *   row_start <= row_index < row_end
+     *   col_start <= col_index < col_end
+     *
+     * @param[in] row_start The start index of the row range.
+     * @param[in] row_end   The end index of the row range.
+     * @param[in] col_start The start index of the column range.
+     * @param[in] col_end   The end index of the column range.
+     */
+    TileArea(int32_t row_start, int32_t row_end, int32_t col_start, int32_t col_end);
+
+    /** Get the start row index. */
+    int32_t row_start() const;
+
+    /** Get the end row (exclusive) index. */
+    int32_t row_end() const;
+
+    /** Get the start column index. */
+    int32_t col_start() const;
+
+    /** Get the end column (exclusive) index. */
+    int32_t col_end() const;
+
+private:
+    int32_t _row_start;
+    int32_t _row_end;
+    int32_t _col_start;
+    int32_t _col_end;
+};
+
+/** A rectangular view of a tile. */
+template <typename T>
+class TileView
+{
+public:
+    /** Create a tile view that refers to the whole tile.
+     *
+     * @param[in] tile The tile object.
+     */
+    TileView(const T &tile) : _tile(&tile), _area(0, tile.info().height(), 0, tile.info().width())
+    {
+    }
+
+    /** Create a new rectangular view of the given tile.
+     *
+     * @param[in] tile The tile object.
+     * @param[in] area The rectangular active area.
+     */
+    TileView(const T &tile, const TileArea &area) : _tile(&tile), _area(area)
+    {
+    }
+
+    /** Get the tile object.
+     *
+     * The caller must guarantee that the tile view refers to the whole tile.
+     */
+    const T &full_tile() const
+    {
+        CKW_ASSERT(is_full_tile());
+
+        return *_tile;
+    }
+
+    /** Get the data type of the tile. */
+    DataType data_type() const
+    {
+        return _tile->info().data_type();
+    }
+
+    /** Get the start row index. */
+    int32_t row_start() const
+    {
+        return _area.row_start();
+    }
+
+    /** Get the end row index. */
+    int32_t row_end() const
+    {
+        return _area.row_end();
+    }
+
+    /** Get the start column index. */
+    int32_t col_start() const
+    {
+        return _area.col_start();
+    }
+
+    /** Get the end column index. */
+    int32_t col_end() const
+    {
+        return _area.col_end();
+    }
+
+    /** Get the height of the tile view. */
+    int32_t height() const
+    {
+        return _area.row_end() - _area.row_start();
+    }
+
+    /** Get the width of the tile view. */
+    int32_t width() const
+    {
+        return _area.col_end() - _area.col_start();
+    }
+
+    /** See @ref IVectorAccess::vector. */
+    TileVariable vector(int32_t row) const
+    {
+        return _tile->vector(row_start() + row, col_start(), width());
+    }
+
+    /** See @ref IScalarAccess::scalar. */
+    TileVariable scalar(int32_t row, int32_t col) const
+    {
+        return _tile->scalar(row_start() + row, col_start() + col);
+    }
+
+    /** Get the name of the tile. */
+    const std::string &name() const
+    {
+        return _tile->name();
+    }
+
+    /** Get whether the tile view is a scalar element. */
+    bool is_scalar() const
+    {
+        return height() == 1 && width() == 1;
+    }
+
+    /** Get whether the tile view refers to the whole tile. */
+    bool is_full_tile() const
+    {
+        return row_start() == 0 && row_end() == _tile->info().height() && col_start() == 0 &&
+               col_end() == _tile->info().width();
+    }
+
+private:
+    const T *_tile;
+    TileArea _area;
+};
+
+} // namespace ckw
+
+#endif // CKW_SRC_TILEVIEW_H
diff --git a/compute_kernel_writer/src/cl/CLHelpers.cpp b/compute_kernel_writer/src/cl/CLHelpers.cpp
index 08108e383fe87997aa3bd62b8483289137d547a2..8e4a932764832c18da4f3ad49d36778b970a221c 100644
--- a/compute_kernel_writer/src/cl/CLHelpers.cpp
+++ b/compute_kernel_writer/src/cl/CLHelpers.cpp
@@ -21,17 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "src/cl/CLHelpers.h"
+
 #include "ckw/Error.h"
 #include "ckw/types/DataType.h"
+#include "ckw/types/Operators.h"
 #include "ckw/types/TensorStorageType.h"
 
+#include "src/types/DataTypeHelpers.h"
+
 namespace ckw
 {
 bool cl_validate_vector_length(int32_t len)
 {
     bool valid_vector_length = true;
-    if(len < 1 || len > 16 || (len > 4 && len < 8) || (len > 8 && len < 16))
+    if (len < 1 || len > 16 || (len > 4 && len < 8) || (len > 8 && len < 16))
     {
         valid_vector_length = false;
     }
@@ -40,14 +45,14 @@ bool cl_validate_vector_length(int32_t len)
 
 std::string cl_get_variable_datatype_as_string(DataType dt, int32_t len)
 {
-    if(cl_validate_vector_length(len) == false)
+    if (cl_validate_vector_length(len) == false)
     {
         CKW_THROW_MSG("Unsupported vector length");
         return "";
     }
 
     std::string res;
-    switch(dt)
+    switch (dt)
     {
         case DataType::Fp32:
             res += "float";
@@ -81,7 +86,7 @@ std::string cl_get_variable_datatype_as_string(DataType dt, int32_t len)
             return "";
     }
 
-    if(len > 1)
+    if (len > 1)
     {
         res += std::to_string(len);
     }
@@ -91,7 +96,7 @@ std::string cl_get_variable_datatype_as_string(DataType dt, int32_t len)
 
 int32_t cl_round_up_to_nearest_valid_vector_width(int32_t width)
 {
-    switch(width)
+    switch (width)
     {
         case 1:
             return 1;
@@ -124,7 +129,7 @@ int32_t cl_round_up_to_nearest_valid_vector_width(int32_t width)
 std::string cl_get_variable_storagetype_as_string(TensorStorageType storage)
 {
     std::string res;
-    switch(storage)
+    switch (storage)
     {
         case TensorStorageType::BufferUint8Ptr:
             res += "__global uchar*";
@@ -142,12 +147,134 @@ std::string cl_get_variable_storagetype_as_string(TensorStorageType storage)
     return res;
 }
 
+std::string cl_get_assignment_op_as_string(AssignmentOp op)
+{
+    switch (op)
+    {
+        case AssignmentOp::Increment:
+            return "+=";
+
+        case AssignmentOp::Decrement:
+            return "-=";
+
+        default:
+            CKW_THROW_MSG("Unsupported assignment operator!");
+    }
+}
+
+std::tuple<bool, std::string> cl_get_unary_op(UnaryOp op)
+{
+    switch (op)
+    {
+        case UnaryOp::LogicalNot:
+            return {false, "!"};
+
+        case UnaryOp::BitwiseNot:
+            return {false, "~"};
+
+        case UnaryOp::Exp:
+            return {true, "exp"};
+
+        case UnaryOp::Tanh:
+            return {true, "tanh"};
+
+        case UnaryOp::Sqrt:
+            return {true, "sqrt"};
+
+        case UnaryOp::Erf:
+            return {true, "erf"};
+
+        case UnaryOp::Fabs:
+            return {true, "fabs"};
+
+        case UnaryOp::Log:
+            return {true, "log"};
+
+        case UnaryOp::Round:
+            return {true, "round"};
+
+        default:
+            CKW_THROW_MSG("Unsupported unary operation!");
+    }
+}
+
+std::tuple<bool, std::string> cl_get_binary_op(BinaryOp op, DataType data_type)
+{
+    const auto is_float = is_data_type_float(data_type);
+
+    switch (op)
+    {
+        case BinaryOp::Add:
+            return {false, "+"};
+
+        case BinaryOp::Sub:
+            return {false, "-"};
+
+        case BinaryOp::Mul:
+            return {false, "*"};
+
+        case BinaryOp::Div:
+            return {false, "/"};
+
+        case BinaryOp::Mod:
+            return {false, "%"};
+
+        case BinaryOp::Equal:
+            return {false, "=="};
+
+        case BinaryOp::Less:
+            return {false, "<"};
+
+        case BinaryOp::LessEqual:
+            return {false, "<="};
+
+        case BinaryOp::Greater:
+            return {false, ">"};
+
+        case BinaryOp::GreaterEqual:
+            return {false, ">="};
+
+        case BinaryOp::LogicalAnd:
+            return {false, "&&"};
+
+        case BinaryOp::LogicalOr:
+            return {false, "||"};
+
+        case BinaryOp::BitwiseXOR:
+            return {false, "^"};
+
+        case BinaryOp::Min:
+            return {true, is_float ? "fmin" : "min"};
+
+        case BinaryOp::Max:
+            return {true, is_float ? "fmax" : "max"};
+
+        default:
+            CKW_THROW_MSG("Unsupported binary operator/function!");
+    }
+}
+
+std::tuple<bool, std::string> cl_get_ternary_op(TernaryOp op)
+{
+    switch (op)
+    {
+        case TernaryOp::Select:
+            return {true, "select"};
+
+        case TernaryOp::Clamp:
+            return {true, "clamp"};
+
+        default:
+            CKW_THROW_MSG("Unsupported ternary function!");
+    }
+}
+
 std::string cl_data_type_rounded_up_to_valid_vector_width(DataType dt, int32_t width)
 {
-    std::string data_type;
-    const int32_t     w = cl_round_up_to_nearest_valid_vector_width(width);
+    std::string   data_type;
+    const int32_t w = cl_round_up_to_nearest_valid_vector_width(width);
     data_type += cl_get_variable_datatype_as_string(dt, 1);
-    if(w != 1)
+    if (w != 1)
     {
         data_type += std::to_string(w);
     }
@@ -158,7 +285,7 @@ std::vector<int32_t> cl_decompose_vector_width(int32_t vector_width)
 {
     std::vector<int32_t> x;
 
-    switch(vector_width)
+    switch (vector_width)
     {
         case 0:
             break;
diff --git a/compute_kernel_writer/src/cl/CLHelpers.h b/compute_kernel_writer/src/cl/CLHelpers.h
index 669424088effc97fb618e0cfdb52d7828f6b2675..370ffc700c7938bd1eea41a64c272b9ff5548531 100644
--- a/compute_kernel_writer/src/cl/CLHelpers.h
+++ b/compute_kernel_writer/src/cl/CLHelpers.h
@@ -24,8 +24,11 @@
 #ifndef CKW_SRC_CL_CLHELPERS_H
 #define CKW_SRC_CL_CLHELPERS_H
 
+#include "ckw/types/Operators.h"
+
 #include <cstdint>
 #include <string>
+#include <tuple>
 #include <vector>
 
 /** OpenCL specific helper functions */
@@ -52,6 +55,51 @@ bool cl_validate_vector_length(int32_t len);
  */
 std::string cl_get_variable_datatype_as_string(DataType dt, int32_t len);
 
+/** Return the assignment operator in OpenCL language.
+ *
+ * @param[in] op The assignment operator.
+ *
+ * @return The operator in OpenCL language as a string.
+ */
+std::string cl_get_assignment_op_as_string(AssignmentOp op);
+
+/** Return the information about the unary operation.
+ *
+ * The result contains:
+ *   - is_func: true if it's a function and false if it's an unary operator in OpenCL language.
+ *   - str: the function name or the operator in OpenCL language.
+ *
+ * @param[in] op The unary operator.
+ *
+ * @return The information about the unary operation.
+ */
+std::tuple<bool, std::string> cl_get_unary_op(UnaryOp op);
+
+/** Return the information about the binary operation.
+ *
+ * The result contains:
+ *   - is_func: true if it's a function and false if it's an binary operator in OpenCL language.
+ *   - str: the function name or the operator in OpenCL language.
+ *
+ * @param[in] op        The binary operator.
+ * @param[in] data_type The input data type.
+ *
+ * @return The information about the binary operation.
+ */
+std::tuple<bool, std::string> cl_get_binary_op(BinaryOp op, DataType data_type);
+
+/** Return the information about the ternary operation.
+ *
+ * The result contains:
+ *   - is_func: true if it's a function and false if it's a ternary operator in OpenCL language.
+ *   - str: the function name or the operator in OpenCL language.
+ *
+ * @param[in] op The ternary operator.
+ *
+ * @return The information about the ternary operation.
+ */
+std::tuple<bool, std::string> cl_get_ternary_op(TernaryOp op);
+
 /** Helper function to return the OpenCL vector size that accommodate the the desired width
  *
  * @param[in] width The desired width
diff --git a/compute_kernel_writer/src/cl/CLKernelWriter.cpp b/compute_kernel_writer/src/cl/CLKernelWriter.cpp
index 93630769019962af19fef06073d509ffd251827d..62e6853a7adf7e85a9a201ec3c14efbe0c9984c5 100644
--- a/compute_kernel_writer/src/cl/CLKernelWriter.cpp
+++ b/compute_kernel_writer/src/cl/CLKernelWriter.cpp
@@ -23,13 +23,29 @@
  */
 
 #include "src/cl/CLKernelWriter.h"
+
 #include "ckw/Error.h"
 #include "ckw/Kernel.h"
+#include "ckw/TensorSampler.h"
 #include "ckw/TileOperand.h"
+#include "ckw/types/DataType.h"
+#include "ckw/types/MemoryOperation.h"
+#include "ckw/types/TargetLanguage.h"
+
 #include "src/cl/CLHelpers.h"
 #include "src/cl/CLTensorArgument.h"
 #include "src/cl/CLTile.h"
+#include "src/cl/helpers/CLMemoryOpBufferHelper.h"
+#include "src/cl/helpers/CLMemoryOpImage2dHelper.h"
+#include "src/cl/helpers/ICLMemoryOpHelper.h"
+#include "src/ITensorComponent.h"
+#include "src/TileView.h"
+#include "src/types/DataTypeHelpers.h"
+
+#include <algorithm>
 #include <cstdint>
+#include <tuple>
+#include <vector>
 
 namespace ckw
 {
@@ -39,11 +55,461 @@ CLKernelWriter::~CLKernelWriter() = default;
 
 std::unique_ptr<Kernel> CLKernelWriter::emit_kernel(const std::string &name)
 {
-    CKW_UNUSED(name);
-    CKW_THROW_MSG("Not implemented!");
+    std::string code;
+
+    code += "__kernel void ";
+    code += name;
+    code += "\n(\n";
+
+    // Create the list of arguments.
+    std::vector<KernelArgument> arguments;
+
+    for (const auto &tensor : _tensors)
+    {
+        const auto tensor_id = tensor->info().id();
+
+        const auto storages   = tensor->storages();
+        const auto components = tensor->components();
+
+        for (const auto &storage : storages)
+        {
+            code += cl_get_variable_storagetype_as_string(storage.type);
+            code += " ";
+            code += storage.val;
+            code += ",\n";
+
+            arguments.emplace_back(tensor_id, storage.type);
+        }
+
+        for (const auto &component : components)
+        {
+            const auto &tile      = component->tile();
+            const auto &tile_info = tile.info();
+
+            CKW_ASSERT(tile.is_scalar());
+
+            code += cl_get_variable_datatype_as_string(tile_info.data_type(), 1);
+            code += " ";
+            code += tile.name();
+            code += ",\n";
+
+            arguments.emplace_back(tensor_id, component->component_type());
+        }
+    }
+
+    if (code.size() >= 2 && code[code.size() - 2] == ',' && code[code.size() - 1] == '\n')
+    {
+        // Remove the last comma in the argument list.
+        code.pop_back();
+        code[code.size() - 1] = '\n';
+    }
+
+    code += ")\n{\n";
+
+    code += _body_source_code;
+
+    code += "}\n";
+
+    return std::make_unique<Kernel>(TargetLanguage::OpenCL, arguments, code);
+}
+
+void CLKernelWriter::op_assign(const TileOperand &dst, const TileOperand &src)
+{
+    const auto dst_view = to_cl_tile_view(dst);
+    const auto src_view = to_cl_tile_view(src);
+
+    const auto dst_w = dst_view.width();
+    const auto dst_h = dst_view.height();
+    const auto src_w = src_view.width();
+
+    const auto data_type_str = cl_get_variable_datatype_as_string(dst_view.data_type(), dst_w);
+
+    const auto        broadcast_src_x = dst_w != 1 && src_w == 1;
+    const std::string src_prefix      = broadcast_src_x ? "(" + data_type_str + ")" : "";
+
+    CKW_ASSERT_MSG(src_view.data_type() == dst_view.data_type(), "Source and destination type must match.");
+    CKW_ASSERT_MSG(src_view.height() == dst_h || src_view.height() == 1,
+                   "Tile height must match or source is broadcasting in y dimension.");
+    CKW_ASSERT_MSG(src_w == dst_w || src_w == 1, "Tile width must match or source is broadcasting in x dimension.");
+
+    // Broadcasting on y dimension is automatic (see CLTile::vector).
+    for (int32_t y = 0; y < dst_h; ++y)
+    {
+        append_code(dst_view.vector(y).str, " = ", src_prefix, src_view.vector(y).str, ";\n");
+    }
+}
+
+void CLKernelWriter::op_cast(const TileOperand &dst, const TileOperand &src, ConvertPolicy policy)
+{
+    const auto dst_view = to_cl_tile_view(dst);
+    const auto src_view = to_cl_tile_view(src);
+
+    const auto dst_w = dst_view.width();
+    const auto dst_h = dst_view.height();
+    const auto src_w = src_view.width();
+
+    const auto dst_type = dst_view.data_type();
+
+    const auto convert_type_str = cl_get_variable_datatype_as_string(dst_type, src_w);
+    const auto dst_type_str     = cl_get_variable_datatype_as_string(dst_type, dst_w);
+
+    const std::string sat = policy == ConvertPolicy::Saturate ? "_sat" : "";
+    CKW_ASSERT_IF(policy == ConvertPolicy::Saturate, !is_data_type_float(dst_type));
+
+    const auto        broadcast_x = dst_w != 1 && src_w == 1;
+    const std::string prefix      = broadcast_x ? "(" + dst_type_str + ")" : "";
+
+    CKW_ASSERT_MSG(src_view.data_type() != dst_view.data_type(), "Source and destination type must be different.");
+    CKW_ASSERT_MSG(src_view.height() == dst_h || src_view.height() == 1,
+                   "Tile height must match or source is broadcasting in y dimension.");
+    CKW_ASSERT_MSG(src_w == dst_w || src_w == 1, "Tile width must match or source is broadcasting in x dimension.");
+
+    // Broadcasting on y dimension is automatic (see CLTile::vector).
+    for (int32_t y = 0; y < dst_h; ++y)
+    {
+        append_code(dst_view.vector(y).str, " = ", prefix, "convert_", convert_type_str, sat, "(",
+                    src_view.vector(y).str, ");\n");
+    }
+}
+
+void CLKernelWriter::op_unary(const TileOperand &dst, UnaryOp op, const TileOperand &src)
+{
+    const auto dst_view = to_cl_tile_view(dst);
+    const auto src_view = to_cl_tile_view(src);
+
+    const auto dst_w = dst_view.width();
+    const auto dst_h = dst_view.height();
+    const auto src_w = src_view.width();
+
+    const auto data_type_str   = cl_get_variable_datatype_as_string(dst_view.data_type(), dst_w);
+    const auto broadcast_src_x = dst_w != 1 && src_w == 1;
+
+    const std::string src_prefix = broadcast_src_x ? "(" + data_type_str + ")" : "";
+
+    const auto  op_info    = cl_get_unary_op(op);
+    const auto  op_is_func = std::get<0>(op_info);
+    const auto &op_name    = std::get<1>(op_info);
+    const auto  op_prefix  = op_is_func ? op_name + "(" : op_name;
+    const auto  op_suffix  = op_is_func ? ")" : "";
+
+    CKW_ASSERT_MSG(src_view.data_type() == dst_view.data_type(), "Source and destination type must match.");
+    CKW_ASSERT_MSG(src_view.height() == dst_h || src_view.height() == 1,
+                   "Tile height must match or source is broadcasting in y dimension.");
+    CKW_ASSERT_MSG(src_w == dst_w || src_w == 1, "Tile width must match or source is broadcasting in x dimension.");
+
+    // Broadcasting on y dimension is automatic (see CLTile::vector).
+    for (int32_t y = 0; y < dst_h; ++y)
+    {
+        append_code(dst_view.vector(y).str, " = ", src_prefix, op_prefix, src_view.vector(y).str, op_suffix, ";\n");
+    }
+}
+
+void CLKernelWriter::op_binary(const TileOperand &dst, BinaryOp op, const TileOperand &first, const TileOperand &second)
+{
+    const auto dst_view = to_cl_tile_view(dst);
+    const auto lhs_view = to_cl_tile_view(first);
+    const auto rhs_view = to_cl_tile_view(second);
+
+    const auto dst_w = dst_view.width();
+    const auto dst_h = dst_view.height();
+    const auto lhs_w = lhs_view.width();
+    const auto rhs_w = rhs_view.width();
+
+    const auto data_type = lhs_view.data_type();
+
+    CKW_ASSERT_MSG(lhs_view.data_type() == rhs_view.data_type(), "LHS and RHS type must match.");
+
+    CKW_ASSERT_MSG(lhs_view.height() == dst_h || lhs_view.height() == 1,
+                   "LHS tile height must match or source is broadcasting in y dimension.");
+    CKW_ASSERT_MSG(rhs_view.height() == dst_h || rhs_view.height() == 1,
+                   "RHS tile height must match or source is broadcasting in y dimension.");
+
+    CKW_ASSERT_MSG(lhs_w == dst_w || lhs_w == 1,
+                   "LHS tile width must match destination or LHS is broadcasting in x dimension.");
+    CKW_ASSERT_MSG(rhs_w == dst_w || rhs_w == 1,
+                   "RHS tile width must match destination or RHS is broadcasting in x dimension.");
+
+    if (op == BinaryOp::MatMul_Nt_T)
+    {
+        CKW_ASSERT(is_data_type_float(data_type));
+
+        for (int32_t y = 0; y < dst_h; ++y)
+        {
+            for (int32_t x = 0; x < dst_w; ++x)
+            {
+                for (int32_t k = 0; k < lhs_w; ++k)
+                {
+                    append_code(dst_view.scalar(x, y).str, " = fma(", lhs_view.scalar(k, y).str, ", ",
+                                rhs_view.scalar(k, x).str, ", ", dst_view.scalar(x, y).str, ");\n");
+                }
+            }
+        }
+    }
+    else
+    {
+        const auto  op_info    = cl_get_binary_op(op, data_type);
+        const auto  op_is_func = std::get<0>(op_info);
+        const auto &op_name    = std::get<1>(op_info);
+
+        const auto data_type_str = cl_get_variable_datatype_as_string(data_type, dst_w);
+
+        const auto broadcast_lhs_x = dst_w != 1 && lhs_w == 1;
+        const auto broadcast_rhs_x = dst_w != 1 && rhs_w == 1;
+
+        const std::string lhs_prefix = broadcast_lhs_x ? "(" + data_type_str + ")" : "";
+        const std::string rhs_prefix = broadcast_rhs_x ? "(" + data_type_str + ")" : "";
+
+        const std::string op_prefix    = op_is_func ? " = " + op_name + "(" : " = ";
+        const std::string op_separator = op_is_func ? ", " : " " + op_name + " ";
+        const std::string op_suffix    = op_is_func ? ");\n" : ";\n";
+
+        // Broadcasting on y dimension is automatic (see CLTile::vector).
+        for (int32_t y = 0; y < dst_h; ++y)
+        {
+            append_code(dst_view.vector(y).str, op_prefix, lhs_prefix, lhs_view.vector(y).str, op_separator, rhs_prefix,
+                        rhs_view.vector(y).str, op_suffix);
+        }
+    }
+}
+
+void CLKernelWriter::op_ternary(
+    const TileOperand &dst, TernaryOp op, const TileOperand &first, const TileOperand &second, const TileOperand &third)
+{
+    const auto dst_view    = to_cl_tile_view(dst);
+    const auto first_view  = to_cl_tile_view(first);
+    const auto second_view = to_cl_tile_view(second);
+    const auto third_view  = to_cl_tile_view(third);
+
+    const auto dst_w    = dst_view.width();
+    const auto dst_h    = dst_view.height();
+    const auto first_w  = first_view.width();
+    const auto second_w = second_view.width();
+    const auto third_w  = third_view.width();
+
+    const auto data_type     = dst_view.data_type();
+    const auto data_type_str = cl_get_variable_datatype_as_string(data_type, dst_w);
+
+    const auto  op_info    = cl_get_ternary_op(op);
+    const auto  op_is_func = std::get<0>(op_info);
+    const auto &op_name    = std::get<1>(op_info);
+
+    const auto broadcast_first_x  = dst_w != 1 && first_w == 1;
+    const auto broadcast_second_x = dst_w != 1 && second_w == 1;
+    const auto broadcast_third_x  = dst_w != 1 && third_w == 1;
+
+    const std::string first_prefix  = broadcast_first_x ? "(" + data_type_str + ")" : "";
+    const std::string second_prefix = broadcast_second_x ? "(" + data_type_str + ")" : "";
+    const std::string third_prefix  = broadcast_third_x ? "(" + data_type_str + ")" : "";
+
+    CKW_ASSERT_MSG(op_is_func, "The only supported ternary operator is function.");
+    CKW_ASSERT_MSG(second_view.data_type() == dst_view.data_type(), "2nd source and destination type must match.");
+    CKW_ASSERT_MSG(third_view.data_type() == dst_view.data_type(), "3rd source and destination type must match.");
+
+    CKW_ASSERT_MSG(first_view.height() == dst_h || first_view.height() == 1,
+                   "1st tile height must match or source is broadcasting in y dimension.");
+    CKW_ASSERT_MSG(second_view.height() == dst_h || second_view.height() == 1,
+                   "2nd tile height must match or source is broadcasting in y dimension.");
+    CKW_ASSERT_MSG(third_view.height() == dst_h || third_view.height() == 1,
+                   "3rd tile height must match or source is broadcasting in y dimension.");
+
+    CKW_ASSERT_MSG(first_w == dst_w || first_w == 1,
+                   "1st tile width must match or source is broadcasting in x dimension.");
+    CKW_ASSERT_MSG(second_w == dst_w || second_w == 1,
+                   "2nd tile width must match or source is broadcasting in x dimension.");
+    CKW_ASSERT_MSG(third_w == dst_w || third_w == 1,
+                   "3rd tile width must match or source is broadcasting in x dimension.");
+
+    // Broadcasting on y dimension is automatic (see CLTile::vector).
+    for (int32_t y = 0; y < dst_h; ++y)
+    {
+        append_code(dst_view.vector(y).str, " = ", op_name, "(", first_prefix, first_view.vector(y).str, ", ",
+                    second_prefix, second_view.vector(y).str, ", ", third_prefix, third_view.vector(y).str, ");\n");
+    }
+}
+
+void CLKernelWriter::op_if_generic(
+    const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body, bool is_else_if)
+{
+    const auto lhs_view = to_cl_tile_view(lhs);
+    const auto rhs_view = to_cl_tile_view(rhs);
+
+    const auto op_name = std::get<1>(cl_get_binary_op(op, lhs_view.data_type()));
+    CKW_ASSERT(op == BinaryOp::Less || op == BinaryOp::LessEqual || op == BinaryOp::Equal ||
+               op == BinaryOp::GreaterEqual || op == BinaryOp::Greater);
+
+    CKW_ASSERT(lhs_view.is_scalar());
+    CKW_ASSERT(rhs_view.is_scalar());
+
+    if (is_else_if)
+    {
+        append_code("else ");
+    }
+
+    append_code("if (", lhs_view.scalar(0, 0).str, " ", op_name, " ", rhs_view.scalar(0, 0).str, ")\n{\n");
+    write_body(body);
+    append_code("}\n");
+}
+
+void CLKernelWriter::op_if(const TileOperand           &lhs,
+                           BinaryOp                     op,
+                           const TileOperand           &rhs,
+                           const std::function<void()> &body)
+{
+    op_if_generic(lhs, op, rhs, body, false /* is_else_if */);
+}
+
+void CLKernelWriter::op_else_if(const TileOperand           &lhs,
+                                BinaryOp                     op,
+                                const TileOperand           &rhs,
+                                const std::function<void()> &body)
+{
+    op_if_generic(lhs, op, rhs, body, true /* is_else_if */);
+}
+
+void CLKernelWriter::op_else(const std::function<void()> &body)
+{
+    append_code("else\n{\n");
+    write_body(body);
+    append_code("}\n");
+}
+
+void CLKernelWriter::op_for_loop(const TileOperand           &var,
+                                 BinaryOp                     cond_op,
+                                 const TileOperand           &cond_value,
+                                 const TileOperand           &update_var,
+                                 AssignmentOp                 update_op,
+                                 const TileOperand           &update_value,
+                                 const std::function<void()> &body)
+{
+    const auto var_view          = to_cl_tile_view(var);
+    const auto cond_value_view   = to_cl_tile_view(cond_value);
+    const auto update_var_view   = to_cl_tile_view(update_var);
+    const auto update_value_view = to_cl_tile_view(update_value);
+
+    CKW_ASSERT(var_view.is_scalar());
+    CKW_ASSERT(cond_value_view.is_scalar());
+    CKW_ASSERT(update_var_view.is_scalar());
+    CKW_ASSERT(update_value_view.is_scalar());
+
+    CKW_ASSERT(var_view.data_type() == cond_value_view.data_type());
+    CKW_ASSERT(update_var_view.data_type() == update_value_view.data_type());
+
+    const auto cond_op_name = std::get<1>(cl_get_binary_op(cond_op, var_view.data_type()));
+    CKW_ASSERT(cond_op == BinaryOp::Less || cond_op == BinaryOp::LessEqual || cond_op == BinaryOp::Equal ||
+               cond_op == BinaryOp::GreaterEqual || cond_op == BinaryOp::Greater);
+
+    append_code("for (; ", var_view.scalar(0, 0).str, " ", cond_op_name, " ", cond_value_view.scalar(0, 0).str, "; ",
+                update_var_view.scalar(0, 0).str, " ", cl_get_assignment_op_as_string(update_op), " ",
+                update_value_view.scalar(0, 0).str, ")\n{\n");
+    write_body(body);
+    append_code("}\n");
+}
+
+void CLKernelWriter::op_return()
+{
+    append_code("return;\n");
+}
+
+void CLKernelWriter::op_get_global_id(const TileOperand &dst, int32_t dim)
+{
+    const auto tile_view = to_cl_tile_view(dst);
+
+    CKW_ASSERT(tile_view.is_scalar());
+    CKW_ASSERT(tile_view.data_type() == DataType::Int32 || tile_view.data_type() == DataType::Uint32);
+
+    CKW_ASSERT(dim >= 0 && dim <= 2);
+
+    append_code(tile_view.scalar(0, 0).str, " = get_global_id(", std::to_string(dim), ");\n");
+}
+
+void CLKernelWriter::op_print(const std::string &prefix, const std::vector<TileOperand> &operands)
+{
+    std::string format_code;
+    std::string args_code;
+
+    for (auto &op : operands)
+    {
+        const auto tile_view = to_cl_tile_view(op);
+
+        const auto name      = tile_view.name();
+        const auto width     = tile_view.width();
+        const auto height    = tile_view.height();
+        const auto data_type = tile_view.data_type();
+
+        // Construct the format specifier to print out one row of the tile.
+        std::string row_format("%");
+
+        if (width > 1)
+        {
+            row_format += "v" + std::to_string(width);
+        }
+
+        switch (data_type)
+        {
+            case DataType::Fp32:
+                row_format += "hlg";
+                break;
+            case DataType::Fp16:
+                row_format += "hg";
+                break;
+            case DataType::Int32:
+            case DataType::Bool:
+                row_format += (width > 1) ? "hli" : "i";
+                break;
+            case DataType::Int16:
+                row_format += "hi";
+                break;
+            case DataType::Int8:
+                row_format += "hhi";
+                break;
+            case DataType::Uint32:
+                row_format += (width > 1) ? "hlu" : "u";
+                break;
+            case DataType::Uint16:
+                row_format += "hu";
+                break;
+            case DataType::Uint8:
+                row_format += "hhu";
+                break;
+            default:
+                CKW_THROW_MSG("Unsupported data type!");
+        }
+
+        if (width > 1)
+        {
+            row_format = "[" + row_format + "]";
+        }
+
+        // Construct the format specifier for the printf statement.
+        format_code += name + " = ";
+
+        if (height == 1)
+        {
+            format_code += row_format;
+        }
+        else
+        {
+            format_code += "[" + row_format;
+            for (int32_t row = 1; row < height; ++row)
+            {
+                format_code += ", " + row_format;
+            }
+            format_code += "]";
+        }
+
+        format_code += "\\n";
+
+        // Construct the variable arguments for the printf statement.
+        for (int32_t row = 0; row < height; ++row)
+        {
+            args_code += ", " + tile_view.vector(row).str;
+        }
+    }
+
+    append_code("printf(\"", prefix, "\\n", format_code, "\"", args_code, ");\n");
 }
 
-void CLKernelWriter::comment(const std::string &text)
+void CLKernelWriter::op_comment(const std::string &text)
 {
 #ifdef COMPUTE_KERNEL_WRITER_DEBUG_ENABLED
 
@@ -84,13 +550,19 @@ TileOperand CLKernelWriter::declare_tile(const std::string &name, const TileInfo
     const int32_t  width     = tile_info.width();
     const DataType data_type = tile_info.data_type();
 
-    for(int32_t row = 0; row < height; ++row)
+    CKW_ASSERT_MSG(std::find_if(_tiles.begin(), _tiles.end(),
+                                [=](const std::unique_ptr<CLTile> &e)
+                                { return e->name() == fullname; }) == _tiles.end(),
+                   "There is already a tile with name: " + fullname);
+
+    auto tile = std::make_unique<CLTile>(fullname, tile_info);
+
+    for (int32_t row = 0; row < height; ++row)
     {
         const std::string cl_type = cl_get_variable_datatype_as_string(data_type, width);
-        append_code(cl_type, " ", fullname, std::to_string(row), ";\n");
+        append_code(cl_type, " ", tile->vector(row).str, ";\n");
     }
 
-    auto       tile    = std::make_unique<CLTile>(name, tile_info);
     const auto operand = create_tile_operand(*tile);
 
     _tiles.insert(std::move(tile));
@@ -98,9 +570,231 @@ TileOperand CLKernelWriter::declare_tile(const std::string &name, const TileInfo
     return operand;
 }
 
+TileOperand CLKernelWriter::declare_constant_tile(const ConstantData &data)
+{
+    auto              tile    = std::make_unique<CLTile>(get_values(data), get_data_type(data));
+    const TileOperand operand = create_tile_operand(*tile);
+    _constant_tiles.insert(std::move(tile));
+
+    return operand;
+}
+
 void CLKernelWriter::op_write_raw_code(const std::string &raw_code)
 {
     append_code(raw_code);
 }
 
+TileView<CLTile> CLKernelWriter::to_cl_tile_view(const TileOperand &operand) const
+{
+    const auto     tile_and_area = get_tile(operand);
+    ITile         &tile          = std::get<0>(tile_and_area);
+    const TileArea area          = std::get<1>(tile_and_area);
+
+#ifdef COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED
+    // Check if the tile is a CLTile created by this kernel writer.
+
+    {
+        bool found = false;
+
+        for (const auto &t : _tiles)
+        {
+            if (&tile == t.get())
+            {
+                found = true;
+                break;
+            }
+        }
+
+        for (const auto &t : _constant_tiles)
+        {
+            if (&tile == t.get())
+            {
+                found = true;
+                break;
+            }
+        }
+
+        if (!found)
+        {
+            for (const auto &t : _tensors)
+            {
+                const auto components = t->components();
+
+                for (const auto component : components)
+                {
+                    if (&tile == &component->tile())
+                    {
+                        found = true;
+                        break;
+                    }
+                }
+
+                if (found)
+                {
+                    break;
+                }
+            }
+        }
+
+        CKW_ASSERT_MSG(found, "The tile is not found!");
+    }
+#endif // COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED
+
+    return {static_cast<CLTile &>(tile), area};
+}
+
+void CLKernelWriter::op_load(const TileOperand   &tile_op,
+                             const TensorOperand &tensor_op,
+                             TensorSampler       &sampler,
+                             const TileOperand   &x,
+                             const TileOperand   &y,
+                             const TileOperand   &z,
+                             const TileOperand   &batch)
+{
+    const CLTile dilation_x({{"1"}}, DataType::Int32);
+    const CLTile dilation_y({{"1"}}, DataType::Int32);
+
+    op_load_store(MemoryOperation::Load, tile_op, tensor_op, sampler, x, y, z, batch, dilation_x, dilation_y,
+                  false /* indirect buffer */);
+}
+
+void CLKernelWriter::op_load_dilated(const TileOperand   &tile_op,
+                                     const TensorOperand &tensor_op,
+                                     TensorSampler       &sampler,
+                                     const TileOperand   &x,
+                                     const TileOperand   &y,
+                                     const TileOperand   &z,
+                                     const TileOperand   &batch,
+                                     const TileOperand   &dilation_x,
+                                     const TileOperand   &dilation_y)
+{
+    const auto dil_x_view = to_cl_tile_view(dilation_x);
+    const auto dil_y_view = to_cl_tile_view(dilation_y);
+
+    op_load_store(MemoryOperation::Load, tile_op, tensor_op, sampler, x, y, z, batch, dil_x_view, dil_y_view,
+                  false /* indirect buffer */);
+}
+
+void CLKernelWriter::op_store(const TensorOperand &tensor_op,
+                              const TileOperand   &tile_op,
+                              TensorSampler       &sampler,
+                              const TileOperand   &x,
+                              const TileOperand   &y,
+                              const TileOperand   &z,
+                              const TileOperand   &batch)
+{
+    const CLTile dilation_x({{"1"}}, DataType::Int32);
+    const CLTile dilation_y({{"1"}}, DataType::Int32);
+
+    op_load_store(MemoryOperation::Store, tile_op, tensor_op, sampler, x, y, z, batch, dilation_x, dilation_y,
+                  false /* indirect buffer */);
+}
+
+void CLKernelWriter::op_store_dilated(const TensorOperand &tensor_op,
+                                      const TileOperand   &tile_op,
+                                      TensorSampler       &sampler,
+                                      const TileOperand   &x,
+                                      const TileOperand   &y,
+                                      const TileOperand   &z,
+                                      const TileOperand   &batch,
+                                      const TileOperand   &dilation_x,
+                                      const TileOperand   &dilation_y)
+{
+    const auto dil_x_view = to_cl_tile_view(dilation_x);
+    const auto dil_y_view = to_cl_tile_view(dilation_y);
+
+    op_load_store(MemoryOperation::Store, tile_op, tensor_op, sampler, x, y, z, batch, dil_x_view, dil_y_view,
+                  false /* indirect buffer */);
+}
+
+void CLKernelWriter::op_load_indirect(const TileOperand   &tile_op,
+                                      const TensorOperand &tensor_op,
+                                      TensorSampler       &sampler,
+                                      const TileOperand   &x,
+                                      const TileOperand   &y,
+                                      const TileOperand   &z,
+                                      const TileOperand   &batch)
+{
+    const CLTile dilation_x({{"1"}}, DataType::Int32);
+    const CLTile dilation_y({{"1"}}, DataType::Int32);
+
+    op_load_store(MemoryOperation::Load, tile_op, tensor_op, sampler, x, y, z, batch, dilation_x, dilation_y,
+                  true /* indirect buffer */);
+}
+
+void CLKernelWriter::op_load_store(MemoryOperation         op,
+                                   const TileOperand      &tile_op,
+                                   const TensorOperand    &tensor_op,
+                                   TensorSampler          &sampler,
+                                   const TileOperand      &x,
+                                   const TileOperand      &y,
+                                   const TileOperand      &z,
+                                   const TileOperand      &batch,
+                                   const TileView<CLTile> &dilation_x,
+                                   const TileView<CLTile> &dilation_y,
+                                   bool                    indirect_buffer)
+{
+    CKW_UNUSED(dilation_x);
+    CKW_ASSERT(dilation_x.is_scalar());
+    CKW_ASSERT(dilation_y.is_scalar());
+    CKW_ASSERT(dilation_x.scalar(0, 0).str == "((int)(1))"); // Dilation in x dimension is not implemented yet
+
+    if (indirect_buffer)
+    {
+        CKW_ASSERT(dilation_y.scalar(0, 0).str == "((int)(1))" && dilation_x.scalar(0, 0).str == "((int)(1))");
+    }
+
+    ITensor &tensor = get_tensor(tensor_op);
+
+    std::unique_ptr<ICLMemoryOpHelper> helper;
+    switch (sampler.storage())
+    {
+        case TensorStorageType::BufferUint8Ptr:
+            helper = std::make_unique<CLMemoryOpBufferHelper>(this, &tensor, &sampler, op);
+            break;
+        case TensorStorageType::Texture2dReadOnly:
+        case TensorStorageType::Texture2dWriteOnly:
+            helper = std::make_unique<CLMemoryOpImage2dHelper>(this, &tensor, &sampler, op);
+            break;
+        default:
+            CKW_THROW_MSG("Unsupported tensor storage");
+    }
+
+    // Load/store op doesn't support sub-tile access.
+    const auto tile       = to_cl_tile_view(tile_op).full_tile();
+    const auto x_tile     = to_cl_tile_view(x).full_tile();
+    const auto y_tile     = to_cl_tile_view(y).full_tile();
+    const auto z_tile     = to_cl_tile_view(z).full_tile();
+    const auto batch_tile = to_cl_tile_view(batch).full_tile();
+
+    CKW_ASSERT(x_tile.is_scalar());
+    CKW_ASSERT(z_tile.is_scalar());
+    CKW_ASSERT_IF(indirect_buffer, y_tile.info().width() == 1);
+    CKW_ASSERT_IF(!indirect_buffer, y_tile.is_scalar());
+    CKW_ASSERT(batch_tile.is_scalar());
+
+    helper->initialize(&tile, &x_tile, &z_tile, &batch_tile);
+
+    for (int row = 0; row < tile.info().height(); ++row)
+    {
+        if (!indirect_buffer)
+        {
+            std::string coord_y = y_tile.scalar(0, 0).str + " + " + std::to_string(row);
+
+            if (dilation_y.scalar(0, 0).str != "((int)(1))")
+            {
+                coord_y += " * " + dilation_y.scalar(0, 0).str;
+            }
+
+            helper->write_row(row, coord_y);
+        }
+        else
+        {
+            helper->write_row(row, y_tile.scalar(row, 0).str);
+        }
+    }
+
+    helper->finalize();
+}
+
 } // namespace ckw
diff --git a/compute_kernel_writer/src/cl/CLKernelWriter.h b/compute_kernel_writer/src/cl/CLKernelWriter.h
index 5df148da7b48aa70f5371a5b95aea7b73eb1d2bd..6485bae512a9ace1882971791912a6f14f63bff1 100644
--- a/compute_kernel_writer/src/cl/CLKernelWriter.h
+++ b/compute_kernel_writer/src/cl/CLKernelWriter.h
@@ -27,15 +27,26 @@
 
 #include "ckw/KernelWriter.h"
 
+#include "src/TileView.h"
+
 #include <memory>
 #include <set>
+#include <string>
 #include <utility>
 
 namespace ckw
 {
 
+// Forward Declarations
 class CLTile;
 class CLTensorArgument;
+class ConstantData;
+class TensorOperand;
+class TensorSampler;
+class TileOperand;
+
+enum class DataType;
+enum class MemoryOperation;
 
 /** OpenCL kernel writer. */
 class CLKernelWriter : public KernelWriter
@@ -51,16 +62,57 @@ public:
     /** Destructor */
     ~CLKernelWriter();
 
+    // =============================================================================================
+    // Data processing
+    // =============================================================================================
+
+    void op_assign(const TileOperand &dst, const TileOperand &src) override;
+
+    void op_cast(const TileOperand &dst, const TileOperand &src, ConvertPolicy policy) override;
+
+    void op_unary(const TileOperand &dst, UnaryOp op, const TileOperand &src) override;
+
+    void op_binary(const TileOperand &dst, BinaryOp op, const TileOperand &first, const TileOperand &second) override;
+
+    void op_ternary(const TileOperand &dst,
+                    TernaryOp          op,
+                    const TileOperand &first,
+                    const TileOperand &second,
+                    const TileOperand &third) override;
+
+    // =============================================================================================
+    // Flow control
+    // =============================================================================================
+
+    void op_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body) override;
+
+    void
+    op_else_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body) override;
+
+    void op_else(const std::function<void()> &body) override;
+
+    void op_for_loop(const TileOperand           &var,
+                     BinaryOp                     cond_op,
+                     const TileOperand           &cond_value,
+                     const TileOperand           &update_var,
+                     AssignmentOp                 update_op,
+                     const TileOperand           &update_value,
+                     const std::function<void()> &body) override;
+
+    void op_return() override;
+
     // =============================================================================================
     // Misc
     // =============================================================================================
 
-    /** Similar to @ref KernelWriter::comment() */
-    void comment(const std::string &text) override;
+    void op_get_global_id(const TileOperand &dst, int32_t dim) override;
+
+    void op_comment(const std::string &text) override;
 
-    /** Similar to @ref KernelWriter::op_write_raw_code() */
     void op_write_raw_code(const std::string &raw_code) override;
 
+    void op_print(const std::string &prefix, const std::vector<TileOperand> &operands) override;
+
     // =============================================================================================
     // Code generation
     // =============================================================================================
@@ -76,10 +128,70 @@ public:
     /** Declare a tile given name and tile information
      *
      * Similar to @ref KernelWriter::declare_tile()
-    */
+     */
     TileOperand declare_tile(const std::string &name, const TileInfo &tile_info) override;
 
+    /** Declare a constant tile given a @ref:ConstantData object
+     *
+     * Similar to @ref KernelWriter::declare_constant_tile()
+     */
+    TileOperand declare_constant_tile(const ConstantData &data) override;
+
+    // =============================================================================================
+    // Memory Operations
+    // =============================================================================================
+
+    void op_load(const TileOperand   &tile_op,
+                 const TensorOperand &tensor_op,
+                 TensorSampler       &sampler,
+                 const TileOperand   &x,
+                 const TileOperand   &y,
+                 const TileOperand   &z,
+                 const TileOperand   &batch) override;
+
+    void op_load_dilated(const TileOperand   &tile_op,
+                         const TensorOperand &tensor_op,
+                         TensorSampler       &sampler,
+                         const TileOperand   &x,
+                         const TileOperand   &y,
+                         const TileOperand   &z,
+                         const TileOperand   &batch,
+                         const TileOperand   &dilation_x,
+                         const TileOperand   &dilation_y) override;
+
+    void op_store(const TensorOperand &tensor_op,
+                  const TileOperand   &tile_op,
+                  TensorSampler       &sampler,
+                  const TileOperand   &x,
+                  const TileOperand   &y,
+                  const TileOperand   &z,
+                  const TileOperand   &batch) override;
+
+    void op_store_dilated(const TensorOperand &tensor_op,
+                          const TileOperand   &tile_op,
+                          TensorSampler       &sampler,
+                          const TileOperand   &x,
+                          const TileOperand   &y,
+                          const TileOperand   &z,
+                          const TileOperand   &batch,
+                          const TileOperand   &dilation_x,
+                          const TileOperand   &dilation_y) override;
+
+    void op_load_indirect(const TileOperand   &tile_op,
+                          const TensorOperand &tensor_op,
+                          TensorSampler       &sampler,
+                          const TileOperand   &x,
+                          const TileOperand   &y,
+                          const TileOperand   &z,
+                          const TileOperand   &batch) override;
+
 protected:
+    /** Return a tile view containing a reference to @ref CLTile object and the active area.
+     *
+     * This function performs appropriate check before doing type casting.
+     */
+    TileView<CLTile> to_cl_tile_view(const TileOperand &operand) const;
+
     /** Append the specified code to the kernel body source code. */
     template <typename T, typename... TArgs>
     void append_code(T &&code, TArgs &&...args)
@@ -98,6 +210,38 @@ protected:
     /** Get the current kernel body source code. */
     const std::string &body_source_code() const;
 
+    // For helper functions
+private:
+    /** Helper method to consolidate all load/store logic in this class */
+    void op_load_store(MemoryOperation         op,
+                       const TileOperand      &tile_op,
+                       const TensorOperand    &tensor_op,
+                       TensorSampler          &sampler,
+                       const TileOperand      &x,
+                       const TileOperand      &y,
+                       const TileOperand      &z,
+                       const TileOperand      &batch,
+                       const TileView<CLTile> &dilation_x,
+                       const TileView<CLTile> &dilation_y,
+                       bool                    indirect_buffer);
+
+    /** This function is the generic function to write both `if` and `else if` blocks.
+     *
+     * It is used for both @ref CLKernelWriter::op_if and @ref CLKernelWriter::op_else_if.
+     *
+     * @param[in] lhs        The LHS tile of the condition.
+     * @param[in] op         The relational binary operator.
+     * @param[in] rhs        The RHS tile of the condition.
+     * @param[in] body       The function that writes the body of the else-if block.
+     * @param[in] is_else_if True if this is an `else if` block, otherwise this is an `if` block.
+     */
+    void op_if_generic(const TileOperand           &lhs,
+                       BinaryOp                     op,
+                       const TileOperand           &rhs,
+                       const std::function<void()> &body,
+                       bool                         is_else_if);
+
+    // For attributes
 private:
     /** This string contains the kernel body source code, not the full CL source code.
      * The full source code will only be generated when the user calls @ref KernelWriter::emit_kernel.
@@ -109,6 +253,7 @@ private:
 
     std::set<std::unique_ptr<CLTensorArgument>> _tensors{};
     std::set<std::unique_ptr<CLTile>>           _tiles{};
+    std::set<std::unique_ptr<CLTile>>           _constant_tiles{};
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/src/cl/CLTensorArgument.cpp b/compute_kernel_writer/src/cl/CLTensorArgument.cpp
index 7d4dc958df6d82477e696458addc1d19f1bd13ff..e53de2830d2ff5931b3b1c600a7cb6311050f8f0 100644
--- a/compute_kernel_writer/src/cl/CLTensorArgument.cpp
+++ b/compute_kernel_writer/src/cl/CLTensorArgument.cpp
@@ -23,11 +23,13 @@
  */
 
 #include "src/cl/CLTensorArgument.h"
+
 #include "ckw/Error.h"
-#include "src/ITensorArgument.h"
-#include "src/ITensorComponent.h"
+
 #include "src/cl/CLHelpers.h"
 #include "src/cl/CLTensorComponent.h"
+#include "src/ITensorArgument.h"
+#include "src/ITensorComponent.h"
 #include "src/types/TensorComponentType.h"
 
 #include <algorithm>
@@ -48,25 +50,23 @@ CLTensorComponent &CLTensorArgument::cl_component(TensorComponentType x)
 {
     // Return the component if it has already been created.
     {
-        const auto it = std::find_if(
-            _components_used.begin(), _components_used.end(),
-            [=](const std::unique_ptr<CLTensorComponent> &item)
-            {
-                return item->component_type() == x;
-            });
+        const auto it =
+            std::find_if(_components_used.begin(), _components_used.end(),
+                         [=](const std::unique_ptr<CLTensorComponent> &item) { return item->component_type() == x; });
 
-        if(it != _components_used.end())
+        if (it != _components_used.end())
         {
             return **it;
         }
     }
 
-    if(_return_dims_by_value)
+    if (_return_dims_by_value)
     {
         uint32_t component_type = static_cast<uint32_t>(x);
 
-        const bool is_dimension         = (component_type & static_cast<uint32_t>(TensorComponentBitmask::Dimension)) != 0;
-        const bool is_folded_dimensions = (component_type & static_cast<uint32_t>(TensorComponentBitmask::FoldedDimensions)) != 0;
+        const bool is_dimension = (component_type & static_cast<uint32_t>(TensorComponentBitmask::Dimension)) != 0;
+        const bool is_folded_dimensions =
+            (component_type & static_cast<uint32_t>(TensorComponentBitmask::FoldedDimensions)) != 0;
 
         constexpr auto bitmask_all     = static_cast<uint32_t>(TensorComponentIndexBitmask::All);
         constexpr auto bitmask_index_0 = static_cast<uint32_t>(TensorComponentIndexBitmask::Index0);
@@ -83,16 +83,16 @@ CLTensorComponent &CLTensorArgument::cl_component(TensorComponentType x)
         CKW_ASSERT(bitmask_index_2 == bitmask_index_3 >> 4);
 
         // If we have a dimension or folded dimensions, we can return the corresponding value if it is not dynamic (not equal to -1)
-        if(is_dimension == true || is_folded_dimensions == true)
+        if (is_dimension == true || is_folded_dimensions == true)
         {
             component_type = component_type & bitmask_all;
 
             int32_t idx = 1;
-            for(int32_t i = 0; i < tensor_component_index_max_count; ++i)
+            for (int32_t i = 0; i < tensor_component_index_max_count; ++i)
             {
                 uint32_t dim_idx = component_type & bitmask_index_0;
 
-                if(dim_idx == 0)
+                if (dim_idx == 0)
                 {
                     // Stop at the first nibble containing 0
                     break;
@@ -104,7 +104,7 @@ CLTensorComponent &CLTensorArgument::cl_component(TensorComponentType x)
                 // Get the dimension value
                 const int32_t dim_val = _info.shape()[dim_idx];
 
-                if(dim_val == kDynamicTensorDimensionValue)
+                if (dim_val == kDynamicTensorDimensionValue)
                 {
                     // We cannot return the dimension by value if it is dynamic.
                     // Therefore, force the idx variable to kDynamicTensorDimensionValue and break the loop.
@@ -118,7 +118,7 @@ CLTensorComponent &CLTensorArgument::cl_component(TensorComponentType x)
                 component_type >>= 4;
             }
 
-            if(idx != kDynamicTensorDimensionValue)
+            if (idx != kDynamicTensorDimensionValue)
             {
                 _components_used.emplace_back(std::make_unique<CLTensorComponent>(*this, x, idx));
 
@@ -141,14 +141,10 @@ TensorStorageVariable &CLTensorArgument::storage(TensorStorageType x)
 {
     // Return the storage if it has already been created.
     {
-        const auto it = std::find_if(
-            _storages_used.begin(), _storages_used.end(),
-            [=](const TensorStorageVariable &item)
-            {
-                return item.type == x;
-            });
+        const auto it = std::find_if(_storages_used.begin(), _storages_used.end(),
+                                     [=](const TensorStorageVariable &item) { return item.type == x; });
 
-        if(it != _storages_used.end())
+        if (it != _storages_used.end())
         {
             return *it;
         }
@@ -167,7 +163,7 @@ std::string CLTensorArgument::create_storage_name(TensorStorageType x) const
 {
     std::string var_name = _basename;
 
-    switch(x)
+    switch (x)
     {
         case TensorStorageType::BufferUint8Ptr:
             var_name += "_ptr";
@@ -198,9 +194,9 @@ std::vector<const ITensorComponent *> CLTensorArgument::components() const
 {
     std::vector<const ITensorComponent *> components;
 
-    for(const auto &component : _components_used)
+    for (const auto &component : _components_used)
     {
-        if(component->is_assignable())
+        if (component->is_assignable())
         {
             components.push_back(component.get());
         }
diff --git a/compute_kernel_writer/src/cl/CLTensorArgument.h b/compute_kernel_writer/src/cl/CLTensorArgument.h
index 4cbbee21ee107d3ad902fe2aede0b27f213e7997..35df51422ecf4963808884e0b92e0db538da3359 100644
--- a/compute_kernel_writer/src/cl/CLTensorArgument.h
+++ b/compute_kernel_writer/src/cl/CLTensorArgument.h
@@ -26,7 +26,9 @@
 
 #include "ckw/types/TensorComponentType.h"
 #include "ckw/types/TensorStorageType.h"
+
 #include "src/ITensor.h"
+
 #include <memory>
 #include <string>
 #include <vector>
@@ -67,7 +69,7 @@ public:
      * unlike @ref CLTensorComponent::component which is for the public API and only returns
      * a reference to a generic @ref ITile object.
      */
-    CLTensorComponent& cl_component(TensorComponentType component_type);
+    CLTensorComponent &cl_component(TensorComponentType component_type);
 
     // Inherited method overridden
     TensorStorageVariable                &storage(TensorStorageType x) override;
@@ -78,7 +80,7 @@ public:
 private:
     std::string create_storage_name(TensorStorageType x) const;
 
-    bool                                            _return_dims_by_value{ false };
+    bool                                            _return_dims_by_value{false};
     std::vector<TensorStorageVariable>              _storages_used{};
     std::vector<std::unique_ptr<CLTensorComponent>> _components_used{};
 };
diff --git a/compute_kernel_writer/src/cl/CLTensorComponent.cpp b/compute_kernel_writer/src/cl/CLTensorComponent.cpp
index c29b307748e3c2554bef71416b4d486b60198687..dbe2036768a8e5b68a79668261efa220bbe55120 100644
--- a/compute_kernel_writer/src/cl/CLTensorComponent.cpp
+++ b/compute_kernel_writer/src/cl/CLTensorComponent.cpp
@@ -23,8 +23,10 @@
  */
 
 #include "src/cl/CLTensorComponent.h"
+
 #include "ckw/Error.h"
 #include "ckw/types/TensorComponentType.h"
+
 #include "src/cl/CLTensorArgument.h"
 #include "src/cl/CLTile.h"
 
@@ -38,7 +40,7 @@ std::string create_component_name(const std::string &name, TensorComponentType x
 {
     std::string var_name(name);
 
-    switch(x)
+    switch (x)
     {
         case TensorComponentType::OffsetFirstElement:
             var_name += "_offset_first_element";
@@ -93,12 +95,13 @@ std::string create_component_name(const std::string &name, TensorComponentType x
 } // namespace
 
 CLTensorComponent::CLTensorComponent(const CLTensorArgument &tensor, TensorComponentType component_type)
-    : CLTile(create_component_name(tensor.name(), component_type), TileInfo(DataType::Int32)), _component_type(component_type)
+    : CLTile(create_component_name(tensor.name(), component_type), TileInfo(DataType::Int32)),
+      _component_type(component_type)
 {
 }
 
 CLTensorComponent::CLTensorComponent(const CLTensorArgument &tensor, TensorComponentType component_type, int32_t value)
-    : CLTile({ { std::to_string(value) } }, DataType::Int32), _component_type(component_type)
+    : CLTile({{std::to_string(value)}}, DataType::Int32), _component_type(component_type)
 {
     CKW_UNUSED(tensor);
 }
diff --git a/compute_kernel_writer/src/cl/CLTensorComponent.h b/compute_kernel_writer/src/cl/CLTensorComponent.h
index 42a42666dc37666628c4b1ee5523b1b96af4f219..731597ebbf4d59f6ed91cfe1c0fec34e7f9effd9 100644
--- a/compute_kernel_writer/src/cl/CLTensorComponent.h
+++ b/compute_kernel_writer/src/cl/CLTensorComponent.h
@@ -26,8 +26,9 @@
 #define CKW_SRC_CL_CLTENSORCOMPONENT_H
 
 #include "ckw/types/TensorComponentType.h"
-#include "src/ITensorComponent.h"
+
 #include "src/cl/CLTile.h"
+#include "src/ITensorComponent.h"
 
 namespace ckw
 {
@@ -72,7 +73,7 @@ public:
     TensorComponentType component_type() const override;
 
 private:
-    TensorComponentType _component_type{ TensorComponentType::Unknown };
+    TensorComponentType _component_type{TensorComponentType::Unknown};
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/src/cl/CLTile.cpp b/compute_kernel_writer/src/cl/CLTile.cpp
index 013ac4c2768b940cf20a8f864fc7808fea9cd5c7..f6e271e8139b14a569faec7ab8caa8d6f59388da 100644
--- a/compute_kernel_writer/src/cl/CLTile.cpp
+++ b/compute_kernel_writer/src/cl/CLTile.cpp
@@ -21,20 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "src/cl/CLTile.h"
+
 #include "ckw/Error.h"
 #include "ckw/TileInfo.h"
 
-#include "src/Helpers.h"
 #include "src/cl/CLHelpers.h"
-#include "src/cl/CLTile.h"
+#include "src/Helpers.h"
 
 #include <algorithm>
 #include <vector>
 
 namespace ckw
 {
-CLTile::CLTile(const std::string &name, const TileInfo &info)
-    : _is_constant(false)
+CLTile::CLTile(const std::string &name, const TileInfo &info) : _is_constant(false)
 {
     validate_tile_info(info);
 
@@ -42,8 +42,7 @@ CLTile::CLTile(const std::string &name, const TileInfo &info)
     _info     = info;
 }
 
-CLTile::CLTile(const TileContainer &vals, DataType dt)
-    : _is_constant(true)
+CLTile::CLTile(const TileContainer &vals, DataType dt) : _is_constant(true)
 {
     const int32_t w = vals[0].size();
     const int32_t h = vals.size();
@@ -56,9 +55,9 @@ CLTile::CLTile(const TileContainer &vals, DataType dt)
 
     _vals = TileContainer(h, std::vector<std::string>(w));
 
-    for(int32_t y = 0; y < h; ++y)
+    for (int32_t y = 0; y < h; ++y)
     {
-        for(int32_t x = 0; x < w; ++x)
+        for (int32_t x = 0; x < w; ++x)
         {
             _vals[y][x] = vals[y][x];
         }
@@ -81,7 +80,7 @@ TileVariable CLTile::scalar(int32_t row, int32_t col) const
     col = clamp(col, static_cast<int32_t>(0), _info.width() - 1);
     row = clamp(row, static_cast<int32_t>(0), _info.height() - 1);
 
-    if(_is_constant)
+    if (_is_constant)
     {
         // We can use the vector method to retrieve the scalar variable stored in the constant tile
         return vector(row, col, 1);
@@ -94,7 +93,7 @@ TileVariable CLTile::scalar(int32_t row, int32_t col) const
         t.desc.len = 1;
 
         // This check is required because if the width has only one element, we cannot use .s0
-        if(_info.width() != 1)
+        if (_info.width() != 1)
         {
             // Automatic broadcasting
             t.str += ".s" + dec_to_hex_as_string(col);
@@ -109,7 +108,7 @@ TileVariable CLTile::vector(int32_t row) const
     // Clamp to nearest valid edge
     row = clamp(row, static_cast<int32_t>(0), _info.height() - 1);
 
-    if(_is_constant)
+    if (_is_constant)
     {
         return vector(row, 0, _info.width());
     }
@@ -125,6 +124,9 @@ TileVariable CLTile::vector(int32_t row) const
 
 TileVariable CLTile::vector(int32_t row, int32_t col_start, int32_t width) const
 {
+    CKW_ASSERT(col_start >= 0 && col_start < _info.width());
+    CKW_ASSERT(col_start + width <= _info.width());
+
     // Validate the new vector length
     cl_validate_vector_length(width);
 
@@ -135,14 +137,14 @@ TileVariable CLTile::vector(int32_t row, int32_t col_start, int32_t width) const
     t.desc.dt  = _info.data_type();
     t.desc.len = width;
 
-    if(_is_constant)
+    if (_is_constant)
     {
         // The vector has the following form: ((data_typeN)(val0, val1,..., ValN-1))
         t.str = "((" + cl_get_variable_datatype_as_string(t.desc.dt, width) + ")";
         t.str += "(";
 
         int32_t col = col_start;
-        for(; col < width - 1; ++col)
+        for (; col < width - 1; ++col)
         {
             t.str += _vals[row][col];
             t.str += ", ";
@@ -154,10 +156,10 @@ TileVariable CLTile::vector(int32_t row, int32_t col_start, int32_t width) const
     {
         t.str = create_var_name(row);
 
-        if(_info.width() != 1)
+        if (_info.width() != 1 && _info.width() != width)
         {
             t.str += ".s";
-            for(int i = 0; i < width; ++i)
+            for (int i = 0; i < width; ++i)
             {
                 t.str += dec_to_hex_as_string(col_start + i);
             }
@@ -171,11 +173,11 @@ std::vector<TileVariable> CLTile::all() const
 {
     std::vector<TileVariable> vars;
 
-    if(_is_constant)
+    if (_is_constant)
     {
-        for(int32_t y = 0; y < _info.height(); ++y)
+        for (int32_t y = 0; y < _info.height(); ++y)
         {
-            for(int32_t x = 0; x < _info.width(); ++x)
+            for (int32_t x = 0; x < _info.width(); ++x)
             {
                 // We can use the vector method to retrieve all the scalar variables stored in the constant tile
                 TileVariable t = vector(y, x, 1);
@@ -185,7 +187,7 @@ std::vector<TileVariable> CLTile::all() const
     }
     else
     {
-        for(int32_t y = 0; y < _info.height(); ++y)
+        for (int32_t y = 0; y < _info.height(); ++y)
         {
             TileVariable t;
             t.str      = create_var_name(y);
@@ -208,9 +210,9 @@ std::string CLTile::create_var_name(int32_t row) const
     std::string var_name = _basename;
 
     // If a scalar variable, we do not append the row index
-    if(_info.height() > 1)
+    if (_info.height() > 1)
     {
-        var_name += "_";
+        var_name += "__";
         var_name += std::to_string(row);
     }
 
@@ -219,7 +221,7 @@ std::string CLTile::create_var_name(int32_t row) const
 
 std::vector<int32_t> CLTile::supported_vector_lengths() const
 {
-    return std::vector<int32_t>{ 1, 2, 3, 4, 8, 16 };
+    return std::vector<int32_t>{1, 2, 3, 4, 8, 16};
 }
 
 void CLTile::validate_tile_info(const TileInfo &info) const
@@ -229,4 +231,4 @@ void CLTile::validate_tile_info(const TileInfo &info) const
     CKW_ASSERT_MSG(info.data_type() != DataType::Unknown, "DataType::Unknown is not supported");
 }
 
-} // namespace ckw
\ No newline at end of file
+} // namespace ckw
diff --git a/compute_kernel_writer/src/cl/CLTile.h b/compute_kernel_writer/src/cl/CLTile.h
index 46af4de364979f70303f1a20d24a1a3105c3168a..498cf51034a568f84d7bffd8fdc8047cdc696a97 100644
--- a/compute_kernel_writer/src/cl/CLTile.h
+++ b/compute_kernel_writer/src/cl/CLTile.h
@@ -25,6 +25,7 @@
 #define COMPUTE_KERNEL_WRITER_SRC_CL_CLTILE_H
 
 #include "src/ITile.h"
+
 #include <string>
 
 namespace ckw
@@ -33,7 +34,7 @@ namespace ckw
 class TileInfo;
 
 /** OpenCL specific tile */
-class CLTile : public ITile, public IVectorAccess, public IScalarAccess
+class CLTile : public ITile, public IVectorAccess
 {
 public:
     /** Initialize a new instance of @ref CLTile class for variable tile.
@@ -75,9 +76,9 @@ private:
 
     std::string create_var_name(int32_t row) const;
 
-    TileInfo      _info{ DataType::Unknown };
-    std::string   _basename{ "" };
-    bool          _is_constant{ false };
+    TileInfo      _info{DataType::Unknown};
+    std::string   _basename{""};
+    bool          _is_constant{false};
     TileContainer _vals{};
 };
 } // namespace ckw
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a98ebed8faeb9f907de63563748d08503d4c946f
--- /dev/null
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cl/helpers/CLMemoryOpBufferHelper.h"
+
+#include "ckw/Error.h"
+#include "ckw/TensorSampler.h"
+#include "ckw/types/MemoryOperation.h"
+#include "ckw/types/TensorStorageType.h"
+
+#include "src/cl/CLHelpers.h"
+#include "src/cl/CLKernelWriter.h"
+#include "src/cl/CLTensorArgument.h"
+#include "src/cl/CLTile.h"
+#include "src/ITensor.h"
+#include "src/Tensor3dMapper.h"
+
+namespace ckw
+{
+bool CLMemoryOpBufferHelper::validate(const CLKernelWriter *writer,
+                                      const ITensor        *tensor,
+                                      const TensorSampler  *sampler,
+                                      const Tensor3dMapper *mapper,
+                                      MemoryOperation       op,
+                                      const CLTile         *dst)
+{
+    CKW_UNUSED(writer, tensor, mapper, op, dst);
+
+    if (sampler->storage() != TensorStorageType::BufferUint8Ptr)
+    {
+        return false;
+    }
+    return true;
+}
+
+/** Initialization and Finalizing Logic
+ *
+ *   The meanings of if/elses in different dimensions and how they're constructed:
+ *   - x: partial load/store
+ *   - y: no load/store operation
+ *   - z: no load/store operation
+ *   if(x)
+ *   {
+ *       if(z)
+ *       {
+ *           if(y)
+ *           {
+ *               // full load/store width
+ *           }
+ *           else
+ *           {
+ *               // no load/store
+ *           }
+ *       }
+ *       else
+ *       {
+ *           // no load/store
+ *       }
+ *   }
+ *   else
+ *   {
+ *       if(z)
+ *       {
+ *           if(y)
+ *           {
+ *               // partial load/store width
+ *           }
+ *           else
+ *           {
+ *               // no load/store
+ *           }
+ *       }
+ *       else
+ *       {
+ *           // no load/store
+ *       }
+ *   }
+ *
+ *  In general, initialize() writes if conditions, and finalize() writes else conditions.
+ *  The outermost block is x, then z and then y. This is why, if/else's covering for y are initialized
+ *  at each row write. In some addressing modes, such as None, no if/else conditions are written.
+ */
+void CLMemoryOpBufferHelper::initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b)
+{
+    _dst = dst;
+
+    CKW_ASSERT(validate(_writer, _tensor, _sampler, _mapper.get(), _op, _dst));
+
+    _ls_width_full = dst->info().width();
+    _coord_x       = x->scalar(0, 0).str;
+    _coord_z       = z->scalar(0, 0).str;
+    _coord_b       = b->scalar(0, 0).str;
+    _coord_orig_z  = _coord_z;
+
+    out_of_bound_initialize_x(_coord_x);
+    out_of_bound_initialize_z(_coord_z);
+}
+
+void CLMemoryOpBufferHelper::write_row(int32_t row_id, const std::string &coord_y)
+{
+    // The only check required is on Y.
+    out_of_bound_initialize_y(coord_y);
+
+    const std::string dst     = _dst->vector(row_id).str;
+    const std::string address = to_buffer_address(_coord_x, coord_y, _coord_z, _coord_b);
+    const std::string ls_buf  = to_statement(_op, _ls_width_full, dst, address);
+
+    _writer->op_write_raw_code(ls_buf);
+    _writer->op_write_raw_code(";\n");
+
+    out_of_bound_finalize_y(dst);
+
+    // The left over load/store will be written in the finalize stage
+    if (_ls_width_part.size() != 0)
+    {
+        int32_t col_start = 0;
+        for (int32_t partial_width : _ls_width_part)
+        {
+            const std::string dst       = _dst->vector(row_id, col_start, partial_width).str;
+            const std::string coord_x   = _coord_x + " + " + std::to_string(col_start);
+            const std::string address   = to_buffer_address(coord_x, coord_y, _coord_z, _coord_b);
+            const std::string statement = to_statement(_op, partial_width, dst, address);
+            _leftovers_x.emplace_back(dst, coord_y, statement);
+
+            col_start += partial_width;
+        }
+    }
+}
+
+void CLMemoryOpBufferHelper::finalize()
+{
+    out_of_bound_finalize_z();
+    out_of_bound_finalize_x();
+}
+
+void CLMemoryOpBufferHelper::out_of_bound_initialize_x(const std::string &coord)
+{
+    if (_sampler->address_mode_x() == TensorSamplerAddressModeX::OverlappingMin)
+    {
+        TensorInfo  tensor_info = _tensor->info();
+        TensorShape shape       = tensor_info.shape();
+
+        _ls_width_part = cl_decompose_vector_width(shape[0] % _ls_width_full);
+        if (_ls_width_part.size() != 0)
+        {
+            _writer->op_write_raw_code("if(" + coord + " > 0)\n{\n");
+        }
+    }
+}
+
+void CLMemoryOpBufferHelper::out_of_bound_finalize_x()
+{
+    if (_sampler->address_mode_x() == TensorSamplerAddressModeX::OverlappingMin)
+    {
+        if (_ls_width_part.size() != 0)
+        {
+            _writer->op_write_raw_code("}\nelse\n{\n");
+
+            out_of_bound_initialize_z(_coord_orig_z);
+            for (LeftoverDescriptor leftover_desc : _leftovers_x)
+            {
+                out_of_bound_initialize_y(leftover_desc.coord);
+                _writer->op_write_raw_code(leftover_desc.statement);
+                _writer->op_write_raw_code(";\n");
+                out_of_bound_finalize_y(leftover_desc.dst);
+            }
+            out_of_bound_finalize_z();
+            _writer->op_write_raw_code("}\n");
+        }
+    }
+}
+
+void CLMemoryOpBufferHelper::out_of_bound_initialize_y(const std::string &coord)
+{
+    std::string max = "";
+
+    const TensorSamplerAddressModeY address_mode_y = _sampler->address_mode_y();
+
+    switch (address_mode_y)
+    {
+        case TensorSamplerAddressModeY::ClampToBorderMaxOnly:
+            // Not to be moved outside the case because it marks the relevant tensor component as used even if we dont't use the variable
+            max = _mapper->dim_y().str;
+            _writer->op_write_raw_code("if(" + coord + " < " + max + ")\n{\n");
+            break;
+        case TensorSamplerAddressModeY::SkipLessThanZero:
+            _writer->op_write_raw_code("if(" + coord + " >= 0)\n{\n");
+            break;
+        case TensorSamplerAddressModeY::None:
+            break;
+        default:
+            CKW_THROW_MSG("Unsupported address mode for Y dimension");
+    }
+}
+
+void CLMemoryOpBufferHelper::out_of_bound_finalize_y(const std::string &dst)
+{
+    const TensorSamplerAddressModeY address_mode_y = _sampler->address_mode_y();
+
+    switch (address_mode_y)
+    {
+        case TensorSamplerAddressModeY::ClampToBorderMaxOnly:
+            _writer->op_write_raw_code("}\nelse\n{\n");
+            _writer->op_write_raw_code(dst);
+            _writer->op_write_raw_code(" = 0.0f;\n}\n");
+            break;
+        case TensorSamplerAddressModeY::SkipLessThanZero:
+            _writer->op_write_raw_code("}\n");
+            break;
+        case TensorSamplerAddressModeY::None:
+            break;
+        default:
+            CKW_THROW_MSG("Unsupported address mode for Y dimension");
+    }
+}
+
+void CLMemoryOpBufferHelper::out_of_bound_initialize_z(const std::string &coord)
+{
+    CKW_UNUSED(coord);
+
+    const TensorSamplerAddressModeZ address_mode_z = _sampler->address_mode_z();
+    switch (address_mode_z)
+    {
+        case TensorSamplerAddressModeZ::None:
+            break;
+        default:
+            CKW_THROW_MSG("Unsupported address mode for Z dimension");
+    }
+}
+
+void CLMemoryOpBufferHelper::out_of_bound_finalize_z()
+{
+    const TensorSamplerAddressModeZ address_mode_z = _sampler->address_mode_z();
+
+    switch (address_mode_z)
+    {
+        case TensorSamplerAddressModeZ::None:
+            break;
+        default:
+            CKW_THROW_MSG("Unsupported address mode for Z dimension");
+    }
+}
+
+std::string CLMemoryOpBufferHelper::to_statement(MemoryOperation    op,
+                                                 int32_t            vector_width,
+                                                 const std::string &data,
+                                                 const std::string &address) const
+{
+    switch (op)
+    {
+        case MemoryOperation::Load:
+            if (vector_width != 1)
+            {
+                return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")";
+            }
+            else
+            {
+                return data + " = *(" + address + ")";
+            }
+            break;
+        case MemoryOperation::Store:
+            if (vector_width != 1)
+            {
+                return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")";
+            }
+            else
+            {
+                return "*(" + address + ") = " + data;
+            }
+            break;
+        default:
+            CKW_THROW_MSG("Unsupported MemoryOperation");
+    }
+
+    return "";
+}
+
+std::string CLMemoryOpBufferHelper::to_buffer_address(const std::string &x,
+                                                      const std::string &y,
+                                                      const std::string &z,
+                                                      const std::string &b) const
+{
+    TensorStorageType tensor_storage = _sampler->storage();
+    CKW_ASSERT(tensor_storage == TensorStorageType::BufferUint8Ptr);
+
+    const std::string ptr_buf  = _tensor->storage(tensor_storage).val;
+    const std::string dst_type = cl_data_type_rounded_up_to_valid_vector_width(_dst->info().data_type(), 1);
+
+    std::string address;
+    address += "(__global ";
+    address += dst_type;
+    address += "*)(";
+    address += ptr_buf;
+    if (x != "0" && (_mapper->dim_x().str != "1"))
+    {
+        address += " + (";
+        address += x + ") * sizeof(" + dst_type + ")";
+    }
+    if (y != "0")
+    {
+        const std::string stride_y = _mapper->stride_y().str;
+        address += " + (";
+        address += y + ")";
+        address += " * ";
+        address += stride_y;
+    }
+    if (z != "0" && (_mapper->dim_z().str != "1"))
+    {
+        const std::string stride_z = _mapper->stride_z().str;
+        address += " + (";
+        address += z + ")";
+        address += " * ";
+        address += stride_z;
+    }
+    if (b != "0" && (_mapper->dim_batch().str != "1"))
+    {
+        const std::string stride_b = _mapper->stride_batch().str;
+        address += " + (";
+        address += b + ")";
+        address += " * ";
+        address += stride_b;
+    }
+    address += ")";
+    return address;
+}
+} // namespace ckw
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e1a842fe1ee866c116fd0c9c15c75789eb08e6e
--- /dev/null
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_SRC_CL_CLMEMORYOPBUFFERHELPER_H
+#define CKW_SRC_CL_CLMEMORYOPBUFFERHELPER_H
+
+#include "src/cl/helpers/ICLMemoryOpHelper.h"
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace ckw
+{
+
+// Forward Declarations
+class CLKernelWriter;
+class CLTile;
+enum class MemoryOperation;
+
+/** Helper class to write memory operations (like load/store) in OpenCL
+ */
+class CLMemoryOpBufferHelper : public ICLMemoryOpHelper
+{
+public:
+    /** Constructor similar to @ref ICLMemoryOpHelper() */
+    CLMemoryOpBufferHelper(CLKernelWriter *writer, ITensor *tensor, TensorSampler *sampler, MemoryOperation op)
+        : ICLMemoryOpHelper(writer, tensor, sampler, op)
+    {
+    }
+
+    /** Copy constructor */
+    CLMemoryOpBufferHelper(const CLMemoryOpBufferHelper &) = default;
+
+    /** Assignment operator overload */
+    CLMemoryOpBufferHelper &operator=(const CLMemoryOpBufferHelper &) = default;
+
+    // Methods overridden
+    void initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b) override;
+    void write_row(int32_t row_id, const std::string &coord_y) override;
+    void finalize() override;
+
+private:
+    struct LeftoverDescriptor
+    {
+        LeftoverDescriptor(const std::string &dst, const std::string &coord, const std::string &statement)
+            : dst(dst), coord(coord), statement(statement)
+        {
+        }
+
+        std::string dst{};       // Describes the destination tile or part of it
+        std::string coord{};     // Describes the coordinate to be used in boundary checks
+        std::string statement{}; // Describes the memory operation statement
+    };
+
+    std::vector<int32_t>            _ls_width_part{};
+    std::vector<LeftoverDescriptor> _leftovers_x{};
+    std::string                     _coord_orig_z{};
+
+    static bool validate(const CLKernelWriter *writer,
+                         const ITensor        *tensor,
+                         const TensorSampler  *sampler,
+                         const Tensor3dMapper *mapper,
+                         MemoryOperation       op,
+                         const CLTile         *dst);
+
+    void out_of_bound_initialize_x(const std::string &coord);
+    void out_of_bound_finalize_x();
+    void out_of_bound_initialize_y(const std::string &coord);
+    void out_of_bound_finalize_y(const std::string &dst);
+    void out_of_bound_initialize_z(const std::string &coord);
+    void out_of_bound_finalize_z();
+
+    std::string
+    to_statement(MemoryOperation op, int32_t vector_width, const std::string &data, const std::string &address) const;
+    std::string
+    to_buffer_address(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const;
+};
+} // namespace ckw
+
+#endif /* CKW_SRC_CL_CLMEMORYOPBUFFERHELPER_H */
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b7d146bdee005096a45ecb744fbd029967bdfe08
--- /dev/null
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cl/helpers/CLMemoryOpImage2dHelper.h"
+
+#include "ckw/Error.h"
+#include "ckw/TensorSampler.h"
+#include "ckw/types/MemoryOperation.h"
+#include "ckw/types/TensorStorageType.h"
+
+#include "src/cl/CLKernelWriter.h"
+#include "src/cl/CLTensorArgument.h"
+#include "src/cl/CLTile.h"
+#include "src/ITensor.h"
+#include "src/Tensor3dMapper.h"
+
+namespace ckw
+{
+void CLMemoryOpImage2dHelper::initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b)
+{
+    CKW_ASSERT(validate(_writer, _tensor, _sampler, _mapper.get(), _op, dst));
+
+    _dst           = dst;
+    _ls_width_full = dst->info().width();
+    _coord_x       = x->scalar(0, 0).str;
+    _coord_z       = z->scalar(0, 0).str;
+    _coord_b       = b->scalar(0, 0).str;
+}
+
+void CLMemoryOpImage2dHelper::write_row(int32_t row_id, const std::string &coord_y)
+{
+    // The only check required is on Y.
+    out_of_bound_initialize_y(coord_y);
+
+    const std::string dst     = _dst->vector(row_id).str;
+    const std::string sampler = to_ls_image2d_sampler();
+    const std::string coord   = to_ls_image2d_address(_coord_x, coord_y, _coord_z, _coord_b);
+    const std::string ls_buf  = to_ls_image2d(_op, _ls_width_full, dst, sampler, coord);
+
+    _writer->op_write_raw_code(ls_buf + ";\n");
+
+    out_of_bound_finalize_y();
+}
+
+void CLMemoryOpImage2dHelper::finalize()
+{
+}
+
+bool CLMemoryOpImage2dHelper::validate(const CLKernelWriter *writer,
+                                       const ITensor        *tensor,
+                                       const TensorSampler  *sampler,
+                                       const Tensor3dMapper *mapper,
+                                       MemoryOperation       op,
+                                       const CLTile         *dst)
+{
+    CKW_UNUSED(writer, tensor, mapper);
+
+    if (dst->info().width() != 4)
+    {
+        return false;
+    }
+    if (sampler->address_mode_x() != TensorSamplerAddressModeX::None)
+    {
+        return false;
+    }
+    if (sampler->address_mode_z() != TensorSamplerAddressModeZ::None)
+    {
+        return false;
+    }
+    if (sampler->storage() != TensorStorageType::Texture2dReadOnly && op == MemoryOperation::Load)
+    {
+        return false;
+    }
+    if (sampler->storage() != TensorStorageType::Texture2dWriteOnly && op == MemoryOperation::Store)
+    {
+        return false;
+    }
+    if ((dst->info().data_type() != DataType::Fp32) && (dst->info().data_type() != DataType::Fp16))
+    {
+        return false;
+    }
+    return true;
+}
+
+void CLMemoryOpImage2dHelper::out_of_bound_initialize_y(const std::string &coord)
+{
+    CKW_UNUSED(coord);
+
+    const TensorSamplerAddressModeY address_mode_y = _sampler->address_mode_y();
+    switch (address_mode_y)
+    {
+        case TensorSamplerAddressModeY::SkipLessThanZero:
+            _writer->op_write_raw_code("if(" + coord + " >= 0)\n{\n");
+            break;
+        case TensorSamplerAddressModeY::ClampToBorderMaxOnly:
+        case TensorSamplerAddressModeY::None:
+            break;
+        default:
+            CKW_THROW_MSG("Unsupported address mode for Y dimension");
+    }
+}
+
+void CLMemoryOpImage2dHelper::out_of_bound_finalize_y()
+{
+    const TensorSamplerAddressModeY address_mode_y = _sampler->address_mode_y();
+    switch (address_mode_y)
+    {
+        case TensorSamplerAddressModeY::SkipLessThanZero:
+            _writer->op_write_raw_code("}\n");
+            break;
+        case TensorSamplerAddressModeY::ClampToBorderMaxOnly:
+        case TensorSamplerAddressModeY::None:
+            break;
+        default:
+            CKW_THROW_MSG("Unsupported address mode for Y dimension");
+    }
+}
+
+std::string CLMemoryOpImage2dHelper::to_ls_image2d(MemoryOperation    op,
+                                                   int32_t            vector_width,
+                                                   const std::string &data,
+                                                   const std::string &sampler,
+                                                   const std::string &address) const
+{
+    CKW_UNUSED(vector_width);
+
+    const TensorStorageType tensor_storage = _sampler->storage();
+    const std::string       image2d_obj    = _tensor->storage(tensor_storage).val;
+    const std::string       post_fix       = _dst->info().data_type() == DataType::Fp32 ? "f" : "h";
+
+    switch (op)
+    {
+        case MemoryOperation::Load:
+            return data + " = read_image" + post_fix + "(" + image2d_obj + ", " + sampler + ", " + address + ")";
+            break;
+        case MemoryOperation::Store:
+            return "write_image" + post_fix + "(" + image2d_obj + ", " + address + ", " + data + ")";
+        default:
+            CKW_THROW_MSG("Unsupported MemoryOperation");
+    }
+}
+
+std::string CLMemoryOpImage2dHelper::to_ls_image2d_sampler() const
+{
+    const auto address_mode_y = _sampler->address_mode_y();
+
+    switch (address_mode_y)
+    {
+        case TensorSamplerAddressModeY::None:
+            return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST";
+        case TensorSamplerAddressModeY::SkipLessThanZero:
+        case TensorSamplerAddressModeY::ClampToBorderMaxOnly:
+            return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST";
+        default:
+            CKW_THROW_MSG("Unsupported address_mode_coord");
+    }
+}
+
+std::string CLMemoryOpImage2dHelper::to_ls_image2d_address(const std::string &x,
+                                                           const std::string &y,
+                                                           const std::string &z,
+                                                           const std::string &b) const
+{
+    std::string coord_x = "(" + x + ") >> 2";
+    std::string coord_y = "(";
+
+    if (y != "0")
+    {
+        coord_y += y;
+    }
+    if (z != "0" && (_mapper->dim_z().str != "1"))
+    {
+        const std::string dim = _mapper->dim_y().str;
+        coord_y += " + (";
+        coord_y += z + ")";
+        coord_y += " * ";
+        coord_y += dim;
+    }
+    if (b != "0" && (_mapper->dim_batch().str != "1"))
+    {
+        const std::string dim0 = _mapper->dim_y().str;
+        const std::string dim1 = _mapper->dim_z().str;
+        coord_y += " + (";
+        coord_y += b + ")";
+        coord_y += " * ";
+        coord_y += dim0;
+        coord_y += " * ";
+        coord_y += dim1;
+    }
+    coord_y += ")";
+    return "(int2)(" + coord_x + ", " + coord_y + ")";
+}
+
+} // namespace ckw
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd9b097a24f9502b0789c4e689f517717cec0ad8
--- /dev/null
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_SRC_CL_HELPERS_CLMEMORYOPIMAGE2DHELPER_H
+#define CKW_SRC_CL_HELPERS_CLMEMORYOPIMAGE2DHELPER_H
+
+#include "src/cl/helpers/ICLMemoryOpHelper.h"
+
+#include <string>
+
+namespace ckw
+{
+
+// Forward Declarations
+class CLKernelWriter;
+class CLTile;
+enum class MemoryOperation;
+
+/** Helper class to write memory operations (like load/store) in OpenCL for Image2d type */
+class CLMemoryOpImage2dHelper : public ICLMemoryOpHelper
+{
+public:
+    /** Constructor similar to @ref ICLMemoryOpHelper() */
+    CLMemoryOpImage2dHelper(CLKernelWriter *writer, ITensor *tensor, TensorSampler *sampler, MemoryOperation op)
+        : ICLMemoryOpHelper(writer, tensor, sampler, op)
+    {
+    }
+
+    /** Copy constructor */
+    CLMemoryOpImage2dHelper(const CLMemoryOpImage2dHelper &) = default;
+
+    /** Assignment operator overload */
+    CLMemoryOpImage2dHelper &operator=(const CLMemoryOpImage2dHelper &) = default;
+
+    // Methods overridden
+    void initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b) override;
+    void write_row(int32_t row_id, const std::string &coord_y) override;
+    void finalize() override;
+
+private:
+    static bool validate(const CLKernelWriter *writer,
+                         const ITensor        *tensor,
+                         const TensorSampler  *sampler,
+                         const Tensor3dMapper *mapper,
+                         MemoryOperation       op,
+                         const CLTile         *dst);
+
+    void out_of_bound_initialize_y(const std::string &coord);
+    void out_of_bound_finalize_y();
+
+    std::string to_ls_image2d(MemoryOperation    op,
+                              int32_t            vector_width,
+                              const std::string &data,
+                              const std::string &sampler,
+                              const std::string &address) const;
+    std::string to_ls_image2d_sampler() const;
+    std::string
+    to_ls_image2d_address(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const;
+};
+} // namespace ckw
+
+#endif // CKW_SRC_CL_HELPERS_CLMEMORYOPIMAGE2DHELPER_H
diff --git a/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h b/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h
new file mode 100644
index 0000000000000000000000000000000000000000..f46fee9750c49ef604f2545bfeb79f59e1e23eb7
--- /dev/null
+++ b/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_SRC_CL_HELPERS_ICLMEMORYOPHELPER_H
+#define CKW_SRC_CL_HELPERS_ICLMEMORYOPHELPER_H
+
+#include "ckw/TensorSampler.h"
+
+#include "src/Tensor3dMapper.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+namespace ckw
+{
+
+// Forward Declarations
+class CLTile;
+class CLKernelWriter;
+class ITensor;
+class TensorSampler;
+enum class MemoryOperation;
+
+/** Base class OpenCL memory operation helper classes
+ *  that helps writing code for memory operations like load/store.
+ */
+class ICLMemoryOpHelper
+{
+public:
+    /** Constructor
+     *
+     * @param[in] writer  @ref ckw::CLKernelWriter object to write the code
+     * @param[in] tensor  @ref ckw::ITensor object to perform the memory operation on
+     * @param[in] sampler @ref ckw::TensorSampler object that tells how to sample a tensor
+     * @param[in] op      The memory operation to be done (e.g. Load/Store)
+     */
+    ICLMemoryOpHelper(CLKernelWriter *writer, ITensor *tensor, TensorSampler *sampler, MemoryOperation op)
+        : _writer(writer), _tensor(tensor), _sampler(sampler), _op(op)
+    {
+        _mapper = std::make_unique<Tensor3dMapper>(tensor, sampler->format());
+    }
+
+    /** Copy constructor */
+    ICLMemoryOpHelper(const ICLMemoryOpHelper &) = default;
+
+    /** Assignment operator overload */
+    ICLMemoryOpHelper &operator=(const ICLMemoryOpHelper &) = default;
+
+    /** Destructor */
+    virtual ~ICLMemoryOpHelper() = default;
+
+    /** Initialization method that takes a 3D tensor's x, z dimensions and
+     *  the batch offset as a tile object, and initializes the code inside
+     *  the writer object.
+     *
+     * @param[in] dst  tile object to perform the memory operation on
+     * @param[in] x    tile object that describes the x-coordinate of the tensor involved
+     * @param[in] z    tile object that describes the z-coordinate of the tensor involved
+     * @param[in] b    tile object that describes the batch offset of the tensor involved
+     */
+    virtual void initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b) = 0;
+
+    /** Method that writes the actual code to the writer that performs the mentioned memory
+     *  operation on the tile initialized. It writes the code for a specific row given in the
+     *  arguments.
+     *
+     * @param[in] row_id   row id
+     * @param[in] coord_y  y-coordinate as string
+     */
+    virtual void write_row(int32_t row_id, const std::string &coord_y) = 0;
+
+    /** Method that finalizes the code in the writer object. This part is usually for taking
+     *  care of finalizing anything that's been initialized inside @ref IMemoryHelper::initialize()
+     *  such as matching compound statements, checking certain boundary conditions etc. No inputs
+     *  and/or outputs, only the writer object is affected.
+     */
+    virtual void finalize() = 0;
+
+protected:
+    CLKernelWriter                 *_writer{nullptr};
+    ITensor                        *_tensor{nullptr};
+    TensorSampler                  *_sampler{nullptr};
+    MemoryOperation                 _op;
+    std::unique_ptr<Tensor3dMapper> _mapper{nullptr};
+    const CLTile                   *_dst{nullptr};
+    int32_t                         _ls_width_full{0};
+    std::string                     _coord_x{};
+    std::string                     _coord_z{};
+    std::string                     _coord_b{};
+};
+} // namespace ckw
+
+#endif /* CKW_SRC_CL_HELPERS_ICLMEMORYOPHELPER_H */
diff --git a/compute_kernel_writer/src/types/ConstantData.cpp b/compute_kernel_writer/src/types/ConstantData.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..67b1103860fcb923f996b2fb1d105b9cdf6844c6
--- /dev/null
+++ b/compute_kernel_writer/src/types/ConstantData.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ckw/types/ConstantData.h"
+
+#include <limits>
+
+namespace ckw
+{
+namespace
+{
+template <typename T>
+inline typename std::enable_if<std::is_same<T, float>::value, std::string>::type to_str(T value)
+{
+    std::stringstream ss;
+    ss << std::scientific << std::setprecision(std::numeric_limits<T>::max_digits10) << value;
+    return ss.str();
+}
+
+template <typename T>
+inline typename std::enable_if<!std::is_same<T, float>::value && !std::is_same<T, bool>::value, std::string>::type
+to_str(T value)
+{
+    return std::to_string(value);
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, bool>::value, std::string>::type to_str(T value)
+{
+    return std::to_string((int)value);
+}
+} // namespace
+
+template <typename T>
+ConstantData::ConstantData(std::initializer_list<std::initializer_list<T>> values, DataType data_type)
+    : _data_type(data_type)
+{
+    CKW_ASSERT(validate<T>(data_type));
+    CKW_ASSERT(values.size() > 0);
+
+    for (auto value_arr : values)
+    {
+        // Each row must have the same number of elements
+        CKW_ASSERT(value_arr.size() == (*values.begin()).size());
+
+        StringVector vec;
+        std::transform(value_arr.begin(), value_arr.end(), std::back_inserter(vec), [](T val) { return to_str(val); });
+
+        _values.push_back(std::move(vec));
+    }
+}
+
+template <typename T>
+bool ConstantData::validate(DataType data_type)
+{
+    switch (data_type)
+    {
+        case DataType::Fp32:
+        case DataType::Fp16:
+            return std::is_same<T, float>::value;
+        case DataType::Bool:
+            return std::is_same<T, bool>::value;
+        case DataType::Int32:
+        case DataType::Int16:
+        case DataType::Int8:
+            return std::is_same<T, int32_t>::value;
+        case DataType::Uint32:
+        case DataType::Uint16:
+        case DataType::Uint8:
+            return std::is_same<T, uint32_t>::value;
+        default:
+            CKW_THROW_MSG("Unknown data type!");
+            break;
+    }
+}
+
+// Necessary instantiations for compiler to recognize
+template ConstantData::ConstantData(std::initializer_list<std::initializer_list<int32_t>>, DataType);
+template ConstantData::ConstantData(std::initializer_list<std::initializer_list<uint32_t>>, DataType);
+template ConstantData::ConstantData(std::initializer_list<std::initializer_list<bool>>, DataType);
+template ConstantData::ConstantData(std::initializer_list<std::initializer_list<float>>, DataType);
+
+template bool ConstantData::validate<int32_t>(DataType);
+template bool ConstantData::validate<uint32_t>(DataType);
+template bool ConstantData::validate<bool>(DataType);
+template bool ConstantData::validate<float>(DataType);
+
+const std::vector<std::vector<std::string>> &ConstantData::values() const
+{
+    return _values;
+}
+
+DataType ConstantData::data_type() const
+{
+    return _data_type;
+}
+
+} // namespace ckw
diff --git a/compute_kernel_writer/src/types/DataTypeHelpers.cpp b/compute_kernel_writer/src/types/DataTypeHelpers.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f0c33fb72222892540d62d5f327815c966f99db
--- /dev/null
+++ b/compute_kernel_writer/src/types/DataTypeHelpers.cpp
@@ -0,0 +1,35 @@
+/*
+* Copyright (c) 2023 Arm Limited.
+*
+* SPDX-License-Identifier: MIT
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to
+* deal in the Software without restriction, including without limitation the
+* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+* sell copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in all
+* copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+
+#include "src/types/DataTypeHelpers.h"
+
+namespace ckw
+{
+
+bool is_data_type_float(DataType data_type)
+{
+    return (data_type == DataType::Fp32 || data_type == DataType::Fp16);
+}
+
+} // namespace ckw
diff --git a/compute_kernel_writer/src/types/DataTypeHelpers.h b/compute_kernel_writer/src/types/DataTypeHelpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6ec6ccd1925f8ac01c8ead9a399c6de72b09c2e
--- /dev/null
+++ b/compute_kernel_writer/src/types/DataTypeHelpers.h
@@ -0,0 +1,43 @@
+/*
+* Copyright (c) 2023 Arm Limited.
+*
+* SPDX-License-Identifier: MIT
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to
+* deal in the Software without restriction, including without limitation the
+* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+* sell copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in all
+* copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+
+#ifndef CKW_SRC_TYPES_DATATYPEHELPERS_H
+#define CKW_SRC_TYPES_DATATYPEHELPERS_H
+
+#include "ckw/types/DataType.h"
+
+namespace ckw
+{
+
+/** Return a value indicating whether the data type is floating-point.
+ *
+ * @param[in] data_type The data type to check.
+ *
+ * @return Whether the data type is floating-point.
+ */
+bool is_data_type_float(DataType data_type);
+
+} // namespace ckw
+
+#endif // CKW_SRC_TYPES_DATATYPEHELPERS_H
diff --git a/compute_kernel_writer/validation/Validation.cpp b/compute_kernel_writer/validation/Validation.cpp
index 5d53a16effbd643231d53fcab0e91f19e857e8cc..4fbd1eacdabfe02ba2df4b6e5dcfa5339fa7af27 100644
--- a/compute_kernel_writer/validation/Validation.cpp
+++ b/compute_kernel_writer/validation/Validation.cpp
@@ -22,14 +22,30 @@
  * SOFTWARE.
  */
 
-#include "tests/CLConstantTileTest.hpp"
-#include "tests/CLKernelWriterCommentTest.h"
-#include "tests/CLKernelWriterDeclareTileTest.h"
-#include "tests/CLTensorArgumentTest.h"
-#include "tests/CLTileTest.hpp"
-#include "tests/TensorBitMaskTest.h"
-#include "tests/UtilsTest.h"
+#include "validation/tests/CLConstantTileTest.hpp"
+#include "validation/tests/CLKernelWriterAssignTest.h"
+#include "validation/tests/CLKernelWriterBinaryOpTest.h"
+#include "validation/tests/CLKernelWriterCastTest.h"
+#include "validation/tests/CLKernelWriterCommentTest.h"
+#include "validation/tests/CLKernelWriterDeclareConstantTileTest.h"
+#include "validation/tests/CLKernelWriterDeclareTensorTest.h"
+#include "validation/tests/CLKernelWriterDeclareTileTest.h"
+#include "validation/tests/CLKernelWriterForTest.h"
+#include "validation/tests/CLKernelWriterGetGlobalIdTest.h"
+#include "validation/tests/CLKernelWriterIfTest.h"
+#include "validation/tests/CLKernelWriterOpLoadIndirectTest.h"
+#include "validation/tests/CLKernelWriterOpLoadStoreTest.h"
+#include "validation/tests/CLKernelWriterPrintTest.h"
+#include "validation/tests/CLKernelWriterReturnTest.h"
+#include "validation/tests/CLKernelWriterSubTileTest.h"
+#include "validation/tests/CLKernelWriterTernaryOpTest.h"
+#include "validation/tests/CLKernelWriterUnaryExpressionTest.h"
+#include "validation/tests/CLTensorArgumentTest.h"
+#include "validation/tests/CLTileTest.hpp"
+#include "validation/tests/TensorBitMaskTest.h"
+#include "validation/tests/UtilsTest.h"
 
+#include <cstdint>
 #include <memory>
 #include <vector>
 
@@ -73,6 +89,21 @@ int32_t main()
     const auto test21 = std::make_unique<CLTensorArgumentComponentsUsedPassByValueTrueTest>();
     const auto test22 = std::make_unique<CLTensorArgumentStoragesUsedTest>();
     const auto test23 = std::make_unique<CLTensorArgumentComponentsUsedPassByValueTrueDynamicDimTrueTest>();
+    const auto test24 = std::make_unique<CLKernelWriterDeclareTensorTest>();
+    const auto test25 = std::make_unique<CLKernelWriterOpLoadStoreTest>();
+    const auto test26 = std::make_unique<CLKernelWriterAssignTest>();
+    const auto test27 = std::make_unique<CLKernelWriterCastTest>();
+    const auto test28 = std::make_unique<CLKernelWriterUnaryExpressionTest>();
+    const auto test29 = std::make_unique<CLKernelWriterBinaryOpTest>();
+    const auto test30 = std::make_unique<CLKernelWriterTernaryOpTest>();
+    const auto test31 = std::make_unique<CLKernelWriterDeclareConstantTileTest>();
+    const auto test32 = std::make_unique<CLKernelWriterIfTest>();
+    const auto test33 = std::make_unique<CLKernelWriterForTest>();
+    const auto test34 = std::make_unique<CLKernelWriterReturnTest>();
+    const auto test35 = std::make_unique<CLKernelWriterGetGlobalIdTest>();
+    const auto test36 = std::make_unique<CLKernelWriterPrintTest>();
+    const auto test37 = std::make_unique<CLKernelWriterOpLoadIndirectTest>();
+    const auto test38 = std::make_unique<CLKernelWriterSubTileTest>();
 
     tests.push_back(test3.get());
     tests.push_back(test4.get());
@@ -97,6 +128,21 @@ int32_t main()
     tests.push_back(test21.get());
     tests.push_back(test22.get());
     tests.push_back(test23.get());
+    tests.push_back(test24.get());
+    tests.push_back(test25.get());
+    tests.push_back(test26.get());
+    tests.push_back(test27.get());
+    tests.push_back(test28.get());
+    tests.push_back(test29.get());
+    tests.push_back(test30.get());
+    tests.push_back(test31.get());
+    tests.push_back(test32.get());
+    tests.push_back(test33.get());
+    tests.push_back(test34.get());
+    tests.push_back(test35.get());
+    tests.push_back(test36.get());
+    tests.push_back(test37.get());
+    tests.push_back(test38.get());
 #endif /* COMPUTE_KERNEL_WRITER_OPENCL_ENABLED */
 
     bool all_test_passed = true;
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterAssignTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterAssignTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..f32f797a01a06eec660c9f3a36dd996caf9c5fec
--- /dev/null
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterAssignTest.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_VALIDATION_TESTS_CLKERNELWRITERASSIGNTEST_H
+#define CKW_VALIDATION_TESTS_CLKERNELWRITERASSIGNTEST_H
+
+#include "ckw/TileInfo.h"
+#include "ckw/types/DataType.h"
+#include "src/cl/CLKernelWriter.h"
+#include "validation/tests/common/Common.h"
+#include "validation/tests/common/KernelWriterInterceptor.h"
+
+#include <cstdint>
+#include <vector>
+
+namespace ckw
+{
+
+class CLKernelWriterAssignTest : public ITest
+{
+public:
+    CLKernelWriterAssignTest()
+    {
+        _tests.push_back({ 1, 1, 1, 1, DataType::Fp32, "G0__dst = G0__src;\n" }); // Scalar.
+
+        _tests.push_back({ 1, 3, 1, 3, DataType::Fp16, "G0__dst = G0__src;\n" }); // Whole vector.
+
+        _tests.push_back({ 2, 4, 2, 4, DataType::Int8, "G0__dst__0 = G0__src__0;\nG0__dst__1 = G0__src__1;\n" }); // Whole tile.
+
+        _tests.push_back({ 2, 3, 1, 3, DataType::Uint8, "G0__dst__0 = G0__src;\nG0__dst__1 = G0__src;\n" }); // Y-dimension broadcast.
+
+        _tests.push_back({ 2, 4, 2, 1, DataType::Fp32, "G0__dst__0 = (float4)G0__src__0;\nG0__dst__1 = (float4)G0__src__1;\n" }); // X-dimension broadcast.
+
+        _tests.push_back({ 2, 3, 1, 1, DataType::Fp16, "G0__dst__0 = (half3)G0__src;\nG0__dst__1 = (half3)G0__src;\n" }); // X and y dimension broadcast.
+    }
+
+    bool run() override
+    {
+        int32_t test_no          = 0;
+        bool    all_tests_passed = true;
+
+        for(const auto &test : _tests)
+        {
+            KernelWriterInterceptor<CLKernelWriter> writer;
+
+            auto dst = writer.declare_tile("dst", TileInfo(test.data_type, test.dst_height, test.dst_width));
+            auto src = writer.declare_tile("src", TileInfo(test.data_type, test.src_height, test.src_width));
+
+            writer.start_capture_code();
+
+            writer.op_assign(dst, src);
+
+            VALIDATE_TEST(writer.check_added_code(test.expected_code), all_tests_passed, test_no++);
+        }
+
+        return all_tests_passed;
+    }
+
+    std::string name() override
+    {
+        return "CLKernelWriterAssignTest";
+    }
+
+private:
+    struct TestInfo
+    {
+        int32_t     dst_height;
+        int32_t     dst_width;
+        int32_t     src_height;
+        int32_t     src_width;
+        DataType    data_type;
+        std::string expected_code;
+    };
+
+    std::vector<TestInfo> _tests{};
+};
+
+} // namespace ckw
+
+#endif // CKW_VALIDATION_TESTS_CLKERNELWRITERASSIGNTEST_H
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfa67240086a50894bca3c4d716efa426c2587af
--- /dev/null
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_VALIDATION_TESTS_CLKERNELWRITERBINARYOPTEST_H
+#define CKW_VALIDATION_TESTS_CLKERNELWRITERBINARYOPTEST_H
+
+#include "ckw/TileInfo.h"
+#include "ckw/types/DataType.h"
+#include "src/cl/CLKernelWriter.h"
+#include "validation/tests/common/Common.h"
+#include "validation/tests/common/KernelWriterInterceptor.h"
+
+#include <cstdint>
+#include <vector>
+
+namespace ckw
+{
+
+class CLKernelWriterBinaryOpTest : public ITest
+{
+public:
+    CLKernelWriterBinaryOpTest()
+    {
+        // dst_height, dst_width, dst_data_type, lhs_height, lhs_width, rhs_height, rhs_width, src_data_type, op, expected_code
+        _tests.push_back({ 1, 1, DataType::Fp32, 1, 1, 1, 1, DataType::Fp32, BinaryOp::Add, "G0__dst = G0__lhs + G0__rhs;\n" }); // Scalar.
+
+        _tests.push_back({ 1, 3, DataType::Bool, 1, 3, 1, 3, DataType::Fp16, BinaryOp::Equal, "G0__dst = G0__lhs == G0__rhs;\n" }); // Whole vector.
+
+        _tests.push_back({ 2, 4, DataType::Int8, 2, 4, 2, 4, DataType::Int8, BinaryOp::Min, "G0__dst__0 = min(G0__lhs__0, G0__rhs__0);\nG0__dst__1 = min(G0__lhs__1, G0__rhs__1);\n" }); // Whole tile.
+
+        _tests.push_back({ 2, 3, DataType::Uint8, 1, 3, 2, 3, DataType::Uint8, BinaryOp::BitwiseXOR, "G0__dst__0 = G0__lhs ^ G0__rhs__0;\nG0__dst__1 = G0__lhs ^ G0__rhs__1;\n" }); // LHS y-dimension broadcast.
+
+        _tests.push_back({ 2, 3, DataType::Bool, 2, 3, 1, 3, DataType::Fp32, BinaryOp::Less, "G0__dst__0 = G0__lhs__0 < G0__rhs;\nG0__dst__1 = G0__lhs__1 < G0__rhs;\n" }); // RHS y-dimension broadcast.
+
+        _tests.push_back({ 2, 3, DataType::Fp16, 1, 3, 1, 3, DataType::Fp16, BinaryOp::Max, "G0__dst__0 = fmax(G0__lhs, G0__rhs);\nG0__dst__1 = fmax(G0__lhs, G0__rhs);\n" }); // LHS and RHS y-dimension broadcast.
+
+        _tests.push_back({ 2, 4, DataType::Fp32, 2, 1, 2, 4, DataType::Fp32, BinaryOp::Div, "G0__dst__0 = (float4)G0__lhs__0 / G0__rhs__0;\nG0__dst__1 = (float4)G0__lhs__1 / G0__rhs__1;\n" }); // LHS x-dimension broadcast.
+
+        _tests.push_back({ 2, 4, DataType::Fp16, 2, 4, 2, 1, DataType::Fp16, BinaryOp::Mod, "G0__dst__0 = G0__lhs__0 % (half4)G0__rhs__0;\nG0__dst__1 = G0__lhs__1 % (half4)G0__rhs__1;\n" }); // RHS x-dimension broadcast.
+
+        _tests.push_back({ 2, 4, DataType::Bool, 2, 1, 2, 1, DataType::Fp32, BinaryOp::GreaterEqual, "G0__dst__0 = (float4)G0__lhs__0 >= (float4)G0__rhs__0;\nG0__dst__1 = (float4)G0__lhs__1 >= (float4)G0__rhs__1;\n" }); // LHS and RHS x-dimension broadcast.
+
+        _tests.push_back({ 2, 3, DataType::Fp32, 2, 3, 2, 3, DataType::Fp32, BinaryOp::MatMul_Nt_T,
+                           "G0__dst__0.s0 = fma(G0__lhs__0.s0, G0__rhs__0.s0, G0__dst__0.s0);\n"
+                           "G0__dst__0.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s0, G0__dst__0.s0);\n"
+                           "G0__dst__0.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s0, G0__dst__0.s0);\n"
+                           "G0__dst__1.s0 = fma(G0__lhs__0.s0, G0__rhs__0.s1, G0__dst__1.s0);\n"
+                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s1, G0__dst__1.s0);\n"
+                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s1, G0__dst__1.s0);\n"
+                           "G0__dst__1.s0 = fma(G0__lhs__0.s0, G0__rhs__0.s2, G0__dst__1.s0);\n"
+                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s2, G0__dst__1.s0);\n"
+                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s2, G0__dst__1.s0);\n"
+                           "G0__dst__0.s1 = fma(G0__lhs__0.s1, G0__rhs__0.s0, G0__dst__0.s1);\n"
+                           "G0__dst__0.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s0, G0__dst__0.s1);\n"
+                           "G0__dst__0.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s0, G0__dst__0.s1);\n"
+                           "G0__dst__1.s1 = fma(G0__lhs__0.s1, G0__rhs__0.s1, G0__dst__1.s1);\n"
+                           "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s1, G0__dst__1.s1);\n"
+                           "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s1, G0__dst__1.s1);\n"
+                           "G0__dst__1.s1 = fma(G0__lhs__0.s1, G0__rhs__0.s2, G0__dst__1.s1);\n"
+                           "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s2, G0__dst__1.s1);\n"
+                           "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s2, G0__dst__1.s1);\n" });
+    }
+
+    bool run() override
+    {
+        int32_t test_no          = 0;
+        bool    all_tests_passed = true;
+
+        for(const auto &test : _tests)
+        {
+            KernelWriterInterceptor<CLKernelWriter> writer;
+
+            auto dst = writer.declare_tile("dst", TileInfo(test.dst_data_type, test.dst_height, test.dst_width));
+            auto lhs = writer.declare_tile("lhs", TileInfo(test.src_data_type, test.lhs_height, test.lhs_width));
+            auto rhs = writer.declare_tile("rhs", TileInfo(test.src_data_type, test.rhs_height, test.rhs_width));
+
+            writer.start_capture_code();
+
+            writer.op_binary(dst, test.op, lhs, rhs);
+
+            VALIDATE_TEST(writer.check_added_code(test.expected_code), all_tests_passed, test_no++);
+        }
+
+        return all_tests_passed;
+    }
+
+    std::string name() override
+    {
+        return "CLKernelWriterBinaryOpTest";
+    }
+
+private:
+    struct TestInfo
+    {
+        int32_t     dst_height;
+        int32_t     dst_width;
+        DataType    dst_data_type;
+        int32_t     lhs_height;
+        int32_t     lhs_width;
+        int32_t     rhs_height;
+        int32_t     rhs_width;
+        DataType    src_data_type;
+        BinaryOp    op;
+        std::string expected_code;
+    };
+
+    std::vector<TestInfo> _tests{};
+};
+
+} // namespace ckw
+
+#endif // CKW_VALIDATION_TESTS_CLKERNELWRITERBINARYOPTEST_H
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterCastTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterCastTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..a185cce5450b0967c8e0d3eba9d01d3dcb30e8fa
--- /dev/null
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterCastTest.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_VALIDATION_TESTS_CLKERNELWRITERCASTTEST_H
+#define CKW_VALIDATION_TESTS_CLKERNELWRITERCASTTEST_H
+
+#include "ckw/TileInfo.h"
+#include "ckw/types/ConvertPolicy.h"
+#include "ckw/types/DataType.h"
+#include "src/cl/CLKernelWriter.h"
+#include "validation/tests/common/Common.h"
+#include "validation/tests/common/KernelWriterInterceptor.h"
+
+#include <cstdint>
+#include <vector>
+
+namespace ckw
+{
+
+class CLKernelWriterCastTest : public ITest
+{
+public:
+    CLKernelWriterCastTest()
+    {
+        _tests.push_back({ 1, 1, DataType::Fp16, 1, 1, DataType::Fp32, ConvertPolicy::None, "G0__dst = convert_half(G0__src);\n" }); // Scalar.
+
+        _tests.push_back({ 1, 3, DataType::Int32, 1, 3, DataType::Fp16, ConvertPolicy::Saturate, "G0__dst = convert_int3_sat(G0__src);\n" }); // Whole vector.
+
+        _tests.push_back({ 2, 4, DataType::Uint16, 2, 4, DataType::Int8, ConvertPolicy::Saturate, "G0__dst__0 = convert_ushort4_sat(G0__src__0);\nG0__dst__1 = convert_ushort4_sat(G0__src__1);\n" }); // Whole tile.
+
+        _tests.push_back({ 2, 3, DataType::Int8, 1, 3, DataType::Uint8, ConvertPolicy::None, "G0__dst__0 = convert_char3(G0__src);\nG0__dst__1 = convert_char3(G0__src);\n" }); // Y-dimension broadcast.
+
+        _tests.push_back({ 2, 4, DataType::Fp16, 2, 1, DataType::Fp32, ConvertPolicy::None, "G0__dst__0 = (half4)convert_half(G0__src__0);\nG0__dst__1 = (half4)convert_half(G0__src__1);\n" }); // X-dimension broadcast.
+
+        _tests.push_back({ 2, 3, DataType::Fp32, 1, 1, DataType::Fp16, ConvertPolicy::None, "G0__dst__0 = (float3)convert_float(G0__src);\nG0__dst__1 = (float3)convert_float(G0__src);\n" }); // X and y dimension broadcast.
+    }
+
+    bool run() override
+    {
+        int32_t test_no          = 0;
+        bool    all_tests_passed = true;
+
+        for(const auto &test : _tests)
+        {
+            KernelWriterInterceptor<CLKernelWriter> writer;
+
+            auto dst = writer.declare_tile("dst", TileInfo(test.dst_data_type, test.dst_height, test.dst_width));
+            auto src = writer.declare_tile("src", TileInfo(test.src_data_type, test.src_height, test.src_width));
+
+            writer.start_capture_code();
+
+            writer.op_cast(dst, src, test.policy);
+
+            VALIDATE_TEST(writer.check_added_code(test.expected_code), all_tests_passed, test_no++);
+        }
+
+        return all_tests_passed;
+    }
+
+    std::string name() override
+    {
+        return "CLKernelWriterCastTest";
+    }
+
+private:
+    struct TestInfo
+    {
+        int32_t       dst_height;
+        int32_t       dst_width;
+        DataType      dst_data_type;
+        int32_t       src_height;
+        int32_t       src_width;
+        DataType      src_data_type;
+        ConvertPolicy policy;
+        std::string   expected_code;
+    };
+
+    std::vector<TestInfo> _tests{};
+};
+
+} // namespace ckw
+
+#endif // CKW_VALIDATION_TESTS_CLKERNELWRITERCASTTEST_H
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterCommentTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterCommentTest.h
index ff09ea80737e4461c93b907c6142402b9a4e912b..b36c3905ecdc1a0a75919195dd8f3ab7435dcd95 100644
--- a/compute_kernel_writer/validation/tests/CLKernelWriterCommentTest.h
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterCommentTest.h
@@ -22,8 +22,8 @@
  * SOFTWARE.
  */
 
-#ifndef CKW_VALIDATION_TESTS_CLKERNELTEST_H
-#define CKW_VALIDATION_TESTS_CLKERNELTEST_H
+#ifndef CKW_VALIDATION_TESTS_CLKERNELWRITERCOMMENTTEST_H
+#define CKW_VALIDATION_TESTS_CLKERNELWRITERCOMMENTTEST_H
 
 #include "src/cl/CLKernelWriter.h"
 #include "validation/tests/common/Common.h"
@@ -45,14 +45,18 @@ public:
 
         KernelWriterInterceptor<CLKernelWriter> writer;
 
-        writer.comment("previous code");
+        writer.op_comment("previous code");
 
         writer.start_capture_code();
 
-        writer.comment("code under test 0");
-        writer.comment("code under test 1");
+        writer.op_comment("code under test 0");
+        writer.op_comment("code under test 1");
 
+#ifdef COMPUTE_KERNEL_WRITER_DEBUG_ENABLED
         constexpr auto expected_code = "// code under test 0\n// code under test 1\n";
+#else  // COMPUTE_KERNEL_WRITER_DEBUG_ENABLED
+        constexpr auto expected_code = "";
+#endif // COMPUTE_KERNEL_WRITER_DEBUG_ENABLED
 
         VALIDATE_TEST(writer.check_added_code(expected_code), all_tests_passed, 0);
 
@@ -67,4 +71,4 @@ public:
 
 } // namespace ckw
 
-#endif // CKW_VALIDATION_TESTS_CLKERNELTEST_H
+#endif // CKW_VALIDATION_TESTS_CLKERNELWRITERCOMMENTTEST_H
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterDeclareConstantTileTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterDeclareConstantTileTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..661a8328e8b2ce8993185854a7fb48823852cf67
--- /dev/null
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterDeclareConstantTileTest.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_VALIDATION_TESTS_CLKERNELWRITERDECLARECONSTANTTILETEST_H
+#define CKW_VALIDATION_TESTS_CLKERNELWRITERDECLARECONSTANTTILETEST_H
+
+#include "ckw/TileInfo.h"
+#include "ckw/types/ConstantData.h"
+#include "ckw/types/DataType.h"
+#include "src/cl/CLKernelWriter.h"
+#include "validation/tests/common/KernelWriterInterceptor.h"
+#include "validation/tests/common/Common.h"
+
+#include <string>
+#include <tuple>
+#include <vector>
+
+namespace ckw
+{
+class CLKernelWriterDeclareConstantTileTest : public ITest
+{
+    using TestConfig = std::tuple<ConstantData, DataType, int32_t, int32_t, std::string>;
+public:
+    CLKernelWriterDeclareConstantTileTest()
+    {
+        _configs = {
+            // ConstantData, DataType, Height, Width
+            {ConstantData({{1}}, DataType::Int32), DataType::Int32, 1, 1,
+                "G0__tile = ((int)(1));\n"},
+            {ConstantData({{1U}}, DataType::Uint32), DataType::Uint32, 1, 1,
+                "G0__tile = ((uint)(1));\n"},
+            {ConstantData({{1, 2}}, DataType::Int8), DataType::Int8, 1, 2,
+                "G0__tile = ((char2)(1, 2));\n"},
+            {ConstantData({{1, -2}, {-3, 4}}, DataType::Int32), DataType::Int32, 2, 2,
+                "G0__tile__0 = ((int2)(1, -2));\nG0__tile__1 = ((int2)(-3, 4));\n"},
+            {ConstantData({{1.0f, -2.0f}}, DataType::Fp16), DataType::Fp16, 1, 2,
+                "G0__tile = ((half2)(1.000000000e+00, -2.000000000e+00));\n"},
+            {ConstantData({{/* FLT_MAX */ 340282346638528859811704183484516925440.0f, -2.0f, 3.0f}}, DataType::Fp32), DataType::Fp32, 1, 3,
+                "G0__tile = ((float3)(3.402823466e+38, -2.000000000e+00, 3.000000000e+00));\n"},
+            {ConstantData({{1.0f, -1e-20f, 2e-20f, /* FLT_EPS */ 1.1920928955078125e-7f}}, DataType::Fp32), DataType::Fp32, 1, 4,
+                "G0__tile = ((float4)(1.000000000e+00, -9.999999683e-21, 1.999999937e-20, 1.192092896e-07));\n"},
+            {ConstantData({{0.5f, 2.1e-30f, /* FLT_MIN */ 1.175494350822287507969e-38f}}, DataType::Fp32), DataType::Fp32, 1, 3,
+                "G0__tile = ((float3)(5.000000000e-01, 2.099999969e-30, 1.175494351e-38));\n"},
+            {ConstantData({{true}, {false}, {false}}, DataType::Bool), DataType::Bool, 3, 1,
+                "G0__tile__0 = ((bool)(1));\nG0__tile__1 = ((bool)(0));\nG0__tile__2 = ((bool)(0));\n"}
+        };
+    }
+
+    bool run() override
+    {
+        bool all_tests_passed = true;
+        int test_idx = 0;
+
+        for(TestConfig _config: _configs)
+        {
+            KernelWriterInterceptor<CLKernelWriter> writer;
+            const ConstantData const_data = std::get<0>(_config);
+            const DataType data_type = std::get<1>(_config);
+            const size_t height = std::get<2>(_config);
+            const size_t width = std::get<3>(_config);
+            const std::string expected_code = std::get<4>(_config);
+
+            TileOperand tile = writer.declare_tile("tile", TileInfo(data_type, height, width));
+            writer.start_capture_code();
+            TileOperand const_tile = writer.declare_constant_tile(const_data);
+            writer.op_assign(tile, const_tile);
+
+            VALIDATE_TEST(writer.check_added_code(expected_code), all_tests_passed, test_idx++);
+        }
+
+        return all_tests_passed;
+    }
+
+    std::string name() override
+    {
+        return "CLKernelWriterDeclareConstantTileTest";
+    }
+
+private:
+    std::vector<TestConfig> _configs {};
+};
+
+} // namespace ckw
+
+#endif // CKW_VALIDATION_TESTS_CLKERNELWRITERDECLARECONSTANTTILETEST_H
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTensorTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTensorTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e1056972e644fb442f2bf9955aef8bc542909f0
--- /dev/null
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTensorTest.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_VALIDATION_TESTS_CLKERNELWRITERDECLARETENSORTEST_H
+#define CKW_VALIDATION_TESTS_CLKERNELWRITERDECLARETENSORTEST_H
+
+#include "ckw/Error.h"
+#include "ckw/Kernel.h"
+#include "ckw/KernelArgument.h"
+#include "ckw/TensorInfo.h"
+#include "ckw/types/TensorComponentType.h"
+#include "ckw/types/TensorDataLayout.h"
+#include "src/cl/CLKernelWriter.h"
+#include "validation/tests/common/Common.h"
+
+namespace ckw
+{
+
+class CLKernelWriterDeclareTensorTest : public ITest
+{
+public:
+    CLKernelWriterDeclareTensorTest()
+    {
+    }
+
+    std::string name() override
+    {
+        return "CLKernelWriterDeclareTensorTest";
+    }
+
+    bool run() override
+    {
+        auto all_tests_passed = true;
+
+        CLKernelWriter writer;
+
+        auto src = writer.declare_tensor_argument("src", TensorInfo(DataType::Fp32, TensorShape{ 2, 3, 4, 5 }, TensorDataLayout::Nhwc, 0));
+        auto dst = writer.declare_tensor_argument("dst", TensorInfo(DataType::Fp32, TensorShape{ 6, 7, 8, 9 }, TensorDataLayout::Nhwc, 1));
+
+        auto src_dim0           = src.dim0();
+        auto src_stride2        = src.stride2();
+        auto src_offset_element = src.offset_first_element_in_bytes();
+
+        auto dst_dim1 = dst.dim0();
+
+        auto src_dim0_again = src.dim0();
+
+        CKW_UNUSED(src_dim0, src_stride2, src_offset_element, dst_dim1, src_dim0_again);
+
+        const auto kernel = writer.emit_kernel("test_kernel");
+
+        const std::string expected_code =
+            "__kernel void test_kernel\n"
+            "(\n"
+            "int G0__src_dim0,\n"
+            "int G0__src_stride2,\n"
+            "int G0__src_offset_first_element,\n"
+            "int G0__dst_dim0\n"
+            ")\n"
+            "{\n"
+            "}\n";
+
+        const auto &actual_code = kernel->source_code();
+
+        int test_id = 0;
+        VALIDATE_TEST(kernel->arguments().size() == 4, all_tests_passed, test_id++);
+        test_tensor_component_argument(kernel->arguments()[0], 0, TensorComponentType::Dim0, all_tests_passed, test_id);
+        test_tensor_component_argument(kernel->arguments()[1], 0, TensorComponentType::Stride2, all_tests_passed, test_id);
+        test_tensor_component_argument(kernel->arguments()[2], 0, TensorComponentType::OffsetFirstElement, all_tests_passed, test_id);
+        test_tensor_component_argument(kernel->arguments()[3], 1, TensorComponentType::Dim0, all_tests_passed, test_id);
+        VALIDATE_TEST(actual_code == expected_code, all_tests_passed, test_id++);
+
+        return all_tests_passed;
+    }
+
+    void test_tensor_component_argument(const KernelArgument &arg, int32_t tensor_id, TensorComponentType component_type, bool &all_tests_passed, int &test_id)
+    {
+        VALIDATE_TEST(arg.type() == KernelArgument::Type::TensorComponent, all_tests_passed, test_id++);
+        VALIDATE_TEST(arg.id() == tensor_id, all_tests_passed, test_id++);
+        VALIDATE_TEST(arg.tensor_component_type() == component_type, all_tests_passed, test_id++);
+    }
+};
+
+} // namespace ckw
+
+#endif // CKW_VALIDATION_TESTS_CLKERNELWRITERDECLARETENSORTEST_H
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTileTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTileTest.h
index 5e00084aaac55c36e1d929a479307692b060bf92..4f728bc1bfca4f8318a13101e47bad964bae37ef 100644
--- a/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTileTest.h
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTileTest.h
@@ -73,7 +73,7 @@ public:
             std::string expected_code = "";
             for(int32_t row = 0; row < height; ++row)
             {
-                expected_code += prefix + std::to_string(row) + ";\n";
+                expected_code += prefix + ((height > 1) ? std::string("__") + std::to_string(row) : "") + ";\n";
             }
 
             TileInfo tile_info(data_type, height, width);
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterForTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterForTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..beb39966b26b5876e05faaacde9bb824a680c6b1
--- /dev/null
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterForTest.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_VALIDATION_TESTS_CLKERNELWRITERFORTEST_H
+#define CKW_VALIDATION_TESTS_CLKERNELWRITERFORTEST_H
+
+#include "ckw/Error.h"
+#include "ckw/TileInfo.h"
+#include "ckw/types/Operators.h"
+#include "src/cl/CLKernelWriter.h"
+#include "validation/tests/common/Common.h"
+#include "validation/tests/common/KernelWriterInterceptor.h"
+
+namespace ckw
+{
+
+class CLKernelWriterForTest : public ITest
+{
+public:
+    CLKernelWriterForTest()
+    {
+    }
+
+    bool run() override
+    {
+        bool all_tests_passed = true;
+
+        KernelWriterInterceptor<CLKernelWriter> writer;
+
+        auto idx   = writer.declare_tile("idx", TileInfo(DataType::Int32, 1, 1));
+        auto len   = writer.declare_tile("len", TileInfo(DataType::Int32, 1, 1));
+        auto addr  = writer.declare_tile("addr", TileInfo(DataType::Int32, 1, 1));
+        auto esize = writer.declare_tile("esize", TileInfo(DataType::Int32, 1, 1));
+
+        writer.start_capture_code();
+
+        writer.op_for_loop(
+            idx, BinaryOp::Less, len, addr, AssignmentOp::Increment, esize,
+            [&]()
+            {
+                auto tile = writer.declare_tile("tile", TileInfo(DataType::Fp32, 1, 3));
+                CKW_UNUSED(tile);
+            });
+
+        constexpr auto expected_code =
+            "for (; G0__idx < G0__len; G0__addr += G0__esize)\n"
+            "{\n"
+            "float3 G1__tile;\n"
+            "}\n";
+
+        VALIDATE_TEST(writer.check_added_code(expected_code), all_tests_passed, 0);
+
+        return all_tests_passed;
+    }
+
+    std::string name() override
+    {
+        return "CLKernelWriterForTest";
+    }
+};
+
+} // namespace ckw
+
+#endif // CKW_VALIDATION_TESTS_CLKERNELWRITERFORTEST_H
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterGetGlobalIdTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterGetGlobalIdTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa34b3f5df7b5e1ab4badf11d2f7cd1086274146
--- /dev/null
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterGetGlobalIdTest.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_VALIDATION_TESTS_CLKERNELWRITERGETGLOBALIDTEST_H
+#define CKW_VALIDATION_TESTS_CLKERNELWRITERGETGLOBALIDTEST_H
+
+#include "ckw/TileInfo.h"
+#include "src/cl/CLKernelWriter.h"
+#include "validation/tests/common/Common.h"
+#include "validation/tests/common/KernelWriterInterceptor.h"
+
+namespace ckw
+{
+
+class CLKernelWriterGetGlobalIdTest : public ITest
+{
+public:
+    CLKernelWriterGetGlobalIdTest()
+    {
+    }
+
+    bool run() override
+    {
+        bool all_tests_passed = true;
+
+        KernelWriterInterceptor<CLKernelWriter> writer;
+
+        auto gid = writer.declare_tile("gid", TileInfo(DataType::Int32));
+
+        writer.start_capture_code();
+
+        writer.op_get_global_id(gid, 0);
+        writer.op_get_global_id(gid, 1);
+        writer.op_get_global_id(gid, 2);
+
+        constexpr auto expected_code = "G0__gid = get_global_id(0);\nG0__gid = get_global_id(1);\nG0__gid = get_global_id(2);\n";
+
+        VALIDATE_TEST(writer.check_added_code(expected_code), all_tests_passed, 0);
+
+        return all_tests_passed;
+    }
+
+    std::string name() override
+    {
+        return "CLKernelWriterGetGlobalIdTest";
+    }
+};
+
+} // namespace ckw
+
+#endif // CKW_VALIDATION_TESTS_CLKERNELWRITERGETGLOBALIDTEST_H
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterIfTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterIfTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..3964bd76d4755ef771c361db72210d1834fa1a72
--- /dev/null
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterIfTest.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_VALIDATION_TESTS_CLKERNELWRITERIFTEST_H
+#define CKW_VALIDATION_TESTS_CLKERNELWRITERIFTEST_H
+
+#include "ckw/Error.h"
+#include "ckw/TileInfo.h"
+#include "src/cl/CLKernelWriter.h"
+#include "validation/tests/common/Common.h"
+#include "validation/tests/common/KernelWriterInterceptor.h"
+
+#include <cstdint>
+
+namespace ckw
+{
+
+class CLKernelWriterIfTest : public ITest
+{
+public:
+    CLKernelWriterIfTest()
+    {
+    }
+
+    bool run() override
+    {
+        int32_t test_no          = 0;
+        bool    all_tests_passed = true;
+
+        KernelWriterInterceptor<CLKernelWriter> writer;
+
+        auto lhs = writer.declare_tile("lhs", TileInfo(DataType::Fp32, 1, 1));
+        auto rhs = writer.declare_tile("rhs", TileInfo(DataType::Fp32, 1, 1));
+
+        // The first if block.
+        {
+            writer.start_capture_code();
+
+            writer.op_if(
+                lhs, BinaryOp::Equal, rhs,
+                [&]()
+                {
+                    auto tile = writer.declare_tile("tile", TileInfo(DataType::Fp16, 2, 3));
+                    CKW_UNUSED(tile);
+                });
+
+            constexpr auto expected_code =
+                "if (G0__lhs == G0__rhs)\n"
+                "{\n"
+                "half3 G1__tile__0;\n"
+                "half3 G1__tile__1;\n"
+                "}\n";
+
+            VALIDATE_TEST(writer.check_added_code(expected_code), all_tests_passed, test_no++);
+        }
+
+        // The second if block - The ID space inside the if block should change.
+        {
+            writer.start_capture_code();
+
+            writer.op_if(
+                lhs, BinaryOp::Equal, rhs,
+                [&]()
+                {
+                    auto tile = writer.declare_tile("tile", TileInfo(DataType::Fp16, 2, 3));
+                    CKW_UNUSED(tile);
+                });
+
+            constexpr auto expected_code =
+                "if (G0__lhs == G0__rhs)\n"
+                "{\n"
+                "half3 G2__tile__0;\n"
+                "half3 G2__tile__1;\n"
+                "}\n";
+
+            VALIDATE_TEST(writer.check_added_code(expected_code), all_tests_passed, test_no++);
+        }
+
+        // The if-else block - The ID space in each block should change.
+        {
+            writer.start_capture_code();
+
+            writer.op_if(
+                lhs, BinaryOp::Equal, rhs,
+                [&]()
+                {
+                    auto tile = writer.declare_tile("tile", TileInfo(DataType::Fp16, 2, 3));
+                    CKW_UNUSED(tile);
+                });
+            writer.op_else(
+                [&]()
+                {
+                    auto tile = writer.declare_tile("tile", TileInfo(DataType::Uint8, 1, 4));
+                    CKW_UNUSED(tile);
+                });
+
+            constexpr auto expected_code =
+                "if (G0__lhs == G0__rhs)\n"
+                "{\n"
+                "half3 G3__tile__0;\n"
+                "half3 G3__tile__1;\n"
+                "}\n"
+                "else\n"
+                "{\n"
+                "uchar4 G4__tile;\n"
+                "}\n";
+
+            VALIDATE_TEST(writer.check_added_code(expected_code), all_tests_passed, test_no++);
+        }
+
+        // If-else if block.
+        {
+            writer.start_capture_code();
+
+            writer.op_if(
+                lhs, BinaryOp::Equal, rhs,
+                [&]()
+                {
+                    auto tile = writer.declare_tile("tile", TileInfo(DataType::Fp32, 1, 3));
+                    CKW_UNUSED(tile);
+                });
+            writer.op_else_if(
+                lhs, BinaryOp::Less, rhs,
+                [&]()
+                {
+                    auto tile = writer.declare_tile("tile", TileInfo(DataType::Int8, 1, 4));
+                    CKW_UNUSED(tile);
+                });
+
+            constexpr auto expected_code =
+                "if (G0__lhs == G0__rhs)\n"
+                "{\n"
+                "float3 G5__tile;\n"
+                "}\n"
+                "else if (G0__lhs < G0__rhs)\n"
+                "{\n"
+                "char4 G6__tile;\n"
+                "}\n";
+
+            VALIDATE_TEST(writer.check_added_code(expected_code), all_tests_passed, test_no++);
+        }
+
+        return all_tests_passed;
+    }
+
+    std::string name() override
+    {
+        return "CLKernelWriterIfTest";
+    }
+};
+
+} // namespace ckw
+
+#endif // CKW_VALIDATION_TESTS_CLKERNELWRITERIFTEST_H
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterOpLoadIndirectTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterOpLoadIndirectTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..dacf3cd4350d6f5b47518b0c1f51f01caa97d91a
--- /dev/null
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterOpLoadIndirectTest.h
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_VALIDATION_TESTS_CLKERNELWRITEROPLOADINDIRECTTEST_H
+#define CKW_VALIDATION_TESTS_CLKERNELWRITEROPLOADINDIRECTTEST_H
+
+#include "ckw/TileInfo.h"
+#include "ckw/types/DataType.h"
+#include "ckw/TensorSampler.h"
+#include "ckw/types/MemoryOperation.h"
+#include "ckw/types/TensorSamplerTypes.h"
+#include "src/cl/CLKernelWriter.h"
+#include "validation/tests/common/KernelWriterInterceptor.h"
+#include "validation/tests/common/Common.h"
+
+#include <vector>
+
+namespace ckw
+{
+
+class CLKernelWriterOpLoadIndirectTest : public ITest
+{
+private:
+    using AddressModeX = TensorSamplerAddressModeX;
+    using AddressModeY = TensorSamplerAddressModeY;
+    using AddressModeZ = TensorSamplerAddressModeZ;
+    using Format = TensorSamplerFormat;
+    using Storage = TensorStorageType;
+
+    struct Coordinates
+    {
+        Coordinates(std::string x, std::string y, std::string z, std::string batch)
+            : x(x), y(y), z(z), batch(batch)
+        {
+        }
+
+        std::string x;
+        std::string y;
+        std::string z;
+        std::string batch;
+    };
+
+    struct SamplerData
+    {
+        SamplerData(Format format, AddressModeX mode_x, AddressModeY mode_y, AddressModeZ mode_z)
+        :   format(format), mode_x(mode_x), mode_y(mode_y), mode_z(mode_z)
+        {
+        }
+
+        Format format;
+        AddressModeX mode_x;
+        AddressModeY mode_y;
+        AddressModeZ mode_z;
+    };
+
+    using CLKernelWriterOpLoadIndirectConfig = std::tuple<TileInfo, TensorStorageType, SamplerData, Coordinates, std::string>;
+
+public:
+    CLKernelWriterOpLoadIndirectTest()
+    {
+        const std::string fp_2x3_tile = R"_(
+G0__tile__0 = vload3(0, (__global float*)(G0__tensor_ptr + (G0__x) * sizeof(float) + (G0__indirect_addr__0) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (G0__b) * G0__tensor_stride3));
+G0__tile__1 = vload3(0, (__global float*)(G0__tensor_ptr + (G0__x) * sizeof(float) + (G0__indirect_addr__1) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (G0__b) * G0__tensor_stride3));
+)_";
+
+        const std::string half_2x4_yz_collapsed_y_clamped_to_border_max_only_image = R"_(
+G0__tile__0 = read_imageh(G0__tensor_img2d, CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, (int2)((G0__x) >> 2, (G0__indirect_addr__0 + (G0__b) * G0__tensor_dim1xdim2 * 1)));
+G0__tile__1 = read_imageh(G0__tensor_img2d, CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, (int2)((G0__x) >> 2, (G0__indirect_addr__1 + (G0__b) * G0__tensor_dim1xdim2 * 1)));
+)_";
+
+        const std::string int_2x4_y_skip_less_than_zero = R"_(
+if(G0__indirect_addr__0 >= 0)
+{
+G0__tile__0 = vload4(0, (__global int*)(G0__tensor_ptr + (G0__x) * sizeof(int) + (G0__indirect_addr__0) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (G0__b) * G0__tensor_stride3));
+}
+if(G0__indirect_addr__1 >= 0)
+{
+G0__tile__1 = vload4(0, (__global int*)(G0__tensor_ptr + (G0__x) * sizeof(int) + (G0__indirect_addr__1) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (G0__b) * G0__tensor_stride3));
+}
+)_";
+
+        // tensor shape in x-dim is 10 (thus the 8, 2 vloads in if, else blocks respectively)
+        const std::string uint16_3x8_yz_collapsed_b_eq_0_x_overlapping_min_y_skip_less_than_zero = R"_(
+if(G0__x > 0)
+{
+if(G0__indirect_addr__0 >= 0)
+{
+G0__tile__0 = vload8(0, (__global ushort*)(G0__tensor_ptr + (G0__x) * sizeof(ushort) + (G0__indirect_addr__0) * G0__tensor_stride1 + (G0__0) * G0__tensor_stride3));
+}
+if(G0__indirect_addr__1 >= 0)
+{
+G0__tile__1 = vload8(0, (__global ushort*)(G0__tensor_ptr + (G0__x) * sizeof(ushort) + (G0__indirect_addr__1) * G0__tensor_stride1 + (G0__0) * G0__tensor_stride3));
+}
+if(G0__indirect_addr__2 >= 0)
+{
+G0__tile__2 = vload8(0, (__global ushort*)(G0__tensor_ptr + (G0__x) * sizeof(ushort) + (G0__indirect_addr__2) * G0__tensor_stride1 + (G0__0) * G0__tensor_stride3));
+}
+}
+else
+{
+if(G0__indirect_addr__0 >= 0)
+{
+G0__tile__0.s01 = vload2(0, (__global ushort*)(G0__tensor_ptr + (G0__x + 0) * sizeof(ushort) + (G0__indirect_addr__0) * G0__tensor_stride1 + (G0__0) * G0__tensor_stride3));
+}
+if(G0__indirect_addr__1 >= 0)
+{
+G0__tile__1.s01 = vload2(0, (__global ushort*)(G0__tensor_ptr + (G0__x + 0) * sizeof(ushort) + (G0__indirect_addr__1) * G0__tensor_stride1 + (G0__0) * G0__tensor_stride3));
+}
+if(G0__indirect_addr__2 >= 0)
+{
+G0__tile__2.s01 = vload2(0, (__global ushort*)(G0__tensor_ptr + (G0__x + 0) * sizeof(ushort) + (G0__indirect_addr__2) * G0__tensor_stride1 + (G0__0) * G0__tensor_stride3));
+}
+}
+)_";
+
+        // Configs Bundled
+        _configs = {
+            {
+                TileInfo(DataType::Fp32, 2, 3),
+                TensorStorageType::BufferUint8Ptr,
+                SamplerData(Format::Dim0_Dim1_Dim2, AddressModeX::None, AddressModeY::None, AddressModeZ::None),
+                Coordinates("x", "y", "z", "b"),
+                fp_2x3_tile
+            },
+            {
+                TileInfo(DataType::Fp16, 2, 4),
+                TensorStorageType::Texture2dReadOnly,
+                SamplerData(Format::Dim0_Dim1xDim2_1, AddressModeX::None, AddressModeY::ClampToBorderMaxOnly, AddressModeZ::None),
+                Coordinates("x", "y", "z", "b"),
+                half_2x4_yz_collapsed_y_clamped_to_border_max_only_image
+            },
+            {
+                TileInfo(DataType::Int32, 2, 4),
+                TensorStorageType::BufferUint8Ptr,
+                SamplerData(Format::Dim0_Dim1_Dim2, AddressModeX::None, AddressModeY::SkipLessThanZero, AddressModeZ::None),
+                Coordinates("x", "y", "z", "b"),
+                int_2x4_y_skip_less_than_zero
+            },
+            {
+                TileInfo(DataType::Uint16, 3, 8),
+                TensorStorageType::BufferUint8Ptr,
+                SamplerData(Format::Dim0_Dim1xDim2_1, AddressModeX::OverlappingMin, AddressModeY::SkipLessThanZero, AddressModeZ::None),
+                Coordinates("x", "y", "z", "0"),
+                uint16_3x8_yz_collapsed_b_eq_0_x_overlapping_min_y_skip_less_than_zero
+            }
+        };
+    }
+
+    bool run() override
+    {
+        bool all_tests_passed = true;
+        int32_t test_idx = 0;
+
+        for(auto _config: _configs)
+        {
+            KernelWriterInterceptor<CLKernelWriter> writer;
+
+            const TileInfo tile_info = std::get<0>(_config);
+            const Storage storage = std::get<1>(_config);
+            const SamplerData sampler_data = std::get<2>(_config);
+            const Coordinates coord = std::get<3>(_config);
+            const std::string expected_code = std::get<4>(_config).substr(1); // ignore initial newline, which was added for convenience
+
+            TileOperand tile_op = writer.declare_tile("tile", TileInfo(tile_info.data_type(), tile_info.height(), tile_info.width()));
+            TileOperand indirect_addr_op = writer.declare_tile("indirect_addr", TileInfo(DataType::Int32, tile_info.height(), 1)); // (M0, 1)
+            TileOperand x_op = writer.declare_tile(coord.x, TileInfo(DataType::Int32));
+            TileOperand z_op = writer.declare_tile(coord.z, TileInfo(DataType::Int32));
+            TileOperand batch_op = writer.declare_tile(coord.batch, TileInfo(DataType::Int32));
+
+            TensorShape tensor_shape {10, 10, 10, 10};
+            TensorInfo tensor_info(tile_info.data_type(), tensor_shape, TensorDataLayout::Nhwc, 0 /* id */);
+            TensorOperand tensor_op = writer.declare_tensor_argument("tensor", tensor_info);
+            TensorSampler sampler(storage, sampler_data.format, sampler_data.mode_x, sampler_data.mode_y, sampler_data.mode_z);
+
+            writer.start_capture_code();
+            writer.op_load_indirect(tile_op, tensor_op, sampler, x_op, indirect_addr_op, z_op, batch_op);
+
+            VALIDATE_TEST(writer.check_added_code(expected_code), all_tests_passed, test_idx++);
+        }
+
+        return all_tests_passed;
+    }
+
+    std::string name() override
+    {
+        return "CLKernelWriterOpLoadIndirectTest";
+    }
+
+private:
+    std::vector<CLKernelWriterOpLoadIndirectConfig> _configs {};
+};
+
+} // namespace ckw
+
+#endif // CKW_VALIDATION_TESTS_CLKERNELWRITEROPLOADINDIRECTTEST_H
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterOpLoadStoreTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterOpLoadStoreTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..870e80ee9a52b31b61d05f2ef5f1c8b82a2fe9f8
--- /dev/null
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterOpLoadStoreTest.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_VALIDATION_TESTS_CLKERNELWRITEROPLOADSTORETEST_H
+#define CKW_VALIDATION_TESTS_CLKERNELWRITEROPLOADSTORETEST_H
+
+#include "ckw/TileInfo.h"
+#include "ckw/types/DataType.h"
+#include "src/cl/CLKernelWriter.h"
+#include "validation/tests/common/KernelWriterInterceptor.h"
+#include "validation/tests/common/Common.h"
+
+#include "ckw/TensorSampler.h"
+#include "ckw/types/MemoryOperation.h"
+#include "ckw/types/TensorSamplerTypes.h"
+
+#include <vector>
+
+namespace ckw
+{
+
+class CLKernelWriterOpLoadStoreTest : public ITest
+{
+private:
+    using AddressModeX = TensorSamplerAddressModeX;
+    using AddressModeY = TensorSamplerAddressModeY;
+    using AddressModeZ = TensorSamplerAddressModeZ;
+    using Format = TensorSamplerFormat;
+    using Storage = TensorStorageType;
+
+    struct Coordinates
+    {
+        Coordinates(std::string x, std::string y, std::string z, std::string batch)
+            : x(x), y(y), z(z), batch(batch)
+        {
+        }
+
+        std::string x;
+        std::string y;
+        std::string z;
+        std::string batch;
+    };
+
+    struct SamplerData
+    {
+        SamplerData(Format format, AddressModeX mode_x, AddressModeY mode_y, AddressModeZ mode_z)
+        :   format(format), mode_x(mode_x), mode_y(mode_y), mode_z(mode_z)
+        {
+        }
+
+        Format format;
+        AddressModeX mode_x;
+        AddressModeY mode_y;
+        AddressModeZ mode_z;
+    };
+
+    struct Dilations
+    {
+        Dilations(std::string dilation_x, std::string dilation_y)
+        : dilation_x(dilation_x), dilation_y(dilation_y)
+        {
+        }
+
+        std::string dilation_x;
+        std::string dilation_y;
+    };
+
+    using CLKernelWriterOpLoadStoreConfig = std::tuple<MemoryOperation, TileInfo, TensorStorageType, SamplerData, Coordinates, Dilations, std::string>;
+
+public:
+    CLKernelWriterOpLoadStoreTest()
+    {
+        // Cases
+        const std::string load_fp_2x3_tile = R"_(
+G0__tile__0 = vload3(0, (__global float*)(G0__tensor_ptr + (G0__x) * sizeof(float) + (G0__y + 0) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (G0__b) * G0__tensor_stride3));
+G0__tile__1 = vload3(0, (__global float*)(G0__tensor_ptr + (G0__x) * sizeof(float) + (G0__y + 1) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (G0__b) * G0__tensor_stride3));
+)_";
+        const std::string load_half_2x4_tile_image_clamp_y = R"_(
+G0__tile__0 = read_imageh(G0__tensor_img2d, CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, (int2)((G0__x) >> 2, (G0__y + 0 + (G0__z) * G0__tensor_dim1 + (G0__b) * G0__tensor_dim1 * G0__tensor_dim2)));
+G0__tile__1 = read_imageh(G0__tensor_img2d, CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, (int2)((G0__x) >> 2, (G0__y + 1 + (G0__z) * G0__tensor_dim1 + (G0__b) * G0__tensor_dim1 * G0__tensor_dim2)));
+)_";
+        const std::string store_fp_2x3_tile = R"_(
+vstore3(G0__tile__0, 0, (__global float*)(G0__tensor_ptr + (G0__x) * sizeof(float) + (G0__y + 0) * G0__tensor_stride1 + (G0__b) * G0__tensor_stride3));
+vstore3(G0__tile__1, 0, (__global float*)(G0__tensor_ptr + (G0__x) * sizeof(float) + (G0__y + 1) * G0__tensor_stride1 + (G0__b) * G0__tensor_stride3));
+)_";
+        const std::string store_int8_4x4_y_dilation_batch_eq_0 = R"_(
+vstore4(G0__tile__0, 0, (__global char*)(G0__tensor_ptr + (((int)(1))) * sizeof(char) + (G0__y + 0 * G0__y_dilation) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (((int)(0))) * G0__tensor_stride3));
+vstore4(G0__tile__1, 0, (__global char*)(G0__tensor_ptr + (((int)(1))) * sizeof(char) + (G0__y + 1 * G0__y_dilation) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (((int)(0))) * G0__tensor_stride3));
+vstore4(G0__tile__2, 0, (__global char*)(G0__tensor_ptr + (((int)(1))) * sizeof(char) + (G0__y + 2 * G0__y_dilation) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (((int)(0))) * G0__tensor_stride3));
+vstore4(G0__tile__3, 0, (__global char*)(G0__tensor_ptr + (((int)(1))) * sizeof(char) + (G0__y + 3 * G0__y_dilation) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (((int)(0))) * G0__tensor_stride3));
+)_";
+        // tensor dimension is 10
+        const std::string load_fp_2x3_tile_x_overlapping_min_y_eq_0_batch_eq_1 = R"_(
+if(G0__x > 0)
+{
+G0__tile__0 = vload3(0, (__global float*)(G0__tensor_ptr + (G0__x) * sizeof(float) + (((int)(0)) + 0) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (((int)(1))) * G0__tensor_stride3));
+G0__tile__1 = vload3(0, (__global float*)(G0__tensor_ptr + (G0__x) * sizeof(float) + (((int)(0)) + 1) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (((int)(1))) * G0__tensor_stride3));
+}
+else
+{
+G0__tile__0.s0 = *((__global float*)(G0__tensor_ptr + (G0__x + 0) * sizeof(float) + (((int)(0)) + 0) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (((int)(1))) * G0__tensor_stride3));
+G0__tile__1.s0 = *((__global float*)(G0__tensor_ptr + (G0__x + 0) * sizeof(float) + (((int)(0)) + 1) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (((int)(1))) * G0__tensor_stride3));
+}
+)_";
+        const std::string store_fp_2x3_tile_x_overlapping_min_y_clamp_to_border_max_only = R"_(
+if(G0__x > 0)
+{
+if(G0__y + 0 < G0__tensor_dim1)
+{
+vstore3(G0__tile__0, 0, (__global float*)(G0__tensor_ptr + (G0__x) * sizeof(float) + (G0__y + 0) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (G0__b) * G0__tensor_stride3));
+}
+else
+{
+G0__tile__0 = 0.0f;
+}
+if(G0__y + 1 < G0__tensor_dim1)
+{
+vstore3(G0__tile__1, 0, (__global float*)(G0__tensor_ptr + (G0__x) * sizeof(float) + (G0__y + 1) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (G0__b) * G0__tensor_stride3));
+}
+else
+{
+G0__tile__1 = 0.0f;
+}
+}
+else
+{
+if(G0__y + 0 < G0__tensor_dim1)
+{
+*((__global float*)(G0__tensor_ptr + (G0__x + 0) * sizeof(float) + (G0__y + 0) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (G0__b) * G0__tensor_stride3)) = G0__tile__0.s0;
+}
+else
+{
+G0__tile__0.s0 = 0.0f;
+}
+if(G0__y + 1 < G0__tensor_dim1)
+{
+*((__global float*)(G0__tensor_ptr + (G0__x + 0) * sizeof(float) + (G0__y + 1) * G0__tensor_stride1 + (G0__z) * G0__tensor_stride2 + (G0__b) * G0__tensor_stride3)) = G0__tile__1.s0;
+}
+else
+{
+G0__tile__1.s0 = 0.0f;
+}
+}
+)_";
+        const std::string store_half_2x4_tile_x_image_y_dilation = R"_(
+write_imageh(G0__tensor_img2d, (int2)((G0__x) >> 2, (((int)(0)) + 0 * G0__y_dilation + (G0__z) * G0__tensor_dim1 + (((int)(1))) * G0__tensor_dim1 * G0__tensor_dim2)), G0__tile__0);
+write_imageh(G0__tensor_img2d, (int2)((G0__x) >> 2, (((int)(0)) + 1 * G0__y_dilation + (G0__z) * G0__tensor_dim1 + (((int)(1))) * G0__tensor_dim1 * G0__tensor_dim2)), G0__tile__1);
+)_";
+
+        // Configs Bundled
+        _configs = {
+            // op, tile,  storage, sampler, coordinates, dilation, expected
+            {
+                MemoryOperation::Load,
+                TileInfo(DataType::Fp32, 2, 3),
+                TensorStorageType::BufferUint8Ptr,
+                SamplerData(Format::Dim0_Dim1_Dim2, AddressModeX::None, AddressModeY::None, AddressModeZ::None),
+                Coordinates("x", "y", "z", "b"),
+                Dilations("1", "1"),
+                load_fp_2x3_tile
+            },
+            {
+                MemoryOperation::Load,
+                TileInfo(DataType::Fp16, 2, 4),
+                TensorStorageType::Texture2dReadOnly,
+                SamplerData(Format::Dim0_Dim1_Dim2, AddressModeX::None, AddressModeY::ClampToBorderMaxOnly, AddressModeZ::None),
+                Coordinates("x", "y", "z", "b"),
+                Dilations("1", "1"),
+                load_half_2x4_tile_image_clamp_y
+            },
+            {
+                MemoryOperation::Store,
+                TileInfo(DataType::Fp32, 2, 3),
+                TensorStorageType::BufferUint8Ptr,
+                SamplerData(Format::Dim0_Dim1xDim2_1,AddressModeX::None, AddressModeY::None, AddressModeZ::None),
+                Coordinates("x", "y", "z", "b"),
+                Dilations("1", "1"),
+                store_fp_2x3_tile
+            },
+            {
+                MemoryOperation::Store,
+                TileInfo(DataType::Int8, 4, 4),
+                TensorStorageType::BufferUint8Ptr,
+                SamplerData(Format::Dim0_Dim1_Dim2, AddressModeX::None, AddressModeY::None, AddressModeZ::None),
+                Coordinates("1", "y", "z", "0"),
+                Dilations("1", "y_dilation"),
+                store_int8_4x4_y_dilation_batch_eq_0
+            },
+            {
+                MemoryOperation::Load,
+                TileInfo(DataType::Fp32, 2, 3),
+                TensorStorageType::BufferUint8Ptr,
+                SamplerData(Format::Dim0_Dim1_Dim2, AddressModeX::OverlappingMin, AddressModeY::None, AddressModeZ::None),
+                Coordinates("x", "0", "z", "1"),
+                Dilations("1", "1"),
+                load_fp_2x3_tile_x_overlapping_min_y_eq_0_batch_eq_1
+            },
+            {
+                MemoryOperation::Store,
+                TileInfo(DataType::Fp32, 2, 3),
+                TensorStorageType::BufferUint8Ptr,
+                SamplerData(Format::Dim0_Dim1_Dim2, AddressModeX::OverlappingMin, AddressModeY::ClampToBorderMaxOnly, AddressModeZ::None),
+                Coordinates("x", "y", "z", "b"),
+                Dilations("1", "1"),
+                store_fp_2x3_tile_x_overlapping_min_y_clamp_to_border_max_only
+            },
+            {
+                MemoryOperation::Store,
+                TileInfo(DataType::Fp16, 2, 4),
+                TensorStorageType::Texture2dWriteOnly,
+                SamplerData(Format::Dim0_Dim1_Dim2, AddressModeX::None, AddressModeY::None, AddressModeZ::None),
+                Coordinates("x", "0", "z", "1"),
+                Dilations("1", "y_dilation"),
+                store_half_2x4_tile_x_image_y_dilation
+            }
+        };
+    }
+
+    TileOperand declare_tile_helper(KernelWriter &writer, std::string tile)
+    {
+        if(tile == "0" || tile == "1")
+        {
+            return writer.declare_constant_tile(ConstantData({{std::stoi(tile)}}, DataType::Int32));
+        }
+        else
+        {
+            return writer.declare_tile(tile, TileInfo(DataType::Int32));
+        }
+    }
+
+    bool run() override
+    {
+        bool all_tests_passed = true;
+        int32_t test_idx = 0;
+
+        for(auto _config: _configs)
+        {
+            KernelWriterInterceptor<CLKernelWriter> writer;
+
+            const MemoryOperation op = std::get<0>(_config);
+            const TileInfo tile_info = std::get<1>(_config);
+            const Storage storage = std::get<2>(_config);
+            const SamplerData sampler_data = std::get<3>(_config);
+            const Coordinates coord = std::get<4>(_config);
+            const Dilations dilations = std::get<5>(_config);
+            const std::string expected_code = std::get<6>(_config).substr(1); // ignore initial newline, which was added for convenience
+
+            TileOperand tile_op = writer.declare_tile("tile", tile_info);
+            TileOperand x_op = declare_tile_helper(writer, coord.x);
+            TileOperand y_op = declare_tile_helper(writer, coord.y);
+            TileOperand z_op = declare_tile_helper(writer, coord.z);
+            TileOperand batch_op = declare_tile_helper(writer, coord.batch);
+            TileOperand dil_x_op = declare_tile_helper(writer, dilations.dilation_x);
+            TileOperand dil_y_op = declare_tile_helper(writer, dilations.dilation_y);
+
+            TensorShape tensor_shape {10, 10, 10, 10};
+            TensorInfo tensor_info(tile_info.data_type(), tensor_shape, TensorDataLayout::Nhwc, 0 /* id */);
+            TensorOperand tensor_op = writer.declare_tensor_argument("tensor", tensor_info);
+            TensorSampler sampler(storage, sampler_data.format, sampler_data.mode_x, sampler_data.mode_y, sampler_data.mode_z);
+
+            const bool no_dilation = (dilations.dilation_x == "1" && dilations.dilation_y == "1");
+
+            writer.start_capture_code();
+            if(op == MemoryOperation::Load)
+            {
+                if(no_dilation)
+                {
+                    writer.op_load(tile_op, tensor_op, sampler, x_op, y_op, z_op, batch_op);
+                }
+                else
+                {
+                    writer.op_load_dilated(tile_op, tensor_op, sampler, x_op, y_op, z_op, batch_op, dil_x_op, dil_y_op);
+                }
+            }
+            else
+            {
+                if(no_dilation)
+                {
+                    writer.op_store(tensor_op, tile_op, sampler, x_op, y_op, z_op, batch_op);
+                }
+                else
+                {
+                    writer.op_store_dilated(tensor_op, tile_op, sampler, x_op, y_op, z_op, batch_op, dil_x_op, dil_y_op);
+                }
+            }
+
+            VALIDATE_TEST(writer.check_added_code(expected_code), all_tests_passed, test_idx++);
+        }
+
+        return all_tests_passed;
+    }
+
+    std::string name() override
+    {
+        return "CLKernelWriterOpLoadStoreTest";
+    }
+
+private:
+    std::vector<CLKernelWriterOpLoadStoreConfig> _configs {};
+};
+
+} // namespace ckw
+
+#endif // CKW_VALIDATION_TESTS_CLKERNELWRITEROPLOADSTORETEST_H
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterPrintTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterPrintTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..6229dfb8c07b585f1851157a55aa9e59f1a31d18
--- /dev/null
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterPrintTest.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_VALIDATION_TESTS_CLKERNELWRITERPRINT_H
+#define CKW_VALIDATION_TESTS_CLKERNELWRITERPRINT_H
+
+#include "ckw/TileInfo.h"
+#include "src/cl/CLKernelWriter.h"
+#include "validation/tests/common/Common.h"
+#include "validation/tests/common/KernelWriterInterceptor.h"
+
+namespace ckw
+{
+
+class CLKernelWriterPrintTest : public ITest
+{
+public:
+    CLKernelWriterPrintTest()
+    {
+    }
+
+    bool run() override
+    {
+        bool all_tests_passed = true;
+
+        KernelWriterInterceptor<CLKernelWriter> writer;
+
+        const auto tile2x3f16 = writer.declare_tile("tile2x3f16", TileInfo(DataType::Fp16, 2, 3));
+        const auto tile1x2i32 = writer.declare_tile("tile1x2i32", TileInfo(DataType::Int32, 1, 2));
+        const auto tile2x1s32 = writer.declare_tile("tile2x1s32", TileInfo(DataType::Int32, 2, 1));
+        const auto tile1x1u32 = writer.declare_tile("tile1x1u32", TileInfo(DataType::Uint32, 1, 1));
+
+        writer.start_capture_code();
+
+        writer.op_print("debug_log", { tile2x3f16, tile1x2i32, tile2x1s32, tile1x1u32 });
+
+        constexpr auto expected_code =
+            "printf(\"debug_log\\nG0__tile2x3f16 = [[%v3hg], [%v3hg]]\\nG0__tile1x2i32 = [%v2hli]\\nG0__tile2x1s32 = [%i, %i]\\nG0__tile1x1u32 = %u\\n\", "
+            "G0__tile2x3f16__0, G0__tile2x3f16__1, G0__tile1x2i32, G0__tile2x1s32__0, G0__tile2x1s32__1, G0__tile1x1u32);\n";
+
+        VALIDATE_TEST(writer.check_added_code(expected_code), all_tests_passed, 0);
+
+        return all_tests_passed;
+    }
+
+    std::string name() override
+    {
+        return "CLKernelWriterPrintTest";
+    }
+};
+
+} // namespace ckw
+
+#endif // CKW_VALIDATION_TESTS_CLKERNELWRITERPRINT_H
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterReturnTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterReturnTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc3f2a639b60a6864a883a6d51439992064ebd78
--- /dev/null
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterReturnTest.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_VALIDATION_TESTS_CLKERNELWRITERRETURNTEST_H
+#define CKW_VALIDATION_TESTS_CLKERNELWRITERRETURNTEST_H
+
+#include "src/cl/CLKernelWriter.h"
+#include "validation/tests/common/Common.h"
+#include "validation/tests/common/KernelWriterInterceptor.h"
+
+namespace ckw
+{
+
+class CLKernelWriterReturnTest : public ITest
+{
+public:
+    CLKernelWriterReturnTest()
+    {
+    }
+
+    bool run() override
+    {
+        bool all_tests_passed = true;
+
+        KernelWriterInterceptor<CLKernelWriter> writer;
+
+        writer.start_capture_code();
+
+        writer.op_return();
+
+        constexpr auto expected_code = "return;\n";
+
+        VALIDATE_TEST(writer.check_added_code(expected_code), all_tests_passed, 0);
+
+        return all_tests_passed;
+    }
+
+    std::string name() override
+    {
+        return "CLKernelWriterReturnTest";
+    }
+};
+
+} // namespace ckw
+
+#endif // CKW_VALIDATION_TESTS_CLKERNELWRITERRETURNTEST_H
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterSubTileTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterSubTileTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea360b289e42c5c0e793d76e9837baf9554bb2e8
--- /dev/null
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterSubTileTest.h
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_VALIDATION_SRC_TESTS_CLKERNELWRITERSUBTILETEST_H
+#define CKW_VALIDATION_SRC_TESTS_CLKERNELWRITERSUBTILETEST_H
+
+#include "ckw/TileInfo.h"
+#include "ckw/types/DataType.h"
+#include "ckw/types/Operators.h"
+#include "src/cl/CLKernelWriter.h"
+#include "validation/tests/common/Common.h"
+#include "validation/tests/common/KernelWriterInterceptor.h"
+
+#include <cstdint>
+#include <vector>
+
+namespace ckw
+{
+
+class CLKernelWriterSubTileTest : public ITest
+{
+public:
+    CLKernelWriterSubTileTest()
+    {
+        // These are the definitions of the tiles involving in the writing actions.
+        //
+        // Structure:
+        //   * List of tiles:
+        //     - Tile full height.
+        //     - Tile full width.
+        //     - Tile view access type (full tile, vector, scalar).
+        //     - Tile view start row.
+        //     - Tile view start column.
+        //     - The tile name.
+
+        // Vector access.
+        _tests.push_back(
+            { { { 1, 4, AccessType::Vector, 0, 0, "{tile_name}" }, //
+                { 4, 4, AccessType::Vector, 2, 0, "{tile_name}__2" },
+                { 1, 4, AccessType::Full, 0, 0, "{tile_name}" },
+                { 4, 4, AccessType::Vector, 3, 0, "{tile_name}__3" } } });
+
+        // Scalar access.
+        _tests.push_back(
+            { { { 1, 1, AccessType::Full, 0, 0, "{tile_name}" }, //
+                { 4, 8, AccessType::Scalar, 2, 4, "{tile_name}__2.s4" },
+                { 1, 16, AccessType::ScalarOfVector, 0, 10, "{tile_name}.sA" },
+                { 1, 1, AccessType::Scalar, 0, 0, "{tile_name}" } } });
+
+        // These are the definitions of the writing actions.
+        //
+        // Structure:
+        //   * Writing function.
+        //   * Whether this function only works with scalar value.
+        //   * Expected code format.
+
+        _actions.push_back(
+            { [](CLKernelWriter &writer, const std::vector<TileOperand> &args)
+              {
+                  writer.op_assign(args.at(0), args.at(1));
+              },
+              false,
+              "{op0} = {op1};\n" });
+
+        _actions.push_back(
+            { [](CLKernelWriter &writer, const std::vector<TileOperand> &args)
+              {
+                  writer.op_unary(args.at(0), UnaryOp::Sqrt, args.at(1));
+              },
+              false,
+              "{op0} = sqrt({op1});\n" });
+
+        _actions.push_back(
+            { [](CLKernelWriter &writer, const std::vector<TileOperand> &args)
+              {
+                  writer.op_binary(args.at(0), BinaryOp::Add, args.at(1), args.at(2));
+              },
+              false,
+              "{op0} = {op1} + {op2};\n" });
+
+        _actions.push_back(
+            { [](CLKernelWriter &writer, const std::vector<TileOperand> &args)
+              {
+                  writer.op_ternary(args.at(0), TernaryOp::Clamp, args.at(1), args.at(2), args.at(3));
+              },
+              false,
+              "{op0} = clamp({op1}, {op2}, {op3});\n" });
+
+        _actions.push_back(
+            { [](CLKernelWriter &writer, const std::vector<TileOperand> &args)
+              {
+                  writer.op_if(args.at(0), BinaryOp::Greater, args.at(1), [] {});
+              },
+              true,
+              "if ({op0} > {op1})\n{\n}\n" });
+    }
+
+    bool run() override
+    {
+        bool    all_tests_passed = true;
+        int32_t test_idx         = 0;
+
+        KernelWriterInterceptor<CLKernelWriter> writer;
+
+        for(size_t test_no = 0; test_no < _tests.size(); ++test_no)
+        {
+            const TestInfo &test = _tests.at(test_no);
+
+            // Declare all the tiles and get the full name of those tile operand.
+            std::vector<TileOperand> tiles;
+            std::vector<std::string> expected_tiles_name;
+
+            for(size_t operand_no = 0; operand_no < test.operands.size(); ++operand_no)
+            {
+                const TestOperand &operand = test.operands.at(operand_no);
+                std::string        name    = "test" + std::to_string(test_no) + "_op" + std::to_string(operand_no);
+
+                const TileOperand full_tile = writer.declare_tile(name, TileInfo(DataType::Fp32, operand.height, operand.width));
+
+                switch(operand.access_type)
+                {
+                    case AccessType::Full:
+                        tiles.emplace_back(full_tile);
+                        break;
+
+                    case AccessType::Vector:
+                        tiles.emplace_back(full_tile.row(operand.start_row));
+                        break;
+
+                    case AccessType::Scalar:
+                        tiles.emplace_back(full_tile.scalar(operand.start_row, operand.start_col));
+                        break;
+
+                    case AccessType::ScalarOfVector:
+                        tiles.emplace_back(full_tile.row(operand.start_row).scalar(0, operand.start_col));
+                        break;
+
+                    default:
+                        CKW_THROW_MSG("Unsupported access type!");
+                }
+
+                expected_tiles_name.push_back("G0__" + name);
+            }
+
+            // Try each writing action using the newly declared tiles.
+            for(const TestAction &action : _actions)
+            {
+                if(action.scalar_only &&                                     //
+                   (test.operands.at(0).access_type != AccessType::Scalar && //
+                    (test.operands.at(0).height != 1 || test.operands.at(0).width != 1)))
+                {
+                    continue;
+                }
+
+                writer.start_capture_code();
+
+                action.write(writer, tiles);
+
+                // The expected code is constructed from the format strings.
+                std::string expected_code = action.expected_code;
+
+                for(size_t operand_no = 0; operand_no < test.operands.size(); ++operand_no)
+                {
+                    const TestOperand &operand = test.operands.at(operand_no);
+
+                    const std::string op_name = search_and_replace(operand.name, "{tile_name}", expected_tiles_name.at(operand_no));
+                    expected_code             = search_and_replace(expected_code, "{op" + std::to_string(operand_no) + "}", op_name);
+                }
+
+                VALIDATE_TEST(writer.check_added_code(expected_code), all_tests_passed, test_idx++);
+            }
+        }
+
+        return all_tests_passed;
+    }
+
+    std::string search_and_replace(const std::string &src, const std::string &search, const std::string &replace)
+    {
+        std::string result = src;
+
+        size_t idx = 0;
+
+        while(true)
+        {
+            idx = result.find(search, idx);
+
+            if(idx == std::string::npos)
+            {
+                break;
+            }
+
+            result = result.replace(idx, search.size(), replace);
+        }
+
+        return result;
+    }
+
+    std::string name() override
+    {
+        return "CLKernelWriterSubTileTest";
+    }
+
+private:
+    enum class AccessType
+    {
+        Full,
+        Vector,
+        Scalar,
+        ScalarOfVector,
+    };
+
+    struct TestOperand
+    {
+        int32_t height;
+        int32_t width;
+
+        AccessType access_type;
+        int32_t    start_row;
+        int32_t    start_col;
+
+        std::string name;
+    };
+
+    struct TestInfo
+    {
+        std::vector<TestOperand> operands;
+    };
+
+    struct TestAction
+    {
+        std::function<void(CLKernelWriter &, const std::vector<TileOperand> &)> write;
+
+        bool        scalar_only;
+        std::string expected_code;
+    };
+
+    std::vector<TestInfo>   _tests{};
+    std::vector<TestAction> _actions{};
+};
+
+} // namespace ckw
+
+#endif // CKW_VALIDATION_SRC_TESTS_CLKERNELWRITERSUBTILETEST_H
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterTernaryOpTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterTernaryOpTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..d25d3e2958ea432e1362f85ab11011f6b0c2eb94
--- /dev/null
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterTernaryOpTest.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_VALIDATION_TESTS_CLKERNELWRITERTERNARYOPTEST_H
+#define CKW_VALIDATION_TESTS_CLKERNELWRITERTERNARYOPTEST_H
+
+#include "ckw/TileInfo.h"
+#include "ckw/types/DataType.h"
+#include "ckw/types/Operators.h"
+#include "src/cl/CLKernelWriter.h"
+#include "validation/tests/common/Common.h"
+#include "validation/tests/common/KernelWriterInterceptor.h"
+
+#include <cstdint>
+#include <vector>
+
+namespace ckw
+{
+
+class CLKernelWriterTernaryOpTest : public ITest
+{
+public:
+    CLKernelWriterTernaryOpTest()
+    {
+        // dst_height, dst_width, op0_height, op0_width, op1_height, op1_width, op2_height, op2_width, data_type, op, expected_code
+
+        _tests.push_back({ 1, 1, 1, 1, 1, 1, 1, 1, DataType::Fp32, TernaryOp::Select, "G0__dst = select(G0__op0, G0__op1, G0__op2);\n" }); // Scalar.
+
+        _tests.push_back({ 1, 3, 1, 3, 1, 3, 1, 3, DataType::Fp16, TernaryOp::Clamp, "G0__dst = clamp(G0__op0, G0__op1, G0__op2);\n" }); // Whole vector.
+
+        _tests.push_back({ 2, 4, 2, 4, 2, 4, 2, 4, DataType::Int8, TernaryOp::Select, "G0__dst__0 = select(G0__op0__0, G0__op1__0, G0__op2__0);\nG0__dst__1 = select(G0__op0__1, G0__op1__1, G0__op2__1);\n" }); // Whole tile.
+
+        _tests.push_back({ 2, 3, 1, 3, 2, 3, 2, 3, DataType::Uint8, TernaryOp::Clamp, "G0__dst__0 = clamp(G0__op0, G0__op1__0, G0__op2__0);\nG0__dst__1 = clamp(G0__op0, G0__op1__1, G0__op2__1);\n" }); // 1st operand y-dimension broadcast.
+
+        _tests.push_back({ 2, 3, 2, 3, 2, 1, 2, 3, DataType::Fp32, TernaryOp::Select, "G0__dst__0 = select(G0__op0__0, (float3)G0__op1__0, G0__op2__0);\nG0__dst__1 = select(G0__op0__1, (float3)G0__op1__1, G0__op2__1);\n" }); // 2nd operand x-dimension broadcast.
+
+        _tests.push_back({ 2, 3, 1, 3, 2, 1, 1, 1, DataType::Fp16, TernaryOp::Clamp, "G0__dst__0 = clamp(G0__op0, (half3)G0__op1__0, (half3)G0__op2);\nG0__dst__1 = clamp(G0__op0, (half3)G0__op1__1, (half3)G0__op2);\n" }); // 1st operand y-, 2nd operand x-, 3rd operand x- and y-dimension broadcast.
+    }
+
+    bool run() override
+    {
+        int32_t test_no          = 0;
+        bool    all_tests_passed = true;
+
+        for(const auto &test : _tests)
+        {
+            KernelWriterInterceptor<CLKernelWriter> writer;
+
+            auto dst = writer.declare_tile("dst", TileInfo(test.data_type, test.dst_height, test.dst_width));
+            auto op0 = writer.declare_tile("op0", TileInfo(DataType::Bool, test.op0_height, test.op0_width));
+            auto op1 = writer.declare_tile("op1", TileInfo(test.data_type, test.op1_height, test.op1_width));
+            auto op2 = writer.declare_tile("op2", TileInfo(test.data_type, test.op2_height, test.op2_width));
+
+            writer.start_capture_code();
+
+            writer.op_ternary(dst, test.op, op0, op1, op2);
+
+            VALIDATE_TEST(writer.check_added_code(test.expected_code), all_tests_passed, test_no++);
+        }
+
+        return all_tests_passed;
+    }
+
+    std::string name() override
+    {
+        return "CLKernelWriterTernaryOpTest";
+    }
+
+private:
+    struct TestInfo
+    {
+        int32_t     dst_height;
+        int32_t     dst_width;
+        int32_t     op0_height;
+        int32_t     op0_width;
+        int32_t     op1_height;
+        int32_t     op1_width;
+        int32_t     op2_height;
+        int32_t     op2_width;
+        DataType    data_type;
+        TernaryOp   op;
+        std::string expected_code;
+    };
+
+    std::vector<TestInfo> _tests{};
+};
+
+} // namespace ckw
+
+#endif // CKW_VALIDATION_TESTS_CLKERNELWRITERTERNARYOPTEST_H
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterUnaryExpressionTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterUnaryExpressionTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..395a2fe817c2f6452496c3d8ca2f91bcc984ae3e
--- /dev/null
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterUnaryExpressionTest.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_VALIDATION_TESTS_CLKERNELWRITERUNARYEXPRESSIONTEST_H
+#define CKW_VALIDATION_TESTS_CLKERNELWRITERUNARYEXPRESSIONTEST_H
+
+#include "ckw/TileInfo.h"
+#include "ckw/types/DataType.h"
+#include "ckw/types/Operators.h"
+#include "src/cl/CLKernelWriter.h"
+#include "validation/tests/common/Common.h"
+#include "validation/tests/common/KernelWriterInterceptor.h"
+
+#include <cstdint>
+#include <vector>
+
+namespace ckw
+{
+
+class CLKernelWriterUnaryExpressionTest : public ITest
+{
+public:
+    CLKernelWriterUnaryExpressionTest()
+    {
+        // dst_height, dst_width, src_height, src_width, data_type, op, expected_code
+
+        _tests.push_back({ 1, 1, 1, 1, DataType::Uint32, UnaryOp::BitwiseNot, "G0__dst = ~G0__src;\n" }); // Scalar.
+
+        _tests.push_back({ 1, 3, 1, 3, DataType::Int16, UnaryOp::LogicalNot, "G0__dst = !G0__src;\n" }); // Whole vector.
+
+        _tests.push_back({ 2, 4, 2, 4, DataType::Int8, UnaryOp::Exp, "G0__dst__0 = exp(G0__src__0);\nG0__dst__1 = exp(G0__src__1);\n" }); // Whole tile.
+
+        _tests.push_back({ 2, 3, 1, 3, DataType::Uint8, UnaryOp::Log, "G0__dst__0 = log(G0__src);\nG0__dst__1 = log(G0__src);\n" }); // Y-dimension broadcast.
+
+        _tests.push_back({ 2, 4, 2, 1, DataType::Uint16, UnaryOp::Sqrt, "G0__dst__0 = (ushort4)sqrt(G0__src__0);\nG0__dst__1 = (ushort4)sqrt(G0__src__1);\n" }); // X-dimension broadcast.
+
+        _tests.push_back({ 2, 3, 1, 1, DataType::Int32, UnaryOp::Round, "G0__dst__0 = (int3)round(G0__src);\nG0__dst__1 = (int3)round(G0__src);\n" }); // X and y dimension broadcast.
+    }
+
+    bool run() override
+    {
+        int32_t test_no          = 0;
+        bool    all_tests_passed = true;
+
+        for(const auto &test : _tests)
+        {
+            KernelWriterInterceptor<CLKernelWriter> writer;
+
+            auto dst = writer.declare_tile("dst", TileInfo(test.data_type, test.dst_height, test.dst_width));
+            auto src = writer.declare_tile("src", TileInfo(test.data_type, test.src_height, test.src_width));
+
+            writer.start_capture_code();
+
+            writer.op_unary(dst, test.op, src);
+
+            VALIDATE_TEST(writer.check_added_code(test.expected_code), all_tests_passed, test_no++);
+        }
+
+        return all_tests_passed;
+    }
+
+    std::string name() override
+    {
+        return "CLKernelWriterUnaryExpressionTest";
+    }
+
+private:
+    struct TestInfo
+    {
+        int32_t     dst_height;
+        int32_t     dst_width;
+        int32_t     src_height;
+        int32_t     src_width;
+        DataType    data_type;
+        UnaryOp     op;
+        std::string expected_code;
+    };
+
+    std::vector<TestInfo> _tests{};
+};
+
+} // namespace ckw
+
+#endif // CKW_VALIDATION_TESTS_CLKERNELWRITERUNARYEXPRESSIONTEST_H
diff --git a/compute_kernel_writer/validation/tests/CLTileTest.hpp b/compute_kernel_writer/validation/tests/CLTileTest.hpp
index ecfe811267a069eec73563cf2ea135f6afe316e9..a95a11ace7fb541dabf902251463c0a88a7956d1 100644
--- a/compute_kernel_writer/validation/tests/CLTileTest.hpp
+++ b/compute_kernel_writer/validation/tests/CLTileTest.hpp
@@ -22,8 +22,8 @@
  * SOFTWARE.
  */
 
-#ifndef COMPUTE_KERNEL_WRITER_TESTS_CLTILETEST_HPP
-#define COMPUTE_KERNEL_WRITER_TESTS_CLTILETEST_HPP
+#ifndef CKW_VALIDATION_TESTS_CLTILETEST_HPP
+#define CKW_VALIDATION_TESTS_CLTILETEST_HPP
 
 #include "common/Common.h"
 #include "src/Helpers.h"
@@ -63,7 +63,7 @@ public:
 
             for(int32_t y = 0; y < height; ++y)
             {
-                const std::string expected_var_name = tile_name + "_" + std::to_string(y);
+                const std::string expected_var_name = tile_name + "__" + std::to_string(y);
                 const std::string actual_var_name   = vars[y].str;
                 VALIDATE_TEST(actual_var_name.compare(expected_var_name) == 0, all_tests_passed, test_idx++);
             }
@@ -172,7 +172,7 @@ public:
 
             const std::string actual_var_name   = var.str;
             std::string       expected_var_name = tile_name;
-            expected_var_name += "_" + std::to_string(y_coord);
+            expected_var_name += "__" + std::to_string(y_coord);
             expected_var_name += ".s" + dec_to_hex_as_string(x_coord);
 
             VALIDATE_TEST(actual_var_name.compare(expected_var_name) == 0, all_tests_passed, test_idx++);
@@ -238,7 +238,7 @@ public:
 
             const std::string actual_var_name   = var.str;
             std::string       expected_var_name = tile_name;
-            expected_var_name += "_" + std::to_string(y_coord);
+            expected_var_name += "__" + std::to_string(y_coord);
             if(width != 1)
             {
                 expected_var_name += ".s" + dec_to_hex_as_string(x_coord_clamped);
@@ -310,7 +310,7 @@ public:
             std::string       expected_var_name = tile_name;
             if(height != 1)
             {
-                expected_var_name += "_" + std::to_string(y_coord_clamped);
+                expected_var_name += "__" + std::to_string(y_coord_clamped);
             }
 
             if(width != 1)
@@ -367,7 +367,7 @@ public:
                 std::string       expected_var_name = tile_name;
                 if(height != 1)
                 {
-                    expected_var_name += "_" + std::to_string(row);
+                    expected_var_name += "__" + std::to_string(row);
                 }
 
                 VALIDATE_TEST(actual_var_name.compare(expected_var_name) == 0, all_tests_passed, test_idx++);
@@ -423,7 +423,7 @@ public:
                     std::string expected_var_name = tile_name;
                     if(height != 1)
                     {
-                        expected_var_name += "_" + std::to_string(row);
+                        expected_var_name += "__" + std::to_string(row);
                     }
 
                     if(width != 1)
@@ -464,4 +464,4 @@ private:
 };
 } // namespace ckw
 
-#endif /* COMPUTE_KERNEL_WRITER_TESTS_CLTILETEST_HPP */
+#endif // CKW_VALIDATION_TESTS_CLTILETEST_HPP
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 1654d16fc410940d7381ea99305a7ee3e954967c..0b2f32ad1a158e5a16138f559d94888d6615202a 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "Compute Library"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 23.08
+PROJECT_NUMBER         = 23.11
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/docs/DoxygenLayout.xml b/docs/DoxygenLayout.xml
new file mode 100644
index 0000000000000000000000000000000000000000..4e09e20e3d92eef14d986bd395fdae1c68c88110
--- /dev/null
+++ b/docs/DoxygenLayout.xml
@@ -0,0 +1,211 @@
+<doxygenlayout version="1.0">
+  <!-- Generated by doxygen 1.8.15 -->
+  <!-- Navigation index tabs for HTML output -->
+  <navindex>
+    <tab type="mainpage" url="@ref introduction" title="Introduction"/>
+    <tab type="usergroup" title="User Guide">
+        <tab type="user" url="@ref how_to_build" title="How to Build and Run Examples"/>
+        <tab type="user" url="@ref architecture" title="Library Architecture"/>
+        <tab type="user" url="@ref data_type_support" title="Data Type Support"/>
+        <tab type="user" url="@ref data_layout_support" title="Data Layout Support"/>
+        <tab type="user" url="@ref conv2d_heuristic" title="Convolution 2D heuristic"/>
+        <tab type="user" url="@ref operators_list" title="Operator List"/>
+        <tab type="user" url="@ref tests" title="Validation and benchmarks"/>
+        <tab type="user" url="@ref advanced" title="Advanced"/>
+        <tab type="user" url="@ref versions_changelogs" title="Release Versions and Changelog"/>
+        <tab type="user" url="@ref errata" title="Errata"/>
+    </tab>
+    <tab type="usergroup" title="Contributor Guide">
+        <tab type="user" url="@ref contribution_guidelines" title="Contribution Guidelines"/>
+        <tab type="user" url="@ref adding_operator" title="How to Add a New Operator"/>
+        <tab type="user" url="@ref implementation_topic" title="Implementation Topics"/>
+    </tab>
+    <tab type="pages" visible="no" title="Pages" intro=""/>
+    <tab type="modules" visible="yes" title="" intro=""/>
+    <tab type="namespaces" visible="yes" title="">
+      <tab type="namespacelist" visible="yes" title="" intro=""/>
+      <tab type="namespacemembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="classes" visible="yes" title="">
+      <tab type="classlist" visible="yes" title="" intro=""/>
+      <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
+      <tab type="hierarchy" visible="yes" title="" intro=""/>
+      <tab type="classmembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="files" visible="yes" title="">
+      <tab type="filelist" visible="yes" title="" intro="Below is a list of files with brief descriptions. Please note that the descriptions of some miscellaneous files and directories (e.g. data/ and include/) are omitted because of Doxygen limitations."/>
+      <tab type="globals" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="examples" visible="yes" title="" intro=""/>
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <allmemberslink visible="yes"/>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+    <sourcelink visible="yes"/>
+    <memberdecl>
+      <classes visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="yes"/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <memberdecl>
+      <nestedgroups visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+  </directory>
+</doxygenlayout>
diff --git a/docs/contributor_guide/contribution_guidelines.dox b/docs/contributor_guide/contribution_guidelines.dox
index 30ee41af0fe1754fd746c3d4296ee6a71ffc62c0..cbaa50263531c5dd65da44ebb324e33872dee17b 100644
--- a/docs/contributor_guide/contribution_guidelines.dox
+++ b/docs/contributor_guide/contribution_guidelines.dox
@@ -335,6 +335,26 @@ void foo(const Goo &g);             // OK
 
 astyle (http://astyle.sourceforge.net/) and clang-format (https://clang.llvm.org/docs/ClangFormat.html) can check and help you apply some of these rules.
 
+We have also provided the python scripts we use in our precommit pipeline inside scripts directory.
+    - format_code.py: checks Android.bp, bad style, end of file, formats doxygen, runs astyle and clang-format (assuming necessary binaries are in the path). Example invocations:
+@code{.sh}
+        python format_code.py
+        python format_code.py --error-on-diff
+        python format_code.py --files=git-diff (Default behavior in pre-commit configuration, where it checks the staged files)
+@endcode
+    - generate_build_files.py: generates build files required for CMake and Bazel builds. Example invocations:
+@code{.sh}
+        python generate_build_files.py --cmake
+        python generate_build_files.py --bazel
+@endcode
+
+Another way of running the checks is using `pre-commit` (https://pre-commit.com/) framework, which has also nice features like checking trailing spaces, and large committed files etc.
+`pre-commit` can be installed via `pip`. After installing, run the following command in the root directory of the repository:
+
+	pre-commit install
+
+This will create the hooks that perform the formatting checks mentioned above and will automatically run just before committing to flag issues.
+
 @subsection S5_1_3_library_size_guidelines Library size: best practices and guidelines
 
 @subsubsection S5_1_3_1_template_suggestions Template suggestions
diff --git a/docs/user_guide/errata.dox b/docs/user_guide/errata.dox
index 525ad3e396a61252e70c99438c6e19744a6a9f68..056e45a432ee08692d5680c6923254b594bcea1d 100644
--- a/docs/user_guide/errata.dox
+++ b/docs/user_guide/errata.dox
@@ -30,6 +30,17 @@ namespace arm_compute
 
 @section S7_1_errata Errata
 
+- (COMPMID-6493) Crash when running Arm Compute Library compiled for SVE2 on a computer that support SVE only.
+    - Versions: >= v21.02 && <=v23.08
+    - OSs: Linux, Android.
+    - Conditions:
+        - Compile the latest Arm Compute Library for SVE2 (arch=armv8.6-a-sve2).
+        - multi_isa = 0
+        - Device with SVE but without SVE2 support.
+    - Result:
+        - Crash due to illegal instruction.
+        - To run SVE only, build with arch="armv8.2-a-sve", arch="armv8.6-a-sve", or with multi_isa=1.
+
 - (COMPMID-6404) Under certain conditions, CLTile may produce incorrect result.
     - Versions: >= v19.02 && < v23.08
     - OSs: Linux, Android.
diff --git a/docs/user_guide/how_to_build_and_run_examples.dox b/docs/user_guide/how_to_build_and_run_examples.dox
index 75b0a5df54cb2ab3d18a0a66b19e729f9c93679f..4da26d31bcd1b4943eb83286525d678d889f956e 100644
--- a/docs/user_guide/how_to_build_and_run_examples.dox
+++ b/docs/user_guide/how_to_build_and_run_examples.dox
@@ -178,6 +178,35 @@ An example build command with SME2 is:
 
         scons arch=armv8.6-a-sve2-sme2 os=linux build_dir=arm64 -j55 standalone=0 opencl=0 openmp=0 validation_tests=1 neon=1 cppthreads=1 toolchain_prefix=aarch64-none-linux-gnu-
 
+@subsection S1_2_5_clang_build_linux Building with LLVM+Clang Natively on Linux
+
+The library can be built with LLVM+Clang by specifying CC and CXX environment variables appropriately as below. The **minimum** supported clang version is 11, as LLVM 11 introduces SVE/SVE2 VLA intrinsics: https://developer.arm.com/Tools%20and%20Software/LLVM%20Toolchain#Supported-Devices.
+
+	CC=clang CXX=clang++ <build command>
+
+Or, if the environment has multiple clang versions:
+
+	CC=clang-16 CXX=clang++-16
+
+Examples for different build tools look like below.
+
+(experimental) CMake:
+
+	mkdir build
+	cd build
+	CC=clang CXX=clang++ cmake .. -DCMAKE_BUILD_TYPE=Release -DARM_COMPUTE_OPENMP=1 -DARM_COMPUTE_WERROR=0 -DARM_COMPUTE_BUILD_EXAMPLES=1 -DARM_COMPUTE_BUILD_TESTING=1 -DCMAKE_INSTALL_LIBDIR=.
+	CC=clang CXX=clang++ cmake --build . -j32
+
+(experimental) Bazel:
+
+	CC=clang CXX=clang++ bazel build //...
+
+Scons:
+
+	CC=clang CXX=clang++ scons -j32 Werror=1 debug=0 neon=1 openmp=1 cppthreads=1 os=linux arch=armv8a multi_isa=1 build=native validation_tests=1
+
+Configurations supported are limited to the configurations supported by our CMake, Bazel and Multi ISA Scons builds. For more details on CMake and Bazel builds, please see @ref S1_8_experimental_builds
+
 @section S1_3_android Building for Android
 
 For Android, the library was successfully built and tested using Google's standalone toolchains:
diff --git a/docs/user_guide/operator_list.dox b/docs/user_guide/operator_list.dox
index 66b8988d29528c669acdb00cd9fd2520eecf8b7d..25c856da10054a3347654055c580ac0f93bf1228 100644
--- a/docs/user_guide/operator_list.dox
+++ b/docs/user_guide/operator_list.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2021-2022 Arm Limited.
+/// Copyright (c) 2021-2023 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -108,6 +108,26 @@ where N = batches, C = channels, H = height, W = width, D = depth
     <tr><td>F16<td>F16
     <tr><td>F32<td>F32
     </table>
+<tr>
+  <td rowspan="1">AddMulAdd
+  <td rowspan="1" style="width:200px;"> Performs a fused Add + Mul + Add [+ Relu-based-Activation] operation.
+  <td rowspan="1">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEAddMulAdd
+  <td>
+      <ul>
+       <li>Any
+      </ul>
+  <td>
+    <table>
+    <tr><th>input1<th>input2<th>bn_mul<th>bn_add<th>add_output<th>final_output
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8<td>QASYMM8<td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>F16<td>F16<td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32<td>F32<td>F32
+    </table>
 <tr>
   <td rowspan="2">ArgMinMaxLayer
   <td rowspan="2" style="width:200px;"> Function to calculate the index of the minimum or maximum values in a tensor based on an axis.
@@ -126,7 +146,7 @@ where N = batches, C = channels, H = height, W = width, D = depth
     <tr><th>src<th>dst
     <tr><td>QASYMM8<td>U32, S32
     <tr><td>QASYMM8_SIGNED<td>U32, S32
-    <tr><td>S32<td>U32, S32
+    <tr><td>S32<td>U32, S32, S64
     <tr><td>F16<td>U32, S32
     <tr><td>F32<td>U32, S32
     </table>
@@ -2054,6 +2074,40 @@ where N = batches, C = channels, H = height, W = width, D = depth
     <tr><th>src0 - src8<th>src9 - src12<th>src13<th>src14<th>dst0<th>dst1
     <tr><td>QASYMM8<td>S32<td>QSYMM16<td>QASYMM8<td>QSYMM16<td>QASYMM8
     </table>
+<tr>
+  <td rowspan="2">MatMul
+  <td rowspan="2" style="width:200px;"> Computes a matrix multiplication in batches.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_BATCH_MATMUL
+      </ul>
+  <td>NEMatMul
+  <td>
+      <ul>
+       <li>Any
+      </ul>
+  <td>
+    <table>
+    <tr><th>lhs<th>rhs<th>dst
+    <tr><td>F32<td>F32<td>F32
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
+    </table>
+<tr>
+  <td>CLMatMul
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>lhs<th>rhs<th>dst
+    <tr><td>F32<td>F32<td>F32
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
+    </table>
 <tr>
   <td rowspan="2">MaxUnpoolingLayer
   <td rowspan="2" style="width:200px;"> Function to perform MaxUnpooling.
@@ -2154,6 +2208,27 @@ where N = batches, C = channels, H = height, W = width, D = depth
     <tr><td>F32<td>F32
     <tr><td>F16<td>F16
     </table>
+<tr>
+  <td rowspan="1">NormalizePlanarYUVLayer
+  <td rowspan="1" style="width:200px;"> Function to compute normalization planar YUV layer.
+  <td rowspan="1">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>CLNormalizePlanarYUVLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    <tr><td>F16<td>F16
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    </table>
 <tr>
   <td rowspan="2">PadLayer
   <td rowspan="2" style="width:200px;"> Function to pad a tensor.
@@ -2581,6 +2656,23 @@ where N = batches, C = channels, H = height, W = width, D = depth
     <tr><td>F32<td>F32
     <tr><td>S32<td>S32
     </table>
+<tr>
+  <td rowspan="1">ReorderLayer
+  <td rowspan="1" style="width:200px;"> Reorders a tensor to a different weights format.
+  <td rowspan="1">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEReorderLayer
+  <td>
+      <ul>
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    </table>
 <tr>
   <td rowspan="2">ReorgLayer
   <td rowspan="2" style="width:200px;"> Performs a reorganization layer of input tensor to the output tensor.
@@ -2655,7 +2747,7 @@ where N = batches, C = channels, H = height, W = width, D = depth
   <td>
     <table>
     <tr><th>src0<th>src1<th>dst
-    <tr><td>All<td>U32<td>All
+    <tr><td>All<td>U32, S32<td>All
     </table>
 <tr>
   <td>CLReverse
@@ -2666,7 +2758,7 @@ where N = batches, C = channels, H = height, W = width, D = depth
   <td>
     <table>
     <tr><th>src0<th>src1<th>dst
-    <tr><td>All<td>U32<td>All
+    <tr><td>All<td>U32, S32<td>All
     </table>
 <tr>
   <td rowspan="2">RNNLayer
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 348fc1778c599ccd2620ec57a0b13eab3c8a782b..11731e5a339dda43bcffc22e428955424dd3e226 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -41,6 +41,33 @@ If there is more than one release in a month then an extra sequential number is
 
 @section S2_2_changelog Changelog
 
+v23.11 Public major release
+ - New features
+   - Add support for input data type U64/S64 in CLCast and NECast.
+   - Add support for output data type S64 in NEArgMinMaxLayer and CLArgMinMaxLayer
+   - Port the following kernels in the experimental Dynamic Fusion interface to use the new Compute Kernel Writer interface:
+     - @ref experimental::dynamic_fusion::GpuCkwResize
+     - @ref experimental::dynamic_fusion::GpuCkwPool2d
+     - @ref experimental::dynamic_fusion::GpuCkwDepthwiseConv2d
+     - @ref experimental::dynamic_fusion::GpuCkwMatMul
+   - Add support for OpenCL™ comand buffer with mutable dispatch extension.
+   - Add support for Arm® Cortex®-A520 and Arm® Cortex®-R82.
+   - Add support for negative axis values and inverted axis values in @ref arm_compute::NEReverse and @ref arm_compute::CLReverse.
+   - Add new OpenCL™ kernels:
+     - @ref opencl::kernels::ClMatMulLowpNativeMMULKernel support for QASYMM8 and QASYMM8_SIGNED, with batch support
+ - Performance optimizations:
+   - Optimize @ref cpu::CpuReshape
+   - Optimize @ref opencl::ClTranspose
+   - Optimize @ref NEStackLayer
+   - Optimize @ref CLReductionOperation.
+   - Optimize @ref CLSoftmaxLayer.
+   - Optimize start-up time of @ref NEConvolutionLayer for some input configurations where GeMM is selected as the convolution algorithm
+   - Reduce CPU Overhead by optimal flushing of CL kernels.
+ - Deprecate support for Bfloat16 in @ref cpu::CpuCast.
+ - Support for U32 axis in @ref arm_compute::NEReverse and @ref arm_compute::CLReverse will be deprecated in 24.02.
+ - Remove legacy PostOps interface. PostOps was the experimental interface for kernel fusion and is replaced by the new Dynamic Fusion interface.
+ - Update OpenCL™ API headers to v2023.04.17
+
 v23.08 Public major release
  - Deprecate the legacy 'libarm_compute_core' library. This library is an artifact of Compute Library's legacy library architecture and no longer serves any purpose.
  Users must no longer link their applications to this library and instead link only to the main `libarm_compute` library for core functionality.
diff --git a/examples/SConscript b/examples/SConscript
index fd6b59189195d7c90de5214cf99c5f5ed7787861..bfac9deb2b387e87fdd2b1686ea8b9325221557f 100644
--- a/examples/SConscript
+++ b/examples/SConscript
@@ -1,7 +1,7 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 
-# Copyright (c) 2017-2022 Arm Limited.
+# Copyright (c) 2017-2023 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -59,7 +59,7 @@ load_whole_archive = '-Wl,--whole-archive'
 noload_whole_archive = '-Wl,--no-whole-archive'
 if 'macos' in examples_env['os']:
     load_whole_archive = '-Wl,-force_load'
-    noload_whole_archive = '-Wl,-noall_load'
+    noload_whole_archive = ''
 
 # Build graph examples
 graph_utils = examples_env.Object("../utils/GraphUtils.cpp")
diff --git a/examples/cl_cache.cpp b/examples/cl_cache.cpp
index 6de62f7c5d8c131fc71f9bbe6bcc627f3bf52a29..9da5b9176da781837f27d213e62f04253efffe72 100644
--- a/examples/cl_cache.cpp
+++ b/examples/cl_cache.cpp
@@ -25,8 +25,9 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/Utils.h"
 #include "arm_compute/runtime/CL/functions/CLPermute.h"
+#include "arm_compute/runtime/CL/Utils.h"
+
 #include "utils/Utils.h"
 
 using namespace arm_compute;
@@ -43,14 +44,15 @@ public:
 
     bool do_setup(int argc, char **argv) override
     {
-        std::cout << "Once the program has run and created the file cache.bin, rerun with --restore_cache." << std::endl;
+        std::cout << "Once the program has run and created the file cache.bin, rerun with --restore_cache."
+                  << std::endl;
         CLScheduler::get().default_init();
 
-        if(argc > 1)
+        if (argc > 1)
         {
             std::string argv1 = argv[1];
             std::transform(argv1.begin(), argv1.end(), argv1.begin(), ::tolower);
-            if(argv1 == "--restore_cache")
+            if (argv1 == "--restore_cache")
             {
                 // Load the precompiled kernels from a file into the kernel library, in this way the next time they are needed
                 // compilation won't be required.
@@ -110,11 +112,13 @@ private:
         window.use_tensor_dimensions(reference.info()->tensor_shape());
         Iterator it_ref(&reference, window);
         Iterator it_res(&result, window);
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            assert(*reinterpret_cast<unsigned char *>(it_ref.ptr()) == *reinterpret_cast<unsigned char *>(it_res.ptr()));
-        },
-        it_ref, it_res);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &) {
+                assert(*reinterpret_cast<unsigned char *>(it_ref.ptr()) ==
+                       *reinterpret_cast<unsigned char *>(it_res.ptr()));
+            },
+            it_ref, it_res);
         reference.unmap();
         result.unmap();
     }
@@ -126,11 +130,9 @@ private:
         window.use_tensor_dimensions(tensor.info()->tensor_shape());
         Iterator      it_tensor(&tensor, window);
         unsigned char val(0);
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            *reinterpret_cast<unsigned char *>(it_tensor.ptr()) = val++;
-        },
-        it_tensor);
+        execute_window_loop(
+            window, [&](const Coordinates &) { *reinterpret_cast<unsigned char *>(it_tensor.ptr()) = val++; },
+            it_tensor);
         tensor.unmap();
     }
     void init_tensor(const TensorShape shape, CLTensor &tensor, DataType type, DataLayout layout)
diff --git a/examples/cl_sgemm.cpp b/examples/cl_sgemm.cpp
index 27af22895472886097c8c57d5c04388f2948804c..68955c52f7fc7a329536ea413ce4a44700a7f58a 100644
--- a/examples/cl_sgemm.cpp
+++ b/examples/cl_sgemm.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
+
 #include "utils/Utils.h"
 
 #include <cstdlib>
@@ -50,15 +51,16 @@ public:
         CLScheduler::get().default_init(&tuner);
 
         std::ifstream stream;
-        if(argc > 1)
+        if (argc > 1)
         {
             stream.open(argv[1], std::fstream::in);
         }
 
-        if(argc < 3 || (argc < 4 && stream.bad()))
+        if (argc < 3 || (argc < 4 && stream.bad()))
         {
             // Print help
-            std::cout << "Usage: 1) ./build/cl_sgemm input_matrix_1.npy input_matrix_2.npy [input_matrix_3.npy] [alpha = 1] [beta = 0]\n";
+            std::cout << "Usage: 1) ./build/cl_sgemm input_matrix_1.npy input_matrix_2.npy [input_matrix_3.npy] [alpha "
+                         "= 1] [beta = 0]\n";
             std::cout << "       2) ./build/cl_sgemm M N K [alpha = 1.0f] [beta = 0.0f]\n\n";
             std::cout << "Too few or no input_matrices provided. Using M=7, N=3, K=5, alpha=1.0f and beta=0.0f\n\n";
 
@@ -68,29 +70,29 @@ public:
         }
         else
         {
-            if(stream.good()) /* case file1.npy file2.npy [file3.npy] [alpha = 1.0f] [beta = 0.0f] */
+            if (stream.good()) /* case file1.npy file2.npy [file3.npy] [alpha = 1.0f] [beta = 0.0f] */
             {
                 npy0.open(argv[1]);
                 npy0.init_tensor(src0, DataType::F32);
                 npy1.open(argv[2]);
                 npy1.init_tensor(src1, DataType::F32);
 
-                if(argc > 3)
+                if (argc > 3)
                 {
                     stream.close();
                     stream.clear();
                     stream.open(argv[3], std::fstream::in);
-                    if(stream.good()) /* case with third file */
+                    if (stream.good()) /* case with third file */
                     {
                         npy2.open(argv[3]);
                         npy2.init_tensor(src2, DataType::F32);
 
-                        if(argc > 4)
+                        if (argc > 4)
                         {
                             // Convert string to float
                             alpha = strtof(argv[4], nullptr);
 
-                            if(argc > 5)
+                            if (argc > 5)
                             {
                                 // Convert string to float
                                 beta = strtof(argv[5], nullptr);
@@ -101,7 +103,7 @@ public:
                     {
                         alpha = strtof(argv[3], nullptr);
 
-                        if(argc > 4)
+                        if (argc > 4)
                         {
                             beta = strtof(argv[4], nullptr);
                         }
@@ -118,11 +120,11 @@ public:
                 src1.allocator()->init(TensorInfo(TensorShape(N, K), 1, DataType::F32));
                 src2.allocator()->init(TensorInfo(TensorShape(N, M), 1, DataType::F32));
 
-                if(argc > 4)
+                if (argc > 4)
                 {
                     alpha = strtof(argv[4], nullptr);
 
-                    if(argc > 5)
+                    if (argc > 5)
                     {
                         beta = strtof(argv[5], nullptr);
                     }
@@ -141,7 +143,7 @@ public:
         dst.allocator()->allocate();
 
         // Fill the input images with either the data provided or random data
-        if(npy0.is_open())
+        if (npy0.is_open())
         {
             npy0.fill_tensor(src0);
             npy1.fill_tensor(src1);
@@ -149,7 +151,7 @@ public:
             output_filename = "sgemm_out.npy";
             is_fortran      = npy0.is_fortran();
 
-            if(npy2.is_open())
+            if (npy2.is_open())
             {
                 src2.allocator()->allocate();
                 npy2.fill_tensor(src2);
@@ -179,7 +181,7 @@ public:
     }
     void do_teardown() override
     {
-        if(!output_filename.empty()) /* Save to .npy file */
+        if (!output_filename.empty()) /* Save to .npy file */
         {
             save_to_npy(dst, output_filename, is_fortran);
         }
diff --git a/examples/gemm_tuner/CommonGemmExampleOptions.cpp b/examples/gemm_tuner/CommonGemmExampleOptions.cpp
index bee202b99e6f3685d22de056b92a5b3f8e938a63..c2a465604a79e2e867d88084862e9ccb21d4be1b 100644
--- a/examples/gemm_tuner/CommonGemmExampleOptions.cpp
+++ b/examples/gemm_tuner/CommonGemmExampleOptions.cpp
@@ -39,7 +39,8 @@ using namespace utils;
     return os;
 }
 
-CommonGemmExampleOptions::CommonGemmExampleOptions(arm_compute::utils::CommandLineParser &parser, arm_compute::DataType default_data_type)
+CommonGemmExampleOptions::CommonGemmExampleOptions(arm_compute::utils::CommandLineParser &parser,
+                                                   arm_compute::DataType                  default_data_type)
     : help(parser.add_option<ToggleOption>("help")),
       M(parser.add_positional_option<SimpleOption<size_t>>("M", 100)),
       N(parser.add_positional_option<SimpleOption<size_t>>("N", 100)),
@@ -48,21 +49,16 @@ CommonGemmExampleOptions::CommonGemmExampleOptions(arm_compute::utils::CommandLi
       data_type(),
       tuner_mode()
 {
-    const std::set<DataType> supported_data_types
-    {
+    const std::set<DataType> supported_data_types{
         DataType::F16,
         DataType::F32,
         DataType::QASYMM8,
     };
 
-    const std::set<CLTunerMode> supported_tuner_modes
-    {
-        CLTunerMode::EXHAUSTIVE,
-        CLTunerMode::NORMAL,
-        CLTunerMode::RAPID
-    };
+    const std::set<CLTunerMode> supported_tuner_modes{CLTunerMode::EXHAUSTIVE, CLTunerMode::NORMAL, CLTunerMode::RAPID};
 
-    ARM_COMPUTE_ERROR_ON_MSG(supported_data_types.find(default_data_type) == supported_data_types.end(), "Default data type unsupported");
+    ARM_COMPUTE_ERROR_ON_MSG(supported_data_types.find(default_data_type) == supported_data_types.end(),
+                             "Default data type unsupported");
 
     data_type  = parser.add_option<EnumOption<DataType>>("type", supported_data_types, default_data_type);
     tuner_mode = parser.add_option<EnumOption<CLTunerMode>>("tuner-mode", supported_tuner_modes, CLTunerMode::RAPID);
diff --git a/examples/gemm_tuner/CommonGemmExampleOptions.h b/examples/gemm_tuner/CommonGemmExampleOptions.h
index f7447e3db39a63bc29d86b841420488f5ac02d4e..38178bcef8063412cdb455c5b3958261454b11be 100644
--- a/examples/gemm_tuner/CommonGemmExampleOptions.h
+++ b/examples/gemm_tuner/CommonGemmExampleOptions.h
@@ -27,21 +27,22 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
-#include "utils/TypePrinter.h"
+
 #include "utils/command_line/CommandLineOptions.h"
 #include "utils/command_line/CommandLineParser.h"
+#include "utils/TypePrinter.h"
 
 namespace gemm_tuner
 {
 /** Structure holding all the common gemm example parameters */
 struct CommonGemmExampleParams
 {
-    size_t                   M{ 100 };                                      /**< Number of lhs matrix rows */
-    size_t                   N{ 100 };                                      /**< Number of rhs matrix columns */
-    size_t                   K{ 50 };                                       /**< Number of lhs matrix columns/rhs matrix rows */
-    size_t                   B{ 1 };                                        /**< Batch size */
-    arm_compute::DataType    data_type{ arm_compute::DataType::F32 };       /**< Data type */
-    arm_compute::CLTunerMode tuner_mode{ arm_compute::CLTunerMode::RAPID }; /**< OpenCL tuner mode */
+    size_t                   M{100};                                /**< Number of lhs matrix rows */
+    size_t                   N{100};                                /**< Number of rhs matrix columns */
+    size_t                   K{50};                                 /**< Number of lhs matrix columns/rhs matrix rows */
+    size_t                   B{1};                                  /**< Batch size */
+    arm_compute::DataType    data_type{arm_compute::DataType::F32}; /**< Data type */
+    arm_compute::CLTunerMode tuner_mode{arm_compute::CLTunerMode::RAPID}; /**< OpenCL tuner mode */
 };
 
 /** Formatted output of the CommonGemmExampleParams type
@@ -70,7 +71,8 @@ public:
      * @param[in,out] parser            A parser on which "parse()" hasn't been called yet.
      * @param[in]     default_data_type Default data type if unspecified.
      */
-    CommonGemmExampleOptions(arm_compute::utils::CommandLineParser &parser, arm_compute::DataType default_data_type = arm_compute::DataType::F32);
+    CommonGemmExampleOptions(arm_compute::utils::CommandLineParser &parser,
+                             arm_compute::DataType                  default_data_type = arm_compute::DataType::F32);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CommonGemmExampleOptions(const CommonGemmExampleOptions &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -82,11 +84,11 @@ public:
     /** Default destructor */
     ~CommonGemmExampleOptions() = default;
 
-    arm_compute::utils::ToggleOption                         *help;       /**< Show help option */
-    arm_compute::utils::SimpleOption<size_t>                 *M;          /**< Number of lhs matrix rows option */
-    arm_compute::utils::SimpleOption<size_t>                 *N;          /**< Number of rhs matrix columns option */
-    arm_compute::utils::SimpleOption<size_t>                 *K;          /**< Number of lhs matrix columns/rhs matrix rows option */
-    arm_compute::utils::SimpleOption<size_t>                 *B;          /**< Batch size option */
+    arm_compute::utils::ToggleOption         *help; /**< Show help option */
+    arm_compute::utils::SimpleOption<size_t> *M;    /**< Number of lhs matrix rows option */
+    arm_compute::utils::SimpleOption<size_t> *N;    /**< Number of rhs matrix columns option */
+    arm_compute::utils::SimpleOption<size_t> *K;    /**< Number of lhs matrix columns/rhs matrix rows option */
+    arm_compute::utils::SimpleOption<size_t> *B;    /**< Batch size option */
     arm_compute::utils::EnumOption<arm_compute::DataType>    *data_type;  /**< Data type */
     arm_compute::utils::EnumOption<arm_compute::CLTunerMode> *tuner_mode; /**< OpenCL tuner mode */
 };
diff --git a/examples/gemm_tuner/GemmTunerHelpers.h b/examples/gemm_tuner/GemmTunerHelpers.h
index ae5cfbb19e86e591345d3463be60e02009d8c64b..dbff9e2dfff1042576cd9722cdd4190449202aed 100644
--- a/examples/gemm_tuner/GemmTunerHelpers.h
+++ b/examples/gemm_tuner/GemmTunerHelpers.h
@@ -36,9 +36,9 @@ bool update_padding_for_cl_image(arm_compute::ITensorInfo *tensor)
     constexpr unsigned int num_floats_per_pixel = 4;
 
     const unsigned int stride_y_in_elements = tensor->strides_in_bytes()[1] / tensor->element_size();
-    const unsigned int pixel_aligment       = arm_compute::get_cl_image_pitch_alignment(
-                                                  arm_compute::CLKernelLibrary::get().get_device());
-    if(pixel_aligment == 0)
+    const unsigned int pixel_aligment =
+        arm_compute::get_cl_image_pitch_alignment(arm_compute::CLKernelLibrary::get().get_device());
+    if (pixel_aligment == 0)
     {
         return false;
     }
diff --git a/examples/gemm_tuner/cl_gemm_native.cpp b/examples/gemm_tuner/cl_gemm_native.cpp
index dd038739217532e5141b2bcbd86fa74e6092a6f2..7daa0b07d3c000e567e3a03cc0f6d09397b93acd 100644
--- a/examples/gemm_tuner/cl_gemm_native.cpp
+++ b/examples/gemm_tuner/cl_gemm_native.cpp
@@ -25,19 +25,20 @@
 #error "This example needs to be built with -DARM_COMPUTE_CL"
 #endif /* ARM_COMPUTE_CL */
 
-#include "CommonGemmExampleOptions.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
 #include "tests/CL/Helper.h"
-#include "utils/Utils.h"
 #include "utils/command_line/CommandLineOptions.h"
 #include "utils/command_line/CommandLineParser.h"
+#include "utils/Utils.h"
 
+#include "CommonGemmExampleOptions.h"
 #include <cstdlib>
 
 using namespace arm_compute;
@@ -51,9 +52,9 @@ namespace
 /** Structure holding all tunable gemm configs specific to this example/strategy */
 struct GemmConfigs
 {
-    size_t m0{ 4 }; /**< Number of rows processed by the matrix multiplication */
-    size_t n0{ 4 }; /**< Number of columns processed by the matrix multiplication */
-    size_t k0{ 4 }; /**< Number of partial accumulations performed by the matrix multiplication */
+    size_t m0{4}; /**< Number of rows processed by the matrix multiplication */
+    size_t n0{4}; /**< Number of columns processed by the matrix multiplication */
+    size_t k0{4}; /**< Number of partial accumulations performed by the matrix multiplication */
 };
 
 /** Formatted output of the GemmConfigs type
@@ -145,13 +146,13 @@ public:
 
         // Parse command line options
         parser.parse(argc, argv);
-        if(param_options.help->is_set() && param_options.help->value())
+        if (param_options.help->is_set() && param_options.help->value())
         {
             // Print help message
             parser.print_help(argv[0]);
             return false;
         }
-        if(!parser.validate())
+        if (!parser.validate())
         {
             // Invalid arguments. Use default parameters and configs
             std::cerr << "Invalid arguments." << std::endl;
@@ -198,8 +199,9 @@ public:
 
         // Validate argments
         Status status{};
-        status = gemm.validate(lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
-        if(!status)
+        status = gemm.validate(lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info,
+                               kernel_info);
+        if (!status)
         {
             // Unsupported arguments
             std::cerr << "Unsupported arguments." << std::endl;
@@ -221,11 +223,7 @@ public:
     void do_run() override
     {
         // Execute the function
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
-            { ACL_SRC_1, &rhs },
-            { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+        ITensorPack gemm_pack({{ACL_SRC_0, &lhs}, {ACL_SRC_1, &rhs}, {ACL_SRC_2, &bias}, {ACL_DST, &dst}});
         gemm.run(gemm_pack);
 
         // Make sure all the OpenCL jobs are done executing:
diff --git a/examples/gemm_tuner/cl_gemm_reshaped.cpp b/examples/gemm_tuner/cl_gemm_reshaped.cpp
index 59044477bf483a3dc033805346bf8dcf55397b6e..75f3539cb969d433c14a18300f34b6ed210ff57c 100644
--- a/examples/gemm_tuner/cl_gemm_reshaped.cpp
+++ b/examples/gemm_tuner/cl_gemm_reshaped.cpp
@@ -31,14 +31,15 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+
 #include "examples/gemm_tuner/CommonGemmExampleOptions.h"
 #include "examples/gemm_tuner/GemmTunerHelpers.h"
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
 #include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
 #include "tests/CL/Helper.h"
-#include "utils/Utils.h"
 #include "utils/command_line/CommandLineOptions.h"
 #include "utils/command_line/CommandLineParser.h"
+#include "utils/Utils.h"
 
 #include <cstdlib>
 
@@ -53,16 +54,16 @@ namespace
 /** Structure holding all tunable gemm configs specific to this example/strategy */
 struct GemmConfigs
 {
-    size_t m0{ 4 };                        /**< Number of rows processed by the matrix multiplication */
-    size_t n0{ 4 };                        /**< Number of columns processed by the matrix multiplication */
-    size_t k0{ 4 };                        /**< Number of partial accumulations performed by the matrix multiplication */
-    size_t v0{ 1 };                        /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
-    size_t h0{ 1 };                        /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool   interleave_lhs{ true };         /**< Interleave lhs matrix */
-    bool   transpose_lhs{ true };          /**< Transpose lhs matrix. */
-    bool   interleave_rhs{ true };         /**< Interleave rhs matrix */
-    bool   transpose_rhs{ true };          /**< Transpose rhs matrix. */
-    bool   export_to_cl_image_rhs{ true }; /**< Export rhs matrix to cl_image. */
+    size_t m0{4};                /**< Number of rows processed by the matrix multiplication */
+    size_t n0{4};                /**< Number of columns processed by the matrix multiplication */
+    size_t k0{4};                /**< Number of partial accumulations performed by the matrix multiplication */
+    size_t v0{1};                /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
+    size_t h0{1};                /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool   interleave_lhs{true}; /**< Interleave lhs matrix */
+    bool   transpose_lhs{true};  /**< Transpose lhs matrix. */
+    bool   interleave_rhs{true}; /**< Interleave rhs matrix */
+    bool   transpose_rhs{true};  /**< Transpose rhs matrix. */
+    bool   export_to_cl_image_rhs{true}; /**< Export rhs matrix to cl_image. */
 };
 
 /** Formatted output of the GemmConfigs type
@@ -119,8 +120,10 @@ public:
         // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and
         // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other
         // 2 variants (both transposed and none transposed)
-        transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do transpose lhs matrix (0)");
-        export_to_cl_image_rhs->set_help("Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)");
+        transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do "
+                                "transpose lhs matrix (0)");
+        export_to_cl_image_rhs->set_help(
+            "Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)");
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     GemmConfigOptions(const GemmConfigOptions &) = delete;
@@ -133,17 +136,18 @@ public:
     /** Default destructor */
     ~GemmConfigOptions() = default;
 
-    SimpleOption<size_t> *m0;             /**< Number of rows processed by the matrix multiplication option */
-    SimpleOption<size_t> *n0;             /**< Number of columns processed by the matrix multiplication option */
-    SimpleOption<size_t> *k0;             /**< Number of partial accumulations performed by the matrix multiplication option */
-    SimpleOption<size_t> *v0;             /**< Number of vertical blocks of size (m0xk0) stored on the same output row option */
-    SimpleOption<size_t> *h0;             /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
+    SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */
+    SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */
+    SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */
+    SimpleOption<size_t> *v0; /**< Number of vertical blocks of size (m0xk0) stored on the same output row option */
+    SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
     SimpleOption<size_t> *interleave_lhs; /**< Interleave lhs matrix option (1 enable; 0 disable) */
     SimpleOption<size_t> *interleave_rhs; /**< Interleave rhs matrix option (1 enable; 0 disable) */
     // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and
     // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other
     // 2 variants (both transposed and none transposed)
-    SimpleOption<size_t> *transpose_rhs;          /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */
+    SimpleOption<size_t>                   *
+        transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */
     SimpleOption<size_t> *export_to_cl_image_rhs; /**< Export rhs matrix to cl_image.*/
 };
 
@@ -198,13 +202,13 @@ public:
 
         // Parse command line options
         parser.parse(argc, argv);
-        if(param_options.help->is_set() && param_options.help->value())
+        if (param_options.help->is_set() && param_options.help->value())
         {
             // Print help message
             parser.print_help(argv[0]);
             return false;
         }
-        if(!parser.validate())
+        if (!parser.validate())
         {
             // Invalid arguments. Use default parameters and configs
             std::cerr << "Invalid arguments." << std::endl;
@@ -256,20 +260,22 @@ public:
         kernel_info.broadcast_bias          = true;
         kernel_info.activation_info         = act_info;
 
-        if(rhs_info.h0 == 0)
+        if (rhs_info.h0 == 0)
         {
             rhs_info.h0 = std::max(kernel_info.n / rhs_info.n0, 1U);
         }
 
         // Initialise lhs_reshaped tensor info
-        lhs_reshaped.allocator()->init(TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type));
+        lhs_reshaped.allocator()->init(
+            TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type));
 
         // Initialise rhs_reshaped tensor info
-        rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
+        rhs_reshaped.allocator()->init(
+            TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
 
-        if(rhs_info.export_to_cl_image)
+        if (rhs_info.export_to_cl_image)
         {
-            if(!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info()))
+            if (!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info()))
             {
                 std::cerr << "cl_image is not supported on the device, disable export_to_cl_image" << std::endl;
                 return false;
@@ -279,7 +285,7 @@ public:
         // Validate argments
         Status status{};
         status = reshape_lhs.validate(lhs.info(), lhs_reshaped.info(), lhs_info, kernel_info.reinterpret_input_as_3d);
-        if(!status)
+        if (!status)
         {
             // Unsupported arguments
             std::cerr << "Unsupported arguments." << std::endl;
@@ -287,8 +293,9 @@ public:
             return false;
         }
 
-        status = gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
-        if(!status)
+        status = gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info,
+                               rhs_info, kernel_info);
+        if (!status)
         {
             // Unsupported arguments
             std::cerr << "Unsupported arguments." << std::endl;
@@ -300,7 +307,8 @@ public:
         reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
 
         // Configure function
-        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
+        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info,
+                       rhs_info, kernel_info);
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -315,14 +323,11 @@ public:
     void do_run() override
     {
         // Execute the functions
-        ITensorPack reshape_lsh_pack({ { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } });
+        ITensorPack reshape_lsh_pack({{ACL_SRC, &lhs}, {ACL_DST, &lhs_reshaped}});
         reshape_lhs.run(reshape_lsh_pack);
 
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
-            { ACL_SRC_1, &rhs_reshaped },
-            { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+        ITensorPack gemm_pack(
+            {{ACL_SRC_0, &lhs_reshaped}, {ACL_SRC_1, &rhs_reshaped}, {ACL_SRC_2, &bias}, {ACL_DST, &dst}});
         gemm.run(gemm_pack);
 
         // Make sure all the OpenCL jobs are done executing:
diff --git a/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp b/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp
index 0ad2a65dc2a649f70cb816ba8f7f86762cf6c050..cfea2c9bac6f8f32d30aaeaccce2d2e195662372 100644
--- a/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp
+++ b/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp
@@ -25,20 +25,21 @@
 #error "This example needs to be built with -DARM_COMPUTE_CL"
 #endif /* ARM_COMPUTE_CL */
 
-#include "CommonGemmExampleOptions.h"
-#include "GemmTunerHelpers.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
 #include "tests/CL/Helper.h"
-#include "utils/Utils.h"
 #include "utils/command_line/CommandLineOptions.h"
 #include "utils/command_line/CommandLineParser.h"
+#include "utils/Utils.h"
 
+#include "CommonGemmExampleOptions.h"
+#include "GemmTunerHelpers.h"
 #include <cstdlib>
 
 using namespace arm_compute;
@@ -52,13 +53,13 @@ namespace
 /** Structure holding all tunable gemm configs specific to this example/strategy */
 struct GemmConfigs
 {
-    size_t m0{ 4 };                        /**< Number of rows processed by the matrix multiplication */
-    size_t n0{ 4 };                        /**< Number of columns processed by the matrix multiplication */
-    size_t k0{ 4 };                        /**< Number of partial accumulations performed by the matrix multiplication */
-    size_t h0{ 1 };                        /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool   interleave_rhs{ true };         /**< Interleave rhs matrix */
-    bool   transpose_rhs{ true };          /**< Transpose rhs matrix */
-    bool   export_to_cl_image_rhs{ true }; /**< Export rhs matrix to cl_image.*/
+    size_t m0{4};                /**< Number of rows processed by the matrix multiplication */
+    size_t n0{4};                /**< Number of columns processed by the matrix multiplication */
+    size_t k0{4};                /**< Number of partial accumulations performed by the matrix multiplication */
+    size_t h0{1};                /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool   interleave_rhs{true}; /**< Interleave rhs matrix */
+    bool   transpose_rhs{true};  /**< Transpose rhs matrix */
+    bool   export_to_cl_image_rhs{true}; /**< Export rhs matrix to cl_image.*/
 };
 
 /** Formatted output of the GemmConfigs type
@@ -106,7 +107,8 @@ public:
         h0->set_help("Number of horizontal blocks of size (k0xn0) stored on the same output row");
         interleave_rhs->set_help("Interleave rhs matrix (1) / Do not interleave rhs matrix (0)");
         transpose_rhs->set_help("Transpose rhs matrix (1) / Do not transpose rhs matrix (0)");
-        export_to_cl_image_rhs->set_help("Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)");
+        export_to_cl_image_rhs->set_help(
+            "Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)");
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     GemmConfigOptions(const GemmConfigOptions &) = delete;
@@ -119,10 +121,10 @@ public:
     /** Default destructor */
     ~GemmConfigOptions() = default;
 
-    SimpleOption<size_t> *m0;                     /**< Number of rows processed by the matrix multiplication option */
-    SimpleOption<size_t> *n0;                     /**< Number of columns processed by the matrix multiplication option */
-    SimpleOption<size_t> *k0;                     /**< Number of partial accumulations performed by the matrix multiplication option */
-    SimpleOption<size_t> *h0;                     /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
+    SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */
+    SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */
+    SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */
+    SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
     SimpleOption<size_t> *interleave_rhs;         /**< Interleave rhs matrix option (1 enable; 0 disable) */
     SimpleOption<size_t> *transpose_rhs;          /**< Transpose rhs matrix option (1 enable; 0 disable) */
     SimpleOption<size_t> *export_to_cl_image_rhs; /**< Export rhs matrix to cl_image.*/
@@ -170,13 +172,13 @@ public:
 
         // Parse command line options
         parser.parse(argc, argv);
-        if(param_options.help->is_set() && param_options.help->value())
+        if (param_options.help->is_set() && param_options.help->value())
         {
             // Print help message
             parser.print_help(argv[0]);
             return false;
         }
-        if(!parser.validate())
+        if (!parser.validate())
         {
             // Invalid arguments. Use default parameters and configs
             std::cerr << "Invalid arguments." << std::endl;
@@ -225,17 +227,18 @@ public:
         kernel_info.broadcast_bias          = true;
         kernel_info.activation_info         = act_info;
 
-        if(rhs_info.h0 == 0)
+        if (rhs_info.h0 == 0)
         {
             rhs_info.h0 = std::max(kernel_info.n / rhs_info.n0, 1U);
         }
 
         // Initialise rhs_reshaped tensor info
-        rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
+        rhs_reshaped.allocator()->init(
+            TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
 
-        if(rhs_info.export_to_cl_image)
+        if (rhs_info.export_to_cl_image)
         {
-            if(!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info()))
+            if (!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info()))
             {
                 std::cerr << "cl_image is not supported on the device, disable export_to_cl_image" << std::endl;
                 return false;
@@ -244,8 +247,9 @@ public:
 
         // Validate argments
         Status status{};
-        status = gemm.validate(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
-        if(!status)
+        status = gemm.validate(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info,
+                               rhs_info, kernel_info);
+        if (!status)
         {
             // Unsupported arguments
             std::cerr << "Unsupported arguments." << std::endl;
@@ -254,7 +258,8 @@ public:
         }
 
         // Configure function
-        gemm.configure(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
+        gemm.configure(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info,
+                       kernel_info);
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -268,11 +273,7 @@ public:
     void do_run() override
     {
         // Execute the function
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
-            { ACL_SRC_1, &rhs_reshaped },
-            { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+        ITensorPack gemm_pack({{ACL_SRC_0, &lhs}, {ACL_SRC_1, &rhs_reshaped}, {ACL_SRC_2, &bias}, {ACL_DST, &dst}});
         gemm.run(gemm_pack);
 
         // Make sure all the OpenCL jobs are done executing:
diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp
index 9cf9c9fed04560df76d162db30201a7c1d8d7750..3808b98b7dc2dccdd52eb5a2228703694852b6e2 100644
--- a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp
+++ b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp
@@ -31,14 +31,15 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+
 #include "examples/gemm_tuner/CommonGemmExampleOptions.h"
 #include "examples/gemm_tuner/GemmTunerHelpers.h"
 #include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h"
 #include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
 #include "tests/CL/Helper.h"
-#include "utils/Utils.h"
 #include "utils/command_line/CommandLineOptions.h"
 #include "utils/command_line/CommandLineParser.h"
+#include "utils/Utils.h"
 
 #include <cstdlib>
 
@@ -53,15 +54,15 @@ namespace
 /** Structure holding all tunable gemm configs specific to this example/strategy */
 struct GemmConfigs
 {
-    size_t m0{ 4 };                /**< Number of rows processed by the matrix multiplication */
-    size_t n0{ 4 };                /**< Number of columns processed by the matrix multiplication */
-    size_t k0{ 4 };                /**< Number of partial accumulations performed by the matrix multiplication */
-    size_t v0{ 1 };                /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
-    size_t h0{ 1 };                /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool   interleave_lhs{ true }; /**< Interleave lhs matrix */
-    bool   transpose_lhs{ true };  /**< Transpose lhs matrix. */
-    bool   interleave_rhs{ true }; /**< Interleave rhs matrix */
-    bool   transpose_rhs{ true };  /**< Transpose rhs matrix. */
+    size_t m0{4};                /**< Number of rows processed by the matrix multiplication */
+    size_t n0{4};                /**< Number of columns processed by the matrix multiplication */
+    size_t k0{4};                /**< Number of partial accumulations performed by the matrix multiplication */
+    size_t v0{1};                /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
+    size_t h0{1};                /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool   interleave_lhs{true}; /**< Interleave lhs matrix */
+    bool   transpose_lhs{true};  /**< Transpose lhs matrix. */
+    bool   interleave_rhs{true}; /**< Interleave rhs matrix */
+    bool   transpose_rhs{true};  /**< Transpose rhs matrix. */
 };
 
 /** Formatted output of the GemmConfigs type
@@ -116,7 +117,8 @@ public:
         // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and
         // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other
         // 2 variants (both transposed and none transposed)
-        transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do transpose lhs matrix (0)");
+        transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do "
+                                "transpose lhs matrix (0)");
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     GemmConfigOptions(const GemmConfigOptions &) = delete;
@@ -129,17 +131,18 @@ public:
     /** Default destructor */
     ~GemmConfigOptions() = default;
 
-    SimpleOption<size_t> *m0;             /**< Number of rows processed by the matrix multiplication option */
-    SimpleOption<size_t> *n0;             /**< Number of columns processed by the matrix multiplication option */
-    SimpleOption<size_t> *k0;             /**< Number of partial accumulations performed by the matrix multiplication option */
-    SimpleOption<size_t> *v0;             /**< Number of vertical blocks of size (m0xk0) stored on the same output row option */
-    SimpleOption<size_t> *h0;             /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
+    SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */
+    SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */
+    SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */
+    SimpleOption<size_t> *v0; /**< Number of vertical blocks of size (m0xk0) stored on the same output row option */
+    SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
     SimpleOption<size_t> *interleave_lhs; /**< Interleave lhs matrix option (1 enable; 0 disable) */
     SimpleOption<size_t> *interleave_rhs; /**< Interleave rhs matrix option (1 enable; 0 disable) */
     // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and
     // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other
     // 2 variants (both transposed and none transposed)
-    SimpleOption<size_t> *transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */
+    SimpleOption<size_t> *
+        transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */
 };
 
 /** Consumes the gemm configuration options and creates a structure containing all information
@@ -186,12 +189,12 @@ public:
         GemmConfigOptions        config_options(parser);
 
         parser.parse(argc, argv);
-        if(param_options.help->is_set() && param_options.help->value())
+        if (param_options.help->is_set() && param_options.help->value())
         {
             parser.print_help(argv[0]);
             return false;
         }
-        if(!parser.validate())
+        if (!parser.validate())
         {
             // Invalid arguments. Use default parameters and configs
             std::cerr << "Invalid arguments." << std::endl;
@@ -217,10 +220,7 @@ public:
         rhs.allocator()->init(TensorInfo(TensorShape(params.N, params.K, params.B), 1, params.data_type));
 
         // Set arbitrary quantization information
-        const QuantizationInfo q_info
-        {
-            0.012, 3
-        };
+        const QuantizationInfo q_info{0.012, 3};
         lhs.info()->set_quantization_info(q_info);
         rhs.info()->set_quantization_info(q_info);
         dst.info()->set_quantization_info(q_info);
@@ -240,45 +240,44 @@ public:
         rhs_info.transpose          = configs.transpose_rhs;
         rhs_info.export_to_cl_image = false; // CL image not supported for quantized cases yet
 
-        if(rhs_info.h0 == 0)
+        if (rhs_info.h0 == 0)
         {
             rhs_info.h0 = std::max(static_cast<unsigned int>(params.N) / rhs_info.n0, 1U);
         }
 
-        lhs_reshaped.allocator()->init(TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type));
-        rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
+        lhs_reshaped.allocator()->init(
+            TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type));
+        rhs_reshaped.allocator()->init(
+            TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
         lhs_reshaped.info()->set_quantization_info(q_info);
         rhs_reshaped.info()->set_quantization_info(q_info);
 
-        if(rhs_info.export_to_cl_image)
+        if (rhs_info.export_to_cl_image)
         {
-            if(!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info()))
+            if (!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info()))
             {
                 std::cerr << "cl_image is not supported on the device, disable export_to_cl_image" << std::endl;
                 return false;
             }
         }
 
-        GEMMReshapeInfo gemm_info
-        {
-            static_cast<int>(params.M),
-            static_cast<int>(params.N),
-            static_cast<int>(params.K),
-            static_cast<int>(configs.h0),
-            static_cast<int>(configs.v0),
-            0,
-            false,
-            true
-        };
+        GEMMReshapeInfo gemm_info{static_cast<int>(params.M),
+                                  static_cast<int>(params.N),
+                                  static_cast<int>(params.K),
+                                  static_cast<int>(configs.h0),
+                                  static_cast<int>(configs.v0),
+                                  0,
+                                  false,
+                                  true};
 
         // Validate argments
-        if(!reshape_lhs.validate(lhs.info(), lhs_reshaped.info(), lhs_info, gemm_info.reinterpret_input_as_3d()))
+        if (!reshape_lhs.validate(lhs.info(), lhs_reshaped.info(), lhs_info, gemm_info.reinterpret_input_as_3d()))
         {
             std::cerr << "Invalid arguments for ClGemmReshapeLHSMatrixKernel." << std::endl;
             return false;
         }
 
-        if(!gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info))
+        if (!gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info))
         {
             std::cerr << "Invalid arguments for ClGemmLowpMatrixMultiplyReshapedKernel." << std::endl;
             return false;
@@ -300,10 +299,10 @@ public:
     }
     void do_run() override
     {
-        ITensorPack reshape_lsh_pack({ { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } });
+        ITensorPack reshape_lsh_pack({{ACL_SRC, &lhs}, {ACL_DST, &lhs_reshaped}});
         reshape_lhs.run(reshape_lsh_pack);
 
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } });
+        ITensorPack gemm_pack({{ACL_SRC_0, &lhs_reshaped}, {ACL_SRC_1, &rhs_reshaped}, {ACL_DST, &dst}});
         gemm.run(gemm_pack);
 
         // Make sure all the OpenCL jobs are done executing:
diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp
index 94f3c93166b00cec7201a389f0c83fb6419f4d6e..4acb316a3c5494414c9c76fc85defe695e47134a 100644
--- a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp
+++ b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp
@@ -25,23 +25,23 @@
 #error "This example needs to be built with -DARM_COMPUTE_CL"
 #endif /* ARM_COMPUTE_CL */
 
-#include "CommonGemmExampleOptions.h"
-#include "GemmTunerHelpers.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+
 #include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
 #include "tests/CL/Helper.h"
-#include "utils/Utils.h"
 #include "utils/command_line/CommandLineOptions.h"
 #include "utils/command_line/CommandLineParser.h"
+#include "utils/Utils.h"
 
+#include "CommonGemmExampleOptions.h"
+#include "GemmTunerHelpers.h"
 #include <cstdlib>
 #include <memory>
 
@@ -56,12 +56,12 @@ namespace
 /** Structure holding all tunable gemm configs specific to this example/strategy */
 struct GemmConfigs
 {
-    size_t m0{ 4 };                /**< Number of rows processed by the matrix multiplication */
-    size_t n0{ 4 };                /**< Number of columns processed by the matrix multiplication */
-    size_t k0{ 4 };                /**< Number of partial accumulations performed by the matrix multiplication */
-    size_t h0{ 1 };                /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool   interleave_rhs{ true }; /**< Interleave rhs matrix */
-    bool   transpose_rhs{ true };  /**< Transpose rhs matrix */
+    size_t m0{4};                /**< Number of rows processed by the matrix multiplication */
+    size_t n0{4};                /**< Number of columns processed by the matrix multiplication */
+    size_t k0{4};                /**< Number of partial accumulations performed by the matrix multiplication */
+    size_t h0{1};                /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool   interleave_rhs{true}; /**< Interleave rhs matrix */
+    bool   transpose_rhs{true};  /**< Transpose rhs matrix */
 };
 
 /** Formatted output of the GemmConfigs type
@@ -119,10 +119,10 @@ public:
     /** Default destructor */
     ~GemmConfigOptions() = default;
 
-    SimpleOption<size_t> *m0;             /**< Number of rows processed by the matrix multiplication option */
-    SimpleOption<size_t> *n0;             /**< Number of columns processed by the matrix multiplication option */
-    SimpleOption<size_t> *k0;             /**< Number of partial accumulations performed by the matrix multiplication option */
-    SimpleOption<size_t> *h0;             /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
+    SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */
+    SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */
+    SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */
+    SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
     SimpleOption<size_t> *interleave_rhs; /**< Interleave rhs matrix option (1 enable; 0 disable) */
     SimpleOption<size_t> *transpose_rhs;  /**< Transpose rhs matrix option (1 enable; 0 disable) */
 };
@@ -147,8 +147,9 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options)
 
 } // namespace
 
-using ClGemmLowpMatrixMultiplyReshapedOnlyRhs = test::CLSynthetizeOperator<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>;
-using ClGemmLowpMatrixAReduction              = test::CLSynthetizeOperator<ClGemmLowpMatrixAReductionKernel>;
+using ClGemmLowpMatrixMultiplyReshapedOnlyRhs =
+    test::CLSynthetizeOperator<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>;
+using ClGemmLowpMatrixAReduction = test::CLSynthetizeOperator<ClGemmLowpMatrixAReductionKernel>;
 
 class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSFusedOutputStageFixedpointExample : public Example
 {
@@ -165,12 +166,12 @@ public:
         GemmConfigOptions        config_options(parser);
 
         parser.parse(argc, argv);
-        if(param_options.help->is_set() && param_options.help->value())
+        if (param_options.help->is_set() && param_options.help->value())
         {
             parser.print_help(argv[0]);
             return false;
         }
-        if(!parser.validate())
+        if (!parser.validate())
         {
             // Invalid arguments. Use default parameters and configs
             std::cerr << "Invalid arguments." << std::endl;
@@ -199,10 +200,7 @@ public:
 
         // Set arbitrary quantization information (non-zero offset to ensure offset contribution stage is included)
         // Could be extended in the future to include a user-controlled option for offset == 0
-        const QuantizationInfo q_info
-        {
-            0.012, 3
-        };
+        const QuantizationInfo q_info{0.012, 3};
         lhs.info()->set_quantization_info(q_info);
         rhs.info()->set_quantization_info(q_info);
         bias.info()->set_quantization_info(q_info);
@@ -220,16 +218,17 @@ public:
         rhs_info.transpose          = configs.transpose_rhs;
         rhs_info.export_to_cl_image = false; // CL image not supported for quantized cases yet
 
-        if(rhs_info.h0 == 0)
+        if (rhs_info.h0 == 0)
         {
             rhs_info.h0 = std::max(static_cast<unsigned int>(params.N) / rhs_info.n0, 1U);
         }
 
-        rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
+        rhs_reshaped.allocator()->init(
+            TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
         rhs_reshaped.info()->set_quantization_info(q_info);
-        if(rhs_info.export_to_cl_image)
+        if (rhs_info.export_to_cl_image)
         {
-            if(!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info()))
+            if (!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info()))
             {
                 std::cerr << "cl_image is not supported on the device, disable export_to_cl_image" << std::endl;
                 return false;
@@ -251,9 +250,7 @@ public:
 
             gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
             gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
-            quantization::compute_quantized_multipliers_and_shifts(lhs.info(),
-                                                                   rhs.info(),
-                                                                   dst.info(),
+            quantization::compute_quantized_multipliers_and_shifts(lhs.info(), rhs.info(), dst.info(),
                                                                    gemmlowp_output_stage.gemmlowp_multipliers.data(),
                                                                    gemmlowp_output_stage.gemmlowp_shifts.data());
             gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
@@ -290,14 +287,14 @@ public:
         gemm_info.output_stage            = gemmlowp_output_stage;
 
         // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-        if(gemm_info.b_offset != 0)
+        if (gemm_info.b_offset != 0)
         {
             const TensorInfo info_vector_sum_row(compute_reductionB_shape(*lhs.info()), 1, DataType::S32);
             vector_sum_row.allocator()->init(info_vector_sum_row);
 
             mtx_a_reduction = std::make_unique<ClGemmLowpMatrixAReduction>();
 
-            if(!mtx_a_reduction->validate(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{}))
+            if (!mtx_a_reduction->validate(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{}))
             {
                 std::cerr << "Invalid arguments for CLGEMMLowpMatrixAReductionKernel." << std::endl;
                 return false;
@@ -306,7 +303,7 @@ public:
             mtx_a_reduction->configure(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{});
         }
         // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             const TensorInfo info_vector_sum_col(compute_reductionA_shape(*rhs.info()), 1, DataType::S32);
             vector_sum_col.allocator()->init(info_vector_sum_col);
@@ -314,8 +311,10 @@ public:
         }
 
         // Validate argments
-        if(!gemm.validate(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(),
-                          gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), bias.info(), dst_multipliers.info(), dst_shifts.info()))
+        if (!gemm.validate(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info,
+                           gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(),
+                           gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), bias.info(),
+                           dst_multipliers.info(), dst_shifts.info()))
         {
             std::cerr << "Invalid arguments for ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel." << std::endl;
             return false;
@@ -323,8 +322,9 @@ public:
 
         // Configure function
         gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info,
-                       gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(), gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(),
-                       bias.info(), dst_multipliers.info(), dst_shifts.info());
+                       gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(),
+                       gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), bias.info(), dst_multipliers.info(),
+                       dst_shifts.info());
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -341,13 +341,20 @@ public:
     }
     void do_run() override
     {
-        if(mtx_a_reduction != nullptr)
+        if (mtx_a_reduction != nullptr)
         {
-            ITensorPack red_pack({ { ACL_SRC, &lhs }, { ACL_DST, &dst } });
+            ITensorPack red_pack({{ACL_SRC, &lhs}, {ACL_DST, &dst}});
             mtx_a_reduction->run(red_pack);
         }
 
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs }, { ACL_BIAS, &bias }, { ACL_VEC_COL_SUM, &vector_sum_col }, { ACL_VEC_ROW_SUM, &vector_sum_row }, { ACL_SHIFTS, &dst_shifts }, { ACL_MULTIPLIERS, &dst_multipliers }, { ACL_DST, &dst } });
+        ITensorPack gemm_pack({{ACL_SRC_0, &lhs},
+                               {ACL_SRC_1, &rhs},
+                               {ACL_BIAS, &bias},
+                               {ACL_VEC_COL_SUM, &vector_sum_col},
+                               {ACL_VEC_ROW_SUM, &vector_sum_row},
+                               {ACL_SHIFTS, &dst_shifts},
+                               {ACL_MULTIPLIERS, &dst_multipliers},
+                               {ACL_DST, &dst}});
         gemm.run(gemm_pack);
 
         // Make sure all the OpenCL jobs are done executing:
@@ -370,7 +377,7 @@ private:
     CLTensor                                    dst_shifts{};
     CLTuner                                     tuner{};
     ClGemmLowpMatrixMultiplyReshapedOnlyRhs     gemm{};
-    std::unique_ptr<ClGemmLowpMatrixAReduction> mtx_a_reduction{ nullptr };
+    std::unique_ptr<ClGemmLowpMatrixAReduction> mtx_a_reduction{nullptr};
 };
 
 /** Main test program for gemmlowp reshaped rhs only with fused output stage fixedpoint
diff --git a/examples/graph_alexnet.cpp b/examples/graph_alexnet.cpp
index 53a4547e04f934a7d3ebb4748c20ad793ae097ea..be0b8a7d8a9a61af550ccf02ea26c390f5cd047a 100644
--- a/examples/graph_alexnet.cpp
+++ b/examples/graph_alexnet.cpp
@@ -39,8 +39,7 @@ using namespace arm_compute::graph_utils;
 class GraphAlexnetExample : public Example
 {
 public:
-    GraphAlexnetExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "AlexNet")
+    GraphAlexnetExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "AlexNet")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -53,14 +52,15 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
         }
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -69,88 +69,80 @@ public:
         std::string data_path = common_params.data_path;
 
         // Create a preprocessor object
-        const std::array<float, 3> mean_rgb{ { 122.68f, 116.67f, 104.01f } };
+        const std::array<float, 3>     mean_rgb{{122.68f, 116.67f, 104.01f}};
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<CaffePreproccessor>(mean_rgb);
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(227U, 227U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(227U, 227U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
-              // Layer 1
-              << ConvolutionLayer(
-                  11U, 11U, 96U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv1_b.npy"),
-                  PadStrideInfo(4, 4, 0, 0))
-              .set_name("conv1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu1")
-              << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f)).set_name("norm1")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool1")
-              // Layer 2
-              << ConvolutionLayer(
-                  5U, 5U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv2_b.npy"),
-                  PadStrideInfo(1, 1, 2, 2), 2)
-              .set_name("conv2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu2")
-              << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f)).set_name("norm2")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool2")
-              // Layer 3
-              << ConvolutionLayer(
-                  3U, 3U, 384U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv3_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv3_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv3")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu3")
-              // Layer 4
-              << ConvolutionLayer(
-                  3U, 3U, 384U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv4_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv4_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1), 2)
-              .set_name("conv4")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu4")
-              // Layer 5
-              << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv5_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv5_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1), 2)
-              .set_name("conv5")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu5")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool5")
-              // Layer 6
-              << FullyConnectedLayer(
-                  4096U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc6_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc6_b.npy"))
-              .set_name("fc6")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu6")
-              // Layer 7
-              << FullyConnectedLayer(
-                  4096U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc7_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc7_b.npy"))
-              .set_name("fc7")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu7")
-              // Layer 8
-              << FullyConnectedLayer(
-                  1000U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc8_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc8_b.npy"))
-              .set_name("fc8")
-              // Softmax
-              << SoftmaxLayer().set_name("prob")
-              << OutputLayer(get_output_accessor(common_params, 5));
+        graph
+            << common_params.target << common_params.fast_math_hint
+            << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
+            // Layer 1
+            << ConvolutionLayer(11U, 11U, 96U,
+                                get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv1_w.npy", weights_layout),
+                                get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv1_b.npy"),
+                                PadStrideInfo(4, 4, 0, 0))
+                   .set_name("conv1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu1")
+            << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f)).set_name("norm1")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool1")
+            // Layer 2
+            << ConvolutionLayer(
+                   5U, 5U, 256U, get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv2_b.npy"), PadStrideInfo(1, 1, 2, 2), 2)
+                   .set_name("conv2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu2")
+            << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f)).set_name("norm2")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool2")
+            // Layer 3
+            << ConvolutionLayer(
+                   3U, 3U, 384U, get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv3_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv3_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu3")
+            // Layer 4
+            << ConvolutionLayer(
+                   3U, 3U, 384U, get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv4_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv4_b.npy"), PadStrideInfo(1, 1, 1, 1), 2)
+                   .set_name("conv4")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu4")
+            // Layer 5
+            << ConvolutionLayer(
+                   3U, 3U, 256U, get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv5_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv5_b.npy"), PadStrideInfo(1, 1, 1, 1), 2)
+                   .set_name("conv5")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu5")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool5")
+            // Layer 6
+            << FullyConnectedLayer(4096U,
+                                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc6_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc6_b.npy"))
+                   .set_name("fc6")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu6")
+            // Layer 7
+            << FullyConnectedLayer(4096U,
+                                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc7_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc7_b.npy"))
+                   .set_name("fc7")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu7")
+            // Layer 8
+            << FullyConnectedLayer(1000U,
+                                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc8_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc8_b.npy"))
+                   .set_name("fc8")
+            // Softmax
+            << SoftmaxLayer().set_name("prob") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
@@ -163,7 +155,7 @@ public:
 
         // Load the precompiled kernels from a file into the kernel library, in this way the next time they are needed
         // compilation won't be required.
-        if(common_params.enable_cl_cache)
+        if (common_params.enable_cl_cache)
         {
 #ifdef ARM_COMPUTE_CL
             restore_program_cache_from_file();
@@ -173,7 +165,7 @@ public:
         graph.finalize(common_params.target, config);
 
         // Save the opencl kernels to a file
-        if(common_opts.enable_cl_cache)
+        if (common_opts.enable_cl_cache)
         {
 #ifdef ARM_COMPUTE_CL
             save_program_cache_to_file();
diff --git a/examples/graph_deepspeech_v0_4_1.cpp b/examples/graph_deepspeech_v0_4_1.cpp
index da163b649359d6b1ef764eeb0c4ea64dd8236bde..08cd4a47b1f856a92597417cd6f9e0c752160563 100644
--- a/examples/graph_deepspeech_v0_4_1.cpp
+++ b/examples/graph_deepspeech_v0_4_1.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/graph.h"
 #include "arm_compute/graph/Types.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -37,8 +38,7 @@ using namespace arm_compute::graph_utils;
 class GraphDeepSpeechExample : public Example
 {
 public:
-    GraphDeepSpeechExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "DeepSpeech v0.4.1")
+    GraphDeepSpeechExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "DeepSpeech v0.4.1")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -51,7 +51,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -64,7 +64,7 @@ public:
         std::string       data_path  = common_params.data_path;
         const std::string model_path = "/cnn_data/deepspeech_model/";
 
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += model_path;
         }
@@ -77,131 +77,131 @@ public:
         const float cell_clip = 20.f;
 
         // Create input descriptor
-        const TensorShape tensor_shape     = permute_shape(TensorShape(26U, 19U, n_steps, 1U), DataLayout::NHWC, common_params.data_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(26U, 19U, n_steps, 1U), DataLayout::NHWC, common_params.data_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NHWC;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor,
-                            get_weights_accessor(data_path, "input_values_x" + std::to_string(n_steps) + ".npy", weights_layout))
-              .set_name("input_node");
+                            get_weights_accessor(data_path, "input_values_x" + std::to_string(n_steps) + ".npy",
+                                                 weights_layout))
+                     .set_name("input_node");
 
-        if(common_params.data_layout == DataLayout::NCHW)
+        if (common_params.data_layout == DataLayout::NCHW)
         {
             graph << PermuteLayer(PermutationVector(2U, 0U, 1U), common_params.data_layout).set_name("permute_to_nhwc");
         }
 
         graph << ReshapeLayer(TensorShape(494U, n_steps)).set_name("Reshape_input")
               // Layer 1
-              << FullyConnectedLayer(
-                  2048U,
-                  get_weights_accessor(data_path, "h1_transpose.npy", weights_layout),
-                  get_weights_accessor(data_path, "MatMul_bias.npy"))
-              .set_name("fc0")
+              << FullyConnectedLayer(2048U, get_weights_accessor(data_path, "h1_transpose.npy", weights_layout),
+                                     get_weights_accessor(data_path, "MatMul_bias.npy"))
+                     .set_name("fc0")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, cell_clip))
-              .set_name("Relu")
+                     .set_name("Relu")
               // Layer 2
-              << FullyConnectedLayer(
-                  2048U,
-                  get_weights_accessor(data_path, "h2_transpose.npy", weights_layout),
-                  get_weights_accessor(data_path, "MatMul_1_bias.npy"))
-              .set_name("fc1")
+              << FullyConnectedLayer(2048U, get_weights_accessor(data_path, "h2_transpose.npy", weights_layout),
+                                     get_weights_accessor(data_path, "MatMul_1_bias.npy"))
+                     .set_name("fc1")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, cell_clip))
-              .set_name("Relu_1")
+                     .set_name("Relu_1")
               // Layer 3
-              << FullyConnectedLayer(
-                  2048U,
-                  get_weights_accessor(data_path, "h3_transpose.npy", weights_layout),
-                  get_weights_accessor(data_path, "MatMul_2_bias.npy"))
-              .set_name("fc2")
+              << FullyConnectedLayer(2048U, get_weights_accessor(data_path, "h3_transpose.npy", weights_layout),
+                                     get_weights_accessor(data_path, "MatMul_2_bias.npy"))
+                     .set_name("fc2")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, cell_clip))
-              .set_name("Relu_2")
+                     .set_name("Relu_2")
               // Layer 4
               << ReshapeLayer(TensorShape(2048U, 1U, n_steps)).set_name("Reshape_1");
 
         // Unstack Layer (using SplitLayerNode)
-        NodeParams unstack_params = { "unstack", graph.hints().target_hint };
-        NodeID     unstack_nid    = GraphBuilder::add_split_node(graph.graph(), unstack_params, { graph.tail_node(), 0 }, n_steps, 2);
+        NodeParams unstack_params = {"unstack", graph.hints().target_hint};
+        NodeID     unstack_nid =
+            GraphBuilder::add_split_node(graph.graph(), unstack_params, {graph.tail_node(), 0}, n_steps, 2);
 
         // Create input state descriptor
-        TensorDescriptor state_descriptor = TensorDescriptor(TensorShape(2048U), common_params.data_type).set_layout(common_params.data_layout);
-        SubStream        previous_state(graph);
-        SubStream        add_y(graph);
+        TensorDescriptor state_descriptor =
+            TensorDescriptor(TensorShape(2048U), common_params.data_type).set_layout(common_params.data_layout);
+        SubStream previous_state(graph);
+        SubStream add_y(graph);
 
         // Initial state for LSTM is all zeroes for both state_h and state_c, therefore only one input is created
-        previous_state << InputLayer(state_descriptor,
-                                     get_weights_accessor(data_path, "zeros.npy"))
-                       .set_name("previous_state_c_h");
-        add_y << InputLayer(state_descriptor,
-                            get_weights_accessor(data_path, "ones.npy"))
-              .set_name("add_y");
+        previous_state << InputLayer(state_descriptor, get_weights_accessor(data_path, "zeros.npy"))
+                              .set_name("previous_state_c_h");
+        add_y << InputLayer(state_descriptor, get_weights_accessor(data_path, "ones.npy")).set_name("add_y");
 
         // Create LSTM Fully Connected weights and bias descriptors
-        TensorDescriptor lstm_weights_descriptor = TensorDescriptor(TensorShape(4096U, 8192U), common_params.data_type).set_layout(common_params.data_layout);
-        TensorDescriptor lstm_bias_descriptor    = TensorDescriptor(TensorShape(8192U), common_params.data_type).set_layout(common_params.data_layout);
-        SubStream        lstm_fc_weights(graph);
-        SubStream        lstm_fc_bias(graph);
-        lstm_fc_weights << ConstantLayer(lstm_weights_descriptor,
-                                         get_weights_accessor(data_path, "rnn_lstm_cell_kernel_transpose.npy", weights_layout))
-                        .set_name("h5/transpose");
+        TensorDescriptor lstm_weights_descriptor =
+            TensorDescriptor(TensorShape(4096U, 8192U), common_params.data_type).set_layout(common_params.data_layout);
+        TensorDescriptor lstm_bias_descriptor =
+            TensorDescriptor(TensorShape(8192U), common_params.data_type).set_layout(common_params.data_layout);
+        SubStream lstm_fc_weights(graph);
+        SubStream lstm_fc_bias(graph);
+        lstm_fc_weights << ConstantLayer(
+                               lstm_weights_descriptor,
+                               get_weights_accessor(data_path, "rnn_lstm_cell_kernel_transpose.npy", weights_layout))
+                               .set_name("h5/transpose");
         lstm_fc_bias << ConstantLayer(lstm_bias_descriptor,
                                       get_weights_accessor(data_path, "rnn_lstm_cell_MatMul_bias.npy"))
-                     .set_name("MatMul_3_bias");
+                            .set_name("MatMul_3_bias");
 
         // LSTM Block
-        std::pair<SubStream, SubStream> new_state_1  = add_lstm_cell(unstack_nid, 0, previous_state, previous_state, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_2  = add_lstm_cell(unstack_nid, 1, new_state_1.first, new_state_1.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_3  = add_lstm_cell(unstack_nid, 2, new_state_2.first, new_state_2.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_4  = add_lstm_cell(unstack_nid, 3, new_state_3.first, new_state_3.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_5  = add_lstm_cell(unstack_nid, 4, new_state_4.first, new_state_4.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_6  = add_lstm_cell(unstack_nid, 5, new_state_5.first, new_state_5.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_7  = add_lstm_cell(unstack_nid, 6, new_state_6.first, new_state_6.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_8  = add_lstm_cell(unstack_nid, 7, new_state_7.first, new_state_7.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_9  = add_lstm_cell(unstack_nid, 8, new_state_8.first, new_state_8.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_10 = add_lstm_cell(unstack_nid, 9, new_state_9.first, new_state_9.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_11 = add_lstm_cell(unstack_nid, 10, new_state_10.first, new_state_10.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_12 = add_lstm_cell(unstack_nid, 11, new_state_11.first, new_state_11.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_13 = add_lstm_cell(unstack_nid, 12, new_state_12.first, new_state_12.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_14 = add_lstm_cell(unstack_nid, 13, new_state_13.first, new_state_13.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_15 = add_lstm_cell(unstack_nid, 14, new_state_14.first, new_state_14.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_16 = add_lstm_cell(unstack_nid, 15, new_state_15.first, new_state_15.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_1 =
+            add_lstm_cell(unstack_nid, 0, previous_state, previous_state, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_2 =
+            add_lstm_cell(unstack_nid, 1, new_state_1.first, new_state_1.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_3 =
+            add_lstm_cell(unstack_nid, 2, new_state_2.first, new_state_2.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_4 =
+            add_lstm_cell(unstack_nid, 3, new_state_3.first, new_state_3.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_5 =
+            add_lstm_cell(unstack_nid, 4, new_state_4.first, new_state_4.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_6 =
+            add_lstm_cell(unstack_nid, 5, new_state_5.first, new_state_5.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_7 =
+            add_lstm_cell(unstack_nid, 6, new_state_6.first, new_state_6.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_8 =
+            add_lstm_cell(unstack_nid, 7, new_state_7.first, new_state_7.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_9 =
+            add_lstm_cell(unstack_nid, 8, new_state_8.first, new_state_8.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_10 =
+            add_lstm_cell(unstack_nid, 9, new_state_9.first, new_state_9.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_11 = add_lstm_cell(
+            unstack_nid, 10, new_state_10.first, new_state_10.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_12 = add_lstm_cell(
+            unstack_nid, 11, new_state_11.first, new_state_11.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_13 = add_lstm_cell(
+            unstack_nid, 12, new_state_12.first, new_state_12.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_14 = add_lstm_cell(
+            unstack_nid, 13, new_state_13.first, new_state_13.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_15 = add_lstm_cell(
+            unstack_nid, 14, new_state_14.first, new_state_14.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_16 = add_lstm_cell(
+            unstack_nid, 15, new_state_15.first, new_state_15.second, add_y, lstm_fc_weights, lstm_fc_bias);
 
         // Concatenate new states on height
         const int axis = 1;
-        graph << StackLayer(axis,
-                            std::move(new_state_1.second),
-                            std::move(new_state_2.second),
-                            std::move(new_state_3.second),
-                            std::move(new_state_4.second),
-                            std::move(new_state_5.second),
-                            std::move(new_state_6.second),
-                            std::move(new_state_7.second),
-                            std::move(new_state_8.second),
-                            std::move(new_state_9.second),
-                            std::move(new_state_10.second),
-                            std::move(new_state_11.second),
-                            std::move(new_state_12.second),
-                            std::move(new_state_13.second),
-                            std::move(new_state_14.second),
-                            std::move(new_state_15.second),
-                            std::move(new_state_16.second))
-              .set_name("concat");
-
-        graph << FullyConnectedLayer(
-                  2048U,
-                  get_weights_accessor(data_path, "h5_transpose.npy", weights_layout),
-                  get_weights_accessor(data_path, "MatMul_3_bias.npy"))
-              .set_name("fc3")
+        graph << StackLayer(axis, std::move(new_state_1.second), std::move(new_state_2.second),
+                            std::move(new_state_3.second), std::move(new_state_4.second), std::move(new_state_5.second),
+                            std::move(new_state_6.second), std::move(new_state_7.second), std::move(new_state_8.second),
+                            std::move(new_state_9.second), std::move(new_state_10.second),
+                            std::move(new_state_11.second), std::move(new_state_12.second),
+                            std::move(new_state_13.second), std::move(new_state_14.second),
+                            std::move(new_state_15.second), std::move(new_state_16.second))
+                     .set_name("concat");
+
+        graph << FullyConnectedLayer(2048U, get_weights_accessor(data_path, "h5_transpose.npy", weights_layout),
+                                     get_weights_accessor(data_path, "MatMul_3_bias.npy"))
+                     .set_name("fc3")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, cell_clip))
-              .set_name("Relu3")
-              << FullyConnectedLayer(
-                  29U,
-                  get_weights_accessor(data_path, "h6_transpose.npy", weights_layout),
-                  get_weights_accessor(data_path, "MatMul_4_bias.npy"))
-              .set_name("fc3")
+                     .set_name("Relu3")
+              << FullyConnectedLayer(29U, get_weights_accessor(data_path, "h6_transpose.npy", weights_layout),
+                                     get_weights_accessor(data_path, "MatMul_4_bias.npy"))
+                     .set_name("fc3")
               << SoftmaxLayer().set_name("logits");
 
         graph << OutputLayer(get_output_accessor(common_params, 5));
@@ -241,7 +241,7 @@ private:
         return Status{};
     }
 
-    std::pair<SubStream, SubStream> add_lstm_cell(NodeID unstack_nid,
+    std::pair<SubStream, SubStream> add_lstm_cell(NodeID       unstack_nid,
                                                   unsigned int unstack_idx,
                                                   SubStream    previous_state_c,
                                                   SubStream    previous_state_h,
@@ -250,41 +250,41 @@ private:
                                                   SubStream    lstm_fc_bias)
     {
         const std::string         cell_name("rnn/lstm_cell_" + std::to_string(unstack_idx));
-        const DataLayoutDimension concat_dim = (common_params.data_layout == DataLayout::NHWC) ? DataLayoutDimension::CHANNEL : DataLayoutDimension::WIDTH;
+        const DataLayoutDimension concat_dim =
+            (common_params.data_layout == DataLayout::NHWC) ? DataLayoutDimension::CHANNEL : DataLayoutDimension::WIDTH;
 
         // Concatenate result of Unstack with previous_state_h
-        NodeParams concat_params = { cell_name + "/concat", graph.hints().target_hint };
+        NodeParams concat_params = {cell_name + "/concat", graph.hints().target_hint};
         NodeID     concat_nid    = graph.graph().add_node<ConcatenateLayerNode>(2, concat_dim);
         graph.graph().add_connection(unstack_nid, unstack_idx, concat_nid, 0);
         graph.graph().add_connection(previous_state_h.tail_node(), 0, concat_nid, 1);
         set_node_params(graph.graph(), concat_nid, concat_params);
         graph.forward_tail(concat_nid);
 
-        graph << FullyConnectedLayer(
-                  8192U,
-                  lstm_fc_weights,
-                  lstm_fc_bias)
-              .set_name(cell_name + "/BiasAdd");
+        graph << FullyConnectedLayer(8192U, lstm_fc_weights, lstm_fc_bias).set_name(cell_name + "/BiasAdd");
 
         // Split Layer
         const unsigned int num_splits = 4;
         const unsigned int split_axis = 0;
 
-        NodeParams split_params = { cell_name + "/split", graph.hints().target_hint };
-        NodeID     split_nid    = GraphBuilder::add_split_node(graph.graph(), split_params, { graph.tail_node(), 0 }, num_splits, split_axis);
+        NodeParams split_params = {cell_name + "/split", graph.hints().target_hint};
+        NodeID     split_nid =
+            GraphBuilder::add_split_node(graph.graph(), split_params, {graph.tail_node(), 0}, num_splits, split_axis);
 
-        NodeParams sigmoid_1_params = { cell_name + "/Sigmoid_1", graph.hints().target_hint };
-        NodeParams add_params       = { cell_name + "/add", graph.hints().target_hint };
-        NodeParams sigmoid_2_params = { cell_name + "/Sigmoid_2", graph.hints().target_hint };
-        NodeParams tanh_params      = { cell_name + "/Tanh", graph.hints().target_hint };
+        NodeParams sigmoid_1_params = {cell_name + "/Sigmoid_1", graph.hints().target_hint};
+        NodeParams add_params       = {cell_name + "/add", graph.hints().target_hint};
+        NodeParams sigmoid_2_params = {cell_name + "/Sigmoid_2", graph.hints().target_hint};
+        NodeParams tanh_params      = {cell_name + "/Tanh", graph.hints().target_hint};
 
         // Sigmoid 1 (first split)
-        NodeID sigmoid_1_nid = graph.graph().add_node<ActivationLayerNode>(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        NodeID sigmoid_1_nid = graph.graph().add_node<ActivationLayerNode>(
+            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
         graph.graph().add_connection(split_nid, 0, sigmoid_1_nid, 0);
         set_node_params(graph.graph(), sigmoid_1_nid, sigmoid_1_params);
 
         // Tanh (second split)
-        NodeID tanh_nid = graph.graph().add_node<ActivationLayerNode>(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+        NodeID tanh_nid = graph.graph().add_node<ActivationLayerNode>(
+            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
         graph.graph().add_connection(split_nid, 1, tanh_nid, 0);
         set_node_params(graph.graph(), tanh_nid, tanh_params);
 
@@ -292,13 +292,15 @@ private:
         tanh_ss.forward_tail(tanh_nid);
 
         // Add (third split)
-        NodeID add_nid = graph.graph().add_node<EltwiseLayerNode>(descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add });
+        NodeID add_nid =
+            graph.graph().add_node<EltwiseLayerNode>(descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add});
         graph.graph().add_connection(split_nid, 2, add_nid, 0);
         graph.graph().add_connection(add_y.tail_node(), 0, add_nid, 1);
         set_node_params(graph.graph(), add_nid, add_params);
 
         // Sigmoid 2 (fourth split)
-        NodeID sigmoid_2_nid = graph.graph().add_node<ActivationLayerNode>(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        NodeID sigmoid_2_nid = graph.graph().add_node<ActivationLayerNode>(
+            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
         graph.graph().add_connection(split_nid, 3, sigmoid_2_nid, 0);
         set_node_params(graph.graph(), sigmoid_2_nid, sigmoid_2_params);
 
@@ -306,28 +308,28 @@ private:
         sigmoid_1_ss.forward_tail(sigmoid_1_nid);
         SubStream mul_1_ss(sigmoid_1_ss);
         mul_1_ss << EltwiseLayer(std::move(sigmoid_1_ss), std::move(tanh_ss), EltwiseOperation::Mul)
-                 .set_name(cell_name + "/mul_1");
+                        .set_name(cell_name + "/mul_1");
 
         SubStream tanh_1_ss_tmp(graph);
         tanh_1_ss_tmp.forward_tail(add_nid);
 
         tanh_1_ss_tmp << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))
-                      .set_name(cell_name + "/Sigmoid");
+                             .set_name(cell_name + "/Sigmoid");
         SubStream tanh_1_ss_tmp2(tanh_1_ss_tmp);
         tanh_1_ss_tmp2 << EltwiseLayer(std::move(tanh_1_ss_tmp), std::move(previous_state_c), EltwiseOperation::Mul)
-                       .set_name(cell_name + "/mul");
+                              .set_name(cell_name + "/mul");
         SubStream tanh_1_ss(tanh_1_ss_tmp2);
         tanh_1_ss << EltwiseLayer(std::move(tanh_1_ss_tmp2), std::move(mul_1_ss), EltwiseOperation::Add)
-                  .set_name(cell_name + "/new_state_c");
+                         .set_name(cell_name + "/new_state_c");
         SubStream new_state_c(tanh_1_ss);
 
         tanh_1_ss << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))
-                  .set_name(cell_name + "/Tanh_1");
+                         .set_name(cell_name + "/Tanh_1");
 
         SubStream sigmoid_2_ss(graph);
         sigmoid_2_ss.forward_tail(sigmoid_2_nid);
         graph << EltwiseLayer(std::move(sigmoid_2_ss), std::move(tanh_1_ss), EltwiseOperation::Mul)
-              .set_name(cell_name + "/new_state_h");
+                     .set_name(cell_name + "/new_state_h");
 
         SubStream new_state_h(graph);
         return std::pair<SubStream, SubStream>(new_state_c, new_state_h);
diff --git a/examples/graph_edsr.cpp b/examples/graph_edsr.cpp
index 0e41f1215561730c88f2b4353dd48bee05bd684b..b4f2fadf4aa67eb79b5967dfe624464ad3d5878c 100644
--- a/examples/graph_edsr.cpp
+++ b/examples/graph_edsr.cpp
@@ -22,28 +22,28 @@
  * SOFTWARE.
  */
 
+#include "graph_edsr.h"
+
 #include "arm_compute/graph/Utils.h"
 
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/Utils.h"
 
-#include "graph_edsr.h"
-
 using namespace arm_compute::graph;
 using namespace arm_compute::utils;
 
 class GraphEdsrExample : public Example
 {
 public:
-    GraphEdsrExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params()
+    GraphEdsrExample() : cmd_parser(), common_opts(cmd_parser), common_params()
     {
         expected_output_filename = cmd_parser.add_option<SimpleOption<std::string>>("expected-output-filename", "");
-        expected_output_filename->set_help("Name of npy file containing the expected output to validate the graph output.");
+        expected_output_filename->set_help(
+            "Name of npy file containing the expected output to validate the graph output.");
     }
 
-    GraphEdsrExample(const GraphEdsrExample &) = delete;
+    GraphEdsrExample(const GraphEdsrExample &)            = delete;
     GraphEdsrExample &operator=(const GraphEdsrExample &) = delete;
     ~GraphEdsrExample() override                          = default;
 
@@ -57,13 +57,14 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
         }
 
-        ARM_COMPUTE_EXIT_ON_MSG(common_params.data_type != DataType::QASYMM8, "Only QASYMM8 is supported for this graph example");
+        ARM_COMPUTE_EXIT_ON_MSG(common_params.data_type != DataType::QASYMM8,
+                                "Only QASYMM8 is supported for this graph example");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -98,7 +99,7 @@ private:
     GraphContext context{};
     GraphManager manager{};
 
-    SimpleOption<std::string> *expected_output_filename{ nullptr };
+    SimpleOption<std::string> *expected_output_filename{nullptr};
 
     GraphEdsr model{};
 };
diff --git a/examples/graph_edsr.h b/examples/graph_edsr.h
index 72012afdcb3cc44a5037319e9843bcf34082181f..1161e4ba3822fec676b4d6779988752aa0c6c904 100644
--- a/examples/graph_edsr.h
+++ b/examples/graph_edsr.h
@@ -32,12 +32,12 @@
 class GraphEdsr
 {
 public:
-    GraphEdsr()
-        : _graph(0, "EDSR")
+    GraphEdsr() : _graph(0, "EDSR")
     {
     }
 
-    bool setup(const arm_compute::utils::CommonGraphParams &common_params, const arm_compute::utils::SimpleOption<std::string> &expected_output_filename)
+    bool setup(const arm_compute::utils::CommonGraphParams         &common_params,
+               const arm_compute::utils::SimpleOption<std::string> &expected_output_filename)
     {
         using namespace arm_compute;
         using namespace arm_compute::graph;
@@ -47,1221 +47,879 @@ public:
         const auto &data_path = common_params.data_path;
         const auto &target    = common_params.target;
 
-        NodeID id_upscale_net_FakeQuantWithMinMaxVars_transposed = _graph.add_node<ConstNode>(
-                                                                       TensorDescriptor
-        {
-            TensorShape{ 12, 2, 2, 3 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00393533194437623, 1),
-            DataLayout::NHWC });
-        INode *node_upscale_net_FakeQuantWithMinMaxVars_transposed = _graph.node(id_upscale_net_FakeQuantWithMinMaxVars_transposed);
-        node_upscale_net_FakeQuantWithMinMaxVars_transposed->set_common_node_parameters(NodeParams{ "upscale_net_FakeQuantWithMinMaxVars_transposed", target });
-        node_upscale_net_FakeQuantWithMinMaxVars_transposed->output(0)->set_accessor(get_weights_accessor(data_path,
-                                                                                                          "/cnn_data/edsr_model/upscale_net_FakeQuantWithMinMaxVars_transposed.npy", DataLayout::NHWC));
-
-        NodeID id_pre_upscale_Conv2D_bias = _graph.add_node<ConstNode>(
-                                                TensorDescriptor
-        {
-            TensorShape{ 12 },
-            DataType::S32,
-            QuantizationInfo(2.9644968435604824e-06),
-            DataLayout::NHWC });
+        NodeID id_upscale_net_FakeQuantWithMinMaxVars_transposed = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{12, 2, 2, 3}, DataType::QASYMM8, QuantizationInfo(0.00393533194437623, 1), DataLayout::NHWC});
+        INode *node_upscale_net_FakeQuantWithMinMaxVars_transposed =
+            _graph.node(id_upscale_net_FakeQuantWithMinMaxVars_transposed);
+        node_upscale_net_FakeQuantWithMinMaxVars_transposed->set_common_node_parameters(
+            NodeParams{"upscale_net_FakeQuantWithMinMaxVars_transposed", target});
+        node_upscale_net_FakeQuantWithMinMaxVars_transposed->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/upscale_net_FakeQuantWithMinMaxVars_transposed.npy", DataLayout::NHWC));
+
+        NodeID id_pre_upscale_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{12}, DataType::S32, QuantizationInfo(2.9644968435604824e-06), DataLayout::NHWC});
         INode *node_pre_upscale_Conv2D_bias = _graph.node(id_pre_upscale_Conv2D_bias);
-        node_pre_upscale_Conv2D_bias->set_common_node_parameters(NodeParams{ "pre_upscale_Conv2D_bias", target });
-        node_pre_upscale_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/pre_upscale_Conv2D_bias.npy", DataLayout::NHWC));
-
-        NodeID id_pre_upscale_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                            TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 12 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.000455576169770211, 128),
-            DataLayout::NHWC });
+        node_pre_upscale_Conv2D_bias->set_common_node_parameters(NodeParams{"pre_upscale_Conv2D_bias", target});
+        node_pre_upscale_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/pre_upscale_Conv2D_bias.npy", DataLayout::NHWC));
+
+        NodeID id_pre_upscale_FakeQuantWithMinMaxVars =
+            _graph.add_node<ConstNode>(TensorDescriptor{TensorShape{256, 3, 3, 12}, DataType::QASYMM8,
+                                                        QuantizationInfo(0.000455576169770211, 128), DataLayout::NHWC});
         INode *node_pre_upscale_FakeQuantWithMinMaxVars = _graph.node(id_pre_upscale_FakeQuantWithMinMaxVars);
-        node_pre_upscale_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "pre_upscale_FakeQuantWithMinMaxVars", target });
-        node_pre_upscale_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/pre_upscale_FakeQuantWithMinMaxVars.npy",
-                                                                                               DataLayout::NHWC));
-
-        NodeID id_post_residual_Conv2D_bias = _graph.add_node<ConstNode>(
-                                                  TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.2760000345224398e-06),
-            DataLayout::NHWC });
+        node_pre_upscale_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"pre_upscale_FakeQuantWithMinMaxVars", target});
+        node_pre_upscale_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/pre_upscale_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_post_residual_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.2760000345224398e-06), DataLayout::NHWC});
         INode *node_post_residual_Conv2D_bias = _graph.node(id_post_residual_Conv2D_bias);
-        node_post_residual_Conv2D_bias->set_common_node_parameters(NodeParams{ "post_residual_Conv2D_bias", target });
-        node_post_residual_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/post_residual_Conv2D_bias.npy", DataLayout::NHWC));
+        node_post_residual_Conv2D_bias->set_common_node_parameters(NodeParams{"post_residual_Conv2D_bias", target});
+        node_post_residual_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/post_residual_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_post_residual_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                              TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00036424631252884865, 129),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00036424631252884865, 129), DataLayout::NHWC});
         INode *node_post_residual_FakeQuantWithMinMaxVars = _graph.node(id_post_residual_FakeQuantWithMinMaxVars);
-        node_post_residual_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "post_residual_FakeQuantWithMinMaxVars", target });
-        node_post_residual_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/post_residual_FakeQuantWithMinMaxVars.npy",
-                                                                                                 DataLayout::NHWC));
-
-        NodeID id_mul_15_y = _graph.add_node<ConstNode>(
-                                 TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_post_residual_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"post_residual_FakeQuantWithMinMaxVars", target});
+        node_post_residual_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/post_residual_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_15_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_15_y = _graph.node(id_mul_15_y);
-        node_mul_15_y->set_common_node_parameters(NodeParams{ "mul_15_y", target });
-        node_mul_15_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_15_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_15_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                               TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.2441644230420934e-06),
-            DataLayout::NHWC });
+        node_mul_15_y->set_common_node_parameters(NodeParams{"mul_15_y", target});
+        node_mul_15_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_15_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_15_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.2441644230420934e-06), DataLayout::NHWC});
         INode *node_block_15_1_Conv2D_bias = _graph.node(id_block_15_1_Conv2D_bias);
-        node_block_15_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_15_1_Conv2D_bias", target });
-        node_block_15_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_15_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_15_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_15_1_Conv2D_bias", target});
+        node_block_15_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_15_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_15_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                           TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00037038681330159307, 125),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00037038681330159307, 125), DataLayout::NHWC});
         INode *node_block_15_1_FakeQuantWithMinMaxVars = _graph.node(id_block_15_1_FakeQuantWithMinMaxVars);
-        node_block_15_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_15_1_FakeQuantWithMinMaxVars", target });
-        node_block_15_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_15_1_FakeQuantWithMinMaxVars.npy",
-                                                                                              DataLayout::NHWC));
-
-        NodeID id_mul_14_y = _graph.add_node<ConstNode>(
-                                 TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_15_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_15_1_FakeQuantWithMinMaxVars", target});
+        node_block_15_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_15_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_14_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_14_y = _graph.node(id_mul_14_y);
-        node_mul_14_y->set_common_node_parameters(NodeParams{ "mul_14_y", target });
-        node_mul_14_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_14_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_14_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                               TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.3417260333881131e-06),
-            DataLayout::NHWC });
+        node_mul_14_y->set_common_node_parameters(NodeParams{"mul_14_y", target});
+        node_mul_14_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_14_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_14_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.3417260333881131e-06), DataLayout::NHWC});
         INode *node_block_14_1_Conv2D_bias = _graph.node(id_block_14_1_Conv2D_bias);
-        node_block_14_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_14_1_Conv2D_bias", target });
-        node_block_14_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_14_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_14_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_14_1_Conv2D_bias", target});
+        node_block_14_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_14_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_14_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                           TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00040307495510205626, 127),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00040307495510205626, 127), DataLayout::NHWC});
         INode *node_block_14_1_FakeQuantWithMinMaxVars = _graph.node(id_block_14_1_FakeQuantWithMinMaxVars);
-        node_block_14_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_14_1_FakeQuantWithMinMaxVars", target });
-        node_block_14_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_14_1_FakeQuantWithMinMaxVars.npy",
-                                                                                              DataLayout::NHWC));
-
-        NodeID id_mul_13_y = _graph.add_node<ConstNode>(
-                                 TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_14_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_14_1_FakeQuantWithMinMaxVars", target});
+        node_block_14_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_14_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_13_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_13_y = _graph.node(id_mul_13_y);
-        node_mul_13_y->set_common_node_parameters(NodeParams{ "mul_13_y", target });
-        node_mul_13_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_13_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_13_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                               TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.2636977544389083e-06),
-            DataLayout::NHWC });
+        node_mul_13_y->set_common_node_parameters(NodeParams{"mul_13_y", target});
+        node_mul_13_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_13_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_13_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.2636977544389083e-06), DataLayout::NHWC});
         INode *node_block_13_1_Conv2D_bias = _graph.node(id_block_13_1_Conv2D_bias);
-        node_block_13_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_13_1_Conv2D_bias", target });
-        node_block_13_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_13_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_13_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_13_1_Conv2D_bias", target});
+        node_block_13_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_13_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_13_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                           TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003858553245663643, 131),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.0003858553245663643, 131), DataLayout::NHWC});
         INode *node_block_13_1_FakeQuantWithMinMaxVars = _graph.node(id_block_13_1_FakeQuantWithMinMaxVars);
-        node_block_13_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_13_1_FakeQuantWithMinMaxVars", target });
-        node_block_13_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_13_1_FakeQuantWithMinMaxVars.npy",
-                                                                                              DataLayout::NHWC));
-
-        NodeID id_mul_12_y = _graph.add_node<ConstNode>(
-                                 TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_13_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_13_1_FakeQuantWithMinMaxVars", target});
+        node_block_13_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_13_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_12_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_12_y = _graph.node(id_mul_12_y);
-        node_mul_12_y->set_common_node_parameters(NodeParams{ "mul_12_y", target });
-        node_mul_12_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_12_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_12_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                               TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.3479783547154511e-06),
-            DataLayout::NHWC });
+        node_mul_12_y->set_common_node_parameters(NodeParams{"mul_12_y", target});
+        node_mul_12_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_12_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_12_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.3479783547154511e-06), DataLayout::NHWC});
         INode *node_block_12_1_Conv2D_bias = _graph.node(id_block_12_1_Conv2D_bias);
-        node_block_12_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_12_1_Conv2D_bias", target });
-        node_block_12_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_12_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_12_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_12_1_Conv2D_bias", target});
+        node_block_12_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_12_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_12_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                           TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00041212860378436744, 130),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00041212860378436744, 130), DataLayout::NHWC});
         INode *node_block_12_1_FakeQuantWithMinMaxVars = _graph.node(id_block_12_1_FakeQuantWithMinMaxVars);
-        node_block_12_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_12_1_FakeQuantWithMinMaxVars", target });
-        node_block_12_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_12_1_FakeQuantWithMinMaxVars.npy",
-                                                                                              DataLayout::NHWC));
-
-        NodeID id_mul_11_y = _graph.add_node<ConstNode>(
-                                 TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_12_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_12_1_FakeQuantWithMinMaxVars", target});
+        node_block_12_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_12_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_11_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_11_y = _graph.node(id_mul_11_y);
-        node_mul_11_y->set_common_node_parameters(NodeParams{ "mul_11_y", target });
-        node_mul_11_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_11_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_11_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                               TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.2847248171965475e-06),
-            DataLayout::NHWC });
+        node_mul_11_y->set_common_node_parameters(NodeParams{"mul_11_y", target});
+        node_mul_11_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_11_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_11_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.2847248171965475e-06), DataLayout::NHWC});
         INode *node_block_11_1_Conv2D_bias = _graph.node(id_block_11_1_Conv2D_bias);
-        node_block_11_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_11_1_Conv2D_bias", target });
-        node_block_11_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_11_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_11_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_11_1_Conv2D_bias", target});
+        node_block_11_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_11_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_11_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                           TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00040296532097272575, 131),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00040296532097272575, 131), DataLayout::NHWC});
         INode *node_block_11_1_FakeQuantWithMinMaxVars = _graph.node(id_block_11_1_FakeQuantWithMinMaxVars);
-        node_block_11_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_11_1_FakeQuantWithMinMaxVars", target });
-        node_block_11_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_11_1_FakeQuantWithMinMaxVars.npy",
-                                                                                              DataLayout::NHWC));
-
-        NodeID id_mul_10_y = _graph.add_node<ConstNode>(
-                                 TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_11_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_11_1_FakeQuantWithMinMaxVars", target});
+        node_block_11_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_11_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_10_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_10_y = _graph.node(id_mul_10_y);
-        node_mul_10_y->set_common_node_parameters(NodeParams{ "mul_10_y", target });
-        node_mul_10_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_10_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_10_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                               TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.1997129831797793e-06),
-            DataLayout::NHWC });
+        node_mul_10_y->set_common_node_parameters(NodeParams{"mul_10_y", target});
+        node_mul_10_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_10_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_10_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.1997129831797793e-06), DataLayout::NHWC});
         INode *node_block_10_1_Conv2D_bias = _graph.node(id_block_10_1_Conv2D_bias);
-        node_block_10_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_10_1_Conv2D_bias", target });
-        node_block_10_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_10_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_10_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_10_1_Conv2D_bias", target});
+        node_block_10_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_10_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_10_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                           TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00036640543839894235, 129),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00036640543839894235, 129), DataLayout::NHWC});
         INode *node_block_10_1_FakeQuantWithMinMaxVars = _graph.node(id_block_10_1_FakeQuantWithMinMaxVars);
-        node_block_10_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_10_1_FakeQuantWithMinMaxVars", target });
-        node_block_10_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_10_1_FakeQuantWithMinMaxVars.npy",
-                                                                                              DataLayout::NHWC));
-
-        NodeID id_mul_9_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_10_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_10_1_FakeQuantWithMinMaxVars", target});
+        node_block_10_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_10_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_9_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_9_y = _graph.node(id_mul_9_y);
-        node_mul_9_y->set_common_node_parameters(NodeParams{ "mul_9_y", target });
-        node_mul_9_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_9_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_9_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.1920226370421005e-06),
-            DataLayout::NHWC });
+        node_mul_9_y->set_common_node_parameters(NodeParams{"mul_9_y", target});
+        node_mul_9_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_9_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_9_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.1920226370421005e-06), DataLayout::NHWC});
         INode *node_block_9_1_Conv2D_bias = _graph.node(id_block_9_1_Conv2D_bias);
-        node_block_9_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_9_1_Conv2D_bias", target });
-        node_block_9_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_9_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_9_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_9_1_Conv2D_bias", target});
+        node_block_9_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_9_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_9_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003706997958943248, 129),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.0003706997958943248, 129), DataLayout::NHWC});
         INode *node_block_9_1_FakeQuantWithMinMaxVars = _graph.node(id_block_9_1_FakeQuantWithMinMaxVars);
-        node_block_9_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_9_1_FakeQuantWithMinMaxVars", target });
-        node_block_9_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_9_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_8_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_9_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_9_1_FakeQuantWithMinMaxVars", target});
+        node_block_9_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_9_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_8_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_8_y = _graph.node(id_mul_8_y);
-        node_mul_8_y->set_common_node_parameters(NodeParams{ "mul_8_y", target });
-        node_mul_8_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_8_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_8_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.218903321387188e-06),
-            DataLayout::NHWC });
+        node_mul_8_y->set_common_node_parameters(NodeParams{"mul_8_y", target});
+        node_mul_8_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_8_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_8_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.218903321387188e-06), DataLayout::NHWC});
         INode *node_block_8_1_Conv2D_bias = _graph.node(id_block_8_1_Conv2D_bias);
-        node_block_8_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_8_1_Conv2D_bias", target });
-        node_block_8_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_8_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_8_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_8_1_Conv2D_bias", target});
+        node_block_8_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_8_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_8_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00038377835880964994, 127),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00038377835880964994, 127), DataLayout::NHWC});
         INode *node_block_8_1_FakeQuantWithMinMaxVars = _graph.node(id_block_8_1_FakeQuantWithMinMaxVars);
-        node_block_8_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_8_1_FakeQuantWithMinMaxVars", target });
-        node_block_8_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_8_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_7_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_8_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_8_1_FakeQuantWithMinMaxVars", target});
+        node_block_8_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_8_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_7_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_7_y = _graph.node(id_mul_7_y);
-        node_mul_7_y->set_common_node_parameters(NodeParams{ "mul_7_y", target });
-        node_mul_7_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_7_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_7_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.257252392861119e-06),
-            DataLayout::NHWC });
+        node_mul_7_y->set_common_node_parameters(NodeParams{"mul_7_y", target});
+        node_mul_7_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_7_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_7_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.257252392861119e-06), DataLayout::NHWC});
         INode *node_block_7_1_Conv2D_bias = _graph.node(id_block_7_1_Conv2D_bias);
-        node_block_7_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_7_1_Conv2D_bias", target });
-        node_block_7_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_7_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_7_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_7_1_Conv2D_bias", target});
+        node_block_7_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_7_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_7_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00039844686398282647, 129),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00039844686398282647, 129), DataLayout::NHWC});
         INode *node_block_7_1_FakeQuantWithMinMaxVars = _graph.node(id_block_7_1_FakeQuantWithMinMaxVars);
-        node_block_7_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_7_1_FakeQuantWithMinMaxVars", target });
-        node_block_7_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_7_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_6_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_7_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_7_1_FakeQuantWithMinMaxVars", target});
+        node_block_7_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_7_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_6_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_6_y = _graph.node(id_mul_6_y);
-        node_mul_6_y->set_common_node_parameters(NodeParams{ "mul_6_y", target });
-        node_mul_6_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_6_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_6_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.244850636794581e-06),
-            DataLayout::NHWC });
+        node_mul_6_y->set_common_node_parameters(NodeParams{"mul_6_y", target});
+        node_mul_6_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_6_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_6_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.244850636794581e-06), DataLayout::NHWC});
         INode *node_block_6_1_Conv2D_bias = _graph.node(id_block_6_1_Conv2D_bias);
-        node_block_6_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_6_1_Conv2D_bias", target });
-        node_block_6_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_6_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_6_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_6_1_Conv2D_bias", target});
+        node_block_6_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_6_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_6_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00040187727427110076, 132),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00040187727427110076, 132), DataLayout::NHWC});
         INode *node_block_6_1_FakeQuantWithMinMaxVars = _graph.node(id_block_6_1_FakeQuantWithMinMaxVars);
-        node_block_6_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_6_1_FakeQuantWithMinMaxVars", target });
-        node_block_6_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_6_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_5_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_6_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_6_1_FakeQuantWithMinMaxVars", target});
+        node_block_6_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_6_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_5_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_5_y = _graph.node(id_mul_5_y);
-        node_mul_5_y->set_common_node_parameters(NodeParams{ "mul_5_y", target });
-        node_mul_5_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_5_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_5_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.241092718373693e-06),
-            DataLayout::NHWC });
+        node_mul_5_y->set_common_node_parameters(NodeParams{"mul_5_y", target});
+        node_mul_5_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_5_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_5_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.241092718373693e-06), DataLayout::NHWC});
         INode *node_block_5_1_Conv2D_bias = _graph.node(id_block_5_1_Conv2D_bias);
-        node_block_5_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_5_1_Conv2D_bias", target });
-        node_block_5_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_5_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_5_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_5_1_Conv2D_bias", target});
+        node_block_5_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_5_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_5_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003938926674891263, 129),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.0003938926674891263, 129), DataLayout::NHWC});
         INode *node_block_5_1_FakeQuantWithMinMaxVars = _graph.node(id_block_5_1_FakeQuantWithMinMaxVars);
-        node_block_5_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_5_1_FakeQuantWithMinMaxVars", target });
-        node_block_5_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_5_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_4_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_5_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_5_1_FakeQuantWithMinMaxVars", target});
+        node_block_5_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_5_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_4_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_4_y = _graph.node(id_mul_4_y);
-        node_mul_4_y->set_common_node_parameters(NodeParams{ "mul_4_y", target });
-        node_mul_4_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_4_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_4_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.1748390988941537e-06),
-            DataLayout::NHWC });
+        node_mul_4_y->set_common_node_parameters(NodeParams{"mul_4_y", target});
+        node_mul_4_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_4_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_4_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.1748390988941537e-06), DataLayout::NHWC});
         INode *node_block_4_1_Conv2D_bias = _graph.node(id_block_4_1_Conv2D_bias);
-        node_block_4_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_4_1_Conv2D_bias", target });
-        node_block_4_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_4_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_4_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_4_1_Conv2D_bias", target});
+        node_block_4_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_4_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_4_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003788181929849088, 129),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.0003788181929849088, 129), DataLayout::NHWC});
         INode *node_block_4_1_FakeQuantWithMinMaxVars = _graph.node(id_block_4_1_FakeQuantWithMinMaxVars);
-        node_block_4_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_4_1_FakeQuantWithMinMaxVars", target });
-        node_block_4_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_4_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_3_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_4_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_4_1_FakeQuantWithMinMaxVars", target});
+        node_block_4_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_4_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_3_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_3_y = _graph.node(id_mul_3_y);
-        node_mul_3_y->set_common_node_parameters(NodeParams{ "mul_3_y", target });
-        node_mul_3_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_3_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_3_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.1937011095142225e-06),
-            DataLayout::NHWC });
+        node_mul_3_y->set_common_node_parameters(NodeParams{"mul_3_y", target});
+        node_mul_3_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_3_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_3_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.1937011095142225e-06), DataLayout::NHWC});
         INode *node_block_3_1_Conv2D_bias = _graph.node(id_block_3_1_Conv2D_bias);
-        node_block_3_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_3_1_Conv2D_bias", target });
-        node_block_3_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_3_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_3_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_3_1_Conv2D_bias", target});
+        node_block_3_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_3_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_3_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003944312920793891, 129),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.0003944312920793891, 129), DataLayout::NHWC});
         INode *node_block_3_1_FakeQuantWithMinMaxVars = _graph.node(id_block_3_1_FakeQuantWithMinMaxVars);
-        node_block_3_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_3_1_FakeQuantWithMinMaxVars", target });
-        node_block_3_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_3_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_2_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_3_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_3_1_FakeQuantWithMinMaxVars", target});
+        node_block_3_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_3_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_2_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_2_y = _graph.node(id_mul_2_y);
-        node_mul_2_y->set_common_node_parameters(NodeParams{ "mul_2_y", target });
-        node_mul_2_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_2_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_2_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.1634580232566805e-06),
-            DataLayout::NHWC });
+        node_mul_2_y->set_common_node_parameters(NodeParams{"mul_2_y", target});
+        node_mul_2_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_2_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_2_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.1634580232566805e-06), DataLayout::NHWC});
         INode *node_block_2_1_Conv2D_bias = _graph.node(id_block_2_1_Conv2D_bias);
-        node_block_2_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_2_1_Conv2D_bias", target });
-        node_block_2_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_2_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_2_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_2_1_Conv2D_bias", target});
+        node_block_2_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_2_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_2_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003789655165746808, 132),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.0003789655165746808, 132), DataLayout::NHWC});
         INode *node_block_2_1_FakeQuantWithMinMaxVars = _graph.node(id_block_2_1_FakeQuantWithMinMaxVars);
-        node_block_2_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_2_1_FakeQuantWithMinMaxVars", target });
-        node_block_2_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_2_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_1_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_2_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_2_1_FakeQuantWithMinMaxVars", target});
+        node_block_2_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_2_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_1_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_1_y = _graph.node(id_mul_1_y);
-        node_mul_1_y->set_common_node_parameters(NodeParams{ "mul_1_y", target });
-        node_mul_1_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_1_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_1_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.197920255435747e-06),
-            DataLayout::NHWC });
+        node_mul_1_y->set_common_node_parameters(NodeParams{"mul_1_y", target});
+        node_mul_1_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_1_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_1_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.197920255435747e-06), DataLayout::NHWC});
         INode *node_block_1_1_Conv2D_bias = _graph.node(id_block_1_1_Conv2D_bias);
-        node_block_1_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_1_1_Conv2D_bias", target });
-        node_block_1_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_1_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_1_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_1_1_Conv2D_bias", target});
+        node_block_1_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_1_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_1_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00038527738070115447, 132),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00038527738070115447, 132), DataLayout::NHWC});
         INode *node_block_1_1_FakeQuantWithMinMaxVars = _graph.node(id_block_1_1_FakeQuantWithMinMaxVars);
-        node_block_1_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_1_1_FakeQuantWithMinMaxVars", target });
-        node_block_1_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_1_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_y = _graph.add_node<ConstNode>(
-                              TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_1_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_1_1_FakeQuantWithMinMaxVars", target});
+        node_block_1_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_1_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_y = _graph.node(id_mul_y);
-        node_mul_y->set_common_node_parameters(NodeParams{ "mul_y", target });
-        node_mul_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_0_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.315485519626236e-06),
-            DataLayout::NHWC });
+        node_mul_y->set_common_node_parameters(NodeParams{"mul_y", target});
+        node_mul_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_0_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.315485519626236e-06), DataLayout::NHWC});
         INode *node_block_0_1_Conv2D_bias = _graph.node(id_block_0_1_Conv2D_bias);
-        node_block_0_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_0_1_Conv2D_bias", target });
-        node_block_0_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_0_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_0_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_0_1_Conv2D_bias", target});
+        node_block_0_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_0_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_0_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00039420535904355347, 129),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00039420535904355347, 129), DataLayout::NHWC});
         INode *node_block_0_1_FakeQuantWithMinMaxVars = _graph.node(id_block_0_1_FakeQuantWithMinMaxVars);
-        node_block_0_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_0_1_FakeQuantWithMinMaxVars", target });
-        node_block_0_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_0_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_pre_residual_Conv2D_bias = _graph.add_node<ConstNode>(
-                                                 TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.7214160834555514e-06),
-            DataLayout::NHWC });
+        node_block_0_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_0_1_FakeQuantWithMinMaxVars", target});
+        node_block_0_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_0_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_pre_residual_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.7214160834555514e-06), DataLayout::NHWC});
         INode *node_pre_residual_Conv2D_bias = _graph.node(id_pre_residual_Conv2D_bias);
-        node_pre_residual_Conv2D_bias->set_common_node_parameters(NodeParams{ "pre_residual_Conv2D_bias", target });
-        node_pre_residual_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/pre_residual_Conv2D_bias.npy", DataLayout::NHWC));
+        node_pre_residual_Conv2D_bias->set_common_node_parameters(NodeParams{"pre_residual_Conv2D_bias", target});
+        node_pre_residual_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/pre_residual_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_pre_residual_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                             TensorDescriptor
-        {
-            TensorShape{ 3, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0004389610840007663, 127),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{3, 3, 3, 256}, DataType::QASYMM8, QuantizationInfo(0.0004389610840007663, 127),
+                             DataLayout::NHWC});
         INode *node_pre_residual_FakeQuantWithMinMaxVars = _graph.node(id_pre_residual_FakeQuantWithMinMaxVars);
-        node_pre_residual_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "pre_residual_FakeQuantWithMinMaxVars", target });
-        node_pre_residual_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/pre_residual_FakeQuantWithMinMaxVars.npy",
-                                                                                                DataLayout::NHWC));
+        node_pre_residual_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"pre_residual_FakeQuantWithMinMaxVars", target});
+        node_pre_residual_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/pre_residual_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
 
         TensorShape input_shape{};
         input_shape.set(0, 3, false).set(1, 360, false).set(2, 640, false).set(3, 1, false);
 
         NodeID id_input = _graph.add_node<InputNode>(
-                              TensorDescriptor
-        {
-            input_shape,
-            DataType::QASYMM8,
-            QuantizationInfo(0.003921568859368563),
-            DataLayout::NHWC });
+            TensorDescriptor{input_shape, DataType::QASYMM8, QuantizationInfo(0.003921568859368563), DataLayout::NHWC});
         INode *node_input = _graph.node(id_input);
-        node_input->set_common_node_parameters(NodeParams{ "input", target });
+        node_input->set_common_node_parameters(NodeParams{"input", target});
         node_input->output(0)->set_accessor(get_input_accessor(common_params));
 
-        NodeID id_pre_residual_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                             PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.0033370566088706255, 96));
+        NodeID id_pre_residual_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.0033370566088706255, 96));
         INode *node_pre_residual_BiasAdd = _graph.node(id_pre_residual_BiasAdd);
-        node_pre_residual_BiasAdd->set_common_node_parameters(NodeParams{ "pre_residual_BiasAdd", target });
+        node_pre_residual_BiasAdd->set_common_node_parameters(NodeParams{"pre_residual_BiasAdd", target});
         _graph.add_connection(id_input, 0, id_pre_residual_BiasAdd, 0);
         _graph.add_connection(id_pre_residual_FakeQuantWithMinMaxVars, 0, id_pre_residual_BiasAdd, 1);
         _graph.add_connection(id_pre_residual_Conv2D_bias, 0, id_pre_residual_BiasAdd, 2);
 
-        NodeID id_block_0_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.007344874087721109, 185));
+        NodeID id_block_0_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.007344874087721109, 185));
         INode *node_block_0_1_BiasAdd = _graph.node(id_block_0_1_BiasAdd);
-        node_block_0_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_0_1_BiasAdd", target });
+        node_block_0_1_BiasAdd->set_common_node_parameters(NodeParams{"block_0_1_BiasAdd", target});
         _graph.add_connection(id_pre_residual_BiasAdd, 0, id_block_0_1_BiasAdd, 0);
         _graph.add_connection(id_block_0_1_FakeQuantWithMinMaxVars, 0, id_block_0_1_BiasAdd, 1);
         _graph.add_connection(id_block_0_1_Conv2D_bias, 0, id_block_0_1_BiasAdd, 2);
 
         NodeID id_mul = _graph.add_node<EltwiseLayerNode>(
-                            descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0006341293919831514, 174 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0006341293919831514, 174}});
         INode *node_mul = _graph.node(id_mul);
-        node_mul->set_common_node_parameters(NodeParams{ "mul", target });
+        node_mul->set_common_node_parameters(NodeParams{"mul", target});
         _graph.add_connection(id_block_0_1_BiasAdd, 0, id_mul, 0);
         _graph.add_connection(id_mul_y, 0, id_mul, 1);
 
         NodeID id_add = _graph.add_node<EltwiseLayerNode>(
-                            descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0031092411372810602, 95 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0031092411372810602, 95}});
         INode *node_add = _graph.node(id_add);
-        node_add->set_common_node_parameters(NodeParams{ "add", target });
+        node_add->set_common_node_parameters(NodeParams{"add", target});
         _graph.add_connection(id_pre_residual_BiasAdd, 0, id_add, 0);
         _graph.add_connection(id_mul, 0, id_add, 1);
 
-        NodeID id_block_1_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.005333727691322565, 117));
+        NodeID id_block_1_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.005333727691322565, 117));
         INode *node_block_1_1_BiasAdd = _graph.node(id_block_1_1_BiasAdd);
-        node_block_1_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_1_1_BiasAdd", target });
+        node_block_1_1_BiasAdd->set_common_node_parameters(NodeParams{"block_1_1_BiasAdd", target});
         _graph.add_connection(id_add, 0, id_block_1_1_BiasAdd, 0);
         _graph.add_connection(id_block_1_1_FakeQuantWithMinMaxVars, 0, id_block_1_1_BiasAdd, 1);
         _graph.add_connection(id_block_1_1_Conv2D_bias, 0, id_block_1_1_BiasAdd, 2);
 
         NodeID id_mul_1 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0004965941770933568, 122 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0004965941770933568, 122}});
         INode *node_mul_1 = _graph.node(id_mul_1);
-        node_mul_1->set_common_node_parameters(NodeParams{ "mul_1", target });
+        node_mul_1->set_common_node_parameters(NodeParams{"mul_1", target});
         _graph.add_connection(id_block_1_1_BiasAdd, 0, id_mul_1, 0);
         _graph.add_connection(id_mul_1_y, 0, id_mul_1, 1);
 
         NodeID id_add_1 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0030700892675668, 96 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0030700892675668, 96}});
         INode *node_add_1 = _graph.node(id_add_1);
-        node_add_1->set_common_node_parameters(NodeParams{ "add_1", target });
+        node_add_1->set_common_node_parameters(NodeParams{"add_1", target});
         _graph.add_connection(id_add, 0, id_add_1, 0);
         _graph.add_connection(id_mul_1, 0, id_add_1, 1);
 
-        NodeID id_block_2_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.004199742339551449, 132));
+        NodeID id_block_2_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.004199742339551449, 132));
         INode *node_block_2_1_BiasAdd = _graph.node(id_block_2_1_BiasAdd);
-        node_block_2_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_2_1_BiasAdd", target });
+        node_block_2_1_BiasAdd->set_common_node_parameters(NodeParams{"block_2_1_BiasAdd", target});
         _graph.add_connection(id_add_1, 0, id_block_2_1_BiasAdd, 0);
         _graph.add_connection(id_block_2_1_FakeQuantWithMinMaxVars, 0, id_block_2_1_BiasAdd, 1);
         _graph.add_connection(id_block_2_1_Conv2D_bias, 0, id_block_2_1_BiasAdd, 2);
 
         NodeID id_mul_2 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0004133903712499887, 130 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0004133903712499887, 130}});
         INode *node_mul_2 = _graph.node(id_mul_2);
-        node_mul_2->set_common_node_parameters(NodeParams{ "mul_2", target });
+        node_mul_2->set_common_node_parameters(NodeParams{"mul_2", target});
         _graph.add_connection(id_block_2_1_BiasAdd, 0, id_mul_2, 0);
         _graph.add_connection(id_mul_2_y, 0, id_mul_2, 1);
 
         NodeID id_add_2 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.003026385325938463, 94 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.003026385325938463, 94}});
         INode *node_add_2 = _graph.node(id_add_2);
-        node_add_2->set_common_node_parameters(NodeParams{ "add_2", target });
+        node_add_2->set_common_node_parameters(NodeParams{"add_2", target});
         _graph.add_connection(id_add_1, 0, id_add_2, 0);
         _graph.add_connection(id_mul_2, 0, id_add_2, 1);
 
-        NodeID id_block_3_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.003977528307586908, 142));
+        NodeID id_block_3_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.003977528307586908, 142));
         INode *node_block_3_1_BiasAdd = _graph.node(id_block_3_1_BiasAdd);
-        node_block_3_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_3_1_BiasAdd", target });
+        node_block_3_1_BiasAdd->set_common_node_parameters(NodeParams{"block_3_1_BiasAdd", target});
         _graph.add_connection(id_add_2, 0, id_block_3_1_BiasAdd, 0);
         _graph.add_connection(id_block_3_1_FakeQuantWithMinMaxVars, 0, id_block_3_1_BiasAdd, 1);
         _graph.add_connection(id_block_3_1_Conv2D_bias, 0, id_block_3_1_BiasAdd, 2);
 
         NodeID id_mul_3 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0003943995980080217, 141 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0003943995980080217, 141}});
         INode *node_mul_3 = _graph.node(id_mul_3);
-        node_mul_3->set_common_node_parameters(NodeParams{ "mul_3", target });
+        node_mul_3->set_common_node_parameters(NodeParams{"mul_3", target});
         _graph.add_connection(id_block_3_1_BiasAdd, 0, id_mul_3, 0);
         _graph.add_connection(id_mul_3_y, 0, id_mul_3, 1);
 
         NodeID id_add_3 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.003101327223703265, 98 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.003101327223703265, 98}});
         INode *node_add_3 = _graph.node(id_add_3);
-        node_add_3->set_common_node_parameters(NodeParams{ "add_3", target });
+        node_add_3->set_common_node_parameters(NodeParams{"add_3", target});
         _graph.add_connection(id_add_2, 0, id_add_3, 0);
         _graph.add_connection(id_mul_3, 0, id_add_3, 1);
 
-        NodeID id_block_4_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.0045388080179691315, 146));
+        NodeID id_block_4_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.0045388080179691315, 146));
         INode *node_block_4_1_BiasAdd = _graph.node(id_block_4_1_BiasAdd);
-        node_block_4_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_4_1_BiasAdd", target });
+        node_block_4_1_BiasAdd->set_common_node_parameters(NodeParams{"block_4_1_BiasAdd", target});
         _graph.add_connection(id_add_3, 0, id_block_4_1_BiasAdd, 0);
         _graph.add_connection(id_block_4_1_FakeQuantWithMinMaxVars, 0, id_block_4_1_BiasAdd, 1);
         _graph.add_connection(id_block_4_1_Conv2D_bias, 0, id_block_4_1_BiasAdd, 2);
 
         NodeID id_mul_4 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.00044342130422592163, 143 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.00044342130422592163, 143}});
         INode *node_mul_4 = _graph.node(id_mul_4);
-        node_mul_4->set_common_node_parameters(NodeParams{ "mul_4", target });
+        node_mul_4->set_common_node_parameters(NodeParams{"mul_4", target});
         _graph.add_connection(id_block_4_1_BiasAdd, 0, id_mul_4, 0);
         _graph.add_connection(id_mul_4_y, 0, id_mul_4, 1);
 
         NodeID id_add_4 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.003150839824229479, 98 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.003150839824229479, 98}});
         INode *node_add_4 = _graph.node(id_add_4);
-        node_add_4->set_common_node_parameters(NodeParams{ "add_4", target });
+        node_add_4->set_common_node_parameters(NodeParams{"add_4", target});
         _graph.add_connection(id_add_3, 0, id_add_4, 0);
         _graph.add_connection(id_mul_4, 0, id_add_4, 1);
 
-        NodeID id_block_5_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.00402890844270587, 132));
+        NodeID id_block_5_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.00402890844270587, 132));
         INode *node_block_5_1_BiasAdd = _graph.node(id_block_5_1_BiasAdd);
-        node_block_5_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_5_1_BiasAdd", target });
+        node_block_5_1_BiasAdd->set_common_node_parameters(NodeParams{"block_5_1_BiasAdd", target});
         _graph.add_connection(id_add_4, 0, id_block_5_1_BiasAdd, 0);
         _graph.add_connection(id_block_5_1_FakeQuantWithMinMaxVars, 0, id_block_5_1_BiasAdd, 1);
         _graph.add_connection(id_block_5_1_Conv2D_bias, 0, id_block_5_1_BiasAdd, 2);
 
         NodeID id_mul_5 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0004023382789455354, 132 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0004023382789455354, 132}});
         INode *node_mul_5 = _graph.node(id_mul_5);
-        node_mul_5->set_common_node_parameters(NodeParams{ "mul_5", target });
+        node_mul_5->set_common_node_parameters(NodeParams{"mul_5", target});
         _graph.add_connection(id_block_5_1_BiasAdd, 0, id_mul_5, 0);
         _graph.add_connection(id_mul_5_y, 0, id_mul_5, 1);
 
         NodeID id_add_5 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0030975888948887587, 94 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0030975888948887587, 94}});
         INode *node_add_5 = _graph.node(id_add_5);
-        node_add_5->set_common_node_parameters(NodeParams{ "add_5", target });
+        node_add_5->set_common_node_parameters(NodeParams{"add_5", target});
         _graph.add_connection(id_add_4, 0, id_add_5, 0);
         _graph.add_connection(id_mul_5, 0, id_add_5, 1);
 
-        NodeID id_block_6_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.00421866774559021, 125));
+        NodeID id_block_6_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.00421866774559021, 125));
         INode *node_block_6_1_BiasAdd = _graph.node(id_block_6_1_BiasAdd);
-        node_block_6_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_6_1_BiasAdd", target });
+        node_block_6_1_BiasAdd->set_common_node_parameters(NodeParams{"block_6_1_BiasAdd", target});
         _graph.add_connection(id_add_5, 0, id_block_6_1_BiasAdd, 0);
         _graph.add_connection(id_block_6_1_FakeQuantWithMinMaxVars, 0, id_block_6_1_BiasAdd, 1);
         _graph.add_connection(id_block_6_1_Conv2D_bias, 0, id_block_6_1_BiasAdd, 2);
 
         NodeID id_mul_6 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.00041950203012675047, 125 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.00041950203012675047, 125}});
         INode *node_mul_6 = _graph.node(id_mul_6);
-        node_mul_6->set_common_node_parameters(NodeParams{ "mul_6", target });
+        node_mul_6->set_common_node_parameters(NodeParams{"mul_6", target});
         _graph.add_connection(id_block_6_1_BiasAdd, 0, id_mul_6, 0);
         _graph.add_connection(id_mul_6_y, 0, id_mul_6, 1);
 
         NodeID id_add_6 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.003155382815748453, 92 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.003155382815748453, 92}});
         INode *node_add_6 = _graph.node(id_add_6);
-        node_add_6->set_common_node_parameters(NodeParams{ "add_6", target });
+        node_add_6->set_common_node_parameters(NodeParams{"add_6", target});
         _graph.add_connection(id_add_5, 0, id_add_6, 0);
         _graph.add_connection(id_mul_6, 0, id_add_6, 1);
 
-        NodeID id_block_7_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.004250136204063892, 143));
+        NodeID id_block_7_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.004250136204063892, 143));
         INode *node_block_7_1_BiasAdd = _graph.node(id_block_7_1_BiasAdd);
-        node_block_7_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_7_1_BiasAdd", target });
+        node_block_7_1_BiasAdd->set_common_node_parameters(NodeParams{"block_7_1_BiasAdd", target});
         _graph.add_connection(id_add_6, 0, id_block_7_1_BiasAdd, 0);
         _graph.add_connection(id_block_7_1_FakeQuantWithMinMaxVars, 0, id_block_7_1_BiasAdd, 1);
         _graph.add_connection(id_block_7_1_Conv2D_bias, 0, id_block_7_1_BiasAdd, 2);
 
         NodeID id_mul_7 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.00042401350219734013, 142 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.00042401350219734013, 142}});
         INode *node_mul_7 = _graph.node(id_mul_7);
-        node_mul_7->set_common_node_parameters(NodeParams{ "mul_7", target });
+        node_mul_7->set_common_node_parameters(NodeParams{"mul_7", target});
         _graph.add_connection(id_block_7_1_BiasAdd, 0, id_mul_7, 0);
         _graph.add_connection(id_mul_7_y, 0, id_mul_7, 1);
 
         NodeID id_add_7 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0031760605052113533, 86 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0031760605052113533, 86}});
         INode *node_add_7 = _graph.node(id_add_7);
-        node_add_7->set_common_node_parameters(NodeParams{ "add_7", target });
+        node_add_7->set_common_node_parameters(NodeParams{"add_7", target});
         _graph.add_connection(id_add_6, 0, id_add_7, 0);
         _graph.add_connection(id_mul_7, 0, id_add_7, 1);
 
-        NodeID id_block_8_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.004277155734598637, 123));
+        NodeID id_block_8_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.004277155734598637, 123));
         INode *node_block_8_1_BiasAdd = _graph.node(id_block_8_1_BiasAdd);
-        node_block_8_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_8_1_BiasAdd", target });
+        node_block_8_1_BiasAdd->set_common_node_parameters(NodeParams{"block_8_1_BiasAdd", target});
         _graph.add_connection(id_add_7, 0, id_block_8_1_BiasAdd, 0);
         _graph.add_connection(id_block_8_1_FakeQuantWithMinMaxVars, 0, id_block_8_1_BiasAdd, 1);
         _graph.add_connection(id_block_8_1_Conv2D_bias, 0, id_block_8_1_BiasAdd, 2);
 
         NodeID id_mul_8 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.00042673019925132394, 123 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.00042673019925132394, 123}});
         INode *node_mul_8 = _graph.node(id_mul_8);
-        node_mul_8->set_common_node_parameters(NodeParams{ "mul_8", target });
+        node_mul_8->set_common_node_parameters(NodeParams{"mul_8", target});
         _graph.add_connection(id_block_8_1_BiasAdd, 0, id_mul_8, 0);
         _graph.add_connection(id_mul_8_y, 0, id_mul_8, 1);
 
         NodeID id_add_8 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0032156009692698717, 86 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0032156009692698717, 86}});
         INode *node_add_8 = _graph.node(id_add_8);
-        node_add_8->set_common_node_parameters(NodeParams{ "add_8", target });
+        node_add_8->set_common_node_parameters(NodeParams{"add_8", target});
         _graph.add_connection(id_add_7, 0, id_add_8, 0);
         _graph.add_connection(id_mul_8, 0, id_add_8, 1);
 
-        NodeID id_block_9_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.00445037754252553, 129));
+        NodeID id_block_9_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.00445037754252553, 129));
         INode *node_block_9_1_BiasAdd = _graph.node(id_block_9_1_BiasAdd);
-        node_block_9_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_9_1_BiasAdd", target });
+        node_block_9_1_BiasAdd->set_common_node_parameters(NodeParams{"block_9_1_BiasAdd", target});
         _graph.add_connection(id_add_8, 0, id_block_9_1_BiasAdd, 0);
         _graph.add_connection(id_block_9_1_FakeQuantWithMinMaxVars, 0, id_block_9_1_BiasAdd, 1);
         _graph.add_connection(id_block_9_1_Conv2D_bias, 0, id_block_9_1_BiasAdd, 2);
 
         NodeID id_mul_9 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0004448975087143481, 129 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0004448975087143481, 129}});
         INode *node_mul_9 = _graph.node(id_mul_9);
-        node_mul_9->set_common_node_parameters(NodeParams{ "mul_9", target });
+        node_mul_9->set_common_node_parameters(NodeParams{"mul_9", target});
         _graph.add_connection(id_block_9_1_BiasAdd, 0, id_mul_9, 0);
         _graph.add_connection(id_mul_9_y, 0, id_mul_9, 1);
 
         NodeID id_add_9 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0032742770854383707, 80 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0032742770854383707, 80}});
         INode *node_add_9 = _graph.node(id_add_9);
-        node_add_9->set_common_node_parameters(NodeParams{ "add_9", target });
+        node_add_9->set_common_node_parameters(NodeParams{"add_9", target});
         _graph.add_connection(id_add_8, 0, id_add_9, 0);
         _graph.add_connection(id_mul_9, 0, id_add_9, 1);
 
-        NodeID id_block_10_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                           PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.003614710411056876, 131));
+        NodeID id_block_10_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.003614710411056876, 131));
         INode *node_block_10_1_BiasAdd = _graph.node(id_block_10_1_BiasAdd);
-        node_block_10_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_10_1_BiasAdd", target });
+        node_block_10_1_BiasAdd->set_common_node_parameters(NodeParams{"block_10_1_BiasAdd", target});
         _graph.add_connection(id_add_9, 0, id_block_10_1_BiasAdd, 0);
         _graph.add_connection(id_block_10_1_FakeQuantWithMinMaxVars, 0, id_block_10_1_BiasAdd, 1);
         _graph.add_connection(id_block_10_1_Conv2D_bias, 0, id_block_10_1_BiasAdd, 2);
 
         NodeID id_mul_10 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.00036083892337046564, 130 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.00036083892337046564, 130}});
         INode *node_mul_10 = _graph.node(id_mul_10);
-        node_mul_10->set_common_node_parameters(NodeParams{ "mul_10", target });
+        node_mul_10->set_common_node_parameters(NodeParams{"mul_10", target});
         _graph.add_connection(id_block_10_1_BiasAdd, 0, id_mul_10, 0);
         _graph.add_connection(id_mul_10_y, 0, id_mul_10, 1);
 
         NodeID id_add_10 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0031881770119071007, 81 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0031881770119071007, 81}});
         INode *node_add_10 = _graph.node(id_add_10);
-        node_add_10->set_common_node_parameters(NodeParams{ "add_10", target });
+        node_add_10->set_common_node_parameters(NodeParams{"add_10", target});
         _graph.add_connection(id_add_9, 0, id_add_10, 0);
         _graph.add_connection(id_mul_10, 0, id_add_10, 1);
 
-        NodeID id_block_11_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                           PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.003969002980738878, 133));
+        NodeID id_block_11_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.003969002980738878, 133));
         INode *node_block_11_1_BiasAdd = _graph.node(id_block_11_1_BiasAdd);
-        node_block_11_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_11_1_BiasAdd", target });
+        node_block_11_1_BiasAdd->set_common_node_parameters(NodeParams{"block_11_1_BiasAdd", target});
         _graph.add_connection(id_add_10, 0, id_block_11_1_BiasAdd, 0);
         _graph.add_connection(id_block_11_1_FakeQuantWithMinMaxVars, 0, id_block_11_1_BiasAdd, 1);
         _graph.add_connection(id_block_11_1_Conv2D_bias, 0, id_block_11_1_BiasAdd, 2);
 
         NodeID id_mul_11 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0003968806122429669, 133 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0003968806122429669, 133}});
         INode *node_mul_11 = _graph.node(id_mul_11);
-        node_mul_11->set_common_node_parameters(NodeParams{ "mul_11", target });
+        node_mul_11->set_common_node_parameters(NodeParams{"mul_11", target});
         _graph.add_connection(id_block_11_1_BiasAdd, 0, id_mul_11, 0);
         _graph.add_connection(id_mul_11_y, 0, id_mul_11, 1);
 
         NodeID id_add_11 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0032707711216062307, 80 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0032707711216062307, 80}});
         INode *node_add_11 = _graph.node(id_add_11);
-        node_add_11->set_common_node_parameters(NodeParams{ "add_11", target });
+        node_add_11->set_common_node_parameters(NodeParams{"add_11", target});
         _graph.add_connection(id_add_10, 0, id_add_11, 0);
         _graph.add_connection(id_mul_11, 0, id_add_11, 1);
 
-        NodeID id_block_12_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                           PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.004366801120340824, 110));
+        NodeID id_block_12_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.004366801120340824, 110));
         INode *node_block_12_1_BiasAdd = _graph.node(id_block_12_1_BiasAdd);
-        node_block_12_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_12_1_BiasAdd", target });
+        node_block_12_1_BiasAdd->set_common_node_parameters(NodeParams{"block_12_1_BiasAdd", target});
         _graph.add_connection(id_add_11, 0, id_block_12_1_BiasAdd, 0);
         _graph.add_connection(id_block_12_1_FakeQuantWithMinMaxVars, 0, id_block_12_1_BiasAdd, 1);
         _graph.add_connection(id_block_12_1_Conv2D_bias, 0, id_block_12_1_BiasAdd, 2);
 
         NodeID id_mul_12 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0004365936329122633, 110 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0004365936329122633, 110}});
         INode *node_mul_12 = _graph.node(id_mul_12);
-        node_mul_12->set_common_node_parameters(NodeParams{ "mul_12", target });
+        node_mul_12->set_common_node_parameters(NodeParams{"mul_12", target});
         _graph.add_connection(id_block_12_1_BiasAdd, 0, id_mul_12, 0);
         _graph.add_connection(id_mul_12_y, 0, id_mul_12, 1);
 
         NodeID id_add_12 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.003275055903941393, 79 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.003275055903941393, 79}});
         INode *node_add_12 = _graph.node(id_add_12);
-        node_add_12->set_common_node_parameters(NodeParams{ "add_12", target });
+        node_add_12->set_common_node_parameters(NodeParams{"add_12", target});
         _graph.add_connection(id_add_11, 0, id_add_12, 0);
         _graph.add_connection(id_mul_12, 0, id_add_12, 1);
 
-        NodeID id_block_13_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                           PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.004386766813695431, 139));
+        NodeID id_block_13_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.004386766813695431, 139));
         INode *node_block_13_1_BiasAdd = _graph.node(id_block_13_1_BiasAdd);
-        node_block_13_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_13_1_BiasAdd", target });
+        node_block_13_1_BiasAdd->set_common_node_parameters(NodeParams{"block_13_1_BiasAdd", target});
         _graph.add_connection(id_add_12, 0, id_block_13_1_BiasAdd, 0);
         _graph.add_connection(id_block_13_1_FakeQuantWithMinMaxVars, 0, id_block_13_1_BiasAdd, 1);
         _graph.add_connection(id_block_13_1_Conv2D_bias, 0, id_block_13_1_BiasAdd, 2);
 
         NodeID id_mul_13 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0004385628562886268, 139 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0004385628562886268, 139}});
         INode *node_mul_13 = _graph.node(id_mul_13);
-        node_mul_13->set_common_node_parameters(NodeParams{ "mul_13", target });
+        node_mul_13->set_common_node_parameters(NodeParams{"mul_13", target});
         _graph.add_connection(id_block_13_1_BiasAdd, 0, id_mul_13, 0);
         _graph.add_connection(id_mul_13_y, 0, id_mul_13, 1);
 
         NodeID id_add_13 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0033287261612713337, 78 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0033287261612713337, 78}});
         INode *node_add_13 = _graph.node(id_add_13);
-        node_add_13->set_common_node_parameters(NodeParams{ "add_13", target });
+        node_add_13->set_common_node_parameters(NodeParams{"add_13", target});
         _graph.add_connection(id_add_12, 0, id_add_13, 0);
         _graph.add_connection(id_mul_13, 0, id_add_13, 1);
 
-        NodeID id_block_14_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                           PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.0038069337606430054, 130));
+        NodeID id_block_14_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.0038069337606430054, 130));
         INode *node_block_14_1_BiasAdd = _graph.node(id_block_14_1_BiasAdd);
-        node_block_14_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_14_1_BiasAdd", target });
+        node_block_14_1_BiasAdd->set_common_node_parameters(NodeParams{"block_14_1_BiasAdd", target});
         _graph.add_connection(id_add_13, 0, id_block_14_1_BiasAdd, 0);
         _graph.add_connection(id_block_14_1_FakeQuantWithMinMaxVars, 0, id_block_14_1_BiasAdd, 1);
         _graph.add_connection(id_block_14_1_Conv2D_bias, 0, id_block_14_1_BiasAdd, 2);
 
         NodeID id_mul_14 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.00037829321809113026, 130 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.00037829321809113026, 130}});
         INode *node_mul_14 = _graph.node(id_mul_14);
-        node_mul_14->set_common_node_parameters(NodeParams{ "mul_14", target });
+        node_mul_14->set_common_node_parameters(NodeParams{"mul_14", target});
         _graph.add_connection(id_block_14_1_BiasAdd, 0, id_mul_14, 0);
         _graph.add_connection(id_mul_14_y, 0, id_mul_14, 1);
 
         NodeID id_add_14 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0033590947277843952, 77 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0033590947277843952, 77}});
         INode *node_add_14 = _graph.node(id_add_14);
-        node_add_14->set_common_node_parameters(NodeParams{ "add_14", target });
+        node_add_14->set_common_node_parameters(NodeParams{"add_14", target});
         _graph.add_connection(id_add_13, 0, id_add_14, 0);
         _graph.add_connection(id_mul_14, 0, id_add_14, 1);
 
-        NodeID id_block_15_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                           PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.004009159281849861, 130));
+        NodeID id_block_15_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.004009159281849861, 130));
         INode *node_block_15_1_BiasAdd = _graph.node(id_block_15_1_BiasAdd);
-        node_block_15_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_15_1_BiasAdd", target });
+        node_block_15_1_BiasAdd->set_common_node_parameters(NodeParams{"block_15_1_BiasAdd", target});
         _graph.add_connection(id_add_14, 0, id_block_15_1_BiasAdd, 0);
         _graph.add_connection(id_block_15_1_FakeQuantWithMinMaxVars, 0, id_block_15_1_BiasAdd, 1);
         _graph.add_connection(id_block_15_1_Conv2D_bias, 0, id_block_15_1_BiasAdd, 2);
 
         NodeID id_mul_15 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0004008286341559142, 130 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0004008286341559142, 130}});
         INode *node_mul_15 = _graph.node(id_mul_15);
-        node_mul_15->set_common_node_parameters(NodeParams{ "mul_15", target });
+        node_mul_15->set_common_node_parameters(NodeParams{"mul_15", target});
         _graph.add_connection(id_block_15_1_BiasAdd, 0, id_mul_15, 0);
         _graph.add_connection(id_mul_15_y, 0, id_mul_15, 1);
 
         NodeID id_add_15 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0035031239967793226, 78 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0035031239967793226, 78}});
         INode *node_add_15 = _graph.node(id_add_15);
-        node_add_15->set_common_node_parameters(NodeParams{ "add_15", target });
+        node_add_15->set_common_node_parameters(NodeParams{"add_15", target});
         _graph.add_connection(id_add_14, 0, id_add_15, 0);
         _graph.add_connection(id_mul_15, 0, id_add_15, 1);
 
-        NodeID id_post_residual_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                              PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.005167999770492315, 112));
+        NodeID id_post_residual_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.005167999770492315, 112));
         INode *node_post_residual_BiasAdd = _graph.node(id_post_residual_BiasAdd);
-        node_post_residual_BiasAdd->set_common_node_parameters(NodeParams{ "post_residual_BiasAdd", target });
+        node_post_residual_BiasAdd->set_common_node_parameters(NodeParams{"post_residual_BiasAdd", target});
         _graph.add_connection(id_add_15, 0, id_post_residual_BiasAdd, 0);
         _graph.add_connection(id_post_residual_FakeQuantWithMinMaxVars, 0, id_post_residual_BiasAdd, 1);
         _graph.add_connection(id_post_residual_Conv2D_bias, 0, id_post_residual_BiasAdd, 2);
 
         NodeID id_add_16 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0065071373246610165, 89 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0065071373246610165, 89}});
         INode *node_add_16 = _graph.node(id_add_16);
-        node_add_16->set_common_node_parameters(NodeParams{ "add_16", target });
+        node_add_16->set_common_node_parameters(NodeParams{"add_16", target});
         _graph.add_connection(id_post_residual_BiasAdd, 0, id_add_16, 0);
         _graph.add_connection(id_pre_residual_BiasAdd, 0, id_add_16, 1);
 
-        NodeID id_pre_upscale_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                            PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.005013593938201666, 26));
+        NodeID id_pre_upscale_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.005013593938201666, 26));
         INode *node_pre_upscale_BiasAdd = _graph.node(id_pre_upscale_BiasAdd);
-        node_pre_upscale_BiasAdd->set_common_node_parameters(NodeParams{ "pre_upscale_BiasAdd", target });
+        node_pre_upscale_BiasAdd->set_common_node_parameters(NodeParams{"pre_upscale_BiasAdd", target});
         _graph.add_connection(id_add_16, 0, id_pre_upscale_BiasAdd, 0);
         _graph.add_connection(id_pre_upscale_FakeQuantWithMinMaxVars, 0, id_pre_upscale_BiasAdd, 1);
         _graph.add_connection(id_pre_upscale_Conv2D_bias, 0, id_pre_upscale_BiasAdd, 2);
 
         NodeID id_upscale_net_FakeQuantWithMinMaxVars_1 = _graph.add_node<DeconvolutionLayerNode>(
-                                                              descriptors::DeconvolutionLayerDescriptor
-        {
-            PadStrideInfo{
-                2, 2,
-                0, 0,
-                0, 0,
-                DimensionRoundingType::FLOOR },
-            QuantizationInfo{ 0.004990961868315935, 26 } });
+            descriptors::DeconvolutionLayerDescriptor{PadStrideInfo{2, 2, 0, 0, 0, 0, DimensionRoundingType::FLOOR},
+                                                      QuantizationInfo{0.004990961868315935, 26}});
         INode *node_upscale_net_FakeQuantWithMinMaxVars_1 = _graph.node(id_upscale_net_FakeQuantWithMinMaxVars_1);
-        node_upscale_net_FakeQuantWithMinMaxVars_1->set_common_node_parameters(NodeParams{ "upscale_net_FakeQuantWithMinMaxVars_1", target });
+        node_upscale_net_FakeQuantWithMinMaxVars_1->set_common_node_parameters(
+            NodeParams{"upscale_net_FakeQuantWithMinMaxVars_1", target});
         _graph.add_connection(id_pre_upscale_BiasAdd, 0, id_upscale_net_FakeQuantWithMinMaxVars_1, 0);
-        _graph.add_connection(id_upscale_net_FakeQuantWithMinMaxVars_transposed, 0, id_upscale_net_FakeQuantWithMinMaxVars_1, 1);
+        _graph.add_connection(id_upscale_net_FakeQuantWithMinMaxVars_transposed, 0,
+                              id_upscale_net_FakeQuantWithMinMaxVars_1, 1);
         TensorShape output_shape;
         output_shape.set(0, 3, false).set(1, 720, false).set(2, 1280, false).set(3, 1, false);
 
         NodeID id_output_140211982446376   = _graph.add_node<OutputNode>();
         INode *node_output_140211982446376 = _graph.node(id_output_140211982446376);
-        node_output_140211982446376->set_common_node_parameters(NodeParams{ "output_140211982446376", target });
+        node_output_140211982446376->set_common_node_parameters(NodeParams{"output_140211982446376", target});
         _graph.add_connection(id_upscale_net_FakeQuantWithMinMaxVars_1, 0, id_output_140211982446376, 0);
-        node_output_140211982446376->input(0)->set_accessor(get_npy_output_accessor(expected_output_filename.value(), output_shape, common_params.data_type,
-                                                                                    common_params.data_layout));
+        node_output_140211982446376->input(0)->set_accessor(get_npy_output_accessor(
+            expected_output_filename.value(), output_shape, common_params.data_type, common_params.data_layout));
 
         return true;
     }
diff --git a/examples/graph_googlenet.cpp b/examples/graph_googlenet.cpp
index 683205b3b58bc890814ab69dbf8d436f1874528e..f431fc412bc3a3ce2291b83dbfee02d6aab86295 100644
--- a/examples/graph_googlenet.cpp
+++ b/examples/graph_googlenet.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphGooglenetExample : public Example
 {
 public:
-    GraphGooglenetExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "GoogleNet")
+    GraphGooglenetExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "GoogleNet")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,14 +49,15 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
         }
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -65,64 +66,99 @@ public:
         std::string data_path = common_params.data_path;
 
         // Create a preprocessor object
-        const std::array<float, 3> mean_rgb{ { 122.68f, 116.67f, 104.01f } };
+        const std::array<float, 3>     mean_rgb{{122.68f, 116.67f, 104.01f}};
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<CaffePreproccessor>(mean_rgb);
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
+              << ConvolutionLayer(7U, 7U, 64U,
+                                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv1/conv1_7x7_s2_w.npy",
+                                                       weights_layout),
+                                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv1/conv1_7x7_s2_b.npy"),
+                                  PadStrideInfo(2, 2, 3, 3))
+                     .set_name("conv1/7x7_s2")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv1/relu_7x7")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool1/3x3_s2")
+              << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f))
+                     .set_name("pool1/norm1")
               << ConvolutionLayer(
-                  7U, 7U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv1/conv1_7x7_s2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv1/conv1_7x7_s2_b.npy"),
-                  PadStrideInfo(2, 2, 3, 3))
-              .set_name("conv1/7x7_s2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1/relu_7x7")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool1/3x3_s2")
-              << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f)).set_name("pool1/norm1")
-              << ConvolutionLayer(
-                  1U, 1U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_reduce_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_reduce_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv2/3x3_reduce")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv2/relu_3x3_reduce")
+                     1U, 1U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_reduce_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_reduce_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("conv2/3x3_reduce")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv2/relu_3x3_reduce")
               << ConvolutionLayer(
-                  3U, 3U, 192U,
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2/3x3")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv2/relu_3x3")
-              << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f)).set_name("conv2/norm2")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool2/3x3_s2");
-        graph << get_inception_node(data_path, "inception_3a", weights_layout, 64, std::make_tuple(96U, 128U), std::make_tuple(16U, 32U), 32U).set_name("inception_3a/concat");
-        graph << get_inception_node(data_path, "inception_3b", weights_layout, 128, std::make_tuple(128U, 192U), std::make_tuple(32U, 96U), 64U).set_name("inception_3b/concat");
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool3/3x3_s2");
-        graph << get_inception_node(data_path, "inception_4a", weights_layout, 192, std::make_tuple(96U, 208U), std::make_tuple(16U, 48U), 64U).set_name("inception_4a/concat");
-        graph << get_inception_node(data_path, "inception_4b", weights_layout, 160, std::make_tuple(112U, 224U), std::make_tuple(24U, 64U), 64U).set_name("inception_4b/concat");
-        graph << get_inception_node(data_path, "inception_4c", weights_layout, 128, std::make_tuple(128U, 256U), std::make_tuple(24U, 64U), 64U).set_name("inception_4c/concat");
-        graph << get_inception_node(data_path, "inception_4d", weights_layout, 112, std::make_tuple(144U, 288U), std::make_tuple(32U, 64U), 64U).set_name("inception_4d/concat");
-        graph << get_inception_node(data_path, "inception_4e", weights_layout, 256, std::make_tuple(160U, 320U), std::make_tuple(32U, 128U), 128U).set_name("inception_4e/concat");
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool4/3x3_s2");
-        graph << get_inception_node(data_path, "inception_5a", weights_layout, 256, std::make_tuple(160U, 320U), std::make_tuple(32U, 128U), 128U).set_name("inception_5a/concat");
-        graph << get_inception_node(data_path, "inception_5b", weights_layout, 384, std::make_tuple(192U, 384U), std::make_tuple(48U, 128U), 128U).set_name("inception_5b/concat");
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 7, operation_layout, PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL))).set_name("pool5/7x7_s1")
+                     3U, 3U, 192U,
+                     get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_w.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_b.npy"),
+                     PadStrideInfo(1, 1, 1, 1))
+                     .set_name("conv2/3x3")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv2/relu_3x3")
+              << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f))
+                     .set_name("conv2/norm2")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool2/3x3_s2");
+        graph << get_inception_node(data_path, "inception_3a", weights_layout, 64, std::make_tuple(96U, 128U),
+                                    std::make_tuple(16U, 32U), 32U)
+                     .set_name("inception_3a/concat");
+        graph << get_inception_node(data_path, "inception_3b", weights_layout, 128, std::make_tuple(128U, 192U),
+                                    std::make_tuple(32U, 96U), 64U)
+                     .set_name("inception_3b/concat");
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool3/3x3_s2");
+        graph << get_inception_node(data_path, "inception_4a", weights_layout, 192, std::make_tuple(96U, 208U),
+                                    std::make_tuple(16U, 48U), 64U)
+                     .set_name("inception_4a/concat");
+        graph << get_inception_node(data_path, "inception_4b", weights_layout, 160, std::make_tuple(112U, 224U),
+                                    std::make_tuple(24U, 64U), 64U)
+                     .set_name("inception_4b/concat");
+        graph << get_inception_node(data_path, "inception_4c", weights_layout, 128, std::make_tuple(128U, 256U),
+                                    std::make_tuple(24U, 64U), 64U)
+                     .set_name("inception_4c/concat");
+        graph << get_inception_node(data_path, "inception_4d", weights_layout, 112, std::make_tuple(144U, 288U),
+                                    std::make_tuple(32U, 64U), 64U)
+                     .set_name("inception_4d/concat");
+        graph << get_inception_node(data_path, "inception_4e", weights_layout, 256, std::make_tuple(160U, 320U),
+                                    std::make_tuple(32U, 128U), 128U)
+                     .set_name("inception_4e/concat");
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool4/3x3_s2");
+        graph << get_inception_node(data_path, "inception_5a", weights_layout, 256, std::make_tuple(160U, 320U),
+                                    std::make_tuple(32U, 128U), 128U)
+                     .set_name("inception_5a/concat");
+        graph << get_inception_node(data_path, "inception_5b", weights_layout, 384, std::make_tuple(192U, 384U),
+                                    std::make_tuple(48U, 128U), 128U)
+                     .set_name("inception_5b/concat");
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 7, operation_layout,
+                                               PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool5/7x7_s1")
               << FullyConnectedLayer(
-                  1000U,
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/loss3/loss3_classifier_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/loss3/loss3_classifier_b.npy"))
-              .set_name("loss3/classifier")
-              << SoftmaxLayer().set_name("prob")
-              << OutputLayer(get_output_accessor(common_params, 5));
+                     1000U,
+                     get_weights_accessor(data_path, "/cnn_data/googlenet_model/loss3/loss3_classifier_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/googlenet_model/loss3/loss3_classifier_b.npy"))
+                     .set_name("loss3/classifier")
+              << SoftmaxLayer().set_name("prob") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
@@ -148,63 +184,63 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    ConcatLayer get_inception_node(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                                   unsigned int a_filt,
+    ConcatLayer get_inception_node(const std::string                     &data_path,
+                                   std::string                          &&param_path,
+                                   DataLayout                             weights_layout,
+                                   unsigned int                           a_filt,
                                    std::tuple<unsigned int, unsigned int> b_filters,
                                    std::tuple<unsigned int, unsigned int> c_filters,
-                                   unsigned int d_filt)
+                                   unsigned int                           d_filt)
     {
         std::string total_path = "/cnn_data/googlenet_model/" + param_path + "/" + param_path + "_";
         SubStream   i_a(graph);
-        i_a << ConvolutionLayer(
-                1U, 1U, a_filt,
-                get_weights_accessor(data_path, total_path + "1x1_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "1x1_b.npy"),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/1x1")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_1x1");
+        i_a << ConvolutionLayer(1U, 1U, a_filt,
+                                get_weights_accessor(data_path, total_path + "1x1_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "1x1_b.npy"), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/1x1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_1x1");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(
-                1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "3x3_reduce_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "3x3_reduce_b.npy"),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/3x3_reduce")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_3x3_reduce")
-            << ConvolutionLayer(
-                3U, 3U, std::get<1>(b_filters),
-                get_weights_accessor(data_path, total_path + "3x3_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "3x3_b.npy"),
-                PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/3x3")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_3x3");
+        i_b << ConvolutionLayer(1U, 1U, std::get<0>(b_filters),
+                                get_weights_accessor(data_path, total_path + "3x3_reduce_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "3x3_reduce_b.npy"),
+                                PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/3x3_reduce")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_3x3_reduce")
+            << ConvolutionLayer(3U, 3U, std::get<1>(b_filters),
+                                get_weights_accessor(data_path, total_path + "3x3_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "3x3_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/3x3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_3x3");
 
         SubStream i_c(graph);
-        i_c << ConvolutionLayer(
-                1U, 1U, std::get<0>(c_filters),
-                get_weights_accessor(data_path, total_path + "5x5_reduce_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "5x5_reduce_b.npy"),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/5x5_reduce")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_5x5_reduce")
-            << ConvolutionLayer(
-                5U, 5U, std::get<1>(c_filters),
-                get_weights_accessor(data_path, total_path + "5x5_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "5x5_b.npy"),
-                PadStrideInfo(1, 1, 2, 2))
-            .set_name(param_path + "/5x5")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_5x5");
+        i_c << ConvolutionLayer(1U, 1U, std::get<0>(c_filters),
+                                get_weights_accessor(data_path, total_path + "5x5_reduce_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "5x5_reduce_b.npy"),
+                                PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/5x5_reduce")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_5x5_reduce")
+            << ConvolutionLayer(5U, 5U, std::get<1>(c_filters),
+                                get_weights_accessor(data_path, total_path + "5x5_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "5x5_b.npy"), PadStrideInfo(1, 1, 2, 2))
+                   .set_name(param_path + "/5x5")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_5x5");
 
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL))).set_name(param_path + "/pool")
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout,
+                                             PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL)))
+                   .set_name(param_path + "/pool")
             << ConvolutionLayer(
-                1U, 1U, d_filt,
-                get_weights_accessor(data_path, total_path + "pool_proj_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "pool_proj_b.npy"),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/pool_proj")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_pool_proj");
+                   1U, 1U, d_filt, get_weights_accessor(data_path, total_path + "pool_proj_w.npy", weights_layout),
+                   get_weights_accessor(data_path, total_path + "pool_proj_b.npy"), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/pool_proj")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_pool_proj");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
diff --git a/examples/graph_inception_resnet_v1.cpp b/examples/graph_inception_resnet_v1.cpp
index d789d7f6e7bd120069c8a41a62cea273d1c48d68..a54a0f78063b1b6873b8543f80c6066ed1e39c13 100644
--- a/examples/graph_inception_resnet_v1.cpp
+++ b/examples/graph_inception_resnet_v1.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -38,7 +39,12 @@ class InceptionResNetV1Example final : public Example
 {
 public:
     InceptionResNetV1Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), model_input_width(nullptr), model_input_height(nullptr), graph(0, "InceptionResNetV1")
+        : cmd_parser(),
+          common_opts(cmd_parser),
+          common_params(),
+          model_input_width(nullptr),
+          model_input_height(nullptr),
+          graph(0, "InceptionResNetV1")
     {
         model_input_width  = cmd_parser.add_option<SimpleOption<unsigned int>>("image-width", 512);
         model_input_height = cmd_parser.add_option<SimpleOption<unsigned int>>("image-height", 512);
@@ -47,7 +53,7 @@ public:
         model_input_width->set_help("Input image width.");
         model_input_height->set_help("Input image height.");
     }
-    InceptionResNetV1Example(const InceptionResNetV1Example &) = delete;
+    InceptionResNetV1Example(const InceptionResNetV1Example &)            = delete;
     InceptionResNetV1Example &operator=(const InceptionResNetV1Example &) = delete;
     ~InceptionResNetV1Example() override                                  = default;
     bool do_setup(int argc, char **argv) override
@@ -60,7 +66,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -70,13 +76,14 @@ public:
         const unsigned int image_height = model_input_height->value();
 
         // Set default layout if needed
-        if(!common_opts.data_layout->is_set() && common_params.target == Target::NEON)
+        if (!common_opts.data_layout->is_set() && common_params.target == Target::NEON)
         {
             common_params.data_layout = DataLayout::NCHW;
         }
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -86,7 +93,7 @@ public:
         // Create model path
         std::string data_path  = common_params.data_path;
         std::string model_path = "/cnn_data/inception_resnet_v1_model/";
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += model_path;
         }
@@ -96,95 +103,98 @@ public:
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(image_width, image_height, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape     = permute_shape(
+                TensorShape(image_width, image_height, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false))
               // Conv2d_1a_3x3
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "Conv2d_1a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(2, 2, 0, 0))
-              .set_name("Conv2d_1a_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 32U, get_weights_accessor(data_path, "Conv2d_1a_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                     .set_name("Conv2d_1a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_1a_3x3_BatchNorm_beta.npy"),
                                          batch_norm_epsilon)
-              .set_name("Conv2d_1a_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_1a_3x3/Relu")
+                     .set_name("Conv2d_1a_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_1a_3x3/Relu")
               // Conv2d_2a_3x3
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "Conv2d_2a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_2a_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 32U, get_weights_accessor(data_path, "Conv2d_2a_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv2d_2a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_2a_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_2a_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_2a_3x3_BatchNorm_beta.npy"),
                                          batch_norm_epsilon)
-              .set_name("Conv2d_2a_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2a_3x3/Relu")
+                     .set_name("Conv2d_2a_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_2a_3x3/Relu")
               // Conv2d_2b_3x3
-              << ConvolutionLayer(3U, 3U, 64U,
-                                  get_weights_accessor(data_path, "Conv2d_2b_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("Conv2d_2b_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 64U, get_weights_accessor(data_path, "Conv2d_2b_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                     .set_name("Conv2d_2b_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_2b_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_2b_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_2b_3x3_BatchNorm_beta.npy"),
                                          batch_norm_epsilon)
-              .set_name("Conv2d_2b_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2b_3x3/Relu")
+                     .set_name("Conv2d_2b_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_2b_3x3/Relu")
               // MaxPool_3a_3x3
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true)).set_name("MaxPool_3a_3x3/MaxPool")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
+                     .set_name("MaxPool_3a_3x3/MaxPool")
               // Conv2d_3b_1x1
-              << ConvolutionLayer(1U, 1U, 80U,
-                                  get_weights_accessor(data_path, "Conv2d_3b_1x1_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_3b_1x1/convolution")
+              << ConvolutionLayer(
+                     1U, 1U, 80U, get_weights_accessor(data_path, "Conv2d_3b_1x1_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv2d_3b_1x1/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_3b_1x1_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_3b_1x1_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_3b_1x1_BatchNorm_beta.npy"),
                                          batch_norm_epsilon)
-              .set_name("Conv2d_3b_1x1/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_3b_1x1/Relu")
+                     .set_name("Conv2d_3b_1x1/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_3b_1x1/Relu")
               // Conv2d_4a_3x3
-              << ConvolutionLayer(3U, 3U, 192U,
-                                  get_weights_accessor(data_path, "Conv2d_4a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_4a_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 192U, get_weights_accessor(data_path, "Conv2d_4a_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv2d_4a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_4a_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_4a_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_4a_3x3_BatchNorm_beta.npy"),
                                          batch_norm_epsilon)
-              .set_name("Conv2d_4a_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_4a_3x3/Relu")
+                     .set_name("Conv2d_4a_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_4a_3x3/Relu")
               // Conv2d_4b_3x3
-              << ConvolutionLayer(3U, 3U, 256U,
-                                  get_weights_accessor(data_path, "Conv2d_4b_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(2, 2, 0, 0))
-              .set_name("Conv2d_4a_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 256U, get_weights_accessor(data_path, "Conv2d_4b_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                     .set_name("Conv2d_4a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_4b_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_4b_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_4b_3x3_BatchNorm_beta.npy"),
                                          batch_norm_epsilon)
-              .set_name("Conv2d_4b_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_4b_3x3/Relu");
+                     .set_name("Conv2d_4b_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_4b_3x3/Relu");
 
         // 5 x Inception-resnet-A
         block35_repeat(data_path, weights_layout, 5);
@@ -202,11 +212,9 @@ public:
         // Logits tail
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("Logits/AvgPool_1a_8x8")
               << FlattenLayer().set_name("Logits/Flatten")
-              << FullyConnectedLayer(
-                  128U,
-                  get_weights_accessor(data_path, "Logits_Logits_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "Logits_Logits_biases.npy"))
-              .set_name("Logits/Logits")
+              << FullyConnectedLayer(128U, get_weights_accessor(data_path, "Logits_Logits_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, "Logits_Logits_biases.npy"))
+                     .set_name("Logits/Logits")
               << OutputLayer(std::make_unique<DummyAccessor>(0));
 
         // Finalize graph
@@ -231,14 +239,14 @@ private:
     CommandLineParser           cmd_parser;
     CommonGraphOptions          common_opts;
     CommonGraphParams           common_params;
-    SimpleOption<unsigned int> *model_input_width{ nullptr };
-    SimpleOption<unsigned int> *model_input_height{ nullptr };
+    SimpleOption<unsigned int> *model_input_width{nullptr};
+    SimpleOption<unsigned int> *model_input_height{nullptr};
     Stream                      graph;
 
 private:
     void block35_repeat(const std::string &data_path, DataLayout weights_layout, unsigned int num_blocks)
     {
-        for(unsigned int i = 0; i < num_blocks; ++i)
+        for (unsigned int i = 0; i < num_blocks; ++i)
         {
             std::stringstream unit_path_ss;
             unit_path_ss << "Repeat_block35_" << (i + 1) << "_";
@@ -254,102 +262,128 @@ private:
 
             // Branch 0
             SubStream i_la(i_l);
-            i_la << ConvolutionLayer(1U, 1U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
-                                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
+            i_la << ConvolutionLayer(
+                        1U, 1U, 32U,
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
+                        std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
 
             // Branch 1
             SubStream i_lb(i_l);
             i_lb << ConvolutionLayer(1U, 1U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
                  << ConvolutionLayer(3U, 3U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 1, 1))
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0b_3x3/Relu");
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/Relu");
 
             // Branch 2
             SubStream i_lc(i_l);
             i_lc << ConvolutionLayer(1U, 1U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_2/Conv2d_0a_1x1/Relu")
+                        .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/Relu")
                  << ConvolutionLayer(3U, 3U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 1, 1))
-                 .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_2/Conv2d_0b_3x3/Relu")
+                        .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/Relu")
                  << ConvolutionLayer(3U, 3U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 1, 1))
-                 .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_2/Conv2d_0c_3x3/Relu");
+                        .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/Relu");
 
             // Concatenate
             i_l << ConcatLayer(std::move(i_la), std::move(i_lb), std::move(i_lc)).set_name(unit_name + "concat")
-                << ConvolutionLayer(1U, 1U, 256U,
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
-                                    PadStrideInfo(1, 1, 0, 0))
-                .set_name(unit_name + "Conv2d_1x1/convolution")
-                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.17f, 0.f)).set_name(unit_name + "mul");
+                << ConvolutionLayer(
+                       1U, 1U, 256U,
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
+                       PadStrideInfo(1, 1, 0, 0))
+                       .set_name(unit_name + "Conv2d_1x1/convolution")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.17f, 0.f))
+                       .set_name(unit_name + "mul");
 
             graph << EltwiseLayer(std::move(i_l), std::move(i_r), EltwiseOperation::Add).set_name(unit_name + "add")
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "Relu");
         }
     }
 
     void block17_repeat(const std::string &data_path, DataLayout weights_layout, unsigned int num_blocks)
     {
-        for(unsigned int i = 0; i < num_blocks; ++i)
+        for (unsigned int i = 0; i < num_blocks; ++i)
         {
             std::stringstream unit_path_ss;
             unit_path_ss << "Repeat_1_block17_" << (i + 1) << "_";
@@ -365,79 +399,101 @@ private:
 
             // Branch 0
             SubStream i_la(i_l);
-            i_la << ConvolutionLayer(1U, 1U, 128U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
-                                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
+            i_la << ConvolutionLayer(
+                        1U, 1U, 128U,
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
+                        std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
 
             // Branch 1
             SubStream i_lb(i_l);
             i_lb << ConvolutionLayer(1U, 1U, 128U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
                  << ConvolutionLayer(7U, 1U, 128U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 3, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0b_1x7/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/Relu")
                  << ConvolutionLayer(1U, 7U, 128U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 3))
-                 .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0c_7x1/Relu");
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/Relu");
 
             // Concatenate
             i_l << ConcatLayer(std::move(i_la), std::move(i_lb)).set_name(unit_name + "concat")
-                << ConvolutionLayer(1U, 1U, 896U,
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
-                                    PadStrideInfo(1, 1, 0, 0))
-                .set_name(unit_name + "Conv2d_1x1/convolution")
-                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.10f, 0.f)).set_name(unit_name + "mul");
+                << ConvolutionLayer(
+                       1U, 1U, 896U,
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
+                       PadStrideInfo(1, 1, 0, 0))
+                       .set_name(unit_name + "Conv2d_1x1/convolution")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.10f, 0.f))
+                       .set_name(unit_name + "mul");
 
             graph << EltwiseLayer(std::move(i_l), std::move(i_r), EltwiseOperation::Add).set_name(unit_name + "add")
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "Relu");
         }
     }
 
-    void block8_repeat(const std::string &data_path, DataLayout weights_layout, unsigned int num_blocks, float scale, bool has_activation)
+    void block8_repeat(const std::string &data_path,
+                       DataLayout         weights_layout,
+                       unsigned int       num_blocks,
+                       float              scale,
+                       bool               has_activation)
     {
-        for(unsigned int i = 0; i < num_blocks; ++i)
+        for (unsigned int i = 0; i < num_blocks; ++i)
         {
             std::stringstream unit_path_ss;
             std::stringstream unit_name_ss;
-            if(num_blocks != 1)
+            if (num_blocks != 1)
             {
                 unit_path_ss << "Repeat_2_block8_" << (i + 1) << "_";
                 unit_name_ss << "Repeat_2/block8_" << (i + 1) << "/";
@@ -457,79 +513,97 @@ private:
 
             // Branch 0
             SubStream i_la(i_l);
-            i_la << ConvolutionLayer(1U, 1U, 192U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
-                                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
+            i_la << ConvolutionLayer(
+                        1U, 1U, 192U,
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
+                        std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
 
             // Branch 1
             SubStream i_lb(i_l);
             i_lb << ConvolutionLayer(1U, 1U, 192U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
                  << ConvolutionLayer(3U, 1U, 192U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 1, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0b_1x3/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/Relu")
                  << ConvolutionLayer(1U, 3U, 192U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 1))
-                 .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0c_3x1/Relu");
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/Relu");
 
             // Concatenate
             i_l << ConcatLayer(std::move(i_la), std::move(i_lb)).set_name(unit_name + "concat")
-                << ConvolutionLayer(1U, 1U, 1792U,
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
-                                    PadStrideInfo(1, 1, 0, 0))
-                .set_name(unit_name + "Conv2d_1x1/convolution");
+                << ConvolutionLayer(
+                       1U, 1U, 1792U,
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
+                       PadStrideInfo(1, 1, 0, 0))
+                       .set_name(unit_name + "Conv2d_1x1/convolution");
 
             // Scale result
-            if(scale != 1.f)
+            if (scale != 1.f)
             {
-                i_l << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, scale, 0.f)).set_name(unit_name + "mul");
+                i_l << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, scale, 0.f))
+                           .set_name(unit_name + "mul");
             }
 
             // Residual add
             graph << EltwiseLayer(std::move(i_l), std::move(i_r), EltwiseOperation::Add).set_name(unit_name + "add");
 
             // Apply activation if needed
-            if(has_activation)
+            if (has_activation)
             {
-                graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+                graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                             .set_name(unit_name + "Relu");
             }
         }
     }
@@ -538,61 +612,71 @@ private:
     {
         // Branch 0
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(3U, 3U, 384U,
-                                get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu");
+        i_a << ConvolutionLayer(
+                   3U, 3U, 384U,
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu");
 
         // Branch 1
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 192U,
-                                get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 192U,
-                                get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 1, 1))
-            .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu")
-            << ConvolutionLayer(3U, 3U, 256U,
-                                get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 192U,
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 192U,
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu");
 
         // Branch 2
         SubStream i_c(graph);
-        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0), true)).set_name("Mixed_6a/Branch_2/MaxPool_1a_3x3");
+        i_c << PoolingLayer(
+                   PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0), true))
+                   .set_name("Mixed_6a/Branch_2/MaxPool_1a_3x3");
 
         // Concatenate
         graph << ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c)).set_name("Mixed_6a/concat");
@@ -602,103 +686,120 @@ private:
     {
         // Branch 0
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 384U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu");
+        i_a << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 384U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu");
 
         // Branch 1
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu");
 
         // Branch 2
         SubStream i_c(graph);
-        i_c << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 1, 1))
-            .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/Relu")
-            << ConvolutionLayer(3U, 3U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/Relu");
+        i_c << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/Relu");
 
         // Branch 3
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0), true)).set_name("Mixed_7a/Branch_3/MaxPool_1a_3x3");
+        i_d << PoolingLayer(
+                   PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0), true))
+                   .set_name("Mixed_7a/Branch_3/MaxPool_1a_3x3");
 
         // Concatenate
-        graph << ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d)).set_name("Mixed_7a/concat");
+        graph
+            << ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d)).set_name("Mixed_7a/concat");
     }
 };
 
diff --git a/examples/graph_inception_resnet_v2.cpp b/examples/graph_inception_resnet_v2.cpp
index 1d0c51e9ad2fe1fec75ceb89d735462177f23d3f..43e31ee14bae98c553dd8a9c359472c35acba973 100644
--- a/examples/graph_inception_resnet_v2.cpp
+++ b/examples/graph_inception_resnet_v2.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class InceptionResNetV2Example final : public Example
 {
 public:
-    InceptionResNetV2Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "InceptionResNetV2")
+    InceptionResNetV2Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "InceptionResNetV2")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,20 +49,21 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
         }
 
         // Set default layout if needed
-        if(!common_opts.data_layout->is_set() && common_params.target == Target::NEON)
+        if (!common_opts.data_layout->is_set() && common_params.target == Target::NEON)
         {
             common_params.data_layout = DataLayout::NCHW;
         }
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -70,7 +71,7 @@ public:
         // Create model path
         std::string data_path  = common_params.data_path;
         std::string model_path = "/cnn_data/inception_resnet_v2_model/";
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += model_path;
         }
@@ -80,84 +81,88 @@ public:
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(299U, 299U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(299U, 299U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false))
               // Conv2d_1a_3x3
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "Conv2d_1a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(2, 2, 0, 0))
-              .set_name("Conv2d_1a_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 32U, get_weights_accessor(data_path, "Conv2d_1a_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                     .set_name("Conv2d_1a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_1a_3x3_BatchNorm_beta.npy"),
                                          0.0010000000474974513f)
-              .set_name("Conv2d_1a_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_1a_3x3/Relu")
+                     .set_name("Conv2d_1a_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_1a_3x3/Relu")
               // Conv2d_2a_3x3
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "Conv2d_2a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_2a_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 32U, get_weights_accessor(data_path, "Conv2d_2a_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv2d_2a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_2a_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_2a_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_2a_3x3_BatchNorm_beta.npy"),
                                          0.0010000000474974513f)
-              .set_name("Conv2d_2a_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2a_3x3/Relu")
+                     .set_name("Conv2d_2a_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_2a_3x3/Relu")
               // Conv2d_2b_3x3
-              << ConvolutionLayer(3U, 3U, 64U,
-                                  get_weights_accessor(data_path, "Conv2d_2b_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("Conv2d_2b_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 64U, get_weights_accessor(data_path, "Conv2d_2b_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                     .set_name("Conv2d_2b_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_2b_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_2b_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_2b_3x3_BatchNorm_beta.npy"),
                                          0.0010000000474974513f)
-              .set_name("Conv2d_2b_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2b_3x3/Relu")
+                     .set_name("Conv2d_2b_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_2b_3x3/Relu")
               // MaxPool_3a_3x3
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true)).set_name("MaxPool_3a_3x3/MaxPool")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
+                     .set_name("MaxPool_3a_3x3/MaxPool")
               // Conv2d_3b_1x1
-              << ConvolutionLayer(1U, 1U, 80U,
-                                  get_weights_accessor(data_path, "Conv2d_3b_1x1_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_3b_1x1/convolution")
+              << ConvolutionLayer(
+                     1U, 1U, 80U, get_weights_accessor(data_path, "Conv2d_3b_1x1_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv2d_3b_1x1/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_3b_1x1_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_3b_1x1_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_3b_1x1_BatchNorm_beta.npy"),
                                          0.0010000000474974513f)
-              .set_name("Conv2d_3b_1x1/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_3b_1x1/Relu")
+                     .set_name("Conv2d_3b_1x1/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_3b_1x1/Relu")
               // Conv2d_4a_3x3
-              << ConvolutionLayer(3U, 3U, 192U,
-                                  get_weights_accessor(data_path, "Conv2d_4a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_4a_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 192U, get_weights_accessor(data_path, "Conv2d_4a_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv2d_4a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_4a_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_4a_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_4a_3x3_BatchNorm_beta.npy"),
                                          0.0010000000474974513f)
-              .set_name("Conv2d_4a_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_4a_3x3/Relu")
+                     .set_name("Conv2d_4a_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_4a_3x3/Relu")
               // MaxPool_5a_3x3
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0), true)).set_name("MaxPool_5a_3x3/MaxPool");
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0), true))
+                     .set_name("MaxPool_5a_3x3/MaxPool");
 
         block_mixed_5b(data_path, weights_layout);
         block35_repeat(data_path, weights_layout, 10);
@@ -168,27 +173,25 @@ public:
         block8_repeat(data_path, weights_layout, 1, 1.f, false);
 
         // Conv2d_7b_1x1
-        graph << ConvolutionLayer(1U, 1U, 1536U,
-                                  get_weights_accessor(data_path, "Conv2d_7b_1x1_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_7b_1x1/convolution")
+        graph << ConvolutionLayer(
+                     1U, 1U, 1536U, get_weights_accessor(data_path, "Conv2d_7b_1x1_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv2d_7b_1x1/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_7b_1x1_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_7b_1x1_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_7b_1x1_BatchNorm_beta.npy"),
                                          0.0010000000474974513f)
-              .set_name("Conv2d_7b_1x1/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_7b_1x1/Relu")
+                     .set_name("Conv2d_7b_1x1/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_7b_1x1/Relu")
               << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("Logits/AvgPool_1a_8x8")
               << FlattenLayer().set_name("Logits/Flatten")
-              << FullyConnectedLayer(
-                  1001U,
-                  get_weights_accessor(data_path, "Logits_Logits_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "Logits_Logits_biases.npy"))
-              .set_name("Logits/Logits")
-              << SoftmaxLayer().set_name("Logits/Predictions")
-              << OutputLayer(get_output_accessor(common_params, 5));
+              << FullyConnectedLayer(1001U,
+                                     get_weights_accessor(data_path, "Logits_Logits_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, "Logits_Logits_biases.npy"))
+                     .set_name("Logits/Logits")
+              << SoftmaxLayer().set_name("Logits/Predictions") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
@@ -219,164 +222,191 @@ private:
     {
         // Branch 0
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(1U, 1U, 96U,
-                                get_weights_accessor(data_path, "Mixed_5b_Branch_0_Conv2d_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_5b/Branch_0/Conv2d_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_5b_Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_5b/Branch_0/Conv2d_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_5b/Branch_0/Conv2d_1x1/Relu");
+        i_a << ConvolutionLayer(
+                   1U, 1U, 96U,
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_0_Conv2d_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_5b/Branch_0/Conv2d_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_5b/Branch_0/Conv2d_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_5b/Branch_0/Conv2d_1x1/Relu");
 
         // Branch 1
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 48U,
-                                get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_5b/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(5U, 5U, 64U,
-                                get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0b_5x5_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 2, 2))
-            .set_name("Mixed_5b/Branch_1/Conv2d_0b_5x5/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0b_5x5_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0b_5x5_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0b_5x5_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_5b/Branch_1/Conv2d_0b_5x5/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_5b/Branch_1/Conv2d_0b_5x5/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 48U,
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_5b/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   5U, 5U, 64U,
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0b_5x5_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 2, 2))
+                   .set_name("Mixed_5b/Branch_1/Conv2d_0b_5x5/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0b_5x5_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0b_5x5_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0b_5x5_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_5b/Branch_1/Conv2d_0b_5x5/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_5b/Branch_1/Conv2d_0b_5x5/Relu");
 
         // Branch 2
         SubStream i_c(graph);
-        i_c << ConvolutionLayer(1U, 1U, 64U,
-                                get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_5b/Branch_2/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 1, 1))
-            .set_name("Mixed_5b/Branch_2/Conv2d_0b_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu")
-            << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 1, 1))
-            .set_name("Mixed_5b/Branch_2/Conv2d_0c_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_5b/Branch_2/Conv2d_0c_3x3/Relu");
+        i_c << ConvolutionLayer(
+                   1U, 1U, 64U,
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 96U,
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 96U,
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0c_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0c_3x3/Relu");
 
         // Branch 3
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true)).set_name("Mixed_5b/Branch_3/AvgPool_0a_3x3")
-            << ConvolutionLayer(1U, 1U, 64U,
-                                get_weights_accessor(data_path, "Mixed_5b_Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_5b/Branch_3/Conv2d_0b_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_5b_Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_5b/Branch_3/Conv2d_0b_1x1/Relu");
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout,
+                                             PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+                   .set_name("Mixed_5b/Branch_3/AvgPool_0a_3x3")
+            << ConvolutionLayer(
+                   1U, 1U, 64U,
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_5b/Branch_3/Conv2d_0b_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_5b/Branch_3/Conv2d_0b_1x1/Relu");
 
         // Concatenate
-        graph << ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d)).set_name("Mixed_5a/concat");
+        graph
+            << ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d)).set_name("Mixed_5a/concat");
     }
 
     void block_mixed_6a(const std::string &data_path, DataLayout weights_layout)
     {
         // Branch 0
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(3U, 3U, 384U,
-                                get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu");
+        i_a << ConvolutionLayer(
+                   3U, 3U, 384U,
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu");
 
         // Branch 1
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 256U,
-                                get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 1, 1))
-            .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu")
-            << ConvolutionLayer(3U, 3U, 384U,
-                                get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 384U,
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu");
 
         // Branch 2
         SubStream i_c(graph);
-        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0), true)).set_name("Mixed_6a/Branch_2/MaxPool_1a_3x3");
+        i_c << PoolingLayer(
+                   PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0), true))
+                   .set_name("Mixed_6a/Branch_2/MaxPool_1a_3x3");
 
         // Concatenate
         graph << ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c)).set_name("Mixed_6a/concat");
@@ -386,108 +416,125 @@ private:
     {
         // Branch 0
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 384U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu");
+        i_a << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 384U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu");
 
         // Branch 1
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 288U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 288U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu");
 
         // Branch 2
         SubStream i_c(graph);
-        i_c << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 288U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 1, 1))
-            .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/Relu")
-            << ConvolutionLayer(3U, 3U, 320U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/Relu");
+        i_c << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 288U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 320U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/Relu");
 
         // Branch 3
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true)).set_name("Mixed_7a/Branch_3/MaxPool_1a_3x3");
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
+                   .set_name("Mixed_7a/Branch_3/MaxPool_1a_3x3");
 
         // Concatenate
-        graph << ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d)).set_name("Mixed_7a/concat");
+        graph
+            << ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d)).set_name("Mixed_7a/concat");
     }
 
     void block35_repeat(const std::string &data_path, DataLayout weights_layout, unsigned int num_blocks)
     {
-        for(unsigned int i = 0; i < num_blocks; ++i)
+        for (unsigned int i = 0; i < num_blocks; ++i)
         {
             std::stringstream unit_path_ss;
             unit_path_ss << "Repeat_block35_" << (i + 1) << "_";
@@ -503,102 +550,128 @@ private:
 
             // Branch 0
             SubStream i_la(i_l);
-            i_la << ConvolutionLayer(1U, 1U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
-                                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
+            i_la << ConvolutionLayer(
+                        1U, 1U, 32U,
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
+                        std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
 
             // Branch 1
             SubStream i_lb(i_l);
             i_lb << ConvolutionLayer(1U, 1U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
                  << ConvolutionLayer(3U, 3U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 1, 1))
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0b_3x3/Relu");
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/Relu");
 
             // Branch 2
             SubStream i_lc(i_l);
             i_lc << ConvolutionLayer(1U, 1U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_2/Conv2d_0a_1x1/Relu")
+                        .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/Relu")
                  << ConvolutionLayer(3U, 3U, 48U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 1, 1))
-                 .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_2/Conv2d_0b_3x3/Relu")
+                        .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/Relu")
                  << ConvolutionLayer(3U, 3U, 64U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 1, 1))
-                 .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_2/Conv2d_0c_3x3/Relu");
+                        .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/Relu");
 
             // Concatenate
             i_l << ConcatLayer(std::move(i_la), std::move(i_lb), std::move(i_lc)).set_name(unit_name + "concat")
-                << ConvolutionLayer(1U, 1U, 320U,
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
-                                    PadStrideInfo(1, 1, 0, 0))
-                .set_name(unit_name + "Conv2d_1x1/convolution")
-                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.17f, 0.f)).set_name(unit_name + "mul");
+                << ConvolutionLayer(
+                       1U, 1U, 320U,
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
+                       PadStrideInfo(1, 1, 0, 0))
+                       .set_name(unit_name + "Conv2d_1x1/convolution")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.17f, 0.f))
+                       .set_name(unit_name + "mul");
 
             graph << EltwiseLayer(std::move(i_l), std::move(i_r), EltwiseOperation::Add).set_name(unit_name + "add")
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "Relu");
         }
     }
 
     void block17_repeat(const std::string &data_path, DataLayout weights_layout, unsigned int num_blocks)
     {
-        for(unsigned int i = 0; i < num_blocks; ++i)
+        for (unsigned int i = 0; i < num_blocks; ++i)
         {
             std::stringstream unit_path_ss;
             unit_path_ss << "Repeat_1_block17_" << (i + 1) << "_";
@@ -614,79 +687,101 @@ private:
 
             // Branch 0
             SubStream i_la(i_l);
-            i_la << ConvolutionLayer(1U, 1U, 192U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
-                                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
+            i_la << ConvolutionLayer(
+                        1U, 1U, 192U,
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
+                        std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
 
             // Branch 1
             SubStream i_lb(i_l);
             i_lb << ConvolutionLayer(1U, 1U, 128U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
                  << ConvolutionLayer(7U, 1U, 160U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 3, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0b_1x7/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/Relu")
                  << ConvolutionLayer(1U, 7U, 192U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 3))
-                 .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0c_7x1/Relu");
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/Relu");
 
             // Concatenate
             i_l << ConcatLayer(std::move(i_la), std::move(i_lb)).set_name(unit_name + "concat")
-                << ConvolutionLayer(1U, 1U, 1088U,
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
-                                    PadStrideInfo(1, 1, 0, 0))
-                .set_name(unit_name + "Conv2d_1x1/convolution")
-                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.10f, 0.f)).set_name(unit_name + "mul");
+                << ConvolutionLayer(
+                       1U, 1U, 1088U,
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
+                       PadStrideInfo(1, 1, 0, 0))
+                       .set_name(unit_name + "Conv2d_1x1/convolution")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.10f, 0.f))
+                       .set_name(unit_name + "mul");
 
             graph << EltwiseLayer(std::move(i_l), std::move(i_r), EltwiseOperation::Add).set_name(unit_name + "add")
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "Relu");
         }
     }
 
-    void block8_repeat(const std::string &data_path, DataLayout weights_layout, unsigned int num_blocks, float scale, bool has_activation)
+    void block8_repeat(const std::string &data_path,
+                       DataLayout         weights_layout,
+                       unsigned int       num_blocks,
+                       float              scale,
+                       bool               has_activation)
     {
-        for(unsigned int i = 0; i < num_blocks; ++i)
+        for (unsigned int i = 0; i < num_blocks; ++i)
         {
             std::stringstream unit_path_ss;
             std::stringstream unit_name_ss;
-            if(num_blocks != 1)
+            if (num_blocks != 1)
             {
                 unit_path_ss << "Repeat_2_block8_" << (i + 1) << "_";
                 unit_name_ss << "Repeat_2/block8_" << (i + 1) << "/";
@@ -706,79 +801,97 @@ private:
 
             // Branch 0
             SubStream i_la(i_l);
-            i_la << ConvolutionLayer(1U, 1U, 192U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
-                                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
+            i_la << ConvolutionLayer(
+                        1U, 1U, 192U,
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
+                        std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
 
             // Branch 1
             SubStream i_lb(i_l);
             i_lb << ConvolutionLayer(1U, 1U, 192U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
                  << ConvolutionLayer(3U, 1U, 224U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 1, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0b_1x3/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/Relu")
                  << ConvolutionLayer(1U, 3U, 256U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 1))
-                 .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0c_3x1/Relu");
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/Relu");
 
             // Concatenate
             i_l << ConcatLayer(std::move(i_la), std::move(i_lb)).set_name(unit_name + "concat")
-                << ConvolutionLayer(1U, 1U, 2080U,
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
-                                    PadStrideInfo(1, 1, 0, 0))
-                .set_name(unit_name + "Conv2d_1x1/convolution");
+                << ConvolutionLayer(
+                       1U, 1U, 2080U,
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
+                       PadStrideInfo(1, 1, 0, 0))
+                       .set_name(unit_name + "Conv2d_1x1/convolution");
 
             // Scale result
-            if(scale != 1.f)
+            if (scale != 1.f)
             {
-                i_l << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, scale, 0.f)).set_name(unit_name + "mul");
+                i_l << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, scale, 0.f))
+                           .set_name(unit_name + "mul");
             }
 
             // Residual add
             graph << EltwiseLayer(std::move(i_l), std::move(i_r), EltwiseOperation::Add).set_name(unit_name + "add");
 
             // Apply activation if needed
-            if(has_activation)
+            if (has_activation)
             {
-                graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+                graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                             .set_name(unit_name + "Relu");
             }
         }
     }
diff --git a/examples/graph_inception_v3.cpp b/examples/graph_inception_v3.cpp
index 160e7f04f4b8f896d0267bd656fbe8b0fdfa4d5e..75e03fb6b3479033992a6ee6f3c0b4a8d4545829 100644
--- a/examples/graph_inception_v3.cpp
+++ b/examples/graph_inception_v3.cpp
@@ -21,9 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -37,8 +38,7 @@ using namespace arm_compute::graph_utils;
 class InceptionV3Example : public Example
 {
 public:
-    InceptionV3Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "InceptionV3")
+    InceptionV3Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "InceptionV3")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -51,7 +51,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -68,134 +68,163 @@ public:
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(299U, 299U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(299U, 299U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false))
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
-              .set_name("Conv2d_1a_3x3/convolution")
-              << BatchNormalizationLayer(get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                         get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                         nullptr, get_weights_accessor(data_path,
-                                                                       "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                         0.001f)
-              .set_name("Conv2d_1a_3x3/BatchNorm/batchnorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_1a_3x3/Relu")
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_2a_3x3/convolution")
-              << BatchNormalizationLayer(get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_BatchNorm_moving_mean.npy"),
-                                         get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_BatchNorm_moving_variance.npy"),
-                                         nullptr, get_weights_accessor(data_path,
-                                                                       "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_BatchNorm_beta.npy"),
-                                         0.001f)
-              .set_name("Conv2d_2a_3x3/BatchNorm/batchnorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2a_3x3/Relu")
-
-              << ConvolutionLayer(3U, 3U, 64U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
-              .set_name("Conv2d_2b_3x3/convolution")
-              << BatchNormalizationLayer(get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_BatchNorm_moving_mean.npy"),
-                                         get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_BatchNorm_moving_variance.npy"),
-                                         nullptr, get_weights_accessor(data_path,
-                                                                       "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_BatchNorm_beta.npy"),
-                                         0.001f)
-              .set_name("Conv2d_2b_3x3/BatchNorm/batchnorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2b_3x3/Relu")
-
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("MaxPool_3a_3x3/MaxPool")
-
-              << ConvolutionLayer(1U, 1U, 80U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_3b_1x1/convolution")
-              << BatchNormalizationLayer(get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_BatchNorm_moving_mean.npy"),
-                                         get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_BatchNorm_moving_variance.npy"),
-                                         nullptr, get_weights_accessor(data_path,
-                                                                       "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_BatchNorm_beta.npy"),
-                                         0.001f)
-              .set_name("Conv2d_3b_1x1/BatchNorm/batchnorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_3b_1x1/Relu")
-
-              << ConvolutionLayer(3U, 3U, 192U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_4a_3x3/convolution")
-              << BatchNormalizationLayer(get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_BatchNorm_moving_mean.npy"),
-                                         get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_BatchNorm_moving_variance.npy"),
-                                         nullptr, get_weights_accessor(data_path,
-                                                                       "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_BatchNorm_beta.npy"),
-                                         0.001f)
-              .set_name("Conv2d_4a_3x3/BatchNorm/batchnorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_4a_3x3/Relu")
-
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("MaxPool_5a_3x3/MaxPool");
-
-        graph << get_inception_node_A(data_path, "Mixed_5b", weights_layout, 64U, std::make_tuple(48U, 64U), std::make_tuple(64U, 96U, 96U),
-                                      32U)
-              .set_name("Mixed_5b/concat");
-        graph << get_inception_node_A(data_path, "Mixed_5c", weights_layout, 64U, std::make_tuple(48U, 64U), std::make_tuple(64U, 96U, 96U),
-                                      64U, true)
-              .set_name("Mixed_5c/concat");
-        graph << get_inception_node_A(data_path, "Mixed_5d", weights_layout, 64U, std::make_tuple(48U, 64U), std::make_tuple(64U, 96U, 96U),
-                                      64U)
-              .set_name("Mixed_5d/concat");
-
-        graph << get_inception_node_B(data_path, "Mixed_6a", weights_layout, 384U, std::make_tuple(64U, 96U, 96U)).set_name("Mixed_6a/concat");
+        graph
+            << common_params.target << common_params.fast_math_hint
+            << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false))
+            << ConvolutionLayer(3U, 3U, 32U,
+                                get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_weights.npy",
+                                                     weights_layout),
+                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr,
+                   get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name("Conv2d_1a_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Conv2d_1a_3x3/Relu")
+            << ConvolutionLayer(3U, 3U, 32U,
+                                get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_weights.npy",
+                                                     weights_layout),
+                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Conv2d_2a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr,
+                   get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name("Conv2d_2a_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Conv2d_2a_3x3/Relu")
+
+            << ConvolutionLayer(3U, 3U, 64U,
+                                get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_weights.npy",
+                                                     weights_layout),
+                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                PadStrideInfo(1, 1, 1, 1))
+                   .set_name("Conv2d_2b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr,
+                   get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name("Conv2d_2b_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Conv2d_2b_3x3/Relu")
+
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                   .set_name("MaxPool_3a_3x3/MaxPool")
+
+            << ConvolutionLayer(1U, 1U, 80U,
+                                get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_weights.npy",
+                                                     weights_layout),
+                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Conv2d_3b_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr,
+                   get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name("Conv2d_3b_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Conv2d_3b_1x1/Relu")
+
+            << ConvolutionLayer(3U, 3U, 192U,
+                                get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_weights.npy",
+                                                     weights_layout),
+                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Conv2d_4a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr,
+                   get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name("Conv2d_4a_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Conv2d_4a_3x3/Relu")
+
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                   .set_name("MaxPool_5a_3x3/MaxPool");
+
+        graph << get_inception_node_A(data_path, "Mixed_5b", weights_layout, 64U, std::make_tuple(48U, 64U),
+                                      std::make_tuple(64U, 96U, 96U), 32U)
+                     .set_name("Mixed_5b/concat");
+        graph << get_inception_node_A(data_path, "Mixed_5c", weights_layout, 64U, std::make_tuple(48U, 64U),
+                                      std::make_tuple(64U, 96U, 96U), 64U, true)
+                     .set_name("Mixed_5c/concat");
+        graph << get_inception_node_A(data_path, "Mixed_5d", weights_layout, 64U, std::make_tuple(48U, 64U),
+                                      std::make_tuple(64U, 96U, 96U), 64U)
+                     .set_name("Mixed_5d/concat");
+
+        graph << get_inception_node_B(data_path, "Mixed_6a", weights_layout, 384U, std::make_tuple(64U, 96U, 96U))
+                     .set_name("Mixed_6a/concat");
 
         graph << get_inception_node_C(data_path, "Mixed_6b", weights_layout, 192U, std::make_tuple(128U, 128U, 192U),
                                       std::make_tuple(128U, 128U, 128U, 128U, 192U), 192U)
-              .set_name("Mixed_6b/concat");
+                     .set_name("Mixed_6b/concat");
         graph << get_inception_node_C(data_path, "Mixed_6c", weights_layout, 192U, std::make_tuple(160U, 160U, 192U),
                                       std::make_tuple(160U, 160U, 160U, 160U, 192U), 192U)
-              .set_name("Mixed_6c/concat");
+                     .set_name("Mixed_6c/concat");
         graph << get_inception_node_C(data_path, "Mixed_6d", weights_layout, 192U, std::make_tuple(160U, 160U, 192U),
                                       std::make_tuple(160U, 160U, 160U, 160U, 192U), 192U)
-              .set_name("Mixed_6d/concat");
+                     .set_name("Mixed_6d/concat");
         graph << get_inception_node_C(data_path, "Mixed_6e", weights_layout, 192U, std::make_tuple(192U, 192U, 192U),
                                       std::make_tuple(192U, 192U, 192U, 192U, 192U), 192U)
-              .set_name("Mixed_6e/concat");
+                     .set_name("Mixed_6e/concat");
 
         graph << get_inception_node_D(data_path, "Mixed_7a", weights_layout, std::make_tuple(192U, 320U),
                                       std::make_tuple(192U, 192U, 192U, 192U))
-              .set_name("Mixed_7a/concat");
+                     .set_name("Mixed_7a/concat");
 
         graph << get_inception_node_E(data_path, "Mixed_7b", weights_layout, 320U, std::make_tuple(384U, 384U, 384U),
                                       std::make_tuple(448U, 384U, 384U, 384U), 192U)
-              .set_name("Mixed_7b/concat");
+                     .set_name("Mixed_7b/concat");
         graph << get_inception_node_E(data_path, "Mixed_7c", weights_layout, 320U, std::make_tuple(384U, 384U, 384U),
                                       std::make_tuple(448U, 384U, 384U, 384U), 192U, true)
-              .set_name("Mixed_7c/concat");
-
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 8, operation_layout, PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL))).set_name("Logits/AvgPool_1a_8x8/AvgPool")
-              << ConvolutionLayer(1U, 1U, 1001U, get_weights_accessor(data_path,
-                                                                      "/cnn_data/inceptionv3_model/Logits_Conv2d_1c_1x1_weights.npy", weights_layout),
-                                  get_weights_accessor(data_path,
-                                                       "/cnn_data/inceptionv3_model/Logits_Conv2d_1c_1x1_biases.npy"),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Logits/Conv2d_1c_1x1/convolution")
+                     .set_name("Mixed_7c/concat");
+
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 8, operation_layout,
+                                               PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("Logits/AvgPool_1a_8x8/AvgPool")
+              << ConvolutionLayer(
+                     1U, 1U, 1001U,
+                     get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Logits_Conv2d_1c_1x1_weights.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Logits_Conv2d_1c_1x1_biases.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Logits/Conv2d_1c_1x1/convolution")
               << ReshapeLayer(TensorShape(1001U)).set_name("Predictions/Reshape")
-              << SoftmaxLayer().set_name("Predictions/Softmax")
-              << OutputLayer(get_output_accessor(common_params, 5));
+              << SoftmaxLayer().set_name("Predictions/Softmax") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
@@ -223,19 +252,21 @@ private:
     Stream             graph;
 
 private:
-    ConcatLayer get_inception_node_A(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                                     unsigned int a_filt,
-                                     std::tuple<unsigned int, unsigned int> b_filters,
+    ConcatLayer get_inception_node_A(const std::string                                   &data_path,
+                                     std::string                                        &&param_path,
+                                     DataLayout                                           weights_layout,
+                                     unsigned int                                         a_filt,
+                                     std::tuple<unsigned int, unsigned int>               b_filters,
                                      std::tuple<unsigned int, unsigned int, unsigned int> c_filters,
-                                     unsigned int d_filt,
-                                     bool         is_name_different = false)
+                                     unsigned int                                         d_filt,
+                                     bool                                                 is_name_different = false)
     {
         std::string total_path = "/cnn_data/inceptionv3_model/" + param_path + "_";
 
         // This is due to a naming issue in the tf model
         std::string conv_id0 = "_0a_";
         std::string conv_id1 = "2d_0b_";
-        if(is_name_different)
+        if (is_name_different)
         {
             conv_id0 = "_0b_";
             conv_id1 = "_1_0c_";
@@ -243,457 +274,451 @@ private:
 
         SubStream i_a(graph);
         i_a << ConvolutionLayer(
-                1U, 1U, a_filt,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
+                   1U, 1U, a_filt,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
-                1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 + "1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d" + conv_id0 + "1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 + "1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 + "1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 + "1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d" + conv_id0 + "1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d" + conv_id0 + "1x1/Relu")
+                   1U, 1U, std::get<0>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 + "1x1_weights.npy",
+                                        weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d" + conv_id0 + "1x1/convolution")
+            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 +
+                                                                           "1x1_BatchNorm_moving_mean.npy"),
+                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 +
+                                                                           "1x1_BatchNorm_moving_variance.npy"),
+                                       nullptr,
+                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 +
+                                                                           "1x1_BatchNorm_beta.npy"),
+                                       0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d" + conv_id0 + "1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d" + conv_id0 + "1x1/Relu")
             << ConvolutionLayer(
-                5U, 5U, std::get<1>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 2, 2))
-            .set_name(param_path + "/Branch_1/Conv2d" + conv_id1 + "5x5/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d" + conv_id1 + "5x5/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d" + conv_id1 + "5x5/Relu");
+                   5U, 5U, std::get<1>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_weights.npy",
+                                        weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 2, 2))
+                   .set_name(param_path + "/Branch_1/Conv2d" + conv_id1 + "5x5/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path,
+                                        total_path + "Branch_1_Conv" + conv_id1 + "5x5_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path,
+                                        total_path + "Branch_1_Conv" + conv_id1 + "5x5_BatchNorm_moving_variance.npy"),
+                   nullptr,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d" + conv_id1 + "5x5/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d" + conv_id1 + "5x5/Relu");
 
         SubStream i_c(graph);
         i_c << ConvolutionLayer(
-                1U, 1U, std::get<0>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
+                   1U, 1U, std::get<0>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
-                3U, 3U, std::get<1>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Relu")
+                   3U, 3U, std::get<1>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Relu")
             << ConvolutionLayer(
-                3U, 3U, std::get<2>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/BatchNorm/batcnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0c_3x3/Relu");
+                   3U, 3U, std::get<2>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/BatchNorm/batcnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/Relu");
 
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout,
+                                             PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+                   .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
             << ConvolutionLayer(
-                1U, 1U, d_filt,
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
+                   1U, 1U, d_filt,
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
 
-    ConcatLayer get_inception_node_B(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                                     unsigned int a_filt,
+    ConcatLayer get_inception_node_B(const std::string                                   &data_path,
+                                     std::string                                        &&param_path,
+                                     DataLayout                                           weights_layout,
+                                     unsigned int                                         a_filt,
                                      std::tuple<unsigned int, unsigned int, unsigned int> b_filters)
     {
         std::string total_path = "/cnn_data/inceptionv3_model/" + param_path + "_";
         SubStream   i_a(graph);
         i_a << ConvolutionLayer(
-                3U, 3U, a_filt,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(2, 2, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_1a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_1a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_1a_1x1/Relu");
+                   3U, 3U, a_filt,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_1a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_1a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_1a_1x1/Relu");
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
-                1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
+                   1U, 1U, std::get<0>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
-                3U, 3U, std::get<1>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_3x3/Relu")
+                   3U, 3U, std::get<1>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/Relu")
             << ConvolutionLayer(
-                3U, 3U, std::get<2>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(2, 2, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_1a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_1a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_1a_1x1/Relu");
+                   3U, 3U, std::get<2>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_1a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_1a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_1a_1x1/Relu");
 
         SubStream i_c(graph);
-        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name(param_path + "/Branch_2/MaxPool_1a_3x3/MaxPool");
+        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                   .set_name(param_path + "/Branch_2/MaxPool_1a_3x3/MaxPool");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c));
     }
 
-    ConcatLayer get_inception_node_C(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                                     unsigned int a_filt,
-                                     std::tuple<unsigned int, unsigned int, unsigned int> b_filters,
-                                     std::tuple<unsigned int, unsigned int, unsigned int, unsigned int, unsigned int> c_filters,
-                                     unsigned int d_filt)
+    ConcatLayer
+    get_inception_node_C(const std::string                                   &data_path,
+                         std::string                                        &&param_path,
+                         DataLayout                                           weights_layout,
+                         unsigned int                                         a_filt,
+                         std::tuple<unsigned int, unsigned int, unsigned int> b_filters,
+                         std::tuple<unsigned int, unsigned int, unsigned int, unsigned int, unsigned int> c_filters,
+                         unsigned int                                                                     d_filt)
     {
         std::string total_path = "/cnn_data/inceptionv3_model/" + param_path + "_";
         SubStream   i_a(graph);
         i_a << ConvolutionLayer(
-                1U, 1U, a_filt,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
+                   1U, 1U, a_filt,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
-                1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
+                   1U, 1U, std::get<0>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
-                7U, 1U, std::get<1>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 3, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Relu")
+                   7U, 1U, std::get<1>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Relu")
             << ConvolutionLayer(
-                1U, 7U, std::get<2>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 3))
-            .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0c_7x1/Relu");
+                   1U, 7U, std::get<2>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_0c_7x1/Relu");
 
         SubStream i_c(graph);
         i_c << ConvolutionLayer(
-                1U, 1U, std::get<0>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
+                   1U, 1U, std::get<0>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
-                1U, 7U, std::get<1>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 3))
-            .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0b_7x1/Relu")
+                   1U, 7U, std::get<1>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/Relu")
             << ConvolutionLayer(
-                7U, 1U, std::get<2>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 3, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0c_1x7/Relu")
+                   7U, 1U, std::get<2>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/Relu")
             << ConvolutionLayer(
-                1U, 7U, std::get<3>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 3))
-            .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0d_7x1/Relu")
+                   1U, 7U, std::get<3>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/Relu")
             << ConvolutionLayer(
-                7U, 1U, std::get<4>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 3, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0e_1x7/Relu");
+                   7U, 1U, std::get<4>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/Relu");
 
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout,
+                                             PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+                   .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
             << ConvolutionLayer(
-                1U, 1U, d_filt,
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
+                   1U, 1U, d_filt,
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
 
-    ConcatLayer get_inception_node_D(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                                     std::tuple<unsigned int, unsigned int> a_filters,
+    ConcatLayer get_inception_node_D(const std::string                                                 &data_path,
+                                     std::string                                                      &&param_path,
+                                     DataLayout                                                         weights_layout,
+                                     std::tuple<unsigned int, unsigned int>                             a_filters,
                                      std::tuple<unsigned int, unsigned int, unsigned int, unsigned int> b_filters)
     {
         std::string total_path = "/cnn_data/inceptionv3_model/" + param_path + "_";
         SubStream   i_a(graph);
         i_a << ConvolutionLayer(
-                1U, 1U, std::get<0>(a_filters),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu")
+                   1U, 1U, std::get<0>(a_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
-                3U, 3U, std::get<1>(a_filters),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(2, 2, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_1a_3x3/Relu");
+                   3U, 3U, std::get<1>(a_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_1a_3x3/Relu");
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
-                1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
+                   1U, 1U, std::get<0>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
-                7U, 1U, std::get<1>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 3, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Relu")
+                   7U, 1U, std::get<1>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Relu")
             << ConvolutionLayer(
-                1U, 7U, std::get<2>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 3))
-            .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0c_7x1/Relu")
+                   1U, 7U, std::get<2>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/Relu")
             << ConvolutionLayer(
-                3U, 3U, std::get<3>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(2, 2, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_1a_3x3/Relu");
+                   3U, 3U, std::get<3>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_1a_3x3/Relu");
 
         SubStream i_c(graph);
-        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name(param_path + "/Branch_2/MaxPool_1a_3x3/MaxPool");
+        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                   .set_name(param_path + "/Branch_2/MaxPool_1a_3x3/MaxPool");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c));
     }
 
-    ConcatLayer get_inception_node_E(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                                     unsigned int a_filt,
-                                     std::tuple<unsigned int, unsigned int, unsigned int> b_filters,
+    ConcatLayer get_inception_node_E(const std::string                                                 &data_path,
+                                     std::string                                                      &&param_path,
+                                     DataLayout                                                         weights_layout,
+                                     unsigned int                                                       a_filt,
+                                     std::tuple<unsigned int, unsigned int, unsigned int>               b_filters,
                                      std::tuple<unsigned int, unsigned int, unsigned int, unsigned int> c_filters,
-                                     unsigned int d_filt,
-                                     bool         is_name_different = false)
+                                     unsigned int                                                       d_filt,
+                                     bool is_name_different = false)
     {
         // This is due to a naming issue in the tf model
         std::string conv_id = "_0b_";
-        if(is_name_different)
+        if (is_name_different)
         {
             conv_id = "_0c_";
         }
@@ -701,154 +726,152 @@ private:
         std::string total_path = "/cnn_data/inceptionv3_model/" + param_path + "_";
         SubStream   i_a(graph);
         i_a << ConvolutionLayer(
-                1U, 1U, a_filt,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
+                   1U, 1U, a_filt,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
-                1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu");
+                   1U, 1U, std::get<0>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu");
 
         SubStream i_b1(i_b);
         i_b1 << ConvolutionLayer(
-                 3U, 1U, std::get<1>(b_filters),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_weights.npy", weights_layout),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 1, 0))
-             .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/convolution")
+                    3U, 1U, std::get<1>(b_filters),
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_weights.npy", weights_layout),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 0))
+                    .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/convolution")
              << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
-                 nullptr,
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"),
-                 0.001f)
-             .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_1x3/Relu");
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path,
+                                         total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
+                    nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"),
+                    0.001f)
+                    .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                    .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/Relu");
 
         SubStream i_b2(i_b);
         i_b2 << ConvolutionLayer(
-                 1U, 3U, std::get<2>(b_filters),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id + "3x1_weights.npy", weights_layout),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 0, 1))
-             .set_name(param_path + "/Branch_1/Conv2d" + conv_id + "3x1/convolution")
-             << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id + "3x1_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id + "3x1_BatchNorm_moving_variance.npy"),
-                 nullptr,
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id + "3x1_BatchNorm_beta.npy"),
-                 0.001f)
-             .set_name(param_path + "/Branch_1/Conv2d" + conv_id + "3x1/BatchNorm/batchnorm")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d" + conv_id + "3x1/Relu");
+                    1U, 3U, std::get<2>(b_filters),
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id + "3x1_weights.npy",
+                                         weights_layout),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 1))
+                    .set_name(param_path + "/Branch_1/Conv2d" + conv_id + "3x1/convolution")
+             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id +
+                                                                            "3x1_BatchNorm_moving_mean.npy"),
+                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id +
+                                                                            "3x1_BatchNorm_moving_variance.npy"),
+                                        nullptr,
+                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id +
+                                                                            "3x1_BatchNorm_beta.npy"),
+                                        0.001f)
+                    .set_name(param_path + "/Branch_1/Conv2d" + conv_id + "3x1/BatchNorm/batchnorm")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                    .set_name(param_path + "/Branch_1/Conv2d" + conv_id + "3x1/Relu");
 
         // Merge b1 and b2
         i_b << ConcatLayer(std::move(i_b1), std::move(i_b2)).set_name(param_path + "/Branch_1/concat");
 
         SubStream i_c(graph);
         i_c << ConvolutionLayer(
-                1U, 1U, std::get<0>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
+                   1U, 1U, std::get<0>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
-                3U, 3U, std::get<1>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Relu");
+                   3U, 3U, std::get<1>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Relu");
 
         SubStream i_c1(i_c);
         i_c1 << ConvolutionLayer(
-                 3U, 1U, std::get<2>(c_filters),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_weights.npy", weights_layout),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 1, 0))
-             .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/convolution")
+                    3U, 1U, std::get<2>(c_filters),
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_weights.npy", weights_layout),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 0))
+                    .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/convolution")
              << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_variance.npy"),
-                 nullptr,
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_beta.npy"),
-                 0.001f)
-             .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0c_1x3/Relu");
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path,
+                                         total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_variance.npy"),
+                    nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_beta.npy"),
+                    0.001f)
+                    .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                    .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/Relu");
 
         SubStream i_c2(i_c);
         i_c2 << ConvolutionLayer(
-                 1U, 3U, std::get<3>(c_filters),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_weights.npy", weights_layout),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 0, 1))
-             .set_name(param_path + "/Branch_2/Conv2d_0d_3x1/convolution")
+                    1U, 3U, std::get<3>(c_filters),
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_weights.npy", weights_layout),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 1))
+                    .set_name(param_path + "/Branch_2/Conv2d_0d_3x1/convolution")
              << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_BatchNorm_moving_variance.npy"),
-                 nullptr,
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_BatchNorm_beta.npy"),
-                 0.001f)
-             .set_name(param_path + "/Branch_2/Conv2d_0d_3x1/BatchNorm/batchnorm")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0d_3x1/Relu");
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path,
+                                         total_path + "Branch_2_Conv2d_0d_3x1_BatchNorm_moving_variance.npy"),
+                    nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_BatchNorm_beta.npy"),
+                    0.001f)
+                    .set_name(param_path + "/Branch_2/Conv2d_0d_3x1/BatchNorm/batchnorm")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                    .set_name(param_path + "/Branch_2/Conv2d_0d_3x1/Relu");
 
         // Merge i_c1 and i_c2
         i_c << ConcatLayer(std::move(i_c1), std::move(i_c2)).set_name(param_path + "/Branch_2/concat");
 
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout,
+                                             PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+                   .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
             << ConvolutionLayer(
-                1U, 1U, d_filt,
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
+                   1U, 1U, d_filt,
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
diff --git a/examples/graph_inception_v4.cpp b/examples/graph_inception_v4.cpp
index 6d8fab4141e9bb309133b63089383aa8e50f4124..052498ad38e9f13ddcb931a9982e7168fa65a8f4 100644
--- a/examples/graph_inception_v4.cpp
+++ b/examples/graph_inception_v4.cpp
@@ -39,8 +39,7 @@ using namespace arm_compute::graph_utils;
 class InceptionV4Example final : public Example
 {
 public:
-    InceptionV4Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "InceptionV4")
+    InceptionV4Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "InceptionV4")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -53,7 +52,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -70,51 +69,70 @@ public:
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(299U, 299U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(299U, 299U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false))
               // Conv2d_1a_3x3
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
-              .set_name("Conv2d_1a_3x3/Conv2D")
-              << BatchNormalizationLayer(get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                         get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                         get_random_accessor(1.f, 1.f),
-                                         get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                         0.001f)
-              .set_name("Conv2d_1a_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_1a_3x3/Relu")
+              << ConvolutionLayer(
+                     3U, 3U, 32U,
+                     get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_weights.npy",
+                                          weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                     .set_name("Conv2d_1a_3x3/Conv2D")
+              << BatchNormalizationLayer(
+                     get_weights_accessor(data_path,
+                                          "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                     get_weights_accessor(data_path,
+                                          "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                     get_random_accessor(1.f, 1.f),
+                     get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                     0.001f)
+                     .set_name("Conv2d_1a_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_1a_3x3/Relu")
               // Conv2d_2a_3x3
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_2a_3x3/Conv2D")
-              << BatchNormalizationLayer(get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_BatchNorm_moving_mean.npy"),
-                                         get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_BatchNorm_moving_variance.npy"),
-                                         get_random_accessor(1.f, 1.f),
-                                         get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_BatchNorm_beta.npy"),
-                                         0.001f)
-              .set_name("Conv2d_2a_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2a_3x3/Relu")
+              << ConvolutionLayer(
+                     3U, 3U, 32U,
+                     get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_weights.npy",
+                                          weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv2d_2a_3x3/Conv2D")
+              << BatchNormalizationLayer(
+                     get_weights_accessor(data_path,
+                                          "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_BatchNorm_moving_mean.npy"),
+                     get_weights_accessor(data_path,
+                                          "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_BatchNorm_moving_variance.npy"),
+                     get_random_accessor(1.f, 1.f),
+                     get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_BatchNorm_beta.npy"),
+                     0.001f)
+                     .set_name("Conv2d_2a_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_2a_3x3/Relu")
               // Conv2d_2b_3x3
-              << ConvolutionLayer(3U, 3U, 64U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
-              .set_name("Conv2d_2b_3x3/Conv2D")
-              << BatchNormalizationLayer(get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_BatchNorm_moving_mean.npy"),
-                                         get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_BatchNorm_moving_variance.npy"),
-                                         get_random_accessor(1.f, 1.f),
-                                         get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_BatchNorm_beta.npy"),
-                                         0.001f)
-              .set_name("Conv2d_2b_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2b_3x3/Relu");
+              << ConvolutionLayer(
+                     3U, 3U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_weights.npy",
+                                          weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                     .set_name("Conv2d_2b_3x3/Conv2D")
+              << BatchNormalizationLayer(
+                     get_weights_accessor(data_path,
+                                          "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_BatchNorm_moving_mean.npy"),
+                     get_weights_accessor(data_path,
+                                          "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_BatchNorm_moving_variance.npy"),
+                     get_random_accessor(1.f, 1.f),
+                     get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_BatchNorm_beta.npy"),
+                     0.001f)
+                     .set_name("Conv2d_2b_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_2b_3x3/Relu");
 
         graph << get_mixed_3a(data_path, weights_layout).set_name("Mixed_3a/concat");
         graph << get_mixed_4a(data_path, weights_layout).set_name("Mixed_4a/concat");
@@ -140,15 +158,16 @@ public:
         graph << get_inceptionC_block(data_path, weights_layout, "Mixed_7b").set_name("Mixed_7b/concat");
         graph << get_inceptionC_block(data_path, weights_layout, "Mixed_7c").set_name("Mixed_7c/concat");
         graph << get_inceptionC_block(data_path, weights_layout, "Mixed_7d").set_name("Mixed_7d/concat");
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("Logits/AvgPool_1a/AvgPool")
-              << FlattenLayer().set_name("Logits/Flatten")
-              << FullyConnectedLayer(
-                  1001U,
-                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Logits_Logits_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Logits_Logits_biases.npy"))
-              .set_name("Logits/MatMul")
-              << SoftmaxLayer().set_name("Logits/Predictions")
-              << OutputLayer(get_output_accessor(common_params, 5));
+        graph
+            << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("Logits/AvgPool_1a/AvgPool")
+            << FlattenLayer().set_name("Logits/Flatten")
+            << FullyConnectedLayer(
+                   1001U,
+                   get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Logits_Logits_weights.npy",
+                                        weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Logits_Logits_biases.npy"))
+                   .set_name("Logits/MatMul")
+            << SoftmaxLayer().set_name("Logits/Predictions") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
@@ -162,7 +181,7 @@ public:
 
         // Load the precompiled kernels from a file into the kernel library, in this way the next time they are needed
         // compilation won't be required.
-        if(common_params.enable_cl_cache)
+        if (common_params.enable_cl_cache)
         {
 #ifdef ARM_COMPUTE_CL
             restore_program_cache_from_file();
@@ -172,7 +191,7 @@ public:
         graph.finalize(common_params.target, config);
 
         // Save the opencl kernels to a file
-        if(common_opts.enable_cl_cache)
+        if (common_opts.enable_cl_cache)
         {
 #ifdef ARM_COMPUTE_CL
             save_program_cache_to_file();
@@ -199,22 +218,24 @@ private:
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_3a_";
 
         SubStream i_a(graph);
-        i_a << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name("Mixed_3a/Branch_0/MaxPool_0a_3x3/MaxPool");
+        i_a << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
+                   .set_name("Mixed_3a/Branch_0/MaxPool_0a_3x3/MaxPool");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_3a/Branch_1/Conv2d_0a_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_3a/Branch_1/Conv2d_0a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_3a/Branch_1/Conv2d_0a_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   3U, 3U, 96U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_3a/Branch_1/Conv2d_0a_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_3a/Branch_1/Conv2d_0a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_3a/Branch_1/Conv2d_0a_3x3/Relu");
 
         return ConcatLayer(std::move(i_a), std::move(i_b));
     }
@@ -224,74 +245,86 @@ private:
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_4a_";
 
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(1U, 1U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_4a/Branch_0/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_4a/Branch_0/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_4a/Branch_0/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_4a/Branch_0/Conv2d_1a_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_4a/Branch_0/Conv2d_1a_3x3/Relu");
+        i_a << ConvolutionLayer(
+                   1U, 1U, 64U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_4a/Branch_0/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_4a/Branch_0/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_4a/Branch_0/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 96U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_4a/Branch_0/Conv2d_1a_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_4a/Branch_0/Conv2d_1a_3x3/Relu");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_4a/Branch_1/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_4a/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_4a/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(7U, 1U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
-            .set_name("Mixed_4a/Branch_1/Conv2d_0b_1x7/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_4a/Branch_1/Conv2d_0b_1x7/Relu")
-            << ConvolutionLayer(1U, 7U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
-            .set_name("Mixed_4a/Branch_1/Conv2d_0c_7x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_4a/Branch_1/Conv2d_0c_7x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_4a/Branch_1/Conv2d_0c_7x1/Relu")
-            << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_4a/Branch_1/Conv2d_1a_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_4a/Branch_1/Conv2d_1a_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 64U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   7U, 1U, 64U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0b_1x7/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0b_1x7/Relu")
+            << ConvolutionLayer(
+                   1U, 7U, 64U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0c_7x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0c_7x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0c_7x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 96U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_4a/Branch_1/Conv2d_1a_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_4a/Branch_1/Conv2d_1a_3x3/Relu");
 
         return ConcatLayer(std::move(i_a), std::move(i_b));
     }
@@ -301,22 +334,24 @@ private:
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_5a_";
 
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(3U, 3U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_5a/Branch_0/Conv2d_1a_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_5a/Branch_0/Conv2d_1a_3x3/Relu");
+        i_a << ConvolutionLayer(
+                   3U, 3U, 192U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_5a/Branch_0/Conv2d_1a_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_5a/Branch_0/Conv2d_1a_3x3/Relu");
 
         SubStream i_b(graph);
-        i_b << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name("Mixed_5a/Branch_1/MaxPool_1a_3x3/MaxPool");
+        i_b << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
+                   .set_name("Mixed_5a/Branch_1/MaxPool_1a_3x3/MaxPool");
 
         return ConcatLayer(std::move(i_a), std::move(i_b));
     }
@@ -326,92 +361,106 @@ private:
         std::string total_path = "/cnn_data/inceptionv4_model/" + param_path + "_";
 
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(1U, 1U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
+        i_a << ConvolutionLayer(
+                   1U, 1U, 96U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 64U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 96U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/Relu");
 
         SubStream i_c(graph);
-        i_c << ConvolutionLayer(1U, 1U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Relu")
-            << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0c_3x3/Relu");
+        i_c << ConvolutionLayer(
+                   1U, 1U, 64U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 96U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 96U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/Relu");
 
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
-            << ConvolutionLayer(1U, 1U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout,
+                                             PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+                   .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
+            << ConvolutionLayer(
+                   1U, 1U, 96U,
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
@@ -421,57 +470,65 @@ private:
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_6a_";
 
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(3U, 3U, 384U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu");
+        i_a << ConvolutionLayer(
+                   3U, 3U, 384U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 224U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
-            .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu")
-            << ConvolutionLayer(3U, 3U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 192U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 224U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu");
 
         SubStream i_c(graph);
-        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name("Mixed_6a/Branch_2/MaxPool_1a_3x3/MaxPool");
+        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
+                   .set_name("Mixed_6a/Branch_2/MaxPool_1a_3x3/MaxPool");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c));
     }
@@ -481,125 +538,145 @@ private:
         std::string total_path = "/cnn_data/inceptionv4_model/" + param_path + "_";
 
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(1U, 1U, 384U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
+        i_a << ConvolutionLayer(
+                   1U, 1U, 384U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(7U, 1U, 224U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Relu")
-            << ConvolutionLayer(1U, 7U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
-            .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0c_7x1/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 192U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   7U, 1U, 224U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Relu")
+            << ConvolutionLayer(
+                   1U, 7U, 256U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/Relu");
 
         SubStream i_c(graph);
-        i_c << ConvolutionLayer(1U, 1U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(1U, 7U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
-            .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0b_7x1/Relu")
-            << ConvolutionLayer(7U, 1U, 224U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0c_1x7/Relu")
-            << ConvolutionLayer(1U, 7U, 224U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
-            .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0d_7x1/Relu")
-            << ConvolutionLayer(7U, 1U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0e_1x7/Relu");
+        i_c << ConvolutionLayer(
+                   1U, 1U, 192U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   1U, 7U, 192U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/Relu")
+            << ConvolutionLayer(
+                   7U, 1U, 224U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/Relu")
+            << ConvolutionLayer(
+                   1U, 7U, 224U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/Relu")
+            << ConvolutionLayer(
+                   7U, 1U, 256U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/Relu");
 
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
-            << ConvolutionLayer(1U, 1U, 128U,
-                                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout,
+                                             PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+                   .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
+            << ConvolutionLayer(
+                   1U, 1U, 128U,
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
@@ -609,79 +686,91 @@ private:
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_7a_";
 
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(1U, 1U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu");
+        i_a << ConvolutionLayer(
+                   1U, 1U, 192U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 192U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(7U, 1U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
-            .set_name("Mixed_7a/Branch_1/Conv2d_0b_1x7/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_7a/Branch_1/Conv2d_0b_1x7/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_0b_1x7/Relu")
-            << ConvolutionLayer(1U, 7U, 320U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
-            .set_name("Mixed_7a/Branch_1/Conv2d_0c_7x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_0c_7x1/Relu")
-            << ConvolutionLayer(3U, 3U, 320U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   7U, 1U, 256U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0b_1x7/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0b_1x7/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0b_1x7/Relu")
+            << ConvolutionLayer(
+                   1U, 7U, 320U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0c_7x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0c_7x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 320U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu");
 
         SubStream i_c(graph);
-        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name("Mixed_7a/Branch_2/MaxPool_1a_3x3/MaxPool");
+        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
+                   .set_name("Mixed_7a/Branch_2/MaxPool_1a_3x3/MaxPool");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c));
     }
@@ -691,163 +780,163 @@ private:
         std::string total_path = "/cnn_data/inceptionv4_model/" + param_path + "_";
 
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
+        i_a << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
-                1U, 1U, 384U,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                get_random_accessor(1.f, 1.f),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu");
+                   1U, 1U, 384U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu");
 
         SubStream i_b1(i_b);
         i_b1 << ConvolutionLayer(
-                 3U, 1U, 256U,
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_weights.npy", weights_layout),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 1, 0))
-             .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/Conv2D")
+                    3U, 1U, 256U,
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_weights.npy", weights_layout),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 0))
+                    .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/Conv2D")
              << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
-                 get_random_accessor(1.f, 1.f),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"),
-                 0.001f)
-             .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/BatchNorm")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_1x3/Relu");
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path,
+                                         total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
+                    get_random_accessor(1.f, 1.f),
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"), 0.001f)
+                    .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/BatchNorm")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                    .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/Relu");
 
         SubStream i_b2(i_b);
         i_b2 << ConvolutionLayer(
-                 1U, 3U, 256U,
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_weights.npy", weights_layout),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 0, 1))
-             .set_name(param_path + "/Branch_1/Conv2d_0c_3x1/Conv2D")
+                    1U, 3U, 256U,
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_weights.npy", weights_layout),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 1))
+                    .set_name(param_path + "/Branch_1/Conv2d_0c_3x1/Conv2D")
              << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_variance.npy"),
-                 get_random_accessor(1.f, 1.f),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_beta.npy"),
-                 0.001f)
-             .set_name(param_path + "/Branch_1/Conv2d_0c_3x1/BatchNorm")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0c_3x1/Relu");
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path,
+                                         total_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_variance.npy"),
+                    get_random_accessor(1.f, 1.f),
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_beta.npy"), 0.001f)
+                    .set_name(param_path + "/Branch_1/Conv2d_0c_3x1/BatchNorm")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                    .set_name(param_path + "/Branch_1/Conv2d_0c_3x1/Relu");
 
         // Merge b1 and b2
         i_b << ConcatLayer(std::move(i_b1), std::move(i_b2)).set_name(param_path + "/Branch_1/concat");
 
         SubStream i_c(graph);
         i_c << ConvolutionLayer(
-                1U, 1U, 384U,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                get_random_accessor(1.f, 1.f),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
+                   1U, 1U, 384U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
-                1U, 3U, 448U,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 1))
-            .set_name(param_path + "/Branch_2/Conv2d_0b_3x1/Conv2D")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_BatchNorm_moving_variance.npy"),
-                get_random_accessor(1.f, 1.f),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0b_3x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0b_3x1/Relu")
+                   1U, 3U, 448U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 1))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x1/Relu")
             << ConvolutionLayer(
-                3U, 1U, 512U,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 1, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/Conv2D")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_variance.npy"),
-                get_random_accessor(1.f, 1.f),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0c_1x3/Relu");
+                   3U, 1U, 512U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/Relu");
 
         SubStream i_c1(i_c);
         i_c1 << ConvolutionLayer(
-                 3U, 1U, 256U,
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_weights.npy", weights_layout),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 1, 0))
-             .set_name(param_path + "/Branch_2/Conv2d_0d_1x3/Conv2D")
+                    3U, 1U, 256U,
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_weights.npy", weights_layout),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 0))
+                    .set_name(param_path + "/Branch_2/Conv2d_0d_1x3/Conv2D")
              << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_moving_variance.npy"),
-                 get_random_accessor(1.f, 1.f),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_beta.npy"),
-                 0.001f)
-             .set_name(param_path + "/Branch_2/Conv2d_0d_1x3/BatchNorm")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0d_1x3/Relu");
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path,
+                                         total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_moving_variance.npy"),
+                    get_random_accessor(1.f, 1.f),
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_beta.npy"), 0.001f)
+                    .set_name(param_path + "/Branch_2/Conv2d_0d_1x3/BatchNorm")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                    .set_name(param_path + "/Branch_2/Conv2d_0d_1x3/Relu");
 
         SubStream i_c2(i_c);
         i_c2 << ConvolutionLayer(
-                 1U, 3U, 256U,
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_weights.npy", weights_layout),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 0, 1))
-             .set_name(param_path + "/Branch_2/Conv2d_0e_3x1/Conv2D")
+                    1U, 3U, 256U,
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_weights.npy", weights_layout),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 1))
+                    .set_name(param_path + "/Branch_2/Conv2d_0e_3x1/Conv2D")
              << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_moving_variance.npy"),
-                 get_random_accessor(1.f, 1.f),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_beta.npy"),
-                 0.001f)
-             .set_name(param_path + "/Branch_2/Conv2d_0e_3x1/BatchNorm")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0e_3x1/Relu");
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path,
+                                         total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_moving_variance.npy"),
+                    get_random_accessor(1.f, 1.f),
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_beta.npy"), 0.001f)
+                    .set_name(param_path + "/Branch_2/Conv2d_0e_3x1/BatchNorm")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                    .set_name(param_path + "/Branch_2/Conv2d_0e_3x1/Relu");
 
         // Merge i_c1 and i_c2
         i_c << ConcatLayer(std::move(i_c1), std::move(i_c2)).set_name(param_path + "/Branch_2/concat");
 
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
-            << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout,
+                                             PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+                   .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
+            << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
diff --git a/examples/graph_lenet.cpp b/examples/graph_lenet.cpp
index 1bcd95fb5852ecc81cf7b6d12a8cff5049490090..7d6dce7b170c752025314dd2869171872619387f 100644
--- a/examples/graph_lenet.cpp
+++ b/examples/graph_lenet.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphLenetExample : public Example
 {
 public:
-    GraphLenetExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "LeNet")
+    GraphLenetExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "LeNet")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,14 +49,15 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
         }
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -67,43 +68,39 @@ public:
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(28U, 28U, 1U, batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(28U, 28U, 1U, batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
         //conv1 << pool1 << conv2 << pool2 << fc1 << act1 << fc2 << smx
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params))
               << ConvolutionLayer(
-                  5U, 5U, 20U,
-                  get_weights_accessor(data_path, "/cnn_data/lenet_model/conv1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/lenet_model/conv1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv1")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool1")
+                     5U, 5U, 20U, get_weights_accessor(data_path, "/cnn_data/lenet_model/conv1_w.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/lenet_model/conv1_b.npy"), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("conv1")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                     .set_name("pool1")
               << ConvolutionLayer(
-                  5U, 5U, 50U,
-                  get_weights_accessor(data_path, "/cnn_data/lenet_model/conv2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/lenet_model/conv2_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv2")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool2")
-              << FullyConnectedLayer(
-                  500U,
-                  get_weights_accessor(data_path, "/cnn_data/lenet_model/ip1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/lenet_model/ip1_b.npy"))
-              .set_name("ip1")
+                     5U, 5U, 50U, get_weights_accessor(data_path, "/cnn_data/lenet_model/conv2_w.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/lenet_model/conv2_b.npy"), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("conv2")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                     .set_name("pool2")
+              << FullyConnectedLayer(500U,
+                                     get_weights_accessor(data_path, "/cnn_data/lenet_model/ip1_w.npy", weights_layout),
+                                     get_weights_accessor(data_path, "/cnn_data/lenet_model/ip1_b.npy"))
+                     .set_name("ip1")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu")
-              << FullyConnectedLayer(
-                  10U,
-                  get_weights_accessor(data_path, "/cnn_data/lenet_model/ip2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/lenet_model/ip2_b.npy"))
-              .set_name("ip2")
-              << SoftmaxLayer().set_name("prob")
-              << OutputLayer(get_output_accessor(common_params));
+              << FullyConnectedLayer(10U,
+                                     get_weights_accessor(data_path, "/cnn_data/lenet_model/ip2_w.npy", weights_layout),
+                                     get_weights_accessor(data_path, "/cnn_data/lenet_model/ip2_b.npy"))
+                     .set_name("ip2")
+              << SoftmaxLayer().set_name("prob") << OutputLayer(get_output_accessor(common_params));
 
         // Finalize graph
         GraphConfig config;
diff --git a/examples/graph_mobilenet.cpp b/examples/graph_mobilenet.cpp
index 4630dc958a2c5e7fb64573bf1cf11c5523758b56..e3a6ef116dd0a54e9e9adf0da38c543a1a3300bb 100644
--- a/examples/graph_mobilenet.cpp
+++ b/examples/graph_mobilenet.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -36,14 +37,13 @@ using namespace arm_compute::graph_utils;
 class GraphMobilenetExample : public Example
 {
 public:
-    GraphMobilenetExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "MobileNetV1")
+    GraphMobilenetExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "MobileNetV1")
     {
         // Add model id option
         model_id_opt = cmd_parser.add_option<SimpleOption<int>>("model-id", 0);
         model_id_opt->set_help("Mobilenet model id (0: 1.0_224, else: 0.75_160");
     }
-    GraphMobilenetExample(const GraphMobilenetExample &) = delete;
+    GraphMobilenetExample(const GraphMobilenetExample &)            = delete;
     GraphMobilenetExample &operator=(const GraphMobilenetExample &) = delete;
     ~GraphMobilenetExample() override                               = default;
     bool do_setup(int argc, char **argv) override
@@ -56,7 +56,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -72,15 +72,17 @@ public:
         unsigned int spatial_size = (model_id == 0 || common_params.data_type == DataType::QASYMM8) ? 224 : 160;
 
         // Create input descriptor
-        const TensorShape tensor_shape     = permute_shape(TensorShape(spatial_size, spatial_size, 3U, common_params.batches), DataLayout::NCHW, common_params.data_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(spatial_size, spatial_size, 3U, common_params.batches), DataLayout::NCHW,
+                          common_params.data_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
 
         // Set graph hints
-        graph << common_params.target
-              << common_params.fast_math_hint;
+        graph << common_params.target << common_params.fast_math_hint;
 
         // Create core graph
-        if(arm_compute::is_data_type_float(common_params.data_type))
+        if (arm_compute::is_data_type_float(common_params.data_type))
         {
             create_graph_float(input_descriptor, model_id);
         }
@@ -90,8 +92,7 @@ public:
         }
 
         // Create common tail
-        graph << ReshapeLayer(TensorShape(1001U)).set_name("Reshape")
-              << SoftmaxLayer().set_name("Softmax")
+        graph << ReshapeLayer(TensorShape(1001U)).set_name("Reshape") << SoftmaxLayer().set_name("Softmax")
               << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
@@ -115,14 +116,15 @@ public:
 private:
     CommandLineParser  cmd_parser;
     CommonGraphOptions common_opts;
-    SimpleOption<int> *model_id_opt{ nullptr };
+    SimpleOption<int> *model_id_opt{nullptr};
     CommonGraphParams  common_params;
     Stream             graph;
 
     void create_graph_float(TensorDescriptor &input_descriptor, int model_id)
     {
         float       depth_scale = (model_id == 0) ? 1.f : 0.75;
-        std::string model_path  = (model_id == 0) ? "/cnn_data/mobilenet_v1_1_224_model/" : "/cnn_data/mobilenet_v1_075_160_model/";
+        std::string model_path =
+            (model_id == 0) ? "/cnn_data/mobilenet_v1_1_224_model/" : "/cnn_data/mobilenet_v1_075_160_model/";
 
         // Create a preprocessor object
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<TFPreproccessor>();
@@ -131,47 +133,68 @@ private:
         std::string data_path = common_params.data_path;
 
         // Add model path to data path
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += model_path;
         }
 
-        graph << InputLayer(input_descriptor,
-                            get_input_accessor(common_params, std::move(preprocessor), false))
-              << ConvolutionLayer(
-                  3U, 3U, 32U * depth_scale,
-                  get_weights_accessor(data_path, "Conv2d_0_weights.npy", DataLayout::NCHW),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR))
-              .set_name("Conv2d_0")
-              << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "Conv2d_0_BatchNorm_moving_mean.npy"),
-                  get_weights_accessor(data_path, "Conv2d_0_BatchNorm_moving_variance.npy"),
-                  get_weights_accessor(data_path, "Conv2d_0_BatchNorm_gamma.npy"),
-                  get_weights_accessor(data_path, "Conv2d_0_BatchNorm_beta.npy"),
-                  0.001f)
-              .set_name("Conv2d_0/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f)).set_name("Conv2d_0/Relu6");
-        graph << get_dwsc_node_float(data_path, "Conv2d_1", 64 * depth_scale, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_2", 128 * depth_scale, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_3", 128 * depth_scale, PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_4", 256 * depth_scale, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_5", 256 * depth_scale, PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_6", 512 * depth_scale, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_7", 512 * depth_scale, PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_8", 512 * depth_scale, PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_9", 512 * depth_scale, PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_10", 512 * depth_scale, PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_11", 512 * depth_scale, PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_12", 1024 * depth_scale, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_13", 1024 * depth_scale, PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, common_params.data_layout)).set_name("Logits/AvgPool_1a")
-              << ConvolutionLayer(
-                  1U, 1U, 1001U,
-                  get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_weights.npy", DataLayout::NCHW),
-                  get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_biases.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Logits/Conv2d_1c_1x1");
+        graph << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false))
+              << ConvolutionLayer(3U, 3U, 32U * depth_scale,
+                                  get_weights_accessor(data_path, "Conv2d_0_weights.npy", DataLayout::NCHW),
+                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                  PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR))
+                     .set_name("Conv2d_0")
+              << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_0_BatchNorm_moving_mean.npy"),
+                                         get_weights_accessor(data_path, "Conv2d_0_BatchNorm_moving_variance.npy"),
+                                         get_weights_accessor(data_path, "Conv2d_0_BatchNorm_gamma.npy"),
+                                         get_weights_accessor(data_path, "Conv2d_0_BatchNorm_beta.npy"), 0.001f)
+                     .set_name("Conv2d_0/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
+                     .set_name("Conv2d_0/Relu6");
+        graph << get_dwsc_node_float(data_path, "Conv2d_1", 64 * depth_scale, PadStrideInfo(1, 1, 1, 1),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_2", 128 * depth_scale,
+                                     PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_3", 128 * depth_scale,
+                                     PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_4", 256 * depth_scale,
+                                     PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_5", 256 * depth_scale,
+                                     PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_6", 512 * depth_scale,
+                                     PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_7", 512 * depth_scale,
+                                     PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_8", 512 * depth_scale,
+                                     PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_9", 512 * depth_scale,
+                                     PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_10", 512 * depth_scale,
+                                     PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_11", 512 * depth_scale,
+                                     PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_12", 1024 * depth_scale,
+                                     PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_13", 1024 * depth_scale,
+                                     PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph
+            << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, common_params.data_layout)).set_name("Logits/AvgPool_1a")
+            << ConvolutionLayer(
+                   1U, 1U, 1001U, get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_weights.npy", DataLayout::NCHW),
+                   get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_biases.npy"), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Logits/Conv2d_1c_1x1");
     }
 
     void create_graph_qasymm(TensorDescriptor &input_descriptor)
@@ -180,7 +203,7 @@ private:
         std::string data_path = common_params.data_path;
 
         // Add model path to data path
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += "/cnn_data/mobilenet_qasymm8_model/";
         }
@@ -188,19 +211,16 @@ private:
         // Quantization info taken from the AndroidNN QASYMM8 MobileNet example
         const QuantizationInfo in_quant_info = QuantizationInfo(0.0078125f, 128);
 
-        const std::vector<QuantizationInfo> conv_weights_quant_info =
-        {
+        const std::vector<QuantizationInfo> conv_weights_quant_info = {
             QuantizationInfo(0.02182667888700962f, 151), // conv0
             QuantizationInfo(0.004986600950360298f, 74)  // conv14
         };
-        const std::vector<QuantizationInfo> conv_out_quant_info =
-        {
+        const std::vector<QuantizationInfo> conv_out_quant_info = {
             QuantizationInfo(0.023528477177023888f, 0), // conv0
             QuantizationInfo(0.16609922051429749f, 66)  // conv14
         };
 
-        const std::vector<QuantizationInfo> depth_weights_quant_info =
-        {
+        const std::vector<QuantizationInfo> depth_weights_quant_info = {
             QuantizationInfo(0.29219913482666016f, 110),  // dwsc1
             QuantizationInfo(0.40277284383773804f, 130),  // dwsc2
             QuantizationInfo(0.06053730100393295f, 160),  // dwsc3
@@ -216,8 +236,7 @@ private:
             QuantizationInfo(0.12616927921772003f, 211)   // dwsc13
         };
 
-        const std::vector<QuantizationInfo> point_weights_quant_info =
-        {
+        const std::vector<QuantizationInfo> point_weights_quant_info = {
             QuantizationInfo(0.030420949682593346f, 121), // dwsc1
             QuantizationInfo(0.015148180536925793f, 104), // dwsc2
             QuantizationInfo(0.013755458407104015f, 94),  // dwsc3
@@ -235,108 +254,121 @@ private:
 
         graph << InputLayer(input_descriptor.set_quantization_info(in_quant_info),
                             get_input_accessor(common_params, nullptr, false))
-              << ConvolutionLayer(
-                  3U, 3U, 32U,
-                  get_weights_accessor(data_path, "Conv2d_0_weights.npy"),
-                  get_weights_accessor(data_path, "Conv2d_0_bias.npy"),
-                  PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR),
-                  1, conv_weights_quant_info.at(0), conv_out_quant_info.at(0))
-              .set_name("Conv2d_0")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name("Conv2d_0/Relu6");
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_1", 64U, PadStrideInfo(1U, 1U, 1U, 1U), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(0), point_weights_quant_info.at(0));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_2", 128U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(1),
-                                      point_weights_quant_info.at(1));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_3", 128U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(2),
-                                      point_weights_quant_info.at(2));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_4", 256U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(3),
-                                      point_weights_quant_info.at(3));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_5", 256U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(4),
-                                      point_weights_quant_info.at(4));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_6", 512U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(5),
-                                      point_weights_quant_info.at(5));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_7", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(6),
-                                      point_weights_quant_info.at(6));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_8", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(7),
-                                      point_weights_quant_info.at(7));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_9", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(8),
-                                      point_weights_quant_info.at(8));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_10", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(9),
-                                      point_weights_quant_info.at(9));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_11", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(10),
-                                      point_weights_quant_info.at(10));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_12", 1024U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(11),
-                                      point_weights_quant_info.at(11));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_13", 1024U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(12),
-                                      point_weights_quant_info.at(12))
-              << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, common_params.data_layout)).set_name("Logits/AvgPool_1a")
-              << ConvolutionLayer(
-                  1U, 1U, 1001U,
-                  get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_weights.npy"),
-                  get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_bias.npy"),
-                  PadStrideInfo(1U, 1U, 0U, 0U), 1, conv_weights_quant_info.at(1), conv_out_quant_info.at(1))
-              .set_name("Logits/Conv2d_1c_1x1");
+              << ConvolutionLayer(3U, 3U, 32U, get_weights_accessor(data_path, "Conv2d_0_weights.npy"),
+                                  get_weights_accessor(data_path, "Conv2d_0_bias.npy"),
+                                  PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), 1,
+                                  conv_weights_quant_info.at(0), conv_out_quant_info.at(0))
+                     .set_name("Conv2d_0")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+                     .set_name("Conv2d_0/Relu6");
+        graph << get_dwsc_node_qasymm(data_path, "Conv2d_1", 64U, PadStrideInfo(1U, 1U, 1U, 1U),
+                                      PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(0),
+                                      point_weights_quant_info.at(0));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_2", 128U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(1), point_weights_quant_info.at(1));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_3", 128U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(2), point_weights_quant_info.at(2));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_4", 256U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(3), point_weights_quant_info.at(3));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_5", 256U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(4), point_weights_quant_info.at(4));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_6", 512U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(5), point_weights_quant_info.at(5));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_7", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(6), point_weights_quant_info.at(6));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_8", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(7), point_weights_quant_info.at(7));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_9", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(8), point_weights_quant_info.at(8));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_10", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(9), point_weights_quant_info.at(9));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_11", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(10), point_weights_quant_info.at(10));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_12", 1024U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(11), point_weights_quant_info.at(11));
+        graph
+            << get_dwsc_node_qasymm(
+                   data_path, "Conv2d_13", 1024U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR),
+                   PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(12), point_weights_quant_info.at(12))
+            << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, common_params.data_layout)).set_name("Logits/AvgPool_1a")
+            << ConvolutionLayer(1U, 1U, 1001U, get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_weights.npy"),
+                                get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_bias.npy"),
+                                PadStrideInfo(1U, 1U, 0U, 0U), 1, conv_weights_quant_info.at(1),
+                                conv_out_quant_info.at(1))
+                   .set_name("Logits/Conv2d_1c_1x1");
     }
 
-    ConcatLayer get_dwsc_node_float(const std::string &data_path, std::string &&param_path,
-                                    unsigned int  conv_filt,
-                                    PadStrideInfo dwc_pad_stride_info, PadStrideInfo conv_pad_stride_info)
+    ConcatLayer get_dwsc_node_float(const std::string &data_path,
+                                    std::string      &&param_path,
+                                    unsigned int       conv_filt,
+                                    PadStrideInfo      dwc_pad_stride_info,
+                                    PadStrideInfo      conv_pad_stride_info)
     {
         std::string total_path = param_path + "_";
         SubStream   sg(graph);
         sg << DepthwiseConvolutionLayer(
-               3U, 3U,
-               get_weights_accessor(data_path, total_path + "depthwise_depthwise_weights.npy", DataLayout::NCHW),
-               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-               dwc_pad_stride_info)
-           .set_name(total_path + "depthwise/depthwise")
+                  3U, 3U,
+                  get_weights_accessor(data_path, total_path + "depthwise_depthwise_weights.npy", DataLayout::NCHW),
+                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), dwc_pad_stride_info)
+                  .set_name(total_path + "depthwise/depthwise")
            << BatchNormalizationLayer(
-               get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_moving_mean.npy"),
-               get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_moving_variance.npy"),
-               get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_gamma.npy"),
-               get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_beta.npy"),
-               0.001f)
-           .set_name(total_path + "depthwise/BatchNorm")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f)).set_name(total_path + "depthwise/Relu6")
-           << ConvolutionLayer(
-               1U, 1U, conv_filt,
-               get_weights_accessor(data_path, total_path + "pointwise_weights.npy", DataLayout::NCHW),
-               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-               conv_pad_stride_info)
-           .set_name(total_path + "pointwise/Conv2D")
+                  get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_moving_mean.npy"),
+                  get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_moving_variance.npy"),
+                  get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_gamma.npy"),
+                  get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_beta.npy"), 0.001f)
+                  .set_name(total_path + "depthwise/BatchNorm")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
+                  .set_name(total_path + "depthwise/Relu6")
+           << ConvolutionLayer(1U, 1U, conv_filt,
+                               get_weights_accessor(data_path, total_path + "pointwise_weights.npy", DataLayout::NCHW),
+                               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), conv_pad_stride_info)
+                  .set_name(total_path + "pointwise/Conv2D")
            << BatchNormalizationLayer(
-               get_weights_accessor(data_path, total_path + "pointwise_BatchNorm_moving_mean.npy"),
-               get_weights_accessor(data_path, total_path + "pointwise_BatchNorm_moving_variance.npy"),
-               get_weights_accessor(data_path, total_path + "pointwise_BatchNorm_gamma.npy"),
-               get_weights_accessor(data_path, total_path + "pointwise_BatchNorm_beta.npy"),
-               0.001f)
-           .set_name(total_path + "pointwise/BatchNorm")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f)).set_name(total_path + "pointwise/Relu6");
+                  get_weights_accessor(data_path, total_path + "pointwise_BatchNorm_moving_mean.npy"),
+                  get_weights_accessor(data_path, total_path + "pointwise_BatchNorm_moving_variance.npy"),
+                  get_weights_accessor(data_path, total_path + "pointwise_BatchNorm_gamma.npy"),
+                  get_weights_accessor(data_path, total_path + "pointwise_BatchNorm_beta.npy"), 0.001f)
+                  .set_name(total_path + "pointwise/BatchNorm")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
+                  .set_name(total_path + "pointwise/Relu6");
 
         return ConcatLayer(std::move(sg));
     }
 
-    ConcatLayer get_dwsc_node_qasymm(const std::string &data_path, std::string &&param_path,
+    ConcatLayer get_dwsc_node_qasymm(const std::string &data_path,
+                                     std::string      &&param_path,
                                      const unsigned int conv_filt,
-                                     PadStrideInfo dwc_pad_stride_info, PadStrideInfo conv_pad_stride_info,
-                                     QuantizationInfo depth_weights_quant_info, QuantizationInfo point_weights_quant_info)
+                                     PadStrideInfo      dwc_pad_stride_info,
+                                     PadStrideInfo      conv_pad_stride_info,
+                                     QuantizationInfo   depth_weights_quant_info,
+                                     QuantizationInfo   point_weights_quant_info)
     {
         std::string total_path = param_path + "_";
         SubStream   sg(graph);
 
-        sg << DepthwiseConvolutionLayer(
-               3U, 3U,
-               get_weights_accessor(data_path, total_path + "depthwise_weights.npy"),
-               get_weights_accessor(data_path, total_path + "depthwise_bias.npy"),
-               dwc_pad_stride_info, 1, std::move(depth_weights_quant_info))
-           .set_name(total_path + "depthwise/depthwise")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name(total_path + "depthwise/Relu6")
-           << ConvolutionLayer(
-               1U, 1U, conv_filt,
-               get_weights_accessor(data_path, total_path + "pointwise_weights.npy"),
-               get_weights_accessor(data_path, total_path + "pointwise_bias.npy"),
-               conv_pad_stride_info, 1, std::move(point_weights_quant_info))
-           .set_name(total_path + "pointwise/Conv2D")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name(total_path + "pointwise/Relu6");
+        sg << DepthwiseConvolutionLayer(3U, 3U, get_weights_accessor(data_path, total_path + "depthwise_weights.npy"),
+                                        get_weights_accessor(data_path, total_path + "depthwise_bias.npy"),
+                                        dwc_pad_stride_info, 1, std::move(depth_weights_quant_info))
+                  .set_name(total_path + "depthwise/depthwise")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+                  .set_name(total_path + "depthwise/Relu6")
+           << ConvolutionLayer(1U, 1U, conv_filt, get_weights_accessor(data_path, total_path + "pointwise_weights.npy"),
+                               get_weights_accessor(data_path, total_path + "pointwise_bias.npy"), conv_pad_stride_info,
+                               1, std::move(point_weights_quant_info))
+                  .set_name(total_path + "pointwise/Conv2D")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+                  .set_name(total_path + "pointwise/Relu6");
 
         return ConcatLayer(std::move(sg));
     }
diff --git a/examples/graph_mobilenet_v2.cpp b/examples/graph_mobilenet_v2.cpp
index c027e6f13e41618cdbca0ffdce4a6f6ac281f10c..9bc21c42c54f7bef053f34e239a329dc8de3f3f5 100644
--- a/examples/graph_mobilenet_v2.cpp
+++ b/examples/graph_mobilenet_v2.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -36,11 +37,10 @@ using namespace arm_compute::graph_utils;
 class GraphMobilenetV2Example : public Example
 {
 public:
-    GraphMobilenetV2Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "MobileNetV2")
+    GraphMobilenetV2Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "MobileNetV2")
     {
     }
-    GraphMobilenetV2Example(const GraphMobilenetV2Example &) = delete;
+    GraphMobilenetV2Example(const GraphMobilenetV2Example &)            = delete;
     GraphMobilenetV2Example &operator=(const GraphMobilenetV2Example &) = delete;
     ~GraphMobilenetV2Example() override                                 = default;
 
@@ -54,7 +54,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -64,15 +64,16 @@ public:
         std::cout << common_params << std::endl;
 
         // Create input descriptor
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, common_params.data_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+        const TensorShape tensor_shape = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches),
+                                                       DataLayout::NCHW, common_params.data_layout);
+        TensorDescriptor  input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
 
         // Set graph hints
-        graph << common_params.target
-              << common_params.fast_math_hint;
+        graph << common_params.target << common_params.fast_math_hint;
 
         // Create core graph
-        if(arm_compute::is_data_type_float(common_params.data_type))
+        if (arm_compute::is_data_type_float(common_params.data_type))
         {
             create_graph_float(input_descriptor);
         }
@@ -82,8 +83,7 @@ public:
         }
         // Create common tail
         graph << ReshapeLayer(TensorShape(1001U)).set_name("Predictions/Reshape")
-              << SoftmaxLayer().set_name("Predictions/Softmax")
-              << OutputLayer(get_output_accessor(common_params, 5));
+              << SoftmaxLayer().set_name("Predictions/Softmax") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
@@ -136,123 +136,143 @@ private:
         std::string data_path = common_params.data_path;
 
         // Add model path to data path
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += model_path;
         }
 
         graph << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false))
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "Conv_weights.npy", DataLayout::NCHW),
+              << ConvolutionLayer(3U, 3U, 32U, get_weights_accessor(data_path, "Conv_weights.npy", DataLayout::NCHW),
                                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                   PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL))
-              .set_name("Conv")
+                     .set_name("Conv")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv_BatchNorm_moving_variance.npy"),
                                          get_weights_accessor(data_path, "Conv_BatchNorm_gamma.npy"),
                                          get_weights_accessor(data_path, "Conv_BatchNorm_beta.npy"),
                                          0.0010000000474974513f)
-              .set_name("Conv/BatchNorm")
+                     .set_name("Conv/BatchNorm")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
-              .set_name("Conv/Relu6");
+                     .set_name("Conv/Relu6");
 
         get_expanded_conv_float(data_path, "expanded_conv", 32U, 16U, PadStrideInfo(1, 1, 1, 1));
-        get_expanded_conv_float(data_path, "expanded_conv_1", 16U, 24U, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), HasExpand::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_2", 24U, 24U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_3", 24U, 32U, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), HasExpand::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_4", 32U, 32U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_5", 32U, 32U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_6", 32U, 64U, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), HasExpand::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_7", 64U, 64U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_8", 64U, 64U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_9", 64U, 64U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_1", 16U, 24U,
+                                PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), HasExpand::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_2", 24U, 24U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_3", 24U, 32U,
+                                PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), HasExpand::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_4", 32U, 32U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_5", 32U, 32U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_6", 32U, 64U,
+                                PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), HasExpand::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_7", 64U, 64U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_8", 64U, 64U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_9", 64U, 64U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
         get_expanded_conv_float(data_path, "expanded_conv_10", 64U, 96U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_11", 96U, 96U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_12", 96U, 96U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_13", 96U, 160U, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), HasExpand::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_14", 160U, 160U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_15", 160U, 160U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_11", 96U, 96U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_12", 96U, 96U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_13", 96U, 160U,
+                                PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), HasExpand::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_14", 160U, 160U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_15", 160U, 160U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
         get_expanded_conv_float(data_path, "expanded_conv_16", 160U, 320U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes);
 
-        graph << ConvolutionLayer(1U, 1U, 1280U,
-                                  get_weights_accessor(data_path, "Conv_1_weights.npy", DataLayout::NCHW),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv_1")
+        graph << ConvolutionLayer(
+                     1U, 1U, 1280U, get_weights_accessor(data_path, "Conv_1_weights.npy", DataLayout::NCHW),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv_1")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv_1_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv_1_BatchNorm_moving_variance.npy"),
                                          get_weights_accessor(data_path, "Conv_1_BatchNorm_gamma.npy"),
                                          get_weights_accessor(data_path, "Conv_1_BatchNorm_beta.npy"),
                                          0.0010000000474974513f)
-              .set_name("Conv_1/BatchNorm")
+                     .set_name("Conv_1/BatchNorm")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
-              .set_name("Conv_1/Relu6")
+                     .set_name("Conv_1/Relu6")
               << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, common_params.data_layout)).set_name("Logits/AvgPool")
               << ConvolutionLayer(1U, 1U, 1001U,
                                   get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_weights.npy", DataLayout::NCHW),
                                   get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_biases.npy"),
                                   PadStrideInfo(1, 1, 0, 0))
-              .set_name("Logits/Conv2d_1c_1x1");
+                     .set_name("Logits/Conv2d_1c_1x1");
     }
 
-    void get_expanded_conv_float(const std::string &data_path, std::string &&param_path,
-                                 unsigned int input_channels, unsigned int output_channels,
-                                 PadStrideInfo dwc_pad_stride_info,
-                                 HasExpand has_expand = HasExpand::No, IsResidual is_residual = IsResidual::No,
-                                 unsigned int expansion_size = 6)
+    void get_expanded_conv_float(const std::string &data_path,
+                                 std::string      &&param_path,
+                                 unsigned int       input_channels,
+                                 unsigned int       output_channels,
+                                 PadStrideInfo      dwc_pad_stride_info,
+                                 HasExpand          has_expand     = HasExpand::No,
+                                 IsResidual         is_residual    = IsResidual::No,
+                                 unsigned int       expansion_size = 6)
     {
         std::string total_path = param_path + "_";
         SubStream   left(graph);
 
         // Add expand node
-        if(has_expand == HasExpand::Yes)
+        if (has_expand == HasExpand::Yes)
         {
-            left << ConvolutionLayer(1U, 1U, input_channels * expansion_size,
-                                     get_weights_accessor(data_path, total_path + "expand_weights.npy", DataLayout::NCHW),
-                                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-                 .set_name(param_path + "/expand/Conv2D")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "expand_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, total_path + "expand_BatchNorm_moving_variance.npy"),
-                                            get_weights_accessor(data_path, total_path + "expand_BatchNorm_gamma.npy"),
-                                            get_weights_accessor(data_path, total_path + "expand_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(param_path + "/expand/BatchNorm")
+            left << ConvolutionLayer(
+                        1U, 1U, input_channels * expansion_size,
+                        get_weights_accessor(data_path, total_path + "expand_weights.npy", DataLayout::NCHW),
+                        std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                        .set_name(param_path + "/expand/Conv2D")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, total_path + "expand_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path, total_path + "expand_BatchNorm_moving_variance.npy"),
+                        get_weights_accessor(data_path, total_path + "expand_BatchNorm_gamma.npy"),
+                        get_weights_accessor(data_path, total_path + "expand_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(param_path + "/expand/BatchNorm")
                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
-                 .set_name(param_path + "/expand/Relu6");
+                        .set_name(param_path + "/expand/Relu6");
         }
 
         // Add depthwise node
-        left << DepthwiseConvolutionLayer(3U, 3U,
-                                          get_weights_accessor(data_path, total_path + "depthwise_depthwise_weights.npy", DataLayout::NCHW),
-                                          std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                          dwc_pad_stride_info)
-             .set_name(param_path + "/depthwise/depthwise")
-             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_moving_mean.npy"),
-                                        get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_moving_variance.npy"),
-                                        get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_gamma.npy"),
-                                        get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_beta.npy"),
-                                        0.0010000000474974513f)
-             .set_name(param_path + "/depthwise/BatchNorm")
+        left << DepthwiseConvolutionLayer(
+                    3U, 3U,
+                    get_weights_accessor(data_path, total_path + "depthwise_depthwise_weights.npy", DataLayout::NCHW),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), dwc_pad_stride_info)
+                    .set_name(param_path + "/depthwise/depthwise")
+             << BatchNormalizationLayer(
+                    get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_moving_variance.npy"),
+                    get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_gamma.npy"),
+                    get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_beta.npy"),
+                    0.0010000000474974513f)
+                    .set_name(param_path + "/depthwise/BatchNorm")
              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
-             .set_name(param_path + "/depthwise/Relu6");
+                    .set_name(param_path + "/depthwise/Relu6");
 
         // Add project node
         left << ConvolutionLayer(1U, 1U, output_channels,
                                  get_weights_accessor(data_path, total_path + "project_weights.npy", DataLayout::NCHW),
-                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-             .set_name(param_path + "/project/Conv2D")
-             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "project_BatchNorm_moving_mean.npy"),
-                                        get_weights_accessor(data_path, total_path + "project_BatchNorm_moving_variance.npy"),
-                                        get_weights_accessor(data_path, total_path + "project_BatchNorm_gamma.npy"),
-                                        get_weights_accessor(data_path, total_path + "project_BatchNorm_beta.npy"),
-                                        0.0010000000474974513)
-             .set_name(param_path + "/project/BatchNorm");
-
-        if(is_residual == IsResidual::Yes)
+                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                 PadStrideInfo(1, 1, 0, 0))
+                    .set_name(param_path + "/project/Conv2D")
+             << BatchNormalizationLayer(
+                    get_weights_accessor(data_path, total_path + "project_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path, total_path + "project_BatchNorm_moving_variance.npy"),
+                    get_weights_accessor(data_path, total_path + "project_BatchNorm_gamma.npy"),
+                    get_weights_accessor(data_path, total_path + "project_BatchNorm_beta.npy"), 0.0010000000474974513)
+                    .set_name(param_path + "/project/BatchNorm");
+
+        if (is_residual == IsResidual::Yes)
         {
             // Add residual node
             SubStream right(graph);
-            graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(param_path + "/add");
+            graph
+                << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(param_path + "/add");
         }
         else
         {
@@ -269,7 +289,7 @@ private:
         std::string data_path = common_params.data_path;
 
         // Add model path to data path
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += model_path;
         }
@@ -277,16 +297,14 @@ private:
         const QuantizationInfo in_quant_info  = QuantizationInfo(0.0078125f, 128);
         const QuantizationInfo mid_quant_info = QuantizationInfo(0.023528477177023888f, 128);
 
-        const std::vector<QuantizationInfo> conv_weights_quant_info =
-        {
+        const std::vector<QuantizationInfo> conv_weights_quant_info = {
             QuantizationInfo(0.03396892547607422f, 122),  // Conv
             QuantizationInfo(0.005167067516595125f, 125), // Conv1
             QuantizationInfo(0.0016910821432247758f, 113) // Conv2d_1c_1x1
         };
 
         // Pointwise expand convolution quantization info
-        const std::vector<QuantizationInfo> pwc_q =
-        {
+        const std::vector<QuantizationInfo> pwc_q = {
             QuantizationInfo(0.254282623529f, 129),        // expand_0 (Dummy)
             QuantizationInfo(0.009758507832884789f, 127),  // expand_1
             QuantizationInfo(0.0036556976847350597f, 144), // expand_2
@@ -306,8 +324,7 @@ private:
             QuantizationInfo(0.002046825597062707f, 135)   // expand_16
         };
         // Depthwise expand convolution quantization info
-        const std::vector<QuantizationInfo> dwc_q =
-        {
+        const std::vector<QuantizationInfo> dwc_q = {
             QuantizationInfo(0.3436955213546753f, 165),   // expand_0
             QuantizationInfo(0.020969120785593987f, 109), // expand_1
             QuantizationInfo(0.16981913149356842f, 52),   // expand_2
@@ -327,8 +344,7 @@ private:
             QuantizationInfo(0.16456253826618195, 201)    // expand_16
         };
         // Project convolution quantization info
-        const std::vector<QuantizationInfo> prwc_q =
-        {
+        const std::vector<QuantizationInfo> prwc_q = {
             QuantizationInfo(0.03737175464630127f, 140),  // expand_0
             QuantizationInfo(0.0225360207259655f, 156),   // expand_1
             QuantizationInfo(0.02740888111293316f, 122),  // expand_2
@@ -350,65 +366,84 @@ private:
 
         graph << InputLayer(input_descriptor.set_quantization_info(in_quant_info),
                             get_weights_accessor(data_path, common_params.image))
-              << ConvolutionLayer(
-                  3U, 3U, 32U,
-                  get_weights_accessor(data_path, "Conv_weights.npy"),
-                  get_weights_accessor(data_path, "Conv_bias.npy"),
-                  PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR),
-                  1, conv_weights_quant_info.at(0), mid_quant_info)
-              .set_name("Conv")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name("Conv/Relu6")
-              << DepthwiseConvolutionLayer(3U, 3U,
-                                           get_weights_accessor(data_path, "expanded_conv_depthwise_depthwise_weights.npy"),
-                                           get_weights_accessor(data_path, "expanded_conv_depthwise_depthwise_biases.npy"),
-                                           PadStrideInfo(1, 1, 1, 1), 1, dwc_q.at(0))
-              .set_name("expanded_conv/depthwise/depthwise")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name("expanded_conv/depthwise/Relu6")
-              << ConvolutionLayer(1U, 1U, 16U,
-                                  get_weights_accessor(data_path, "expanded_conv_project_weights.npy"),
+              << ConvolutionLayer(3U, 3U, 32U, get_weights_accessor(data_path, "Conv_weights.npy"),
+                                  get_weights_accessor(data_path, "Conv_bias.npy"),
+                                  PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), 1,
+                                  conv_weights_quant_info.at(0), mid_quant_info)
+                     .set_name("Conv")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+                     .set_name("Conv/Relu6")
+              << DepthwiseConvolutionLayer(
+                     3U, 3U, get_weights_accessor(data_path, "expanded_conv_depthwise_depthwise_weights.npy"),
+                     get_weights_accessor(data_path, "expanded_conv_depthwise_depthwise_biases.npy"),
+                     PadStrideInfo(1, 1, 1, 1), 1, dwc_q.at(0))
+                     .set_name("expanded_conv/depthwise/depthwise")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+                     .set_name("expanded_conv/depthwise/Relu6")
+              << ConvolutionLayer(1U, 1U, 16U, get_weights_accessor(data_path, "expanded_conv_project_weights.npy"),
                                   get_weights_accessor(data_path, "expanded_conv_project_biases.npy"),
                                   PadStrideInfo(1, 1, 0, 0), 1, prwc_q.at(0))
-              .set_name("expanded_conv/project/Conv2D");
-
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_1", IsResidual::No, 96U, 24U, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL),
-                                  pwc_q.at(1), dwc_q.at(1), prwc_q.at(1));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_2", IsResidual::Yes, 144U, 24U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(2), dwc_q.at(2), prwc_q.at(2));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_3", IsResidual::No, 144U, 32U, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL),
-                                  pwc_q.at(3), dwc_q.at(3), prwc_q.at(3));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_4", IsResidual::Yes, 192U, 32U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(4), dwc_q.at(4), prwc_q.at(4));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_5", IsResidual::Yes, 192U, 32U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(5), dwc_q.at(5), prwc_q.at(5));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_6", IsResidual::No, 192U, 64U, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL),
-                                  pwc_q.at(6), dwc_q.at(6), prwc_q.at(6));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_7", IsResidual::Yes, 384U, 64U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(7), dwc_q.at(7), prwc_q.at(7));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_8", IsResidual::Yes, 384U, 64U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(8), dwc_q.at(8), prwc_q.at(8));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_9", IsResidual::Yes, 384U, 64U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(9), dwc_q.at(9), prwc_q.at(9));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_10", IsResidual::No, 384U, 96U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(10), dwc_q.at(10), prwc_q.at(10));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_11", IsResidual::Yes, 576U, 96U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(11), dwc_q.at(11), prwc_q.at(11));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_12", IsResidual::Yes, 576U, 96U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(12), dwc_q.at(12), prwc_q.at(12));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_13", IsResidual::No, 576U, 160U, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL),
-                                  pwc_q.at(13), dwc_q.at(13), prwc_q.at(13));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_14", IsResidual::Yes, 960U, 160U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(14), dwc_q.at(14), prwc_q.at(14));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_15", IsResidual::Yes, 960U, 160U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(15), dwc_q.at(15), prwc_q.at(15));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_16", IsResidual::No, 960U, 320U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(16), dwc_q.at(16), prwc_q.at(16));
-
-        graph << ConvolutionLayer(1U, 1U, 1280U,
-                                  get_weights_accessor(data_path, "Conv_1_weights.npy"),
-                                  get_weights_accessor(data_path, "Conv_1_biases.npy"),
-                                  PadStrideInfo(1, 1, 0, 0), 1, conv_weights_quant_info.at(1))
-              .set_name("Conv_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name("Conv_1/Relu6")
+                     .set_name("expanded_conv/project/Conv2D");
+
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_1", IsResidual::No, 96U, 24U,
+                                  PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), pwc_q.at(1),
+                                  dwc_q.at(1), prwc_q.at(1));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_2", IsResidual::Yes, 144U, 24U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(2), dwc_q.at(2), prwc_q.at(2));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_3", IsResidual::No, 144U, 32U,
+                                  PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), pwc_q.at(3),
+                                  dwc_q.at(3), prwc_q.at(3));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_4", IsResidual::Yes, 192U, 32U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(4), dwc_q.at(4), prwc_q.at(4));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_5", IsResidual::Yes, 192U, 32U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(5), dwc_q.at(5), prwc_q.at(5));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_6", IsResidual::No, 192U, 64U,
+                                  PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), pwc_q.at(6),
+                                  dwc_q.at(6), prwc_q.at(6));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_7", IsResidual::Yes, 384U, 64U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(7), dwc_q.at(7), prwc_q.at(7));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_8", IsResidual::Yes, 384U, 64U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(8), dwc_q.at(8), prwc_q.at(8));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_9", IsResidual::Yes, 384U, 64U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(9), dwc_q.at(9), prwc_q.at(9));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_10", IsResidual::No, 384U, 96U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(10), dwc_q.at(10), prwc_q.at(10));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_11", IsResidual::Yes, 576U, 96U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(11), dwc_q.at(11), prwc_q.at(11));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_12", IsResidual::Yes, 576U, 96U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(12), dwc_q.at(12), prwc_q.at(12));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_13", IsResidual::No, 576U, 160U,
+                                  PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), pwc_q.at(13),
+                                  dwc_q.at(13), prwc_q.at(13));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_14", IsResidual::Yes, 960U, 160U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(14), dwc_q.at(14), prwc_q.at(14));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_15", IsResidual::Yes, 960U, 160U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(15), dwc_q.at(15), prwc_q.at(15));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_16", IsResidual::No, 960U, 320U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(16), dwc_q.at(16), prwc_q.at(16));
+
+        graph << ConvolutionLayer(1U, 1U, 1280U, get_weights_accessor(data_path, "Conv_1_weights.npy"),
+                                  get_weights_accessor(data_path, "Conv_1_biases.npy"), PadStrideInfo(1, 1, 0, 0), 1,
+                                  conv_weights_quant_info.at(1))
+                     .set_name("Conv_1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+                     .set_name("Conv_1/Relu6")
               << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, common_params.data_layout)).set_name("Logits/AvgPool")
-              << ConvolutionLayer(1U, 1U, 1001U,
-                                  get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_weights.npy"),
+              << ConvolutionLayer(1U, 1U, 1001U, get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_weights.npy"),
                                   get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_biases.npy"),
                                   PadStrideInfo(1, 1, 0, 0), 1, conv_weights_quant_info.at(2))
-              .set_name("Logits/Conv2d_1c_1x1");
+                     .set_name("Logits/Conv2d_1c_1x1");
     }
 
-    void get_expanded_conv_qasymm8(const std::string &data_path, std::string &&param_path, IsResidual is_residual,
-                                   unsigned int input_channels, unsigned int output_channels,
+    void get_expanded_conv_qasymm8(const std::string      &data_path,
+                                   std::string           &&param_path,
+                                   IsResidual              is_residual,
+                                   unsigned int            input_channels,
+                                   unsigned int            output_channels,
                                    PadStrideInfo           dwc_pad_stride_info,
-                                   const QuantizationInfo &pwi, const QuantizationInfo &dwi, const QuantizationInfo &pji)
+                                   const QuantizationInfo &pwi,
+                                   const QuantizationInfo &dwi,
+                                   const QuantizationInfo &pji)
     {
         std::string total_path = param_path + "_";
 
@@ -417,25 +452,28 @@ private:
                                  get_weights_accessor(data_path, total_path + "project_weights.npy"),
                                  get_weights_accessor(data_path, total_path + "project_biases.npy"),
                                  PadStrideInfo(1, 1, 0, 0), 1, pwi)
-             .set_name(param_path + "/Conv2D")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name(param_path + "/Conv2D/Relu6")
-             << DepthwiseConvolutionLayer(3U, 3U,
-                                          get_weights_accessor(data_path, total_path + "depthwise_depthwise_weights.npy"),
-                                          get_weights_accessor(data_path, total_path + "depthwise_depthwise_biases.npy"),
-                                          dwc_pad_stride_info, 1, dwi)
-             .set_name(param_path + "/depthwise/depthwise")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name(param_path + "/depthwise/Relu6")
+                    .set_name(param_path + "/Conv2D")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+                    .set_name(param_path + "/Conv2D/Relu6")
+             << DepthwiseConvolutionLayer(
+                    3U, 3U, get_weights_accessor(data_path, total_path + "depthwise_depthwise_weights.npy"),
+                    get_weights_accessor(data_path, total_path + "depthwise_depthwise_biases.npy"), dwc_pad_stride_info,
+                    1, dwi)
+                    .set_name(param_path + "/depthwise/depthwise")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+                    .set_name(param_path + "/depthwise/Relu6")
              << ConvolutionLayer(1U, 1U, output_channels,
                                  get_weights_accessor(data_path, total_path + "project_weights.npy"),
                                  get_weights_accessor(data_path, total_path + "project_biases.npy"),
                                  PadStrideInfo(1, 1, 0, 0), 1, pji)
-             .set_name(param_path + "/project/Conv2D");
+                    .set_name(param_path + "/project/Conv2D");
 
-        if(is_residual == IsResidual::Yes)
+        if (is_residual == IsResidual::Yes)
         {
             // Add residual node
             SubStream right(graph);
-            graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(param_path + "/add");
+            graph
+                << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(param_path + "/add");
         }
         else
         {
diff --git a/examples/graph_resnet12.cpp b/examples/graph_resnet12.cpp
index 48708ce29a00fe048c4f3b3ad5639ae25cdff813..80db826be531b7fc708573cb6e0b62af5575ef65 100644
--- a/examples/graph_resnet12.cpp
+++ b/examples/graph_resnet12.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -36,7 +37,12 @@ class GraphResNet12Example : public Example
 {
 public:
     GraphResNet12Example()
-        : cmd_parser(), common_opts(cmd_parser), model_input_width(nullptr), model_input_height(nullptr), common_params(), graph(0, "ResNet12")
+        : cmd_parser(),
+          common_opts(cmd_parser),
+          model_input_width(nullptr),
+          model_input_height(nullptr),
+          common_params(),
+          graph(0, "ResNet12")
     {
         model_input_width  = cmd_parser.add_option<SimpleOption<unsigned int>>("image-width", 192);
         model_input_height = cmd_parser.add_option<SimpleOption<unsigned int>>("image-height", 128);
@@ -45,7 +51,7 @@ public:
         model_input_width->set_help("Input image width.");
         model_input_height->set_help("Input image height.");
     }
-    GraphResNet12Example(const GraphResNet12Example &) = delete;
+    GraphResNet12Example(const GraphResNet12Example &)            = delete;
     GraphResNet12Example &operator=(const GraphResNet12Example &) = delete;
     ~GraphResNet12Example() override                              = default;
     bool do_setup(int argc, char **argv) override
@@ -58,7 +64,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -69,7 +75,8 @@ public:
         const unsigned int image_height = model_input_height->value();
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -84,50 +91,47 @@ public:
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<TFPreproccessor>();
 
         // Create input descriptor
-        const TensorShape tensor_shape     = permute_shape(TensorShape(image_width, image_height, 3U, common_params.batches), DataLayout::NCHW, common_params.data_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(image_width, image_height, 3U, common_params.batches), DataLayout::NCHW,
+                          common_params.data_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false /* Do not convert to BGR */))
-              << ConvolutionLayer(
-                  9U, 9U, 64U,
-                  get_weights_accessor(data_path, "conv1_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv1_biases.npy", weights_layout),
-                  PadStrideInfo(1, 1, 4, 4))
-              .set_name("conv1/convolution")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1/Relu");
+        graph << common_params.target << common_params.fast_math_hint
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor),
+                                                                 false /* Do not convert to BGR */))
+              << ConvolutionLayer(9U, 9U, 64U, get_weights_accessor(data_path, "conv1_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv1_biases.npy", weights_layout),
+                                  PadStrideInfo(1, 1, 4, 4))
+                     .set_name("conv1/convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv1/Relu");
 
         add_residual_block(data_path, "block1", weights_layout);
         add_residual_block(data_path, "block2", weights_layout);
         add_residual_block(data_path, "block3", weights_layout);
         add_residual_block(data_path, "block4", weights_layout);
 
-        graph << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "conv10_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv10_biases.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv10/convolution")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv10/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "conv11_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv11_biases.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv11/convolution")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv11/Relu")
-              << ConvolutionLayer(
-                  9U, 9U, 3U,
-                  get_weights_accessor(data_path, "conv12_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv12_biases.npy"),
-                  PadStrideInfo(1, 1, 4, 4))
-              .set_name("conv12/convolution")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH)).set_name("conv12/Tanh")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.58f, 0.5f)).set_name("conv12/Linear")
+        graph << ConvolutionLayer(3U, 3U, 64U, get_weights_accessor(data_path, "conv10_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv10_biases.npy"), PadStrideInfo(1, 1, 1, 1))
+                     .set_name("conv10/convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv10/Relu")
+              << ConvolutionLayer(3U, 3U, 64U, get_weights_accessor(data_path, "conv11_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv11_biases.npy"), PadStrideInfo(1, 1, 1, 1))
+                     .set_name("conv11/convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv11/Relu")
+              << ConvolutionLayer(9U, 9U, 3U, get_weights_accessor(data_path, "conv12_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv12_biases.npy"), PadStrideInfo(1, 1, 4, 4))
+                     .set_name("conv12/convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH))
+                     .set_name("conv12/Tanh")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.58f, 0.5f))
+                     .set_name("conv12/Linear")
               << OutputLayer(std::make_unique<DummyAccessor>(0));
 
         // Finalize graph
@@ -152,8 +156,8 @@ public:
 private:
     CommandLineParser           cmd_parser;
     CommonGraphOptions          common_opts;
-    SimpleOption<unsigned int> *model_input_width{ nullptr };
-    SimpleOption<unsigned int> *model_input_height{ nullptr };
+    SimpleOption<unsigned int> *model_input_width{nullptr};
+    SimpleOption<unsigned int> *model_input_height{nullptr};
     CommonGraphParams           common_params;
     Stream                      graph;
 
@@ -170,35 +174,33 @@ private:
         SubStream left(graph);
         SubStream right(graph);
 
-        right << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, unit_path + "conv1_biases.npy", weights_layout),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name(unit_name + "conv1/convolution")
+        right << ConvolutionLayer(3U, 3U, 64U,
+                                  get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, unit_path + "conv1_biases.npy", weights_layout),
+                                  PadStrideInfo(1, 1, 1, 1))
+                     .set_name(unit_name + "conv1/convolution")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_mean.npy"),
-                  get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_variance.npy"),
-                  get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_gamma.npy"),
-                  get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_beta.npy"),
-                  0.0000100099996416f)
-              .set_name(unit_name + "conv1/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv1/Relu")
-
-              << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, unit_path + "conv2_biases.npy", weights_layout),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name(unit_name + "conv2/convolution")
+                     get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_mean.npy"),
+                     get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_variance.npy"),
+                     get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_gamma.npy"),
+                     get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_beta.npy"), 0.0000100099996416f)
+                     .set_name(unit_name + "conv1/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name(unit_name + "conv1/Relu")
+
+              << ConvolutionLayer(3U, 3U, 64U,
+                                  get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, unit_path + "conv2_biases.npy", weights_layout),
+                                  PadStrideInfo(1, 1, 1, 1))
+                     .set_name(unit_name + "conv2/convolution")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_mean.npy"),
-                  get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_variance.npy"),
-                  get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_gamma.npy"),
-                  get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_beta.npy"),
-                  0.0000100099996416f)
-              .set_name(unit_name + "conv2/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv2/Relu");
+                     get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_mean.npy"),
+                     get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_variance.npy"),
+                     get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_gamma.npy"),
+                     get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_beta.npy"), 0.0000100099996416f)
+                     .set_name(unit_name + "conv2/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name(unit_name + "conv2/Relu");
 
         graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(unit_name + "add");
     }
diff --git a/examples/graph_resnet50.cpp b/examples/graph_resnet50.cpp
index 0d3322c886ef67ac62fba3bf7c5b3b9fbbac25c9..ba0f0d5fb611f96aa8943bde351afeb7c2850a9d 100644
--- a/examples/graph_resnet50.cpp
+++ b/examples/graph_resnet50.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphResNetV1_50Example : public Example
 {
 public:
-    GraphResNetV1_50Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "ResNetV1_50")
+    GraphResNetV1_50Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "ResNetV1_50")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,7 +49,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -62,36 +62,40 @@ public:
         std::string data_path = common_params.data_path;
 
         // Create a preprocessor object
-        const std::array<float, 3> mean_rgb{ { 122.68f, 116.67f, 104.01f } };
-        std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<CaffePreproccessor>(mean_rgb,
-                                                                                           false /* Do not convert to BGR */);
+        const std::array<float, 3>     mean_rgb{{122.68f, 116.67f, 104.01f}};
+        std::unique_ptr<IPreprocessor> preprocessor =
+            std::make_unique<CaffePreproccessor>(mean_rgb, false /* Do not convert to BGR */);
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false /* Do not convert to BGR */))
+        graph << common_params.target << common_params.fast_math_hint
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor),
+                                                                 false /* Do not convert to BGR */))
               << ConvolutionLayer(
-                  7U, 7U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_weights.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(2, 2, 3, 3))
-              .set_name("conv1/convolution")
+                     7U, 7U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 3, 3))
+                     .set_name("conv1/convolution")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_moving_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_moving_variance.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_beta.npy"),
-                  0.0000100099996416f)
-              .set_name("conv1/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR))).set_name("pool1/MaxPool");
+                     get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_moving_mean.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_moving_variance.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_gamma.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_beta.npy"),
+                     0.0000100099996416f)
+                     .set_name("conv1/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv1/Relu")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR)))
+                     .set_name("pool1/MaxPool");
 
         add_residual_block(data_path, "block1", weights_layout, 64, 3, 2);
         add_residual_block(data_path, "block2", weights_layout, 128, 4, 2);
@@ -100,13 +104,12 @@ public:
 
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("pool5")
               << ConvolutionLayer(
-                  1U, 1U, 1000U,
-                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/logits_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/logits_biases.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("logits/convolution")
-              << FlattenLayer().set_name("predictions/Reshape")
-              << SoftmaxLayer().set_name("predictions/Softmax")
+                     1U, 1U, 1000U,
+                     get_weights_accessor(data_path, "/cnn_data/resnet50_model/logits_weights.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/resnet50_model/logits_biases.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("logits/convolution")
+              << FlattenLayer().set_name("predictions/Reshape") << SoftmaxLayer().set_name("predictions/Softmax")
               << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
@@ -136,10 +139,14 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    void add_residual_block(const std::string &data_path, const std::string &name, DataLayout weights_layout,
-                            unsigned int base_depth, unsigned int num_units, unsigned int stride)
+    void add_residual_block(const std::string &data_path,
+                            const std::string &name,
+                            DataLayout         weights_layout,
+                            unsigned int       base_depth,
+                            unsigned int       num_units,
+                            unsigned int       stride)
     {
-        for(unsigned int i = 0; i < num_units; ++i)
+        for (unsigned int i = 0; i < num_units; ++i)
         {
             std::stringstream unit_path_ss;
             unit_path_ss << "/cnn_data/resnet50_model/" << name << "_unit_" << (i + 1) << "_bottleneck_v1_";
@@ -151,89 +158,90 @@ private:
 
             unsigned int middle_stride = 1;
 
-            if(i == (num_units - 1))
+            if (i == (num_units - 1))
             {
                 middle_stride = stride;
             }
 
             SubStream right(graph);
-            right << ConvolutionLayer(
-                      1U, 1U, base_depth,
-                      get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
-                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                      PadStrideInfo(1, 1, 0, 0))
-                  .set_name(unit_name + "conv1/convolution")
+            right << ConvolutionLayer(1U, 1U, base_depth,
+                                      get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
+                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                      PadStrideInfo(1, 1, 0, 0))
+                         .set_name(unit_name + "conv1/convolution")
                   << BatchNormalizationLayer(
-                      get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_mean.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_variance.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_gamma.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_beta.npy"),
-                      0.0000100099996416f)
-                  .set_name(unit_name + "conv1/BatchNorm")
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv1/Relu")
+                         get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_mean.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_variance.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_gamma.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_beta.npy"), 0.0000100099996416f)
+                         .set_name(unit_name + "conv1/BatchNorm")
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "conv1/Relu")
 
-                  << ConvolutionLayer(
-                      3U, 3U, base_depth,
-                      get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
-                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                      PadStrideInfo(middle_stride, middle_stride, 1, 1))
-                  .set_name(unit_name + "conv2/convolution")
+                  << ConvolutionLayer(3U, 3U, base_depth,
+                                      get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
+                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                      PadStrideInfo(middle_stride, middle_stride, 1, 1))
+                         .set_name(unit_name + "conv2/convolution")
                   << BatchNormalizationLayer(
-                      get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_mean.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_variance.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_gamma.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_beta.npy"),
-                      0.0000100099996416f)
-                  .set_name(unit_name + "conv2/BatchNorm")
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv1/Relu")
+                         get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_mean.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_variance.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_gamma.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_beta.npy"), 0.0000100099996416f)
+                         .set_name(unit_name + "conv2/BatchNorm")
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "conv1/Relu")
 
-                  << ConvolutionLayer(
-                      1U, 1U, base_depth * 4,
-                      get_weights_accessor(data_path, unit_path + "conv3_weights.npy", weights_layout),
-                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                      PadStrideInfo(1, 1, 0, 0))
-                  .set_name(unit_name + "conv3/convolution")
+                  << ConvolutionLayer(1U, 1U, base_depth * 4,
+                                      get_weights_accessor(data_path, unit_path + "conv3_weights.npy", weights_layout),
+                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                      PadStrideInfo(1, 1, 0, 0))
+                         .set_name(unit_name + "conv3/convolution")
                   << BatchNormalizationLayer(
-                      get_weights_accessor(data_path, unit_path + "conv3_BatchNorm_moving_mean.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv3_BatchNorm_moving_variance.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv3_BatchNorm_gamma.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv3_BatchNorm_beta.npy"),
-                      0.0000100099996416f)
-                  .set_name(unit_name + "conv2/BatchNorm");
+                         get_weights_accessor(data_path, unit_path + "conv3_BatchNorm_moving_mean.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv3_BatchNorm_moving_variance.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv3_BatchNorm_gamma.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv3_BatchNorm_beta.npy"), 0.0000100099996416f)
+                         .set_name(unit_name + "conv2/BatchNorm");
 
-            if(i == 0)
+            if (i == 0)
             {
                 SubStream left(graph);
                 left << ConvolutionLayer(
-                         1U, 1U, base_depth * 4,
-                         get_weights_accessor(data_path, unit_path + "shortcut_weights.npy", weights_layout),
-                         std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                         PadStrideInfo(1, 1, 0, 0))
-                     .set_name(unit_name + "shortcut/convolution")
+                            1U, 1U, base_depth * 4,
+                            get_weights_accessor(data_path, unit_path + "shortcut_weights.npy", weights_layout),
+                            std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                            .set_name(unit_name + "shortcut/convolution")
                      << BatchNormalizationLayer(
-                         get_weights_accessor(data_path, unit_path + "shortcut_BatchNorm_moving_mean.npy"),
-                         get_weights_accessor(data_path, unit_path + "shortcut_BatchNorm_moving_variance.npy"),
-                         get_weights_accessor(data_path, unit_path + "shortcut_BatchNorm_gamma.npy"),
-                         get_weights_accessor(data_path, unit_path + "shortcut_BatchNorm_beta.npy"),
-                         0.0000100099996416f)
-                     .set_name(unit_name + "shortcut/BatchNorm");
+                            get_weights_accessor(data_path, unit_path + "shortcut_BatchNorm_moving_mean.npy"),
+                            get_weights_accessor(data_path, unit_path + "shortcut_BatchNorm_moving_variance.npy"),
+                            get_weights_accessor(data_path, unit_path + "shortcut_BatchNorm_gamma.npy"),
+                            get_weights_accessor(data_path, unit_path + "shortcut_BatchNorm_beta.npy"),
+                            0.0000100099996416f)
+                            .set_name(unit_name + "shortcut/BatchNorm");
 
-                graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(unit_name + "add");
+                graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add)
+                             .set_name(unit_name + "add");
             }
-            else if(middle_stride > 1)
+            else if (middle_stride > 1)
             {
                 SubStream left(graph);
-                left << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 1, common_params.data_layout, PadStrideInfo(middle_stride, middle_stride, 0, 0), true)).set_name(unit_name + "shortcut/MaxPool");
+                left << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 1, common_params.data_layout,
+                                                      PadStrideInfo(middle_stride, middle_stride, 0, 0), true))
+                            .set_name(unit_name + "shortcut/MaxPool");
 
-                graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(unit_name + "add");
+                graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add)
+                             .set_name(unit_name + "add");
             }
             else
             {
                 SubStream left(graph);
-                graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(unit_name + "add");
+                graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add)
+                             .set_name(unit_name + "add");
             }
 
-            graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+            graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "Relu");
         }
     }
 };
diff --git a/examples/graph_resnet_v2_50.cpp b/examples/graph_resnet_v2_50.cpp
index 6d5abb4f4b94eff3f103140b8c42914c4b508c8d..48cf9b0b3ca05ef64077858fd51a523c54d9ab52 100644
--- a/examples/graph_resnet_v2_50.cpp
+++ b/examples/graph_resnet_v2_50.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphResNetV2_50Example : public Example
 {
 public:
-    GraphResNetV2_50Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "ResNetV2_50")
+    GraphResNetV2_50Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "ResNetV2_50")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,7 +49,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -61,7 +61,7 @@ public:
         // Get trainable parameters data path
         std::string data_path  = common_params.data_path;
         std::string model_path = "/cnn_data/resnet_v2_50_model/";
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += model_path;
         }
@@ -71,45 +71,42 @@ public:
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false /* Do not convert to BGR */))
-              << ConvolutionLayer(
-                  7U, 7U, 64U,
-                  get_weights_accessor(data_path, "conv1_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv1_biases.npy", weights_layout),
-                  PadStrideInfo(2, 2, 3, 3))
-              .set_name("conv1/convolution")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR))).set_name("pool1/MaxPool");
+        graph << common_params.target << common_params.fast_math_hint
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor),
+                                                                 false /* Do not convert to BGR */))
+              << ConvolutionLayer(7U, 7U, 64U, get_weights_accessor(data_path, "conv1_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv1_biases.npy", weights_layout),
+                                  PadStrideInfo(2, 2, 3, 3))
+                     .set_name("conv1/convolution")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR)))
+                     .set_name("pool1/MaxPool");
 
         add_residual_block(data_path, "block1", weights_layout, 64, 3, 2);
         add_residual_block(data_path, "block2", weights_layout, 128, 4, 2);
         add_residual_block(data_path, "block3", weights_layout, 256, 6, 2);
         add_residual_block(data_path, "block4", weights_layout, 512, 3, 1);
 
-        graph << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "postnorm_moving_mean.npy"),
-                  get_weights_accessor(data_path, "postnorm_moving_variance.npy"),
-                  get_weights_accessor(data_path, "postnorm_gamma.npy"),
-                  get_weights_accessor(data_path, "postnorm_beta.npy"),
-                  0.000009999999747378752f)
-              .set_name("postnorm/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("postnorm/Relu")
+        graph << BatchNormalizationLayer(get_weights_accessor(data_path, "postnorm_moving_mean.npy"),
+                                         get_weights_accessor(data_path, "postnorm_moving_variance.npy"),
+                                         get_weights_accessor(data_path, "postnorm_gamma.npy"),
+                                         get_weights_accessor(data_path, "postnorm_beta.npy"), 0.000009999999747378752f)
+                     .set_name("postnorm/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("postnorm/Relu")
               << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("pool5")
-              << ConvolutionLayer(
-                  1U, 1U, 1001U,
-                  get_weights_accessor(data_path, "logits_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "logits_biases.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("logits/convolution")
-              << FlattenLayer().set_name("predictions/Reshape")
-              << SoftmaxLayer().set_name("predictions/Softmax")
+              << ConvolutionLayer(1U, 1U, 1001U, get_weights_accessor(data_path, "logits_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "logits_biases.npy"), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("logits/convolution")
+              << FlattenLayer().set_name("predictions/Reshape") << SoftmaxLayer().set_name("predictions/Softmax")
               << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
@@ -139,10 +136,14 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    void add_residual_block(const std::string &data_path, const std::string &name, DataLayout weights_layout,
-                            unsigned int base_depth, unsigned int num_units, unsigned int stride)
+    void add_residual_block(const std::string &data_path,
+                            const std::string &name,
+                            DataLayout         weights_layout,
+                            unsigned int       base_depth,
+                            unsigned int       num_units,
+                            unsigned int       stride)
     {
-        for(unsigned int i = 0; i < num_units; ++i)
+        for (unsigned int i = 0; i < num_units; ++i)
         {
             // Generate unit names
             std::stringstream unit_path_ss;
@@ -154,7 +155,8 @@ private:
             std::string unit_name = unit_name_ss.str();
 
             const TensorShape last_shape = graph.graph().node(graph.tail_node())->output(0)->desc().shape;
-            unsigned int      depth_in   = last_shape[arm_compute::get_data_layout_dimension_index(common_params.data_layout, DataLayoutDimension::CHANNEL)];
+            unsigned int      depth_in   = last_shape[arm_compute::get_data_layout_dimension_index(
+                       common_params.data_layout, DataLayoutDimension::CHANNEL)];
             unsigned int      depth_out  = base_depth * 4;
 
             // All units have stride 1 apart from last one
@@ -162,73 +164,76 @@ private:
 
             // Preact
             SubStream preact(graph);
-            preact << BatchNormalizationLayer(
-                       get_weights_accessor(data_path, unit_path + "preact_moving_mean.npy"),
-                       get_weights_accessor(data_path, unit_path + "preact_moving_variance.npy"),
-                       get_weights_accessor(data_path, unit_path + "preact_gamma.npy"),
-                       get_weights_accessor(data_path, unit_path + "preact_beta.npy"),
-                       0.000009999999747378752f)
-                   .set_name(unit_name + "preact/BatchNorm")
-                   << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "preact/Relu");
+            preact << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "preact_moving_mean.npy"),
+                                              get_weights_accessor(data_path, unit_path + "preact_moving_variance.npy"),
+                                              get_weights_accessor(data_path, unit_path + "preact_gamma.npy"),
+                                              get_weights_accessor(data_path, unit_path + "preact_beta.npy"),
+                                              0.000009999999747378752f)
+                          .set_name(unit_name + "preact/BatchNorm")
+                   << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                          .set_name(unit_name + "preact/Relu");
 
             // Create bottleneck path
             SubStream shortcut(graph);
-            if(depth_in == depth_out)
+            if (depth_in == depth_out)
             {
-                if(middle_stride != 1)
+                if (middle_stride != 1)
                 {
-                    shortcut << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 1, common_params.data_layout, PadStrideInfo(middle_stride, middle_stride, 0, 0), true)).set_name(unit_name + "shortcut/MaxPool");
+                    shortcut << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 1, common_params.data_layout,
+                                                              PadStrideInfo(middle_stride, middle_stride, 0, 0), true))
+                                    .set_name(unit_name + "shortcut/MaxPool");
                 }
             }
             else
             {
                 shortcut.forward_tail(preact.tail_node());
                 shortcut << ConvolutionLayer(
-                             1U, 1U, depth_out,
-                             get_weights_accessor(data_path, unit_path + "shortcut_weights.npy", weights_layout),
-                             get_weights_accessor(data_path, unit_path + "shortcut_biases.npy", weights_layout),
-                             PadStrideInfo(1, 1, 0, 0))
-                         .set_name(unit_name + "shortcut/convolution");
+                                1U, 1U, depth_out,
+                                get_weights_accessor(data_path, unit_path + "shortcut_weights.npy", weights_layout),
+                                get_weights_accessor(data_path, unit_path + "shortcut_biases.npy", weights_layout),
+                                PadStrideInfo(1, 1, 0, 0))
+                                .set_name(unit_name + "shortcut/convolution");
             }
 
             // Create residual path
             SubStream residual(preact);
-            residual << ConvolutionLayer(
-                         1U, 1U, base_depth,
-                         get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
-                         std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                         PadStrideInfo(1, 1, 0, 0))
-                     .set_name(unit_name + "conv1/convolution")
-                     << BatchNormalizationLayer(
-                         get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_mean.npy"),
-                         get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_variance.npy"),
-                         get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_gamma.npy"),
-                         get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_beta.npy"),
-                         0.000009999999747378752f)
-                     .set_name(unit_name + "conv1/BatchNorm")
-                     << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv1/Relu")
-                     << ConvolutionLayer(
-                         3U, 3U, base_depth,
-                         get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
-                         std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                         PadStrideInfo(middle_stride, middle_stride, 1, 1))
-                     .set_name(unit_name + "conv2/convolution")
-                     << BatchNormalizationLayer(
-                         get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_mean.npy"),
-                         get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_variance.npy"),
-                         get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_gamma.npy"),
-                         get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_beta.npy"),
-                         0.000009999999747378752f)
-                     .set_name(unit_name + "conv2/BatchNorm")
-                     << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv1/Relu")
-                     << ConvolutionLayer(
-                         1U, 1U, depth_out,
-                         get_weights_accessor(data_path, unit_path + "conv3_weights.npy", weights_layout),
-                         get_weights_accessor(data_path, unit_path + "conv3_biases.npy", weights_layout),
-                         PadStrideInfo(1, 1, 0, 0))
-                     .set_name(unit_name + "conv3/convolution");
-
-            graph << EltwiseLayer(std::move(shortcut), std::move(residual), EltwiseOperation::Add).set_name(unit_name + "add");
+            residual
+                << ConvolutionLayer(1U, 1U, base_depth,
+                                    get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
+                                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                    PadStrideInfo(1, 1, 0, 0))
+                       .set_name(unit_name + "conv1/convolution")
+                << BatchNormalizationLayer(
+                       get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_mean.npy"),
+                       get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_variance.npy"),
+                       get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_gamma.npy"),
+                       get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_beta.npy"),
+                       0.000009999999747378752f)
+                       .set_name(unit_name + "conv1/BatchNorm")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                       .set_name(unit_name + "conv1/Relu")
+                << ConvolutionLayer(3U, 3U, base_depth,
+                                    get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
+                                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                    PadStrideInfo(middle_stride, middle_stride, 1, 1))
+                       .set_name(unit_name + "conv2/convolution")
+                << BatchNormalizationLayer(
+                       get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_mean.npy"),
+                       get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_variance.npy"),
+                       get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_gamma.npy"),
+                       get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_beta.npy"),
+                       0.000009999999747378752f)
+                       .set_name(unit_name + "conv2/BatchNorm")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                       .set_name(unit_name + "conv1/Relu")
+                << ConvolutionLayer(1U, 1U, depth_out,
+                                    get_weights_accessor(data_path, unit_path + "conv3_weights.npy", weights_layout),
+                                    get_weights_accessor(data_path, unit_path + "conv3_biases.npy", weights_layout),
+                                    PadStrideInfo(1, 1, 0, 0))
+                       .set_name(unit_name + "conv3/convolution");
+
+            graph << EltwiseLayer(std::move(shortcut), std::move(residual), EltwiseOperation::Add)
+                         .set_name(unit_name + "add");
         }
     }
 };
diff --git a/examples/graph_resnext50.cpp b/examples/graph_resnext50.cpp
index 6378f6c74188a398342ffd4e714741b7ca5eaa28..12a1507c4c222c5b6cfff4c6ede595af33d1c7c8 100644
--- a/examples/graph_resnext50.cpp
+++ b/examples/graph_resnext50.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphResNeXt50Example : public Example
 {
 public:
-    GraphResNeXt50Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "ResNeXt50")
+    GraphResNeXt50Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "ResNeXt50")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,14 +49,15 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
         }
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -66,28 +67,33 @@ public:
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params))
               << ScaleLayer(get_weights_accessor(data_path, "/cnn_data/resnext50_model/bn_data_mul.npy"),
                             get_weights_accessor(data_path, "/cnn_data/resnext50_model/bn_data_add.npy"))
-              .set_name("bn_data/Scale")
+                     .set_name("bn_data/Scale")
               << ConvolutionLayer(
-                  7U, 7U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/resnext50_model/conv0_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/resnext50_model/conv0_biases.npy"),
-                  PadStrideInfo(2, 2, 2, 3, 2, 3, DimensionRoundingType::FLOOR))
-              .set_name("conv0/Convolution")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv0/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR))).set_name("pool0");
-
-        add_residual_block(data_path, weights_layout, /*ofm*/ 256, /*stage*/ 1, /*num_unit*/ 3, /*stride_conv_unit1*/ 1);
+                     7U, 7U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/resnext50_model/conv0_weights.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/resnext50_model/conv0_biases.npy"),
+                     PadStrideInfo(2, 2, 2, 3, 2, 3, DimensionRoundingType::FLOOR))
+                     .set_name("conv0/Convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv0/Relu")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR)))
+                     .set_name("pool0");
+
+        add_residual_block(data_path, weights_layout, /*ofm*/ 256, /*stage*/ 1, /*num_unit*/ 3,
+                           /*stride_conv_unit1*/ 1);
         add_residual_block(data_path, weights_layout, 512, 2, 4, 2);
         add_residual_block(data_path, weights_layout, 1024, 3, 6, 2);
         add_residual_block(data_path, weights_layout, 2048, 4, 3, 2);
@@ -121,10 +127,14 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    void add_residual_block(const std::string &data_path, DataLayout weights_layout,
-                            unsigned int base_depth, unsigned int stage, unsigned int num_units, unsigned int stride_conv_unit1)
+    void add_residual_block(const std::string &data_path,
+                            DataLayout         weights_layout,
+                            unsigned int       base_depth,
+                            unsigned int       stage,
+                            unsigned int       num_units,
+                            unsigned int       stride_conv_unit1)
     {
-        for(unsigned int i = 0; i < num_units; ++i)
+        for (unsigned int i = 0; i < num_units; ++i)
         {
             std::stringstream unit_path_ss;
             unit_path_ss << "/cnn_data/resnext50_model/stage" << stage << "_unit" << (i + 1) << "_";
@@ -135,54 +145,55 @@ private:
             std::string unit_name = unit_name_ss.str();
 
             PadStrideInfo pad_grouped_conv(1, 1, 1, 1);
-            if(i == 0)
+            if (i == 0)
             {
-                pad_grouped_conv = (stage == 1) ? PadStrideInfo(stride_conv_unit1, stride_conv_unit1, 1, 1) : PadStrideInfo(stride_conv_unit1, stride_conv_unit1, 0, 1, 0, 1, DimensionRoundingType::FLOOR);
+                pad_grouped_conv = (stage == 1) ? PadStrideInfo(stride_conv_unit1, stride_conv_unit1, 1, 1)
+                                                : PadStrideInfo(stride_conv_unit1, stride_conv_unit1, 0, 1, 0, 1,
+                                                                DimensionRoundingType::FLOOR);
             }
 
             SubStream right(graph);
-            right << ConvolutionLayer(
-                      1U, 1U, base_depth / 2,
-                      get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
-                      get_weights_accessor(data_path, unit_path + "conv1_biases.npy"),
-                      PadStrideInfo(1, 1, 0, 0))
-                  .set_name(unit_name + "conv1/convolution")
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv1/Relu")
-
-                  << ConvolutionLayer(
-                      3U, 3U, base_depth / 2,
-                      get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
-                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                      pad_grouped_conv, 32)
-                  .set_name(unit_name + "conv2/convolution")
+            right << ConvolutionLayer(1U, 1U, base_depth / 2,
+                                      get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
+                                      get_weights_accessor(data_path, unit_path + "conv1_biases.npy"),
+                                      PadStrideInfo(1, 1, 0, 0))
+                         .set_name(unit_name + "conv1/convolution")
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "conv1/Relu")
+
+                  << ConvolutionLayer(3U, 3U, base_depth / 2,
+                                      get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
+                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), pad_grouped_conv,
+                                      32)
+                         .set_name(unit_name + "conv2/convolution")
                   << ScaleLayer(get_weights_accessor(data_path, unit_path + "bn2_mul.npy"),
                                 get_weights_accessor(data_path, unit_path + "bn2_add.npy"))
-                  .set_name(unit_name + "conv1/Scale")
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv2/Relu")
+                         .set_name(unit_name + "conv1/Scale")
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "conv2/Relu")
 
-                  << ConvolutionLayer(
-                      1U, 1U, base_depth,
-                      get_weights_accessor(data_path, unit_path + "conv3_weights.npy", weights_layout),
-                      get_weights_accessor(data_path, unit_path + "conv3_biases.npy"),
-                      PadStrideInfo(1, 1, 0, 0))
-                  .set_name(unit_name + "conv3/convolution");
+                  << ConvolutionLayer(1U, 1U, base_depth,
+                                      get_weights_accessor(data_path, unit_path + "conv3_weights.npy", weights_layout),
+                                      get_weights_accessor(data_path, unit_path + "conv3_biases.npy"),
+                                      PadStrideInfo(1, 1, 0, 0))
+                         .set_name(unit_name + "conv3/convolution");
 
             SubStream left(graph);
-            if(i == 0)
+            if (i == 0)
             {
-                left << ConvolutionLayer(
-                         1U, 1U, base_depth,
-                         get_weights_accessor(data_path, unit_path + "sc_weights.npy", weights_layout),
-                         std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                         PadStrideInfo(stride_conv_unit1, stride_conv_unit1, 0, 0))
-                     .set_name(unit_name + "sc/convolution")
+                left << ConvolutionLayer(1U, 1U, base_depth,
+                                         get_weights_accessor(data_path, unit_path + "sc_weights.npy", weights_layout),
+                                         std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                         PadStrideInfo(stride_conv_unit1, stride_conv_unit1, 0, 0))
+                            .set_name(unit_name + "sc/convolution")
                      << ScaleLayer(get_weights_accessor(data_path, unit_path + "sc_bn_mul.npy"),
                                    get_weights_accessor(data_path, unit_path + "sc_bn_add.npy"))
-                     .set_name(unit_name + "sc/scale");
+                            .set_name(unit_name + "sc/scale");
             }
 
             graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(unit_name + "add");
-            graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+            graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "Relu");
         }
     }
 };
diff --git a/examples/graph_shufflenet.cpp b/examples/graph_shufflenet.cpp
index 6e13c5eeb419fc071c5b2d42f28ef7b8cd73931f..513d95884ea824e677d2340a34f170392f1703e4 100644
--- a/examples/graph_shufflenet.cpp
+++ b/examples/graph_shufflenet.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class ShuffleNetExample : public Example
 {
 public:
-    ShuffleNetExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "ShuffleNet")
+    ShuffleNetExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "ShuffleNet")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,20 +49,21 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
         }
 
         // Set default layout if needed (Single kernel grouped convolution not yet supported int NHWC)
-        if(!common_opts.data_layout->is_set())
+        if (!common_opts.data_layout->is_set())
         {
             common_params.data_layout = DataLayout::NHWC;
         }
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -75,15 +76,17 @@ public:
         std::string data_path = common_params.data_path;
 
         // Add model path to data path
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += model_path;
         }
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
@@ -91,24 +94,22 @@ public:
         // Create preprocessor
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<TFPreproccessor>(0);
 
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false /* Do not convert to BGR */))
-              << ConvolutionLayer(
-                  3U, 3U, 24U,
-                  get_weights_accessor(data_path, "conv3_0_w_0.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv3_0_b_0.npy", weights_layout),
-                  PadStrideInfo(2, 2, 1, 1))
-              .set_name("Conv1/convolution")
-              << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "conv3_0_bn_rm_0.npy"),
-                  get_weights_accessor(data_path, "conv3_0_bn_riv_0.npy"),
-                  get_weights_accessor(data_path, "conv3_0_bn_s_0.npy"),
-                  get_weights_accessor(data_path, "conv3_0_bn_b_0.npy"),
-                  1e-5f)
-              .set_name("Conv1/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv1/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 1, 1))).set_name("pool1/MaxPool");
+        graph << common_params.target << common_params.fast_math_hint
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor),
+                                                                 false /* Do not convert to BGR */))
+              << ConvolutionLayer(3U, 3U, 24U, get_weights_accessor(data_path, "conv3_0_w_0.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv3_0_b_0.npy", weights_layout),
+                                  PadStrideInfo(2, 2, 1, 1))
+                     .set_name("Conv1/convolution")
+              << BatchNormalizationLayer(get_weights_accessor(data_path, "conv3_0_bn_rm_0.npy"),
+                                         get_weights_accessor(data_path, "conv3_0_bn_riv_0.npy"),
+                                         get_weights_accessor(data_path, "conv3_0_bn_s_0.npy"),
+                                         get_weights_accessor(data_path, "conv3_0_bn_b_0.npy"), 1e-5f)
+                     .set_name("Conv1/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv1/Relu")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 1, 1)))
+                     .set_name("pool1/MaxPool");
 
         // Stage 2
         add_residual_block(data_path, DataLayout::NCHW, 0U /* unit */, 112U /* depth */, 2U /* stride */);
@@ -134,13 +135,10 @@ public:
 
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("predictions/AvgPool")
               << FlattenLayer().set_name("predictions/Reshape")
-              << FullyConnectedLayer(
-                  1000U,
-                  get_weights_accessor(data_path, "pred_w_0.npy", weights_layout),
-                  get_weights_accessor(data_path, "pred_b_0.npy"))
-              .set_name("predictions/FC")
-              << SoftmaxLayer().set_name("predictions/Softmax")
-              << OutputLayer(get_output_accessor(common_params, 5));
+              << FullyConnectedLayer(1000U, get_weights_accessor(data_path, "pred_w_0.npy", weights_layout),
+                                     get_weights_accessor(data_path, "pred_b_0.npy"))
+                     .set_name("predictions/FC")
+              << SoftmaxLayer().set_name("predictions/Softmax") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
@@ -167,8 +165,11 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    void add_residual_block(const std::string &data_path, DataLayout weights_layout,
-                            unsigned int unit, unsigned int depth, unsigned int stride)
+    void add_residual_block(const std::string &data_path,
+                            DataLayout         weights_layout,
+                            unsigned int       unit,
+                            unsigned int       depth,
+                            unsigned int       stride)
     {
         PadStrideInfo      dwc_info        = PadStrideInfo(1, 1, 1, 1);
         const unsigned int gconv_id        = unit * 2;
@@ -181,63 +182,61 @@ private:
         SubStream left_ss(graph);
         SubStream right_ss(graph);
 
-        if(stride == 2)
+        if (stride == 2)
         {
-            right_ss << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(2, 2, 1, 1))).set_name(unit_name + "/pool_1/AveragePool");
+            right_ss << PoolingLayer(
+                            PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(2, 2, 1, 1)))
+                            .set_name(unit_name + "/pool_1/AveragePool");
             dwc_info = PadStrideInfo(2, 2, 1, 1);
         }
 
-        left_ss << ConvolutionLayer(
-                    1U, 1U, depth,
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_w_0.npy", weights_layout),
-                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                    PadStrideInfo(1, 1, 0, 0), num_groups)
-                .set_name(unit_name + "/gconv1_" + gconv_id_name + "/convolution")
-                << BatchNormalizationLayer(
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_bn_rm_0.npy"),
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_bn_riv_0.npy"),
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_bn_s_0.npy"),
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_bn_b_0.npy"),
-                    1e-5f)
-                .set_name(unit_name + "/gconv1_" + gconv_id_name + "/BatchNorm")
-                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "/gconv1_" + gconv_id_name + "/Relu")
-                << ChannelShuffleLayer(num_groups).set_name(unit_name + "/shuffle_0/ChannelShufle")
-                << DepthwiseConvolutionLayer(
-                    3U, 3U,
-                    get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_w_0.npy", weights_layout),
-                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                    dwc_info)
-                .set_name(unit_name + "/gconv3_" + unit_id_name + "/depthwise")
-                << BatchNormalizationLayer(
-                    get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_bn_rm_0.npy"),
-                    get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_bn_riv_0.npy"),
-                    get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_bn_s_0.npy"),
-                    get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_bn_b_0.npy"),
-                    1e-5f)
-                .set_name(unit_name + "/gconv3_" + unit_id_name + "/BatchNorm")
-                << ConvolutionLayer(
-                    1U, 1U, depth,
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_w_0.npy", weights_layout),
-                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                    PadStrideInfo(1, 1, 0, 0), num_groups)
-                .set_name(unit_name + "/gconv1_" + gconv_id_1_name + "/convolution")
-                << BatchNormalizationLayer(
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_bn_rm_0.npy"),
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_bn_riv_0.npy"),
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_bn_s_0.npy"),
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_bn_b_0.npy"),
-                    1e-5f)
-                .set_name(unit_name + "/gconv1_" + gconv_id_1_name + "/BatchNorm");
+        left_ss
+            << ConvolutionLayer(1U, 1U, depth,
+                                get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_w_0.npy", weights_layout),
+                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                PadStrideInfo(1, 1, 0, 0), num_groups)
+                   .set_name(unit_name + "/gconv1_" + gconv_id_name + "/convolution")
+            << BatchNormalizationLayer(get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_bn_rm_0.npy"),
+                                       get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_bn_riv_0.npy"),
+                                       get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_bn_s_0.npy"),
+                                       get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_bn_b_0.npy"),
+                                       1e-5f)
+                   .set_name(unit_name + "/gconv1_" + gconv_id_name + "/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(unit_name + "/gconv1_" + gconv_id_name + "/Relu")
+            << ChannelShuffleLayer(num_groups).set_name(unit_name + "/shuffle_0/ChannelShufle")
+            << DepthwiseConvolutionLayer(
+                   3U, 3U, get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_w_0.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), dwc_info)
+                   .set_name(unit_name + "/gconv3_" + unit_id_name + "/depthwise")
+            << BatchNormalizationLayer(get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_bn_rm_0.npy"),
+                                       get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_bn_riv_0.npy"),
+                                       get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_bn_s_0.npy"),
+                                       get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_bn_b_0.npy"), 1e-5f)
+                   .set_name(unit_name + "/gconv3_" + unit_id_name + "/BatchNorm")
+            << ConvolutionLayer(
+                   1U, 1U, depth,
+                   get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_w_0.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0), num_groups)
+                   .set_name(unit_name + "/gconv1_" + gconv_id_1_name + "/convolution")
+            << BatchNormalizationLayer(get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_bn_rm_0.npy"),
+                                       get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_bn_riv_0.npy"),
+                                       get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_bn_s_0.npy"),
+                                       get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_bn_b_0.npy"),
+                                       1e-5f)
+                   .set_name(unit_name + "/gconv1_" + gconv_id_1_name + "/BatchNorm");
 
-        if(stride == 2)
+        if (stride == 2)
         {
             graph << ConcatLayer(std::move(left_ss), std::move(right_ss)).set_name(unit_name + "/Concat");
         }
         else
         {
-            graph << EltwiseLayer(std::move(left_ss), std::move(right_ss), EltwiseOperation::Add).set_name(unit_name + "/Add");
+            graph << EltwiseLayer(std::move(left_ss), std::move(right_ss), EltwiseOperation::Add)
+                         .set_name(unit_name + "/Add");
         }
-        graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "/Relu");
+        graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name(unit_name + "/Relu");
     }
 };
 
diff --git a/examples/graph_squeezenet.cpp b/examples/graph_squeezenet.cpp
index 3ea2fea38f1336177cae1d6aee6a53972a04f2ed..7d0528f805429a1ecf72605938a013d0b94ddc28 100644
--- a/examples/graph_squeezenet.cpp
+++ b/examples/graph_squeezenet.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphSqueezenetExample : public Example
 {
 public:
-    GraphSqueezenetExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "SqueezeNetV1")
+    GraphSqueezenetExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "SqueezeNetV1")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,7 +49,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -62,104 +62,128 @@ public:
         std::string data_path = common_params.data_path;
 
         // Create a preprocessor object
-        const std::array<float, 3> mean_rgb{ { 122.68f, 116.67f, 104.01f } };
+        const std::array<float, 3>     mean_rgb{{122.68f, 116.67f, 104.01f}};
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<CaffePreproccessor>(mean_rgb);
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
               << ConvolutionLayer(
-                  7U, 7U, 96U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_b.npy"),
-                  PadStrideInfo(2, 2, 0, 0))
-              .set_name("conv1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu_conv1")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool1")
+                     7U, 7U, 96U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_w.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_b.npy"),
+                     PadStrideInfo(2, 2, 0, 0))
+                     .set_name("conv1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("relu_conv1")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool1")
               << ConvolutionLayer(
-                  1U, 1U, 16U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire2_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire2_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire2/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire2/relu_squeeze1x1");
+                     1U, 1U, 16U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire2_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire2_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire2/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire2/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire2", weights_layout, 64U, 64U).set_name("fire2/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 16U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire3_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire3_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire3/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire3/relu_squeeze1x1");
+                     1U, 1U, 16U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire3_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire3_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire3/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire3/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire3", weights_layout, 64U, 64U).set_name("fire3/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 32U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire4_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire4_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire4/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire4/relu_squeeze1x1");
+                     1U, 1U, 32U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire4_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire4_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire4/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire4/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire4", weights_layout, 128U, 128U).set_name("fire4/concat");
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool4")
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool4")
               << ConvolutionLayer(
-                  1U, 1U, 32U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire5_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire5_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire5/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire5/relu_squeeze1x1");
+                     1U, 1U, 32U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire5_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire5_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire5/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire5/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire5", weights_layout, 128U, 128U).set_name("fire5/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 48U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire6_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire6_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire6/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire6/relu_squeeze1x1");
+                     1U, 1U, 48U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire6_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire6_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire6/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire6/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire6", weights_layout, 192U, 192U).set_name("fire6/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 48U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire7_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire7_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire7/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire7/relu_squeeze1x1");
+                     1U, 1U, 48U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire7_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire7_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire7/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire7/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire7", weights_layout, 192U, 192U).set_name("fire7/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire8_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire8_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire8/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire8/relu_squeeze1x1");
+                     1U, 1U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire8_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire8_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire8/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire8/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire8", weights_layout, 256U, 256U).set_name("fire8/concat");
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool8")
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool8")
               << ConvolutionLayer(
-                  1U, 1U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire9_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire9_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire9/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire9/relu_squeeze1x1");
+                     1U, 1U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire9_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire9_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire9/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire9/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire9", weights_layout, 256U, 256U).set_name("fire9/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 1000U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv10_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv10_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv10")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu_conv10")
+                     1U, 1U, 1000U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv10_w.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv10_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("conv10")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("relu_conv10")
               << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("pool10")
-              << FlattenLayer().set_name("flatten")
-              << SoftmaxLayer().set_name("prob")
+              << FlattenLayer().set_name("flatten") << SoftmaxLayer().set_name("prob")
               << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
@@ -188,27 +212,30 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    ConcatLayer get_expand_fire_node(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                                     unsigned int expand1_filt, unsigned int expand3_filt)
+    ConcatLayer get_expand_fire_node(const std::string &data_path,
+                                     std::string      &&param_path,
+                                     DataLayout         weights_layout,
+                                     unsigned int       expand1_filt,
+                                     unsigned int       expand3_filt)
     {
         std::string total_path = "/cnn_data/squeezenet_v1.0_model/" + param_path + "_";
         SubStream   i_a(graph);
-        i_a << ConvolutionLayer(
-                1U, 1U, expand1_filt,
-                get_weights_accessor(data_path, total_path + "expand1x1_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "expand1x1_b.npy"),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/expand1x1")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_expand1x1");
+        i_a << ConvolutionLayer(1U, 1U, expand1_filt,
+                                get_weights_accessor(data_path, total_path + "expand1x1_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "expand1x1_b.npy"),
+                                PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/expand1x1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_expand1x1");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(
-                3U, 3U, expand3_filt,
-                get_weights_accessor(data_path, total_path + "expand3x3_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "expand3x3_b.npy"),
-                PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/expand3x3")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_expand3x3");
+        i_b << ConvolutionLayer(3U, 3U, expand3_filt,
+                                get_weights_accessor(data_path, total_path + "expand3x3_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "expand3x3_b.npy"),
+                                PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/expand3x3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_expand3x3");
 
         return ConcatLayer(std::move(i_a), std::move(i_b));
     }
diff --git a/examples/graph_squeezenet_v1_1.cpp b/examples/graph_squeezenet_v1_1.cpp
index 9cc183fbbdeab62981d56f751c249810dd5d4877..ed0f692db2aefc7c9f7ba17b429909f79d50ffe0 100644
--- a/examples/graph_squeezenet_v1_1.cpp
+++ b/examples/graph_squeezenet_v1_1.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphSqueezenet_v1_1Example : public Example
 {
 public:
-    GraphSqueezenet_v1_1Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "SqueezeNetV1.1")
+    GraphSqueezenet_v1_1Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "SqueezeNetV1.1")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,7 +49,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -62,104 +62,128 @@ public:
         std::string data_path = common_params.data_path;
 
         // Create a preprocessor object
-        const std::array<float, 3> mean_rgb{ { 122.68f, 116.67f, 104.01f } };
+        const std::array<float, 3>     mean_rgb{{122.68f, 116.67f, 104.01f}};
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<CaffePreproccessor>(mean_rgb);
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(227U, 227U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(227U, 227U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
               << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv1_b.npy"),
-                  PadStrideInfo(2, 2, 0, 0))
-              .set_name("conv1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu_conv1")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool1")
+                     3U, 3U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv1_w.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv1_b.npy"),
+                     PadStrideInfo(2, 2, 0, 0))
+                     .set_name("conv1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("relu_conv1")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool1")
               << ConvolutionLayer(
-                  1U, 1U, 16U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire2_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire2_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire2/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire2/relu_squeeze1x1");
+                     1U, 1U, 16U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire2_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire2_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire2/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire2/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire2", weights_layout, 64U, 64U).set_name("fire2/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 16U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire3_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire3_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire3/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire3/relu_squeeze1x1");
+                     1U, 1U, 16U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire3_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire3_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire3/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire3/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire3", weights_layout, 64U, 64U).set_name("fire3/concat");
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool3")
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool3")
               << ConvolutionLayer(
-                  1U, 1U, 32U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire4_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire4_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire4/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire4/relu_squeeze1x1");
+                     1U, 1U, 32U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire4_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire4_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire4/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire4/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire4", weights_layout, 128U, 128U).set_name("fire4/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 32U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire5_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire5_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire5/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire5/relu_squeeze1x1");
+                     1U, 1U, 32U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire5_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire5_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire5/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire5/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire5", weights_layout, 128U, 128U).set_name("fire5/concat");
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool5")
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool5")
               << ConvolutionLayer(
-                  1U, 1U, 48U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire6_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire6_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire6/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire6/relu_squeeze1x1");
+                     1U, 1U, 48U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire6_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire6_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire6/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire6/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire6", weights_layout, 192U, 192U).set_name("fire6/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 48U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire7_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire7_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire7/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire7/relu_squeeze1x1");
+                     1U, 1U, 48U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire7_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire7_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire7/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire7/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire7", weights_layout, 192U, 192U).set_name("fire7/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire8_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire8_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire8/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire8/relu_squeeze1x1");
+                     1U, 1U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire8_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire8_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire8/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire8/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire8", weights_layout, 256U, 256U).set_name("fire8/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire9_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire9_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire9/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire9/relu_squeeze1x1");
+                     1U, 1U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire9_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire9_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire9/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire9/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire9", weights_layout, 256U, 256U).set_name("fire9/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 1000U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv10_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv10_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv10")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu_conv10")
+                     1U, 1U, 1000U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv10_w.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv10_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("conv10")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("relu_conv10")
               << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("pool10")
-              << FlattenLayer().set_name("flatten")
-              << SoftmaxLayer().set_name("prob")
+              << FlattenLayer().set_name("flatten") << SoftmaxLayer().set_name("prob")
               << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
@@ -188,27 +212,30 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    ConcatLayer get_expand_fire_node(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                                     unsigned int expand1_filt, unsigned int expand3_filt)
+    ConcatLayer get_expand_fire_node(const std::string &data_path,
+                                     std::string      &&param_path,
+                                     DataLayout         weights_layout,
+                                     unsigned int       expand1_filt,
+                                     unsigned int       expand3_filt)
     {
         std::string total_path = "/cnn_data/squeezenet_v1_1_model/" + param_path + "_";
         SubStream   i_a(graph);
-        i_a << ConvolutionLayer(
-                1U, 1U, expand1_filt,
-                get_weights_accessor(data_path, total_path + "expand1x1_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "expand1x1_b.npy"),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/expand1x1")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_expand1x1");
+        i_a << ConvolutionLayer(1U, 1U, expand1_filt,
+                                get_weights_accessor(data_path, total_path + "expand1x1_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "expand1x1_b.npy"),
+                                PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/expand1x1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_expand1x1");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(
-                3U, 3U, expand3_filt,
-                get_weights_accessor(data_path, total_path + "expand3x3_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "expand3x3_b.npy"),
-                PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/expand3x3")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_expand3x3");
+        i_b << ConvolutionLayer(3U, 3U, expand3_filt,
+                                get_weights_accessor(data_path, total_path + "expand3x3_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "expand3x3_b.npy"),
+                                PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/expand3x3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_expand3x3");
 
         return ConcatLayer(std::move(i_a), std::move(i_b));
     }
diff --git a/examples/graph_srcnn955.cpp b/examples/graph_srcnn955.cpp
index 855bbd848e74ec841264f7e32a78e26d2f53968e..15a8b5d8ec6ff040445d3426ea7aafe03185fa59 100644
--- a/examples/graph_srcnn955.cpp
+++ b/examples/graph_srcnn955.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -36,7 +37,12 @@ class GraphSRCNN955Example : public Example
 {
 public:
     GraphSRCNN955Example()
-        : cmd_parser(), common_opts(cmd_parser), model_input_width(nullptr), model_input_height(nullptr), common_params(), graph(0, "SRCNN955")
+        : cmd_parser(),
+          common_opts(cmd_parser),
+          model_input_width(nullptr),
+          model_input_height(nullptr),
+          common_params(),
+          graph(0, "SRCNN955")
     {
         model_input_width  = cmd_parser.add_option<SimpleOption<unsigned int>>("image-width", 300);
         model_input_height = cmd_parser.add_option<SimpleOption<unsigned int>>("image-height", 300);
@@ -45,7 +51,7 @@ public:
         model_input_width->set_help("Input image width.");
         model_input_height->set_help("Input image height.");
     }
-    GraphSRCNN955Example(const GraphSRCNN955Example &) = delete;
+    GraphSRCNN955Example(const GraphSRCNN955Example &)            = delete;
     GraphSRCNN955Example &operator=(const GraphSRCNN955Example &) = delete;
     ~GraphSRCNN955Example() override                              = default;
     bool do_setup(int argc, char **argv) override
@@ -58,7 +64,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -81,36 +87,33 @@ public:
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<TFPreproccessor>();
 
         // Create input descriptor
-        const TensorShape tensor_shape     = permute_shape(TensorShape(image_width, image_height, 3U, common_params.batches), DataLayout::NCHW, common_params.data_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(image_width, image_height, 3U, common_params.batches), DataLayout::NCHW,
+                          common_params.data_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false /* Do not convert to BGR */))
-              << ConvolutionLayer(
-                  9U, 9U, 64U,
-                  get_weights_accessor(data_path, "conv1_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv1_biases.npy"),
-                  PadStrideInfo(1, 1, 4, 4))
-              .set_name("conv1/convolution")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1/Relu")
-              << ConvolutionLayer(
-                  5U, 5U, 32U,
-                  get_weights_accessor(data_path, "conv2_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv2_biases.npy"),
-                  PadStrideInfo(1, 1, 2, 2))
-              .set_name("conv2/convolution")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv2/Relu")
-              << ConvolutionLayer(
-                  5U, 5U, 3U,
-                  get_weights_accessor(data_path, "conv3_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv3_biases.npy"),
-                  PadStrideInfo(1, 1, 2, 2))
-              .set_name("conv3/convolution")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv3/Relu")
+        graph << common_params.target << common_params.fast_math_hint
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor),
+                                                                 false /* Do not convert to BGR */))
+              << ConvolutionLayer(9U, 9U, 64U, get_weights_accessor(data_path, "conv1_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv1_biases.npy"), PadStrideInfo(1, 1, 4, 4))
+                     .set_name("conv1/convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv1/Relu")
+              << ConvolutionLayer(5U, 5U, 32U, get_weights_accessor(data_path, "conv2_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv2_biases.npy"), PadStrideInfo(1, 1, 2, 2))
+                     .set_name("conv2/convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv2/Relu")
+              << ConvolutionLayer(5U, 5U, 3U, get_weights_accessor(data_path, "conv3_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv3_biases.npy"), PadStrideInfo(1, 1, 2, 2))
+                     .set_name("conv3/convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv3/Relu")
               << OutputLayer(std::make_unique<DummyAccessor>(0));
 
         // Finalize graph
@@ -137,8 +140,8 @@ public:
 private:
     CommandLineParser           cmd_parser;
     CommonGraphOptions          common_opts;
-    SimpleOption<unsigned int> *model_input_width{ nullptr };
-    SimpleOption<unsigned int> *model_input_height{ nullptr };
+    SimpleOption<unsigned int> *model_input_width{nullptr};
+    SimpleOption<unsigned int> *model_input_height{nullptr};
     CommonGraphParams           common_params;
     Stream                      graph;
 };
diff --git a/examples/graph_ssd_mobilenet.cpp b/examples/graph_ssd_mobilenet.cpp
index 9fe7f5b454a6138f8594b9f4ed10ff55bdf26791..5162fe6890f6549361a36f2cc359ef17e934c6fe 100644
--- a/examples/graph_ssd_mobilenet.cpp
+++ b/examples/graph_ssd_mobilenet.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -36,23 +37,26 @@ using namespace arm_compute::graph_utils;
 class GraphSSDMobilenetExample : public Example
 {
 public:
-    GraphSSDMobilenetExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "MobileNetSSD")
+    GraphSSDMobilenetExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "MobileNetSSD")
     {
         // Add topk option
         keep_topk_opt = cmd_parser.add_option<SimpleOption<int>>("topk", 100);
         keep_topk_opt->set_help("Top k detections results per image. Used for data type F32.");
         // Add output option
         detection_boxes_opt = cmd_parser.add_option<SimpleOption<std::string>>("detection_boxes_opt", "");
-        detection_boxes_opt->set_help("Filename containing the reference values for the graph output detection_boxes. Used for data type QASYMM8.");
+        detection_boxes_opt->set_help("Filename containing the reference values for the graph output detection_boxes. "
+                                      "Used for data type QASYMM8.");
         detection_classes_opt = cmd_parser.add_option<SimpleOption<std::string>>("detection_classes_opt", "");
-        detection_classes_opt->set_help("Filename containing the reference values for the output detection_classes. Used for data type QASYMM8.");
+        detection_classes_opt->set_help(
+            "Filename containing the reference values for the output detection_classes. Used for data type QASYMM8.");
         detection_scores_opt = cmd_parser.add_option<SimpleOption<std::string>>("detection_scores_opt", "");
-        detection_scores_opt->set_help("Filename containing the reference values for the output detection_scores. Used for data type QASYMM8.");
+        detection_scores_opt->set_help(
+            "Filename containing the reference values for the output detection_scores. Used for data type QASYMM8.");
         num_detections_opt = cmd_parser.add_option<SimpleOption<std::string>>("num_detections_opt", "");
-        num_detections_opt->set_help("Filename containing the reference values for the output num_detections. Used with datatype QASYMM8.");
+        num_detections_opt->set_help(
+            "Filename containing the reference values for the output num_detections. Used with datatype QASYMM8.");
     }
-    GraphSSDMobilenetExample(const GraphSSDMobilenetExample &) = delete;
+    GraphSSDMobilenetExample(const GraphSSDMobilenetExample &)            = delete;
     GraphSSDMobilenetExample &operator=(const GraphSSDMobilenetExample &) = delete;
     ~GraphSSDMobilenetExample() override                                  = default;
     bool do_setup(int argc, char **argv) override
@@ -65,7 +69,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -75,15 +79,16 @@ public:
         std::cout << common_params << std::endl;
 
         // Create input descriptor
-        const TensorShape tensor_shape     = permute_shape(TensorShape(300, 300, 3U, 1U), DataLayout::NCHW, common_params.data_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(300, 300, 3U, 1U), DataLayout::NCHW, common_params.data_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
 
         // Set graph hints
-        graph << common_params.target
-              << common_params.fast_math_hint;
+        graph << common_params.target << common_params.fast_math_hint;
 
         // Create core graph
-        if(arm_compute::is_data_type_float(common_params.data_type))
+        if (arm_compute::is_data_type_float(common_params.data_type))
         {
             create_graph_float(input_descriptor);
         }
@@ -112,99 +117,98 @@ public:
 private:
     CommandLineParser  cmd_parser;
     CommonGraphOptions common_opts;
-    SimpleOption<int> *keep_topk_opt{ nullptr };
+    SimpleOption<int> *keep_topk_opt{nullptr};
     CommonGraphParams  common_params;
     Stream             graph;
 
-    SimpleOption<std::string> *detection_boxes_opt{ nullptr };
-    SimpleOption<std::string> *detection_classes_opt{ nullptr };
-    SimpleOption<std::string> *detection_scores_opt{ nullptr };
-    SimpleOption<std::string> *num_detections_opt{ nullptr };
-
-    ConcatLayer get_node_A_float(IStream &main_graph, const std::string &data_path, std::string &&param_path,
-                                 unsigned int  conv_filt,
-                                 PadStrideInfo dwc_pad_stride_info, PadStrideInfo conv_pad_stride_info)
+    SimpleOption<std::string> *detection_boxes_opt{nullptr};
+    SimpleOption<std::string> *detection_classes_opt{nullptr};
+    SimpleOption<std::string> *detection_scores_opt{nullptr};
+    SimpleOption<std::string> *num_detections_opt{nullptr};
+
+    ConcatLayer get_node_A_float(IStream           &main_graph,
+                                 const std::string &data_path,
+                                 std::string      &&param_path,
+                                 unsigned int       conv_filt,
+                                 PadStrideInfo      dwc_pad_stride_info,
+                                 PadStrideInfo      conv_pad_stride_info)
     {
         const std::string total_path = param_path + "_";
         SubStream         sg(main_graph);
 
-        sg << DepthwiseConvolutionLayer(
-               3U, 3U,
-               get_weights_accessor(data_path, total_path + "dw_w.npy"),
-               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-               dwc_pad_stride_info)
-           .set_name(param_path + "/dw")
+        sg << DepthwiseConvolutionLayer(3U, 3U, get_weights_accessor(data_path, total_path + "dw_w.npy"),
+                                        std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                        dwc_pad_stride_info)
+                  .set_name(param_path + "/dw")
            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "dw_bn_mean.npy"),
                                       get_weights_accessor(data_path, total_path + "dw_bn_var.npy"),
                                       get_weights_accessor(data_path, total_path + "dw_scale_w.npy"),
                                       get_weights_accessor(data_path, total_path + "dw_scale_b.npy"), 0.00001f)
-           .set_name(param_path + "/dw/bn")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "dw/relu")
-
-           << ConvolutionLayer(
-               1U, 1U, conv_filt,
-               get_weights_accessor(data_path, total_path + "w.npy"),
-               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-               conv_pad_stride_info)
-           .set_name(param_path + "/pw")
+                  .set_name(param_path + "/dw/bn")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                  .set_name(param_path + "dw/relu")
+
+           << ConvolutionLayer(1U, 1U, conv_filt, get_weights_accessor(data_path, total_path + "w.npy"),
+                               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), conv_pad_stride_info)
+                  .set_name(param_path + "/pw")
            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "bn_mean.npy"),
                                       get_weights_accessor(data_path, total_path + "bn_var.npy"),
                                       get_weights_accessor(data_path, total_path + "scale_w.npy"),
                                       get_weights_accessor(data_path, total_path + "scale_b.npy"), 0.00001f)
-           .set_name(param_path + "/pw/bn")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "pw/relu");
+                  .set_name(param_path + "/pw/bn")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                  .set_name(param_path + "pw/relu");
 
         return ConcatLayer(std::move(sg));
     }
 
-    ConcatLayer get_node_B_float(IStream &main_graph, const std::string &data_path, std::string &&param_path,
-                                 unsigned int  conv_filt,
-                                 PadStrideInfo conv_pad_stride_info_1, PadStrideInfo conv_pad_stride_info_2)
+    ConcatLayer get_node_B_float(IStream           &main_graph,
+                                 const std::string &data_path,
+                                 std::string      &&param_path,
+                                 unsigned int       conv_filt,
+                                 PadStrideInfo      conv_pad_stride_info_1,
+                                 PadStrideInfo      conv_pad_stride_info_2)
     {
         const std::string total_path = param_path + "_";
         SubStream         sg(main_graph);
 
-        sg << ConvolutionLayer(
-               1, 1, conv_filt / 2,
-               get_weights_accessor(data_path, total_path + "1_w.npy"),
-               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-               conv_pad_stride_info_1)
-           .set_name(total_path + "1/conv")
+        sg << ConvolutionLayer(1, 1, conv_filt / 2, get_weights_accessor(data_path, total_path + "1_w.npy"),
+                               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), conv_pad_stride_info_1)
+                  .set_name(total_path + "1/conv")
            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "1_bn_mean.npy"),
                                       get_weights_accessor(data_path, total_path + "1_bn_var.npy"),
                                       get_weights_accessor(data_path, total_path + "1_scale_w.npy"),
                                       get_weights_accessor(data_path, total_path + "1_scale_b.npy"), 0.00001f)
-           .set_name(total_path + "1/bn")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(total_path + "1/relu");
-
-        sg << ConvolutionLayer(
-               3, 3, conv_filt,
-               get_weights_accessor(data_path, total_path + "2_w.npy"),
-               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-               conv_pad_stride_info_2)
-           .set_name(total_path + "2/conv")
+                  .set_name(total_path + "1/bn")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                  .set_name(total_path + "1/relu");
+
+        sg << ConvolutionLayer(3, 3, conv_filt, get_weights_accessor(data_path, total_path + "2_w.npy"),
+                               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), conv_pad_stride_info_2)
+                  .set_name(total_path + "2/conv")
            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "2_bn_mean.npy"),
                                       get_weights_accessor(data_path, total_path + "2_bn_var.npy"),
                                       get_weights_accessor(data_path, total_path + "2_scale_w.npy"),
                                       get_weights_accessor(data_path, total_path + "2_scale_b.npy"), 0.00001f)
-           .set_name(total_path + "2/bn")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(total_path + "2/relu");
+                  .set_name(total_path + "2/bn")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                  .set_name(total_path + "2/relu");
 
         return ConcatLayer(std::move(sg));
     }
 
-    ConcatLayer get_node_C_float(IStream &main_graph, const std::string &data_path, std::string &&param_path,
-                                 unsigned int conv_filt, PadStrideInfo conv_pad_stride_info)
+    ConcatLayer get_node_C_float(IStream           &main_graph,
+                                 const std::string &data_path,
+                                 std::string      &&param_path,
+                                 unsigned int       conv_filt,
+                                 PadStrideInfo      conv_pad_stride_info)
     {
         const std::string total_path = param_path + "_";
         SubStream         sg(main_graph);
-        sg << ConvolutionLayer(
-               1U, 1U, conv_filt,
-               get_weights_accessor(data_path, total_path + "w.npy"),
-               get_weights_accessor(data_path, total_path + "b.npy"),
-               conv_pad_stride_info)
-           .set_name(param_path + "/conv");
-        if(common_params.data_layout == DataLayout::NCHW)
+        sg << ConvolutionLayer(1U, 1U, conv_filt, get_weights_accessor(data_path, total_path + "w.npy"),
+                               get_weights_accessor(data_path, total_path + "b.npy"), conv_pad_stride_info)
+                  .set_name(param_path + "/conv");
+        if (common_params.data_layout == DataLayout::NCHW)
         {
             sg << PermuteLayer(PermutationVector(2U, 0U, 1U), DataLayout::NHWC).set_name(param_path + "/perm");
         }
@@ -216,62 +220,77 @@ private:
     void create_graph_float(TensorDescriptor &input_descriptor)
     {
         // Create a preprocessor object
-        const std::array<float, 3> mean_rgb{ { 127.5f, 127.5f, 127.5f } };
+        const std::array<float, 3>     mean_rgb{{127.5f, 127.5f, 127.5f}};
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<CaffePreproccessor>(mean_rgb, true, 0.007843f);
 
         // Get trainable parameters data path
         std::string data_path = common_params.data_path;
 
         // Add model path to data path
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += "/cnn_data/ssd_mobilenet_model/";
         }
 
-        graph << InputLayer(input_descriptor,
-                            get_input_accessor(common_params, std::move(preprocessor)));
+        graph << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)));
 
         SubStream conv_11(graph);
-        conv_11 << ConvolutionLayer(
-                    3U, 3U, 32U,
-                    get_weights_accessor(data_path, "conv0_w.npy"),
-                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                    PadStrideInfo(2, 2, 1, 1))
-                .set_name("conv0");
+        conv_11 << ConvolutionLayer(3U, 3U, 32U, get_weights_accessor(data_path, "conv0_w.npy"),
+                                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                    PadStrideInfo(2, 2, 1, 1))
+                       .set_name("conv0");
         conv_11 << BatchNormalizationLayer(get_weights_accessor(data_path, "conv0_bn_mean.npy"),
                                            get_weights_accessor(data_path, "conv0_bn_var.npy"),
                                            get_weights_accessor(data_path, "conv0_scale_w.npy"),
                                            get_weights_accessor(data_path, "conv0_scale_b.npy"), 0.00001f)
-                .set_name("conv0/bn")
-                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv0/relu");
-
-        conv_11 << get_node_A_float(conv_11, data_path, "conv1", 64, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv2", 128, PadStrideInfo(2, 2, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv3", 128, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv4", 256, PadStrideInfo(2, 2, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv5", 256, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv6", 512, PadStrideInfo(2, 2, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv7", 512, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv8", 512, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv9", 512, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv10", 512, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv11", 512, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
+                       .set_name("conv0/bn")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                       .set_name("conv0/relu");
+
+        conv_11 << get_node_A_float(conv_11, data_path, "conv1", 64, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv2", 128, PadStrideInfo(2, 2, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv3", 128, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv4", 256, PadStrideInfo(2, 2, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv5", 256, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv6", 512, PadStrideInfo(2, 2, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv7", 512, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv8", 512, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv9", 512, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv10", 512, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv11", 512, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
 
         SubStream conv_13(conv_11);
-        conv_13 << get_node_A_float(conv_11, data_path, "conv12", 1024, PadStrideInfo(2, 2, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_13 << get_node_A_float(conv_13, data_path, "conv13", 1024, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
+        conv_13 << get_node_A_float(conv_11, data_path, "conv12", 1024, PadStrideInfo(2, 2, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_13 << get_node_A_float(conv_13, data_path, "conv13", 1024, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
 
         SubStream conv_14(conv_13);
-        conv_14 << get_node_B_float(conv_13, data_path, "conv14", 512, PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 2, 1, 1));
+        conv_14 << get_node_B_float(conv_13, data_path, "conv14", 512, PadStrideInfo(1, 1, 0, 0),
+                                    PadStrideInfo(2, 2, 1, 1));
 
         SubStream conv_15(conv_14);
-        conv_15 << get_node_B_float(conv_14, data_path, "conv15", 256, PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 2, 1, 1));
+        conv_15 << get_node_B_float(conv_14, data_path, "conv15", 256, PadStrideInfo(1, 1, 0, 0),
+                                    PadStrideInfo(2, 2, 1, 1));
 
         SubStream conv_16(conv_15);
-        conv_16 << get_node_B_float(conv_15, data_path, "conv16", 256, PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 2, 1, 1));
+        conv_16 << get_node_B_float(conv_15, data_path, "conv16", 256, PadStrideInfo(1, 1, 0, 0),
+                                    PadStrideInfo(2, 2, 1, 1));
 
         SubStream conv_17(conv_16);
-        conv_17 << get_node_B_float(conv_16, data_path, "conv17", 128, PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 2, 1, 1));
+        conv_17 << get_node_B_float(conv_16, data_path, "conv17", 128, PadStrideInfo(1, 1, 0, 0),
+                                    PadStrideInfo(2, 2, 1, 1));
 
         //mbox_loc
         SubStream conv_11_mbox_loc(conv_11);
@@ -293,8 +312,9 @@ private:
         conv_17_2_mbox_loc << get_node_C_float(conv_17, data_path, "conv17_2_mbox_loc", 24, PadStrideInfo(1, 1, 0, 0));
 
         SubStream mbox_loc(graph);
-        mbox_loc << ConcatLayer(std::move(conv_11_mbox_loc), std::move(conv_13_mbox_loc), conv_14_2_mbox_loc, std::move(conv_15_2_mbox_loc),
-                                std::move(conv_16_2_mbox_loc), std::move(conv_17_2_mbox_loc));
+        mbox_loc << ConcatLayer(std::move(conv_11_mbox_loc), std::move(conv_13_mbox_loc), conv_14_2_mbox_loc,
+                                std::move(conv_15_2_mbox_loc), std::move(conv_16_2_mbox_loc),
+                                std::move(conv_17_2_mbox_loc));
 
         //mbox_conf
         SubStream conv_11_mbox_conf(conv_11);
@@ -304,67 +324,79 @@ private:
         conv_13_mbox_conf << get_node_C_float(conv_13, data_path, "conv13_mbox_conf", 126, PadStrideInfo(1, 1, 0, 0));
 
         SubStream conv_14_2_mbox_conf(conv_14);
-        conv_14_2_mbox_conf << get_node_C_float(conv_14, data_path, "conv14_2_mbox_conf", 126, PadStrideInfo(1, 1, 0, 0));
+        conv_14_2_mbox_conf << get_node_C_float(conv_14, data_path, "conv14_2_mbox_conf", 126,
+                                                PadStrideInfo(1, 1, 0, 0));
 
         SubStream conv_15_2_mbox_conf(conv_15);
-        conv_15_2_mbox_conf << get_node_C_float(conv_15, data_path, "conv15_2_mbox_conf", 126, PadStrideInfo(1, 1, 0, 0));
+        conv_15_2_mbox_conf << get_node_C_float(conv_15, data_path, "conv15_2_mbox_conf", 126,
+                                                PadStrideInfo(1, 1, 0, 0));
 
         SubStream conv_16_2_mbox_conf(conv_16);
-        conv_16_2_mbox_conf << get_node_C_float(conv_16, data_path, "conv16_2_mbox_conf", 126, PadStrideInfo(1, 1, 0, 0));
+        conv_16_2_mbox_conf << get_node_C_float(conv_16, data_path, "conv16_2_mbox_conf", 126,
+                                                PadStrideInfo(1, 1, 0, 0));
 
         SubStream conv_17_2_mbox_conf(conv_17);
-        conv_17_2_mbox_conf << get_node_C_float(conv_17, data_path, "conv17_2_mbox_conf", 126, PadStrideInfo(1, 1, 0, 0));
+        conv_17_2_mbox_conf << get_node_C_float(conv_17, data_path, "conv17_2_mbox_conf", 126,
+                                                PadStrideInfo(1, 1, 0, 0));
 
         SubStream mbox_conf(graph);
-        mbox_conf << ConcatLayer(std::move(conv_11_mbox_conf), std::move(conv_13_mbox_conf), std::move(conv_14_2_mbox_conf),
-                                 std::move(conv_15_2_mbox_conf), std::move(conv_16_2_mbox_conf), std::move(conv_17_2_mbox_conf));
+        mbox_conf << ConcatLayer(std::move(conv_11_mbox_conf), std::move(conv_13_mbox_conf),
+                                 std::move(conv_14_2_mbox_conf), std::move(conv_15_2_mbox_conf),
+                                 std::move(conv_16_2_mbox_conf), std::move(conv_17_2_mbox_conf));
         mbox_conf << ReshapeLayer(TensorShape(21U, 1917U)).set_name("mbox_conf/reshape");
         mbox_conf << SoftmaxLayer().set_name("mbox_conf/softmax");
         mbox_conf << FlattenLayer().set_name("mbox_conf/flat");
 
-        const std::vector<float> priorbox_variances     = { 0.1f, 0.1f, 0.2f, 0.2f };
+        const std::vector<float> priorbox_variances     = {0.1f, 0.1f, 0.2f, 0.2f};
         const float              priorbox_offset        = 0.5f;
-        const std::vector<float> priorbox_aspect_ratios = { 2.f, 3.f };
+        const std::vector<float> priorbox_aspect_ratios = {2.f, 3.f};
 
         //mbox_priorbox branch
         SubStream conv_11_mbox_priorbox(conv_11);
 
         conv_11_mbox_priorbox << PriorBoxLayer(SubStream(graph),
-                                               PriorBoxLayerInfo({ 60.f }, priorbox_variances, priorbox_offset, true, false, {}, { 2.f }))
-                              .set_name("conv11/priorbox");
+                                               PriorBoxLayerInfo({60.f}, priorbox_variances, priorbox_offset, true,
+                                                                 false, {}, {2.f}))
+                                     .set_name("conv11/priorbox");
 
         SubStream conv_13_mbox_priorbox(conv_13);
         conv_13_mbox_priorbox << PriorBoxLayer(SubStream(graph),
-                                               PriorBoxLayerInfo({ 105.f }, priorbox_variances, priorbox_offset, true, false, { 150.f }, priorbox_aspect_ratios))
-                              .set_name("conv13/priorbox");
+                                               PriorBoxLayerInfo({105.f}, priorbox_variances, priorbox_offset, true,
+                                                                 false, {150.f}, priorbox_aspect_ratios))
+                                     .set_name("conv13/priorbox");
 
         SubStream conv_14_2_mbox_priorbox(conv_14);
         conv_14_2_mbox_priorbox << PriorBoxLayer(SubStream(graph),
-                                                 PriorBoxLayerInfo({ 150.f }, priorbox_variances, priorbox_offset, true, false, { 195.f }, priorbox_aspect_ratios))
-                                .set_name("conv14/priorbox");
+                                                 PriorBoxLayerInfo({150.f}, priorbox_variances, priorbox_offset, true,
+                                                                   false, {195.f}, priorbox_aspect_ratios))
+                                       .set_name("conv14/priorbox");
 
         SubStream conv_15_2_mbox_priorbox(conv_15);
         conv_15_2_mbox_priorbox << PriorBoxLayer(SubStream(graph),
-                                                 PriorBoxLayerInfo({ 195.f }, priorbox_variances, priorbox_offset, true, false, { 240.f }, priorbox_aspect_ratios))
-                                .set_name("conv15/priorbox");
+                                                 PriorBoxLayerInfo({195.f}, priorbox_variances, priorbox_offset, true,
+                                                                   false, {240.f}, priorbox_aspect_ratios))
+                                       .set_name("conv15/priorbox");
 
         SubStream conv_16_2_mbox_priorbox(conv_16);
         conv_16_2_mbox_priorbox << PriorBoxLayer(SubStream(graph),
-                                                 PriorBoxLayerInfo({ 240.f }, priorbox_variances, priorbox_offset, true, false, { 285.f }, priorbox_aspect_ratios))
-                                .set_name("conv16/priorbox");
+                                                 PriorBoxLayerInfo({240.f}, priorbox_variances, priorbox_offset, true,
+                                                                   false, {285.f}, priorbox_aspect_ratios))
+                                       .set_name("conv16/priorbox");
 
         SubStream conv_17_2_mbox_priorbox(conv_17);
         conv_17_2_mbox_priorbox << PriorBoxLayer(SubStream(graph),
-                                                 PriorBoxLayerInfo({ 285.f }, priorbox_variances, priorbox_offset, true, false, { 300.f }, priorbox_aspect_ratios))
-                                .set_name("conv17/priorbox");
+                                                 PriorBoxLayerInfo({285.f}, priorbox_variances, priorbox_offset, true,
+                                                                   false, {300.f}, priorbox_aspect_ratios))
+                                       .set_name("conv17/priorbox");
 
         SubStream mbox_priorbox(graph);
 
         mbox_priorbox << ConcatLayer(
-                          (common_params.data_layout == DataLayout::NCHW) ? arm_compute::graph::descriptors::ConcatLayerDescriptor(DataLayoutDimension::WIDTH) : arm_compute::graph::descriptors::ConcatLayerDescriptor(
-                              DataLayoutDimension::CHANNEL),
-                          std::move(conv_11_mbox_priorbox), std::move(conv_13_mbox_priorbox), std::move(conv_14_2_mbox_priorbox),
-                          std::move(conv_15_2_mbox_priorbox), std::move(conv_16_2_mbox_priorbox), std::move(conv_17_2_mbox_priorbox));
+            (common_params.data_layout == DataLayout::NCHW)
+                ? arm_compute::graph::descriptors::ConcatLayerDescriptor(DataLayoutDimension::WIDTH)
+                : arm_compute::graph::descriptors::ConcatLayerDescriptor(DataLayoutDimension::CHANNEL),
+            std::move(conv_11_mbox_priorbox), std::move(conv_13_mbox_priorbox), std::move(conv_14_2_mbox_priorbox),
+            std::move(conv_15_2_mbox_priorbox), std::move(conv_16_2_mbox_priorbox), std::move(conv_17_2_mbox_priorbox));
 
         const int                          num_classes         = 21;
         const bool                         share_location      = true;
@@ -377,77 +409,85 @@ private:
 
         SubStream detection_ouput(mbox_loc);
         detection_ouput << DetectionOutputLayer(std::move(mbox_conf), std::move(mbox_priorbox),
-                                                DetectionOutputLayerInfo(num_classes, share_location, detection_type, keep_top_k, nms_threshold, top_k, label_id_background, conf_thrs));
-        detection_ouput << OutputLayer(get_detection_output_accessor(common_params, { input_descriptor.shape }));
+                                                DetectionOutputLayerInfo(num_classes, share_location, detection_type,
+                                                                         keep_top_k, nms_threshold, top_k,
+                                                                         label_id_background, conf_thrs));
+        detection_ouput << OutputLayer(get_detection_output_accessor(common_params, {input_descriptor.shape}));
     }
 
-    ConcatLayer get_node_A_qasymm(IStream &main_graph, const std::string &data_path, std::string &&param_path,
-                                  unsigned int  conv_filt,
-                                  PadStrideInfo dwc_pad_stride_info, PadStrideInfo conv_pad_stride_info,
-                                  std::pair<QuantizationInfo, QuantizationInfo> depth_quant_info, std::pair<QuantizationInfo, QuantizationInfo> point_quant_info)
+    ConcatLayer get_node_A_qasymm(IStream                                      &main_graph,
+                                  const std::string                            &data_path,
+                                  std::string                                 &&param_path,
+                                  unsigned int                                  conv_filt,
+                                  PadStrideInfo                                 dwc_pad_stride_info,
+                                  PadStrideInfo                                 conv_pad_stride_info,
+                                  std::pair<QuantizationInfo, QuantizationInfo> depth_quant_info,
+                                  std::pair<QuantizationInfo, QuantizationInfo> point_quant_info)
     {
         const std::string total_path = param_path + "_";
         SubStream         sg(main_graph);
 
-        sg << DepthwiseConvolutionLayer(
-               3U, 3U,
-               get_weights_accessor(data_path, total_path + "dw_w.npy"),
-               get_weights_accessor(data_path, total_path + "dw_b.npy"),
-               dwc_pad_stride_info, 1, depth_quant_info.first, depth_quant_info.second)
-           .set_name(param_path + "/dw")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f)).set_name(param_path + "/dw/relu6");
-
-        sg << ConvolutionLayer(
-               1U, 1U, conv_filt,
-               get_weights_accessor(data_path, total_path + "w.npy"),
-               get_weights_accessor(data_path, total_path + "b.npy"),
-               conv_pad_stride_info, 1, point_quant_info.first, point_quant_info.second)
-           .set_name(param_path + "/pw")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f)).set_name(param_path + "/pw/relu6");
+        sg << DepthwiseConvolutionLayer(3U, 3U, get_weights_accessor(data_path, total_path + "dw_w.npy"),
+                                        get_weights_accessor(data_path, total_path + "dw_b.npy"), dwc_pad_stride_info,
+                                        1, depth_quant_info.first, depth_quant_info.second)
+                  .set_name(param_path + "/dw")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
+                  .set_name(param_path + "/dw/relu6");
+
+        sg << ConvolutionLayer(1U, 1U, conv_filt, get_weights_accessor(data_path, total_path + "w.npy"),
+                               get_weights_accessor(data_path, total_path + "b.npy"), conv_pad_stride_info, 1,
+                               point_quant_info.first, point_quant_info.second)
+                  .set_name(param_path + "/pw")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
+                  .set_name(param_path + "/pw/relu6");
 
         return ConcatLayer(std::move(sg));
     }
 
-    ConcatLayer get_node_B_qasymm(IStream &main_graph, const std::string &data_path, std::string &&param_path,
-                                  unsigned int  conv_filt,
-                                  PadStrideInfo conv_pad_stride_info_1x1, PadStrideInfo conv_pad_stride_info_3x3,
-                                  const std::pair<QuantizationInfo, QuantizationInfo> quant_info_1x1, const std::pair<QuantizationInfo, QuantizationInfo> quant_info_3x3)
+    ConcatLayer get_node_B_qasymm(IStream                                            &main_graph,
+                                  const std::string                                  &data_path,
+                                  std::string                                       &&param_path,
+                                  unsigned int                                        conv_filt,
+                                  PadStrideInfo                                       conv_pad_stride_info_1x1,
+                                  PadStrideInfo                                       conv_pad_stride_info_3x3,
+                                  const std::pair<QuantizationInfo, QuantizationInfo> quant_info_1x1,
+                                  const std::pair<QuantizationInfo, QuantizationInfo> quant_info_3x3)
     {
         const std::string total_path = param_path + "_";
         SubStream         sg(main_graph);
 
-        sg << ConvolutionLayer(
-               1, 1, conv_filt / 2,
-               get_weights_accessor(data_path, total_path + "1x1_w.npy"),
-               get_weights_accessor(data_path, total_path + "1x1_b.npy"),
-               conv_pad_stride_info_1x1, 1, quant_info_1x1.first, quant_info_1x1.second)
-           .set_name(total_path + "1x1/conv")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f)).set_name(total_path + "1x1/conv/relu6");
-
-        sg << ConvolutionLayer(
-               3, 3, conv_filt,
-               get_weights_accessor(data_path, total_path + "3x3_w.npy"),
-               get_weights_accessor(data_path, total_path + "3x3_b.npy"),
-               conv_pad_stride_info_3x3, 1, quant_info_3x3.first, quant_info_3x3.second)
-           .set_name(total_path + "3x3/conv")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f)).set_name(total_path + "3x3/conv/relu6");
+        sg << ConvolutionLayer(1, 1, conv_filt / 2, get_weights_accessor(data_path, total_path + "1x1_w.npy"),
+                               get_weights_accessor(data_path, total_path + "1x1_b.npy"), conv_pad_stride_info_1x1, 1,
+                               quant_info_1x1.first, quant_info_1x1.second)
+                  .set_name(total_path + "1x1/conv")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
+                  .set_name(total_path + "1x1/conv/relu6");
+
+        sg << ConvolutionLayer(3, 3, conv_filt, get_weights_accessor(data_path, total_path + "3x3_w.npy"),
+                               get_weights_accessor(data_path, total_path + "3x3_b.npy"), conv_pad_stride_info_3x3, 1,
+                               quant_info_3x3.first, quant_info_3x3.second)
+                  .set_name(total_path + "3x3/conv")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
+                  .set_name(total_path + "3x3/conv/relu6");
 
         return ConcatLayer(std::move(sg));
     }
 
-    ConcatLayer get_node_C_qasymm(IStream &main_graph, const std::string &data_path, std::string &&param_path,
-                                  unsigned int conv_filt, PadStrideInfo               conv_pad_stride_info,
-                                  const std::pair<QuantizationInfo, QuantizationInfo> quant_info, TensorShape reshape_shape)
+    ConcatLayer get_node_C_qasymm(IStream                                            &main_graph,
+                                  const std::string                                  &data_path,
+                                  std::string                                       &&param_path,
+                                  unsigned int                                        conv_filt,
+                                  PadStrideInfo                                       conv_pad_stride_info,
+                                  const std::pair<QuantizationInfo, QuantizationInfo> quant_info,
+                                  TensorShape                                         reshape_shape)
     {
         const std::string total_path = param_path + "_";
         SubStream         sg(main_graph);
-        sg << ConvolutionLayer(
-               1U, 1U, conv_filt,
-               get_weights_accessor(data_path, total_path + "w.npy"),
-               get_weights_accessor(data_path, total_path + "b.npy"),
-               conv_pad_stride_info, 1, quant_info.first, quant_info.second)
-           .set_name(param_path + "/conv");
-        if(common_params.data_layout == DataLayout::NCHW)
+        sg << ConvolutionLayer(1U, 1U, conv_filt, get_weights_accessor(data_path, total_path + "w.npy"),
+                               get_weights_accessor(data_path, total_path + "b.npy"), conv_pad_stride_info, 1,
+                               quant_info.first, quant_info.second)
+                  .set_name(param_path + "/conv");
+        if (common_params.data_layout == DataLayout::NCHW)
         {
             sg << PermuteLayer(PermutationVector(2U, 0U, 1U), DataLayout::NHWC);
         }
@@ -462,57 +502,59 @@ private:
         std::string data_path = common_params.data_path;
 
         // Add model path to data path
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += "/cnn_data/ssd_mobilenet_qasymm8_model/";
         }
 
         // Quantization info are saved as pair for each (pointwise/depthwise) convolution layer: <weight_quant_info, output_quant_info>
-        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> conv_quant_info =
-        {
-            { QuantizationInfo(0.03624850884079933f, 163), QuantizationInfo(0.22219789028167725f, 113) },   // conv0
-            { QuantizationInfo(0.0028752065263688564f, 113), QuantizationInfo(0.05433657020330429f, 128) }, // conv13_2_1_1
-            { QuantizationInfo(0.0014862528769299388f, 125), QuantizationInfo(0.05037643015384674f, 131) }, // conv13_2_3_3
-            { QuantizationInfo(0.00233650766313076f, 113), QuantizationInfo(0.04468846693634987f, 126) },   // conv13_3_1_1
-            { QuantizationInfo(0.002501056529581547f, 120), QuantizationInfo(0.06026708707213402f, 111) },  // conv13_3_3_3
-            { QuantizationInfo(0.002896666992455721f, 121), QuantizationInfo(0.037775348871946335f, 117) }, // conv13_4_1_1
-            { QuantizationInfo(0.0023875406477600336f, 122), QuantizationInfo(0.03881589323282242f, 108) }, // conv13_4_3_3
-            { QuantizationInfo(0.0022081052884459496f, 77), QuantizationInfo(0.025450613349676132f, 125) }, // conv13_5_1_1
-            { QuantizationInfo(0.00604657270014286f, 121), QuantizationInfo(0.033533502370119095f, 109) }   // conv13_5_3_3
+        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> conv_quant_info = {
+            {QuantizationInfo(0.03624850884079933f, 163), QuantizationInfo(0.22219789028167725f, 113)}, // conv0
+            {QuantizationInfo(0.0028752065263688564f, 113),
+             QuantizationInfo(0.05433657020330429f, 128)}, // conv13_2_1_1
+            {QuantizationInfo(0.0014862528769299388f, 125),
+             QuantizationInfo(0.05037643015384674f, 131)},                                               // conv13_2_3_3
+            {QuantizationInfo(0.00233650766313076f, 113), QuantizationInfo(0.04468846693634987f, 126)},  // conv13_3_1_1
+            {QuantizationInfo(0.002501056529581547f, 120), QuantizationInfo(0.06026708707213402f, 111)}, // conv13_3_3_3
+            {QuantizationInfo(0.002896666992455721f, 121),
+             QuantizationInfo(0.037775348871946335f, 117)}, // conv13_4_1_1
+            {QuantizationInfo(0.0023875406477600336f, 122),
+             QuantizationInfo(0.03881589323282242f, 108)}, // conv13_4_3_3
+            {QuantizationInfo(0.0022081052884459496f, 77),
+             QuantizationInfo(0.025450613349676132f, 125)},                                             // conv13_5_1_1
+            {QuantizationInfo(0.00604657270014286f, 121), QuantizationInfo(0.033533502370119095f, 109)} // conv13_5_3_3
         };
 
-        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> depth_quant_info =
-        {
-            { QuantizationInfo(0.03408717364072f, 131), QuantizationInfo(0.29286590218544006f, 108) },     // dwsc1
-            { QuantizationInfo(0.027518004179000854f, 107), QuantizationInfo(0.20796941220760345, 117) },  // dwsc2
-            { QuantizationInfo(0.052489638328552246f, 85), QuantizationInfo(0.4303881824016571f, 142) },   // dwsc3
-            { QuantizationInfo(0.016570359468460083f, 79), QuantizationInfo(0.10512150079011917f, 116) },  // dwsc4
-            { QuantizationInfo(0.060739465057849884f, 65), QuantizationInfo(0.15331414341926575f, 94) },   // dwsc5
-            { QuantizationInfo(0.01324534136801958f, 124), QuantizationInfo(0.13010895252227783f, 153) },  // dwsc6
-            { QuantizationInfo(0.032326459884643555f, 124), QuantizationInfo(0.11565316468477249, 156) },  // dwsc7
-            { QuantizationInfo(0.029948478564620018f, 155), QuantizationInfo(0.11413891613483429f, 146) }, // dwsc8
-            { QuantizationInfo(0.028054025024175644f, 129), QuantizationInfo(0.1142905130982399f, 140) },  // dwsc9
-            { QuantizationInfo(0.025204822421073914f, 129), QuantizationInfo(0.14668069779872894f, 149) }, // dwsc10
-            { QuantizationInfo(0.019332280382514f, 110), QuantizationInfo(0.1480235457420349f, 91) },      // dwsc11
-            { QuantizationInfo(0.0319712869822979f, 88), QuantizationInfo(0.10424695909023285f, 117) },    // dwsc12
-            { QuantizationInfo(0.04378943517804146f, 164), QuantizationInfo(0.23176774382591248f, 138) }   // dwsc13
+        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> depth_quant_info = {
+            {QuantizationInfo(0.03408717364072f, 131), QuantizationInfo(0.29286590218544006f, 108)},     // dwsc1
+            {QuantizationInfo(0.027518004179000854f, 107), QuantizationInfo(0.20796941220760345, 117)},  // dwsc2
+            {QuantizationInfo(0.052489638328552246f, 85), QuantizationInfo(0.4303881824016571f, 142)},   // dwsc3
+            {QuantizationInfo(0.016570359468460083f, 79), QuantizationInfo(0.10512150079011917f, 116)},  // dwsc4
+            {QuantizationInfo(0.060739465057849884f, 65), QuantizationInfo(0.15331414341926575f, 94)},   // dwsc5
+            {QuantizationInfo(0.01324534136801958f, 124), QuantizationInfo(0.13010895252227783f, 153)},  // dwsc6
+            {QuantizationInfo(0.032326459884643555f, 124), QuantizationInfo(0.11565316468477249, 156)},  // dwsc7
+            {QuantizationInfo(0.029948478564620018f, 155), QuantizationInfo(0.11413891613483429f, 146)}, // dwsc8
+            {QuantizationInfo(0.028054025024175644f, 129), QuantizationInfo(0.1142905130982399f, 140)},  // dwsc9
+            {QuantizationInfo(0.025204822421073914f, 129), QuantizationInfo(0.14668069779872894f, 149)}, // dwsc10
+            {QuantizationInfo(0.019332280382514f, 110), QuantizationInfo(0.1480235457420349f, 91)},      // dwsc11
+            {QuantizationInfo(0.0319712869822979f, 88), QuantizationInfo(0.10424695909023285f, 117)},    // dwsc12
+            {QuantizationInfo(0.04378943517804146f, 164), QuantizationInfo(0.23176774382591248f, 138)}   // dwsc13
         };
 
-        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> point_quant_info =
-        {
-            { QuantizationInfo(0.028777318075299263f, 144), QuantizationInfo(0.2663874328136444f, 121) },  // pw1
-            { QuantizationInfo(0.015796702355146408f, 127), QuantizationInfo(0.1739964485168457f, 111) },  // pw2
-            { QuantizationInfo(0.009349990636110306f, 127), QuantizationInfo(0.1805974692106247f, 104) },  // pw3
-            { QuantizationInfo(0.012920888140797615f, 106), QuantizationInfo(0.1205204650759697f, 100) },  // pw4
-            { QuantizationInfo(0.008119508624076843f, 145), QuantizationInfo(0.12272439152002335f, 97) },  // pw5
-            { QuantizationInfo(0.0070041813887655735f, 115), QuantizationInfo(0.0947074219584465f, 101) }, // pw6
-            { QuantizationInfo(0.004827278666198254f, 115), QuantizationInfo(0.0842885747551918f, 110) },  // pw7
-            { QuantizationInfo(0.004755120258778334f, 128), QuantizationInfo(0.08283159881830215f, 116) }, // pw8
-            { QuantizationInfo(0.007527193054556847f, 142), QuantizationInfo(0.12555131316184998f, 137) }, // pw9
-            { QuantizationInfo(0.006050156895071268f, 109), QuantizationInfo(0.10871313512325287f, 124) }, // pw10
-            { QuantizationInfo(0.00490700313821435f, 127), QuantizationInfo(0.10364262014627457f, 140) },  // pw11
-            { QuantizationInfo(0.006063731852918863, 124), QuantizationInfo(0.11241862177848816f, 125) },  // pw12
-            { QuantizationInfo(0.007901716977357864f, 139), QuantizationInfo(0.49889302253723145f, 141) }  // pw13
+        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> point_quant_info = {
+            {QuantizationInfo(0.028777318075299263f, 144), QuantizationInfo(0.2663874328136444f, 121)},  // pw1
+            {QuantizationInfo(0.015796702355146408f, 127), QuantizationInfo(0.1739964485168457f, 111)},  // pw2
+            {QuantizationInfo(0.009349990636110306f, 127), QuantizationInfo(0.1805974692106247f, 104)},  // pw3
+            {QuantizationInfo(0.012920888140797615f, 106), QuantizationInfo(0.1205204650759697f, 100)},  // pw4
+            {QuantizationInfo(0.008119508624076843f, 145), QuantizationInfo(0.12272439152002335f, 97)},  // pw5
+            {QuantizationInfo(0.0070041813887655735f, 115), QuantizationInfo(0.0947074219584465f, 101)}, // pw6
+            {QuantizationInfo(0.004827278666198254f, 115), QuantizationInfo(0.0842885747551918f, 110)},  // pw7
+            {QuantizationInfo(0.004755120258778334f, 128), QuantizationInfo(0.08283159881830215f, 116)}, // pw8
+            {QuantizationInfo(0.007527193054556847f, 142), QuantizationInfo(0.12555131316184998f, 137)}, // pw9
+            {QuantizationInfo(0.006050156895071268f, 109), QuantizationInfo(0.10871313512325287f, 124)}, // pw10
+            {QuantizationInfo(0.00490700313821435f, 127), QuantizationInfo(0.10364262014627457f, 140)},  // pw11
+            {QuantizationInfo(0.006063731852918863, 124), QuantizationInfo(0.11241862177848816f, 125)},  // pw12
+            {QuantizationInfo(0.007901716977357864f, 139), QuantizationInfo(0.49889302253723145f, 141)}  // pw13
         };
 
         // Quantization info taken from the TfLite SSD MobileNet example
@@ -520,114 +562,154 @@ private:
         // Create core graph
         graph << InputLayer(input_descriptor.set_quantization_info(in_quant_info),
                             get_weights_accessor(data_path, common_params.image, DataLayout::NHWC));
-        graph << ConvolutionLayer(
-                  3U, 3U, 32U,
-                  get_weights_accessor(data_path, "conv0_w.npy"),
-                  get_weights_accessor(data_path, "conv0_b.npy"),
-                  PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL), 1, conv_quant_info.at(0).first, conv_quant_info.at(0).second)
-              .set_name("conv0");
-        graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f)).set_name("conv0/relu");
-        graph << get_node_A_qasymm(graph, data_path, "conv1", 64U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(0),
-                                   point_quant_info.at(0));
-        graph << get_node_A_qasymm(graph, data_path, "conv2", 128U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(1),
-                                   point_quant_info.at(1));
-        graph << get_node_A_qasymm(graph, data_path, "conv3", 128U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(2),
-                                   point_quant_info.at(2));
-        graph << get_node_A_qasymm(graph, data_path, "conv4", 256U, PadStrideInfo(2U, 2U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(3),
-                                   point_quant_info.at(3));
-        graph << get_node_A_qasymm(graph, data_path, "conv5", 256U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(4),
-                                   point_quant_info.at(4));
-        graph << get_node_A_qasymm(graph, data_path, "conv6", 512U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(5),
-                                   point_quant_info.at(5));
-        graph << get_node_A_qasymm(graph, data_path, "conv7", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(6),
-                                   point_quant_info.at(6));
-        graph << get_node_A_qasymm(graph, data_path, "conv8", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(7),
-                                   point_quant_info.at(7));
-        graph << get_node_A_qasymm(graph, data_path, "conv9", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(8),
-                                   point_quant_info.at(8));
-        graph << get_node_A_qasymm(graph, data_path, "conv10", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(9),
-                                   point_quant_info.at(9));
-        graph << get_node_A_qasymm(graph, data_path, "conv11", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(10),
-                                   point_quant_info.at(10));
+        graph << ConvolutionLayer(3U, 3U, 32U, get_weights_accessor(data_path, "conv0_w.npy"),
+                                  get_weights_accessor(data_path, "conv0_b.npy"),
+                                  PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL), 1,
+                                  conv_quant_info.at(0).first, conv_quant_info.at(0).second)
+                     .set_name("conv0");
+        graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
+                     .set_name("conv0/relu");
+        graph << get_node_A_qasymm(graph, data_path, "conv1", 64U,
+                                   PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(0), point_quant_info.at(0));
+        graph << get_node_A_qasymm(graph, data_path, "conv2", 128U,
+                                   PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(1), point_quant_info.at(1));
+        graph << get_node_A_qasymm(graph, data_path, "conv3", 128U,
+                                   PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(2), point_quant_info.at(2));
+        graph << get_node_A_qasymm(graph, data_path, "conv4", 256U,
+                                   PadStrideInfo(2U, 2U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(3), point_quant_info.at(3));
+        graph << get_node_A_qasymm(graph, data_path, "conv5", 256U,
+                                   PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(4), point_quant_info.at(4));
+        graph << get_node_A_qasymm(graph, data_path, "conv6", 512U,
+                                   PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(5), point_quant_info.at(5));
+        graph << get_node_A_qasymm(graph, data_path, "conv7", 512U,
+                                   PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(6), point_quant_info.at(6));
+        graph << get_node_A_qasymm(graph, data_path, "conv8", 512U,
+                                   PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(7), point_quant_info.at(7));
+        graph << get_node_A_qasymm(graph, data_path, "conv9", 512U,
+                                   PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(8), point_quant_info.at(8));
+        graph << get_node_A_qasymm(graph, data_path, "conv10", 512U,
+                                   PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(9), point_quant_info.at(9));
+        graph << get_node_A_qasymm(graph, data_path, "conv11", 512U,
+                                   PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(10), point_quant_info.at(10));
 
         SubStream conv_13(graph);
-        conv_13 << get_node_A_qasymm(graph, data_path, "conv12", 1024U, PadStrideInfo(2U, 2U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(11),
-                                     point_quant_info.at(11));
-        conv_13 << get_node_A_qasymm(conv_13, data_path, "conv13", 1024U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(12),
-                                     point_quant_info.at(12));
+        conv_13 << get_node_A_qasymm(graph, data_path, "conv12", 1024U,
+                                     PadStrideInfo(2U, 2U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(11), point_quant_info.at(11));
+        conv_13 << get_node_A_qasymm(conv_13, data_path, "conv13", 1024U,
+                                     PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(12), point_quant_info.at(12));
         SubStream conv_14(conv_13);
-        conv_14 << get_node_B_qasymm(conv_13, data_path, "conv13_2", 512U, PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL), conv_quant_info.at(1),
-                                     conv_quant_info.at(2));
+        conv_14 << get_node_B_qasymm(conv_13, data_path, "conv13_2", 512U, PadStrideInfo(1U, 1U, 0U, 0U),
+                                     PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL),
+                                     conv_quant_info.at(1), conv_quant_info.at(2));
         SubStream conv_15(conv_14);
-        conv_15 << get_node_B_qasymm(conv_14, data_path, "conv13_3", 256U, PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(2U, 2U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), conv_quant_info.at(3),
-                                     conv_quant_info.at(4));
+        conv_15 << get_node_B_qasymm(conv_14, data_path, "conv13_3", 256U, PadStrideInfo(1U, 1U, 0U, 0U),
+                                     PadStrideInfo(2U, 2U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                     conv_quant_info.at(3), conv_quant_info.at(4));
         SubStream conv_16(conv_15);
-        conv_16 << get_node_B_qasymm(conv_15, data_path, "conv13_4", 256U, PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(2U, 2U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), conv_quant_info.at(5),
-                                     conv_quant_info.at(6));
+        conv_16 << get_node_B_qasymm(conv_15, data_path, "conv13_4", 256U, PadStrideInfo(1U, 1U, 0U, 0U),
+                                     PadStrideInfo(2U, 2U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                     conv_quant_info.at(5), conv_quant_info.at(6));
         SubStream conv_17(conv_16);
-        conv_17 << get_node_B_qasymm(conv_16, data_path, "conv13_5", 128U, PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL), conv_quant_info.at(7),
-                                     conv_quant_info.at(8));
+        conv_17 << get_node_B_qasymm(conv_16, data_path, "conv13_5", 128U, PadStrideInfo(1U, 1U, 0U, 0U),
+                                     PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL),
+                                     conv_quant_info.at(7), conv_quant_info.at(8));
 
         // box_predictor
-        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> box_enc_pred_quant_info =
-        {
-            { QuantizationInfo(0.005202020984143019f, 136), QuantizationInfo(0.08655580133199692f, 183) },   // boxpredictor0_bep
-            { QuantizationInfo(0.003121797926723957f, 132), QuantizationInfo(0.03218776360154152f, 140) },   // boxpredictor1_bep
-            { QuantizationInfo(0.002995674265548587f, 130), QuantizationInfo(0.029072262346744537f, 125) },  // boxpredictor2_bep
-            { QuantizationInfo(0.0023131705820560455f, 130), QuantizationInfo(0.026488754898309708f, 127) }, // boxpredictor3_bep
-            { QuantizationInfo(0.0013905081432312727f, 132), QuantizationInfo(0.0199890099465847f, 137) },   // boxpredictor4_bep
-            { QuantizationInfo(0.00216794665902853f, 121), QuantizationInfo(0.019798893481492996f, 151) }    // boxpredictor5_bep
+        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> box_enc_pred_quant_info = {
+            {QuantizationInfo(0.005202020984143019f, 136),
+             QuantizationInfo(0.08655580133199692f, 183)}, // boxpredictor0_bep
+            {QuantizationInfo(0.003121797926723957f, 132),
+             QuantizationInfo(0.03218776360154152f, 140)}, // boxpredictor1_bep
+            {QuantizationInfo(0.002995674265548587f, 130),
+             QuantizationInfo(0.029072262346744537f, 125)}, // boxpredictor2_bep
+            {QuantizationInfo(0.0023131705820560455f, 130),
+             QuantizationInfo(0.026488754898309708f, 127)}, // boxpredictor3_bep
+            {QuantizationInfo(0.0013905081432312727f, 132),
+             QuantizationInfo(0.0199890099465847f, 137)}, // boxpredictor4_bep
+            {QuantizationInfo(0.00216794665902853f, 121),
+             QuantizationInfo(0.019798893481492996f, 151)} // boxpredictor5_bep
         };
 
         const std::vector<TensorShape> box_reshape = // NHWC
-        {
-            TensorShape(4U, 1U, 1083U), // boxpredictor0_bep_reshape
-            TensorShape(4U, 1U, 600U),  // boxpredictor1_bep_reshape
-            TensorShape(4U, 1U, 150U),  // boxpredictor2_bep_reshape
-            TensorShape(4U, 1U, 54U),   // boxpredictor3_bep_reshape
-            TensorShape(4U, 1U, 24U),   // boxpredictor4_bep_reshape
-            TensorShape(4U, 1U, 6U)     // boxpredictor5_bep_reshape
-        };
+            {
+                TensorShape(4U, 1U, 1083U), // boxpredictor0_bep_reshape
+                TensorShape(4U, 1U, 600U),  // boxpredictor1_bep_reshape
+                TensorShape(4U, 1U, 150U),  // boxpredictor2_bep_reshape
+                TensorShape(4U, 1U, 54U),   // boxpredictor3_bep_reshape
+                TensorShape(4U, 1U, 24U),   // boxpredictor4_bep_reshape
+                TensorShape(4U, 1U, 6U)     // boxpredictor5_bep_reshape
+            };
 
         SubStream conv_11_box_enc_pre(graph);
-        conv_11_box_enc_pre << get_node_C_qasymm(graph, data_path, "BoxPredictor_0_BEP", 12U, PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(0), box_reshape.at(0));
+        conv_11_box_enc_pre << get_node_C_qasymm(graph, data_path, "BoxPredictor_0_BEP", 12U,
+                                                 PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(0),
+                                                 box_reshape.at(0));
 
         SubStream conv_13_box_enc_pre(conv_13);
-        conv_13_box_enc_pre << get_node_C_qasymm(conv_13, data_path, "BoxPredictor_1_BEP", 24U, PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(1), box_reshape.at(1));
+        conv_13_box_enc_pre << get_node_C_qasymm(conv_13, data_path, "BoxPredictor_1_BEP", 24U,
+                                                 PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(1),
+                                                 box_reshape.at(1));
 
         SubStream conv_14_2_box_enc_pre(conv_14);
-        conv_14_2_box_enc_pre << get_node_C_qasymm(conv_14, data_path, "BoxPredictor_2_BEP", 24U, PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(2), box_reshape.at(2));
+        conv_14_2_box_enc_pre << get_node_C_qasymm(conv_14, data_path, "BoxPredictor_2_BEP", 24U,
+                                                   PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(2),
+                                                   box_reshape.at(2));
 
         SubStream conv_15_2_box_enc_pre(conv_15);
-        conv_15_2_box_enc_pre << get_node_C_qasymm(conv_15, data_path, "BoxPredictor_3_BEP", 24U, PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(3), box_reshape.at(3));
+        conv_15_2_box_enc_pre << get_node_C_qasymm(conv_15, data_path, "BoxPredictor_3_BEP", 24U,
+                                                   PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(3),
+                                                   box_reshape.at(3));
 
         SubStream conv_16_2_box_enc_pre(conv_16);
-        conv_16_2_box_enc_pre << get_node_C_qasymm(conv_16, data_path, "BoxPredictor_4_BEP", 24U, PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(4), box_reshape.at(4));
+        conv_16_2_box_enc_pre << get_node_C_qasymm(conv_16, data_path, "BoxPredictor_4_BEP", 24U,
+                                                   PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(4),
+                                                   box_reshape.at(4));
 
         SubStream conv_17_2_box_enc_pre(conv_17);
-        conv_17_2_box_enc_pre << get_node_C_qasymm(conv_17, data_path, "BoxPredictor_5_BEP", 24U, PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(5), box_reshape.at(5));
+        conv_17_2_box_enc_pre << get_node_C_qasymm(conv_17, data_path, "BoxPredictor_5_BEP", 24U,
+                                                   PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(5),
+                                                   box_reshape.at(5));
 
         SubStream              box_enc_pre(graph);
         const QuantizationInfo bep_concate_qinfo = QuantizationInfo(0.08655580133199692f, 183);
-        box_enc_pre << ConcatLayer(arm_compute::graph::descriptors::ConcatLayerDescriptor(DataLayoutDimension::HEIGHT, bep_concate_qinfo),
-                                   std::move(conv_11_box_enc_pre), std::move(conv_13_box_enc_pre), conv_14_2_box_enc_pre, std::move(conv_15_2_box_enc_pre),
+        box_enc_pre << ConcatLayer(arm_compute::graph::descriptors::ConcatLayerDescriptor(DataLayoutDimension::HEIGHT,
+                                                                                          bep_concate_qinfo),
+                                   std::move(conv_11_box_enc_pre), std::move(conv_13_box_enc_pre),
+                                   conv_14_2_box_enc_pre, std::move(conv_15_2_box_enc_pre),
                                    std::move(conv_16_2_box_enc_pre), std::move(conv_17_2_box_enc_pre))
-                    .set_name("BoxPredictor/concat");
+                           .set_name("BoxPredictor/concat");
         box_enc_pre << ReshapeLayer(TensorShape(4U, 1917U)).set_name("BoxPredictor/reshape");
 
         // class_predictor
-        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> class_pred_quant_info =
-        {
-            { QuantizationInfo(0.002744135679677129f, 125), QuantizationInfo(0.05746262148022652f, 234) },   // boxpredictor0_cp
-            { QuantizationInfo(0.0024326108396053314f, 80), QuantizationInfo(0.03764628246426582f, 217) },   // boxpredictor1_cp
-            { QuantizationInfo(0.0013898586621508002f, 141), QuantizationInfo(0.034081317484378815f, 214) }, // boxpredictor2_cp
-            { QuantizationInfo(0.0014176908880472183f, 133), QuantizationInfo(0.033889178186655045f, 215) }, // boxpredictor3_cp
-            { QuantizationInfo(0.001090311910957098f, 125), QuantizationInfo(0.02646234817802906f, 230) },   // boxpredictor4_cp
-            { QuantizationInfo(0.001134163816459477f, 115), QuantizationInfo(0.026926767081022263f, 218) }   // boxpredictor5_cp
+        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> class_pred_quant_info = {
+            {QuantizationInfo(0.002744135679677129f, 125),
+             QuantizationInfo(0.05746262148022652f, 234)}, // boxpredictor0_cp
+            {QuantizationInfo(0.0024326108396053314f, 80),
+             QuantizationInfo(0.03764628246426582f, 217)}, // boxpredictor1_cp
+            {QuantizationInfo(0.0013898586621508002f, 141),
+             QuantizationInfo(0.034081317484378815f, 214)}, // boxpredictor2_cp
+            {QuantizationInfo(0.0014176908880472183f, 133),
+             QuantizationInfo(0.033889178186655045f, 215)}, // boxpredictor3_cp
+            {QuantizationInfo(0.001090311910957098f, 125),
+             QuantizationInfo(0.02646234817802906f, 230)}, // boxpredictor4_cp
+            {QuantizationInfo(0.001134163816459477f, 115),
+             QuantizationInfo(0.026926767081022263f, 218)} // boxpredictor5_cp
         };
 
-        const std::vector<TensorShape> class_reshape =
-        {
+        const std::vector<TensorShape> class_reshape = {
             TensorShape(91U, 1083U), // boxpredictor0_cp_reshape
             TensorShape(91U, 600U),  // boxpredictor1_cp_reshape
             TensorShape(91U, 150U),  // boxpredictor2_cp_reshape
@@ -637,60 +719,80 @@ private:
         };
 
         SubStream conv_11_class_pre(graph);
-        conv_11_class_pre << get_node_C_qasymm(graph, data_path, "BoxPredictor_0_CP", 273U, PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(0), class_reshape.at(0));
+        conv_11_class_pre << get_node_C_qasymm(graph, data_path, "BoxPredictor_0_CP", 273U,
+                                               PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(0),
+                                               class_reshape.at(0));
 
         SubStream conv_13_class_pre(conv_13);
-        conv_13_class_pre << get_node_C_qasymm(conv_13, data_path, "BoxPredictor_1_CP", 546U, PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(1), class_reshape.at(1));
+        conv_13_class_pre << get_node_C_qasymm(conv_13, data_path, "BoxPredictor_1_CP", 546U,
+                                               PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(1),
+                                               class_reshape.at(1));
 
         SubStream conv_14_2_class_pre(conv_14);
-        conv_14_2_class_pre << get_node_C_qasymm(conv_14, data_path, "BoxPredictor_2_CP", 546U, PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(2), class_reshape.at(2));
+        conv_14_2_class_pre << get_node_C_qasymm(conv_14, data_path, "BoxPredictor_2_CP", 546U,
+                                                 PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(2),
+                                                 class_reshape.at(2));
 
         SubStream conv_15_2_class_pre(conv_15);
-        conv_15_2_class_pre << get_node_C_qasymm(conv_15, data_path, "BoxPredictor_3_CP", 546U, PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(3), class_reshape.at(3));
+        conv_15_2_class_pre << get_node_C_qasymm(conv_15, data_path, "BoxPredictor_3_CP", 546U,
+                                                 PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(3),
+                                                 class_reshape.at(3));
 
         SubStream conv_16_2_class_pre(conv_16);
-        conv_16_2_class_pre << get_node_C_qasymm(conv_16, data_path, "BoxPredictor_4_CP", 546U, PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(4), class_reshape.at(4));
+        conv_16_2_class_pre << get_node_C_qasymm(conv_16, data_path, "BoxPredictor_4_CP", 546U,
+                                                 PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(4),
+                                                 class_reshape.at(4));
 
         SubStream conv_17_2_class_pre(conv_17);
-        conv_17_2_class_pre << get_node_C_qasymm(conv_17, data_path, "BoxPredictor_5_CP", 546U, PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(5), class_reshape.at(5));
+        conv_17_2_class_pre << get_node_C_qasymm(conv_17, data_path, "BoxPredictor_5_CP", 546U,
+                                                 PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(5),
+                                                 class_reshape.at(5));
 
         const QuantizationInfo cp_concate_qinfo = QuantizationInfo(0.0584389753639698f, 230);
         SubStream              class_pred(graph);
-        class_pred << ConcatLayer(
-                       arm_compute::graph::descriptors::ConcatLayerDescriptor(DataLayoutDimension::WIDTH, cp_concate_qinfo),
-                       std::move(conv_11_class_pre), std::move(conv_13_class_pre), std::move(conv_14_2_class_pre),
-                       std::move(conv_15_2_class_pre), std::move(conv_16_2_class_pre), std::move(conv_17_2_class_pre))
-                   .set_name("ClassPrediction/concat");
+        class_pred << ConcatLayer(arm_compute::graph::descriptors::ConcatLayerDescriptor(DataLayoutDimension::WIDTH,
+                                                                                         cp_concate_qinfo),
+                                  std::move(conv_11_class_pre), std::move(conv_13_class_pre),
+                                  std::move(conv_14_2_class_pre), std::move(conv_15_2_class_pre),
+                                  std::move(conv_16_2_class_pre), std::move(conv_17_2_class_pre))
+                          .set_name("ClassPrediction/concat");
 
         const QuantizationInfo logistic_out_qinfo = QuantizationInfo(0.00390625f, 0);
-        class_pred << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC), logistic_out_qinfo).set_name("ClassPrediction/logistic");
-
-        const int   max_detections            = 10;
-        const int   max_classes_per_detection = 1;
-        const float nms_score_threshold       = 0.30000001192092896f;
-        const float nms_iou_threshold         = 0.6000000238418579f;
-        const int   num_classes               = 90;
-        const float x_scale                   = 10.f;
-        const float y_scale                   = 10.f;
-        const float h_scale                   = 5.f;
-        const float w_scale                   = 5.f;
-        std::array<float, 4> scales = { y_scale, x_scale, w_scale, h_scale };
-        const QuantizationInfo anchors_qinfo = QuantizationInfo(0.006453060545027256f, 0);
+        class_pred << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+                                      logistic_out_qinfo)
+                          .set_name("ClassPrediction/logistic");
+
+        const int              max_detections            = 10;
+        const int              max_classes_per_detection = 1;
+        const float            nms_score_threshold       = 0.30000001192092896f;
+        const float            nms_iou_threshold         = 0.6000000238418579f;
+        const int              num_classes               = 90;
+        const float            x_scale                   = 10.f;
+        const float            y_scale                   = 10.f;
+        const float            h_scale                   = 5.f;
+        const float            w_scale                   = 5.f;
+        std::array<float, 4>   scales                    = {y_scale, x_scale, w_scale, h_scale};
+        const QuantizationInfo anchors_qinfo             = QuantizationInfo(0.006453060545027256f, 0);
 
         SubStream detection_ouput(box_enc_pre);
         detection_ouput << DetectionPostProcessLayer(std::move(class_pred),
-                                                     DetectionPostProcessLayerInfo(max_detections, max_classes_per_detection, nms_score_threshold, nms_iou_threshold, num_classes, scales),
+                                                     DetectionPostProcessLayerInfo(
+                                                         max_detections, max_classes_per_detection, nms_score_threshold,
+                                                         nms_iou_threshold, num_classes, scales),
                                                      get_weights_accessor(data_path, "anchors.npy"), anchors_qinfo)
-                        .set_name("DetectionPostProcess");
+                               .set_name("DetectionPostProcess");
 
         SubStream ouput_0(detection_ouput);
-        ouput_0 << OutputLayer(get_npy_output_accessor(detection_boxes_opt->value(), TensorShape(4U, 10U), DataType::F32), 0);
+        ouput_0 << OutputLayer(
+            get_npy_output_accessor(detection_boxes_opt->value(), TensorShape(4U, 10U), DataType::F32), 0);
 
         SubStream ouput_1(detection_ouput);
-        ouput_1 << OutputLayer(get_npy_output_accessor(detection_classes_opt->value(), TensorShape(10U), DataType::F32), 1);
+        ouput_1 << OutputLayer(get_npy_output_accessor(detection_classes_opt->value(), TensorShape(10U), DataType::F32),
+                               1);
 
         SubStream ouput_2(detection_ouput);
-        ouput_2 << OutputLayer(get_npy_output_accessor(detection_scores_opt->value(), TensorShape(10U), DataType::F32), 2);
+        ouput_2 << OutputLayer(get_npy_output_accessor(detection_scores_opt->value(), TensorShape(10U), DataType::F32),
+                               2);
 
         SubStream ouput_3(detection_ouput);
         ouput_3 << OutputLayer(get_npy_output_accessor(num_detections_opt->value(), TensorShape(1U), DataType::F32), 3);
diff --git a/examples/graph_vgg16.cpp b/examples/graph_vgg16.cpp
index fcfe6ef50d3b53b07fe549fae2a12b0c3f404094..72ac9694b1735450ac9e9e90a889558610f88796 100644
--- a/examples/graph_vgg16.cpp
+++ b/examples/graph_vgg16.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphVGG16Example : public Example
 {
 public:
-    GraphVGG16Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "VGG16")
+    GraphVGG16Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "VGG16")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,7 +49,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -62,153 +62,143 @@ public:
         std::string data_path = common_params.data_path;
 
         // Create a preprocessor object
-        const std::array<float, 3> mean_rgb{ { 123.68f, 116.779f, 103.939f } };
+        const std::array<float, 3>     mean_rgb{{123.68f, 116.779f, 103.939f}};
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<CaffePreproccessor>(mean_rgb);
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
         // Create graph
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
-              // Layer 1
-              << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv1_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv1_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv1_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1_1/Relu")
-              // Layer 2
-              << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv1_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv1_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv1_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1_2/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool1")
-              // Layer 3
-              << ConvolutionLayer(
-                  3U, 3U, 128U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv2_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv2_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv2_1/Relu")
-              // Layer 4
-              << ConvolutionLayer(
-                  3U, 3U, 128U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv2_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv2_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv2_2/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool2")
-              // Layer 5
-              << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv3_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv3_1/Relu")
-              // Layer 6
-              << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv3_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv3_2/Relu")
-              // Layer 7
-              << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_3_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_3_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv3_3")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv3_3/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool3")
-              // Layer 8
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv4_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv4_1/Relu")
-              // Layer 9
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv4_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv4_2/Relu")
-              // Layer 10
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_3_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_3_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv4_3")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv4_3/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool4")
-              // Layer 11
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv5_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv5_1/Relu")
-              // Layer 12
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv5_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv5_2/Relu")
-              // Layer 13
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_3_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_3_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv5_3")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv5_3/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool5")
-              // Layer 14
-              << FullyConnectedLayer(
-                  4096U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc6_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc6_b.npy"))
-              .set_name("fc6")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Relu")
-              // Layer 15
-              << FullyConnectedLayer(
-                  4096U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc7_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc7_b.npy"))
-              .set_name("fc7")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Relu_1")
-              // Layer 16
-              << FullyConnectedLayer(
-                  1000U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc8_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc8_b.npy"))
-              .set_name("fc8")
-              // Softmax
-              << SoftmaxLayer().set_name("prob")
-              << OutputLayer(get_output_accessor(common_params, 5));
+        graph
+            << common_params.target << common_params.fast_math_hint
+            << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
+            // Layer 1
+            << ConvolutionLayer(
+                   3U, 3U, 64U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv1_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv1_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv1_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv1_1/Relu")
+            // Layer 2
+            << ConvolutionLayer(
+                   3U, 3U, 64U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv1_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv1_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv1_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv1_2/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool1")
+            // Layer 3
+            << ConvolutionLayer(
+                   3U, 3U, 128U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv2_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv2_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv2_1/Relu")
+            // Layer 4
+            << ConvolutionLayer(
+                   3U, 3U, 128U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv2_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv2_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv2_2/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool2")
+            // Layer 5
+            << ConvolutionLayer(
+                   3U, 3U, 256U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv3_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv3_1/Relu")
+            // Layer 6
+            << ConvolutionLayer(
+                   3U, 3U, 256U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv3_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv3_2/Relu")
+            // Layer 7
+            << ConvolutionLayer(
+                   3U, 3U, 256U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_3_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_3_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv3_3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv3_3/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool3")
+            // Layer 8
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv4_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv4_1/Relu")
+            // Layer 9
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv4_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv4_2/Relu")
+            // Layer 10
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_3_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_3_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv4_3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv4_3/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool4")
+            // Layer 11
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv5_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv5_1/Relu")
+            // Layer 12
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv5_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv5_2/Relu")
+            // Layer 13
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_3_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_3_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv5_3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv5_3/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool5")
+            // Layer 14
+            << FullyConnectedLayer(4096U,
+                                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc6_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc6_b.npy"))
+                   .set_name("fc6")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Relu")
+            // Layer 15
+            << FullyConnectedLayer(4096U,
+                                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc7_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc7_b.npy"))
+                   .set_name("fc7")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Relu_1")
+            // Layer 16
+            << FullyConnectedLayer(1000U,
+                                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc8_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc8_b.npy"))
+                   .set_name("fc8")
+            // Softmax
+            << SoftmaxLayer().set_name("prob") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
diff --git a/examples/graph_vgg19.cpp b/examples/graph_vgg19.cpp
index efc0bcce19003956108a6bc4fa7ddf755edeeb3e..929354465527a0976e2fc2a14c7fb150143265f9 100644
--- a/examples/graph_vgg19.cpp
+++ b/examples/graph_vgg19.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -34,8 +35,7 @@ using namespace arm_compute::graph_utils;
 class GraphVGG19Example : public Example
 {
 public:
-    GraphVGG19Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "VGG19")
+    GraphVGG19Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "VGG19")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -48,7 +48,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -61,165 +61,152 @@ public:
         std::string data_path = common_params.data_path;
 
         // Create a preprocessor object
-        const std::array<float, 3> mean_rgb{ { 123.68f, 116.779f, 103.939f } };
+        const std::array<float, 3>     mean_rgb{{123.68f, 116.779f, 103.939f}};
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<CaffePreproccessor>(mean_rgb);
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
-              // Layer 1
-              << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv1_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv1_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv1_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1_1/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv1_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv1_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv1_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1_2/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool1")
-              // Layer 2
-              << ConvolutionLayer(
-                  3U, 3U, 128U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv2_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv2_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv2_1/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 128U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv2_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv2_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv2_2/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool2")
-              // Layer 3
-              << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv3_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv3_1/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv3_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv3_2/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_3_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_3_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv3_3")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv3_3/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_4_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_4_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv3_4")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv3_4/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool3")
-              // Layer 4
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv4_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv4_1/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv4_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv4_2/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_3_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_3_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv4_3")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv4_3/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_4_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_4_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv4_4")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv4_4/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool4")
-              // Layer 5
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv5_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv5_1/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv5_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv5_2/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_3_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_3_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv5_3")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv5_3/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_4_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_4_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv5_4")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv5_4/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool5")
-              // Layer 6
-              << FullyConnectedLayer(
-                  4096U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc6_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc6_b.npy"))
-              .set_name("fc6")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Relu")
-              // Layer 7
-              << FullyConnectedLayer(
-                  4096U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc7_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc7_b.npy"))
-              .set_name("fc7")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Relu_1")
-              // Layer 8
-              << FullyConnectedLayer(
-                  1000U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc8_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc8_b.npy"))
-              .set_name("fc8")
-              // Softmax
-              << SoftmaxLayer().set_name("prob")
-              << OutputLayer(get_output_accessor(common_params, 5));
+        graph
+            << common_params.target << common_params.fast_math_hint
+            << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
+            // Layer 1
+            << ConvolutionLayer(
+                   3U, 3U, 64U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv1_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv1_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv1_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv1_1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 64U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv1_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv1_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv1_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv1_2/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool1")
+            // Layer 2
+            << ConvolutionLayer(
+                   3U, 3U, 128U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv2_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv2_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv2_1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 128U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv2_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv2_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv2_2/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool2")
+            // Layer 3
+            << ConvolutionLayer(
+                   3U, 3U, 256U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv3_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv3_1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv3_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv3_2/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_3_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_3_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv3_3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv3_3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_4_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_4_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv3_4")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv3_4/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool3")
+            // Layer 4
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv4_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv4_1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv4_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv4_2/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_3_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_3_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv4_3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv4_3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_4_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_4_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv4_4")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv4_4/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool4")
+            // Layer 5
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv5_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv5_1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv5_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv5_2/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_3_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_3_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv5_3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv5_3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_4_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_4_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv5_4")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv5_4/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool5")
+            // Layer 6
+            << FullyConnectedLayer(4096U,
+                                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc6_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc6_b.npy"))
+                   .set_name("fc6")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Relu")
+            // Layer 7
+            << FullyConnectedLayer(4096U,
+                                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc7_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc7_b.npy"))
+                   .set_name("fc7")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Relu_1")
+            // Layer 8
+            << FullyConnectedLayer(1000U,
+                                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc8_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc8_b.npy"))
+                   .set_name("fc8")
+            // Softmax
+            << SoftmaxLayer().set_name("prob") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
diff --git a/examples/graph_vgg_vdsr.cpp b/examples/graph_vgg_vdsr.cpp
index 3fe28e0fed498914db8d2bb0857a7e9e292ba208..a6cd337f82ac7cb58b534971a58d784d58232951 100644
--- a/examples/graph_vgg_vdsr.cpp
+++ b/examples/graph_vgg_vdsr.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -36,8 +37,7 @@ using namespace arm_compute::graph_utils;
 class GraphVDSRExample : public Example
 {
 public:
-    GraphVDSRExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "VDSR")
+    GraphVDSRExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "VDSR")
     {
         model_input_width  = cmd_parser.add_option<SimpleOption<unsigned int>>("image-width", 192);
         model_input_height = cmd_parser.add_option<SimpleOption<unsigned int>>("image-height", 192);
@@ -46,7 +46,7 @@ public:
         model_input_width->set_help("Input image width.");
         model_input_height->set_help("Input image height.");
     }
-    GraphVDSRExample(const GraphVDSRExample &) = delete;
+    GraphVDSRExample(const GraphVDSRExample &)            = delete;
     GraphVDSRExample &operator=(const GraphVDSRExample &) = delete;
     ~GraphVDSRExample() override                          = default;
     bool do_setup(int argc, char **argv) override
@@ -59,7 +59,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -82,15 +82,17 @@ public:
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<TFPreproccessor>();
 
         // Create input descriptor
-        const TensorShape tensor_shape     = permute_shape(TensorShape(image_width, image_height, 1U, common_params.batches), DataLayout::NCHW, common_params.data_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(image_width, image_height, 1U, common_params.batches), DataLayout::NCHW,
+                          common_params.data_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
         // Note: Quantization info are random and used only for benchmarking purposes
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor.set_quantization_info(QuantizationInfo(0.0078125f, 128)),
                             get_input_accessor(common_params, std::move(preprocessor), false));
 
@@ -98,37 +100,34 @@ public:
         SubStream right(graph);
 
         // Layer 1
-        right << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "conv0_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv0_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1), 1, QuantizationInfo(0.031778190285f, 156), QuantizationInfo(0.0784313753247f, 128))
-              .set_name("conv0")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv0/Relu");
+        right << ConvolutionLayer(3U, 3U, 64U, get_weights_accessor(data_path, "conv0_w.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv0_b.npy"), PadStrideInfo(1, 1, 1, 1), 1,
+                                  QuantizationInfo(0.031778190285f, 156), QuantizationInfo(0.0784313753247f, 128))
+                     .set_name("conv0")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv0/Relu");
 
         // Rest 17 layers
-        for(unsigned int i = 1; i < 19; ++i)
+        for (unsigned int i = 1; i < 19; ++i)
         {
             const std::string conv_w_path = "conv" + arm_compute::support::cpp11::to_string(i) + "_w.npy";
             const std::string conv_b_path = "conv" + arm_compute::support::cpp11::to_string(i) + "_b.npy";
             const std::string conv_name   = "conv" + arm_compute::support::cpp11::to_string(i);
-            right << ConvolutionLayer(
-                      3U, 3U, 64U,
-                      get_weights_accessor(data_path, conv_w_path, weights_layout),
-                      get_weights_accessor(data_path, conv_b_path),
-                      PadStrideInfo(1, 1, 1, 1), 1, QuantizationInfo(0.015851572156f, 93))
-                  .set_name(conv_name)
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(conv_name + "/Relu");
+            right << ConvolutionLayer(3U, 3U, 64U, get_weights_accessor(data_path, conv_w_path, weights_layout),
+                                      get_weights_accessor(data_path, conv_b_path), PadStrideInfo(1, 1, 1, 1), 1,
+                                      QuantizationInfo(0.015851572156f, 93))
+                         .set_name(conv_name)
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(conv_name + "/Relu");
         }
 
         // Final layer
-        right << ConvolutionLayer(
-                  3U, 3U, 1U,
-                  get_weights_accessor(data_path, "conv20_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv20_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1), 1, QuantizationInfo(0.015851572156f, 93))
-              .set_name("conv20")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv20/Relu");
+        right << ConvolutionLayer(3U, 3U, 1U, get_weights_accessor(data_path, "conv20_w.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv20_b.npy"), PadStrideInfo(1, 1, 1, 1), 1,
+                                  QuantizationInfo(0.015851572156f, 93))
+                     .set_name("conv20")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv20/Relu");
 
         // Add residual to input
         graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name("add")
@@ -157,8 +156,8 @@ public:
 private:
     CommandLineParser           cmd_parser;
     CommonGraphOptions          common_opts;
-    SimpleOption<unsigned int> *model_input_width{ nullptr };
-    SimpleOption<unsigned int> *model_input_height{ nullptr };
+    SimpleOption<unsigned int> *model_input_width{nullptr};
+    SimpleOption<unsigned int> *model_input_height{nullptr};
     CommonGraphParams           common_params;
     Stream                      graph;
 };
diff --git a/examples/graph_yolov3.cpp b/examples/graph_yolov3.cpp
index 3c8ddbffd812b8d2b09ca8240aecbdb14d3513d5..5c8d3426ec78c3d4d9dc264d4240fb034e0ab2f9 100644
--- a/examples/graph_yolov3.cpp
+++ b/examples/graph_yolov3.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphYOLOv3Example : public Example
 {
 public:
-    GraphYOLOv3Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "YOLOv3")
+    GraphYOLOv3Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "YOLOv3")
     {
     }
 
@@ -50,14 +50,15 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
         }
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -69,331 +70,322 @@ public:
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<TFPreproccessor>(0.f);
 
         // Create input descriptor
-        const TensorShape tensor_shape     = permute_shape(TensorShape(608U, 608U, 3U, 1U), DataLayout::NCHW, common_params.data_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(608U, 608U, 3U, 1U), DataLayout::NCHW, common_params.data_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false));
         std::pair<SubStream, SubStream> intermediate_layers = darknet53(data_path, weights_layout);
-        graph << ConvolutionLayer(
-                  1U, 1U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_53_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv2d_53")
-              << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_53_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_53_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_53_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_53_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_53/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_53/LeakyRelu")
-              << ConvolutionLayer(
-                  3U, 3U, 1024U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_54_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2d_54")
-              << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_54_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_54_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_54_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_54_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_54/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_54/LeakyRelu")
-              << ConvolutionLayer(
-                  1U, 1U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_55_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv2d_55")
-              << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_55_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_55_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_55_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_55_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_55/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_55/LeakyRelu")
-              << ConvolutionLayer(
-                  3U, 3U, 1024U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_56_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2d_56")
-              << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_56_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_56_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_56_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_56_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_56/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_56/LeakyRelu")
-              << ConvolutionLayer(
-                  1U, 1U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_57_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv2d_57")
-              << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_57_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_57_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_57_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_57_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_57/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_57/LeakyRelu");
+        graph
+            << ConvolutionLayer(
+                   1U, 1U, 512U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_53_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_53")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_53_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_53_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_53_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_53_beta.npy"), 0.000001f)
+                   .set_name("conv2d_53/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_53/LeakyRelu")
+            << ConvolutionLayer(
+                   3U, 3U, 1024U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_54_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_54")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_54_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_54_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_54_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_54_beta.npy"), 0.000001f)
+                   .set_name("conv2d_54/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_54/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 512U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_55_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_55")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_55_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_55_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_55_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_55_beta.npy"), 0.000001f)
+                   .set_name("conv2d_55/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_55/LeakyRelu")
+            << ConvolutionLayer(
+                   3U, 3U, 1024U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_56_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_56")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_56_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_56_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_56_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_56_beta.npy"), 0.000001f)
+                   .set_name("conv2d_56/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_56/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 512U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_57_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_57")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_57_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_57_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_57_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_57_beta.npy"), 0.000001f)
+                   .set_name("conv2d_57/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_57/LeakyRelu");
         SubStream route_1(graph);
-        graph << ConvolutionLayer(
-                  3U, 3U, 1024U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_58_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2d_58")
-              << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_58_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_58_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_58_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_58_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_58/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_58/LeakyRelu")
-              << ConvolutionLayer(
-                  1U, 1U, 255U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_59_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_59_b.npy", weights_layout),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv2d_59")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f)).set_name("conv2d_59/Linear")
-              << YOLOLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.1f)).set_name("Yolo1")
-              << OutputLayer(get_output_accessor(common_params, 5));
+        graph
+            << ConvolutionLayer(
+                   3U, 3U, 1024U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_58_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_58")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_58_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_58_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_58_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_58_beta.npy"), 0.000001f)
+                   .set_name("conv2d_58/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_58/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 255U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_59_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_59_b.npy", weights_layout),
+                   PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_59")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f))
+                   .set_name("conv2d_59/Linear")
+            << YOLOLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.1f)).set_name("Yolo1")
+            << OutputLayer(get_output_accessor(common_params, 5));
         route_1 << ConvolutionLayer(
-                    1U, 1U, 256U,
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_60_w.npy", weights_layout),
-                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                    PadStrideInfo(1, 1, 0, 0))
-                .set_name("conv2d_60")
+                       1U, 1U, 256U,
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_60_w.npy", weights_layout),
+                       std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                       .set_name("conv2d_60")
                 << BatchNormalizationLayer(
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_59_mean.npy"),
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_59_var.npy"),
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_59_gamma.npy"),
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_59_beta.npy"),
-                    0.000001f)
-                .set_name("conv2d_59/BatchNorm")
-                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_60/LeakyRelu")
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_59_mean.npy"),
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_59_var.npy"),
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_59_gamma.npy"),
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_59_beta.npy"),
+                       0.000001f)
+                       .set_name("conv2d_59/BatchNorm")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                       .set_name("conv2d_60/LeakyRelu")
                 << ResizeLayer(InterpolationPolicy::NEAREST_NEIGHBOR, 2, 2).set_name("Upsample_60");
         SubStream concat_1(route_1);
-        concat_1 << ConcatLayer(std::move(route_1), std::move(intermediate_layers.second)).set_name("Route1")
-                 << ConvolutionLayer(
-                     1U, 1U, 256U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_61_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name("conv2d_61")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_60_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_60_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_60_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_60_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_60/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_61/LeakyRelu")
-                 << ConvolutionLayer(
-                     3U, 3U, 512U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_62_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 1, 1))
-                 .set_name("conv2d_62")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_61_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_61_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_61_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_61_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_61/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_62/LeakyRelu")
-                 << ConvolutionLayer(
-                     1U, 1U, 256U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_63_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name("conv2d_63")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_62_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_62_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_62_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_62_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_62/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_63/LeakyRelu")
-                 << ConvolutionLayer(
-                     3U, 3U, 512U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_64_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 1, 1))
-                 .set_name("conv2d_64")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_63_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_63_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_63_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_63_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_63/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_64/LeakyRelu")
-                 << ConvolutionLayer(
-                     1U, 1U, 256U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_65_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name("conv2d_65")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_64_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_64_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_64_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_64_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_65/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_65/LeakyRelu");
+        concat_1
+            << ConcatLayer(std::move(route_1), std::move(intermediate_layers.second)).set_name("Route1")
+            << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_61_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_61")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_60_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_60_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_60_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_60_beta.npy"), 0.000001f)
+                   .set_name("conv2d_60/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_61/LeakyRelu")
+            << ConvolutionLayer(
+                   3U, 3U, 512U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_62_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_62")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_61_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_61_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_61_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_61_beta.npy"), 0.000001f)
+                   .set_name("conv2d_61/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_62/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_63_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_63")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_62_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_62_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_62_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_62_beta.npy"), 0.000001f)
+                   .set_name("conv2d_62/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_63/LeakyRelu")
+            << ConvolutionLayer(
+                   3U, 3U, 512U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_64_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_64")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_63_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_63_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_63_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_63_beta.npy"), 0.000001f)
+                   .set_name("conv2d_63/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_64/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_65_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_65")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_64_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_64_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_64_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_64_beta.npy"), 0.000001f)
+                   .set_name("conv2d_65/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_65/LeakyRelu");
         SubStream route_2(concat_1);
-        concat_1 << ConvolutionLayer(
-                     3U, 3U, 512U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_66_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 1, 1))
-                 .set_name("conv2d_66")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_65_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_65_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_65_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_65_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_65/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_66/LeakyRelu")
-                 << ConvolutionLayer(
-                     1U, 1U, 255U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_67_w.npy", weights_layout),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_67_b.npy", weights_layout),
-                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name("conv2d_67")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f)).set_name("conv2d_67/Linear")
-                 << YOLOLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.1f)).set_name("Yolo2")
-                 << OutputLayer(get_output_accessor(common_params, 5));
+        concat_1
+            << ConvolutionLayer(
+                   3U, 3U, 512U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_66_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_66")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_65_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_65_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_65_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_65_beta.npy"), 0.000001f)
+                   .set_name("conv2d_65/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_66/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 255U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_67_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_67_b.npy", weights_layout),
+                   PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_67")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f))
+                   .set_name("conv2d_67/Linear")
+            << YOLOLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.1f)).set_name("Yolo2")
+            << OutputLayer(get_output_accessor(common_params, 5));
         route_2 << ConvolutionLayer(
-                    1U, 1U, 128U,
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_68_w.npy", weights_layout),
-                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                    PadStrideInfo(1, 1, 0, 0))
-                .set_name("conv2d_68")
+                       1U, 1U, 128U,
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_68_w.npy", weights_layout),
+                       std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                       .set_name("conv2d_68")
                 << BatchNormalizationLayer(
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_66_mean.npy"),
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_66_var.npy"),
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_66_gamma.npy"),
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_66_beta.npy"),
-                    0.000001f)
-                .set_name("conv2d_66/BatchNorm")
-                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_68/LeakyRelu")
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_66_mean.npy"),
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_66_var.npy"),
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_66_gamma.npy"),
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_66_beta.npy"),
+                       0.000001f)
+                       .set_name("conv2d_66/BatchNorm")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                       .set_name("conv2d_68/LeakyRelu")
                 << ResizeLayer(InterpolationPolicy::NEAREST_NEIGHBOR, 2, 2).set_name("Upsample_68");
         SubStream concat_2(route_2);
-        concat_2 << ConcatLayer(std::move(route_2), std::move(intermediate_layers.first)).set_name("Route2")
-                 << ConvolutionLayer(
-                     1U, 1U, 128U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_69_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name("conv2d_69")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_67_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_67_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_67_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_67_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_67/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_69/LeakyRelu")
-                 << ConvolutionLayer(
-                     3U, 3U, 256U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_70_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 1, 1))
-                 .set_name("conv2d_70")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_68_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_68_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_68_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_68_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_68/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_70/LeakyRelu")
-                 << ConvolutionLayer(
-                     1U, 1U, 128U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_71_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name("conv2d_71")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_69_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_69_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_69_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_69_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_69/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_71/LeakyRelu")
-                 << ConvolutionLayer(
-                     3U, 3U, 256U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_72_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 1, 1))
-                 .set_name("conv2d_72")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_70_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_70_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_70_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_70_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_70/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_72/LeakyRelu")
-                 << ConvolutionLayer(
-                     1U, 1U, 128U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_73_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name("conv2d_73")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_71_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_71_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_71_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_71_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_71/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_73/LeakyRelu")
-                 << ConvolutionLayer(
-                     3U, 3U, 256U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_74_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 1, 1))
-                 .set_name("conv2d_74")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_72_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_72_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_72_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_72_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_72/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_74/LeakyRelu")
-                 << ConvolutionLayer(
-                     1U, 1U, 255U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_75_w.npy", weights_layout),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_75_b.npy", weights_layout),
-                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name("conv2d_75")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f)).set_name("conv2d_75/Linear")
-                 << YOLOLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.1f)).set_name("Yolo3")
-                 << OutputLayer(get_output_accessor(common_params, 5));
+        concat_2
+            << ConcatLayer(std::move(route_2), std::move(intermediate_layers.first)).set_name("Route2")
+            << ConvolutionLayer(
+                   1U, 1U, 128U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_69_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_69")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_67_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_67_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_67_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_67_beta.npy"), 0.000001f)
+                   .set_name("conv2d_67/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_69/LeakyRelu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_70_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_70")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_68_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_68_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_68_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_68_beta.npy"), 0.000001f)
+                   .set_name("conv2d_68/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_70/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 128U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_71_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_71")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_69_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_69_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_69_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_69_beta.npy"), 0.000001f)
+                   .set_name("conv2d_69/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_71/LeakyRelu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_72_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_72")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_70_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_70_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_70_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_70_beta.npy"), 0.000001f)
+                   .set_name("conv2d_70/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_72/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 128U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_73_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_73")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_71_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_71_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_71_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_71_beta.npy"), 0.000001f)
+                   .set_name("conv2d_71/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_73/LeakyRelu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_74_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_74")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_72_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_72_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_72_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_72_beta.npy"), 0.000001f)
+                   .set_name("conv2d_72/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_74/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 255U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_75_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_75_b.npy", weights_layout),
+                   PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_75")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f))
+                   .set_name("conv2d_75/Linear")
+            << YOLOLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.1f)).set_name("Yolo3")
+            << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
@@ -422,64 +414,64 @@ private:
     std::pair<SubStream, SubStream> darknet53(const std::string &data_path, DataLayout weights_layout)
     {
         graph << ConvolutionLayer(
-                  3U, 3U, 32U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_1_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2d_1/Conv2D")
+                     3U, 3U, 32U,
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_1_w.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                     .set_name("conv2d_1/Conv2D")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_1_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_1_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_1_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_1_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_1/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_1/LeakyRelu")
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_1_mean.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_1_var.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_1_gamma.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_1_beta.npy"),
+                     0.000001f)
+                     .set_name("conv2d_1/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                     .set_name("conv2d_1/LeakyRelu")
               << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_2_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(2, 2, 1, 1))
-              .set_name("conv2d_2/Conv2D")
+                     3U, 3U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_2_w.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 1, 1))
+                     .set_name("conv2d_2/Conv2D")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_2_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_2_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_2_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_2_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_2/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_2/LeakyRelu");
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_2_mean.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_2_var.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_2_gamma.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_2_beta.npy"),
+                     0.000001f)
+                     .set_name("conv2d_2/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                     .set_name("conv2d_2/LeakyRelu");
         darknet53_block(data_path, "3", weights_layout, 32U);
         graph << ConvolutionLayer(
-                  3U, 3U, 128U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_5_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(2, 2, 1, 1))
-              .set_name("conv2d_5/Conv2D")
+                     3U, 3U, 128U,
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_5_w.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 1, 1))
+                     .set_name("conv2d_5/Conv2D")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_5_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_5_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_5_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_5_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_5/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_5/LeakyRelu");
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_5_mean.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_5_var.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_5_gamma.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_5_beta.npy"),
+                     0.000001f)
+                     .set_name("conv2d_5/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                     .set_name("conv2d_5/LeakyRelu");
         darknet53_block(data_path, "6", weights_layout, 64U);
         darknet53_block(data_path, "8", weights_layout, 64U);
         graph << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_10_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(2, 2, 1, 1))
-              .set_name("conv2d_10/Conv2D")
+                     3U, 3U, 256U,
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_10_w.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 1, 1))
+                     .set_name("conv2d_10/Conv2D")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_10_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_10_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_10_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_10_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_10/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_10/LeakyRelu");
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_10_mean.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_10_var.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_10_gamma.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_10_beta.npy"),
+                     0.000001f)
+                     .set_name("conv2d_10/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                     .set_name("conv2d_10/LeakyRelu");
         darknet53_block(data_path, "11", weights_layout, 128U);
         darknet53_block(data_path, "13", weights_layout, 128U);
         darknet53_block(data_path, "15", weights_layout, 128U);
@@ -490,19 +482,19 @@ private:
         darknet53_block(data_path, "25", weights_layout, 128U);
         SubStream layer_36(graph);
         graph << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_27_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(2, 2, 1, 1))
-              .set_name("conv2d_27/Conv2D")
+                     3U, 3U, 512U,
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_27_w.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 1, 1))
+                     .set_name("conv2d_27/Conv2D")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_27_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_27_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_27_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_27_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_27/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_27/LeakyRelu");
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_27_mean.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_27_var.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_27_gamma.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_27_beta.npy"),
+                     0.000001f)
+                     .set_name("conv2d_27/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                     .set_name("conv2d_27/LeakyRelu");
         darknet53_block(data_path, "28", weights_layout, 256U);
         darknet53_block(data_path, "30", weights_layout, 256U);
         darknet53_block(data_path, "32", weights_layout, 256U);
@@ -513,19 +505,19 @@ private:
         darknet53_block(data_path, "42", weights_layout, 256U);
         SubStream layer_61(graph);
         graph << ConvolutionLayer(
-                  3U, 3U, 1024U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_44_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(2, 2, 1, 1))
-              .set_name("conv2d_44/Conv2D")
+                     3U, 3U, 1024U,
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_44_w.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 1, 1))
+                     .set_name("conv2d_44/Conv2D")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_44_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_44_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_44_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_44_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_44/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_44/LeakyRelu");
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_44_mean.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_44_var.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_44_gamma.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_44_beta.npy"),
+                     0.000001f)
+                     .set_name("conv2d_44/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                     .set_name("conv2d_44/LeakyRelu");
         darknet53_block(data_path, "45", weights_layout, 512U);
         darknet53_block(data_path, "47", weights_layout, 512U);
         darknet53_block(data_path, "49", weights_layout, 512U);
@@ -534,43 +526,48 @@ private:
         return std::pair<SubStream, SubStream>(layer_36, layer_61);
     }
 
-    void darknet53_block(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                         unsigned int filter_size)
+    void darknet53_block(const std::string &data_path,
+                         std::string      &&param_path,
+                         DataLayout         weights_layout,
+                         unsigned int       filter_size)
     {
-        std::string total_path  = "/cnn_data/yolov3_model/";
-        std::string param_path2 = arm_compute::support::cpp11::to_string(arm_compute::support::cpp11::stoi(param_path) + 1);
-        SubStream   i_a(graph);
-        SubStream   i_b(graph);
+        std::string total_path = "/cnn_data/yolov3_model/";
+        std::string param_path2 =
+            arm_compute::support::cpp11::to_string(arm_compute::support::cpp11::stoi(param_path) + 1);
+        SubStream i_a(graph);
+        SubStream i_b(graph);
         i_a << ConvolutionLayer(
-                1U, 1U, filter_size,
-                get_weights_accessor(data_path, total_path + "conv2d_" + param_path + "_w.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name("conv2d_" + param_path + "/Conv2D")
+                   1U, 1U, filter_size,
+                   get_weights_accessor(data_path, total_path + "conv2d_" + param_path + "_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_" + param_path + "/Conv2D")
             << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path + "_mean.npy"),
-                get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path + "_var.npy"),
-                get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path + "_gamma.npy"),
-                get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path + "_beta.npy"),
-                0.000001f)
-            .set_name("conv2d_" + param_path + "/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_" + param_path + "/LeakyRelu")
+                   get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path + "_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path + "_var.npy"),
+                   get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path + "_gamma.npy"),
+                   get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path + "_beta.npy"),
+                   0.000001f)
+                   .set_name("conv2d_" + param_path + "/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_" + param_path + "/LeakyRelu")
             << ConvolutionLayer(
-                3U, 3U, filter_size * 2,
-                get_weights_accessor(data_path, total_path + "conv2d_" + param_path2 + "_w.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 1, 1))
-            .set_name("conv2d_" + param_path2 + "/Conv2D")
+                   3U, 3U, filter_size * 2,
+                   get_weights_accessor(data_path, total_path + "conv2d_" + param_path2 + "_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_" + param_path2 + "/Conv2D")
             << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path2 + "_mean.npy"),
-                get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path2 + "_var.npy"),
-                get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path2 + "_gamma.npy"),
-                get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path2 + "_beta.npy"),
-                0.000001f)
-            .set_name("conv2d_" + param_path2 + "/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_" + param_path2 + "/LeakyRelu");
+                   get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path2 + "_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path2 + "_var.npy"),
+                   get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path2 + "_gamma.npy"),
+                   get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path2 + "_beta.npy"),
+                   0.000001f)
+                   .set_name("conv2d_" + param_path2 + "/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_" + param_path2 + "/LeakyRelu");
 
-        graph << EltwiseLayer(std::move(i_a), std::move(i_b), EltwiseOperation::Add).set_name("").set_name("add_" + param_path + "_" + param_path2);
+        graph << EltwiseLayer(std::move(i_a), std::move(i_b), EltwiseOperation::Add)
+                     .set_name("")
+                     .set_name("add_" + param_path + "_" + param_path2);
     }
 };
 
diff --git a/examples/neon_cnn.cpp b/examples/neon_cnn.cpp
index 5ecf055e60bb6310ff4acd1332d238a716a9e0db..1f7a1ea6ca688ef6a38eb81f37391ed835e451e8 100644
--- a/examples/neon_cnn.cpp
+++ b/examples/neon_cnn.cpp
@@ -21,13 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/NEFunctions.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Allocator.h"
 #include "arm_compute/runtime/BlobLifetimeManager.h"
 #include "arm_compute/runtime/MemoryManagerOnDemand.h"
+#include "arm_compute/runtime/NEON/NEFunctions.h"
 #include "arm_compute/runtime/PoolManager.h"
+
 #include "utils/Utils.h"
 
 using namespace arm_compute;
@@ -43,12 +43,13 @@ public:
 
         // Create memory manager components
         // We need 2 memory managers: 1 for handling the tensors within the functions (mm_layers) and 1 for handling the input and output tensors of the functions (mm_transitions))
-        auto lifetime_mgr0  = std::make_shared<BlobLifetimeManager>();                           // Create lifetime manager
-        auto lifetime_mgr1  = std::make_shared<BlobLifetimeManager>();                           // Create lifetime manager
-        auto pool_mgr0      = std::make_shared<PoolManager>();                                   // Create pool manager
-        auto pool_mgr1      = std::make_shared<PoolManager>();                                   // Create pool manager
-        auto mm_layers      = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr0, pool_mgr0); // Create the memory manager
-        auto mm_transitions = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr1, pool_mgr1); // Create the memory manager
+        auto lifetime_mgr0 = std::make_shared<BlobLifetimeManager>();                       // Create lifetime manager
+        auto lifetime_mgr1 = std::make_shared<BlobLifetimeManager>();                       // Create lifetime manager
+        auto pool_mgr0     = std::make_shared<PoolManager>();                               // Create pool manager
+        auto pool_mgr1     = std::make_shared<PoolManager>();                               // Create pool manager
+        auto mm_layers = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr0, pool_mgr0); // Create the memory manager
+        auto mm_transitions =
+            std::make_shared<MemoryManagerOnDemand>(lifetime_mgr1, pool_mgr1); // Create the memory manager
 
         // The weights and biases tensors should be initialized with the values inferred with the training
 
@@ -116,7 +117,8 @@ public:
         // Initialize tensor of fc0
         constexpr unsigned int num_labels = 128;
 
-        const TensorShape weights_shape_fc0(out_shape_pool1.x() * out_shape_pool1.y() * out_shape_pool1.z(), num_labels);
+        const TensorShape weights_shape_fc0(out_shape_pool1.x() * out_shape_pool1.y() * out_shape_pool1.z(),
+                                            num_labels);
         const TensorShape biases_shape_fc0(num_labels);
         const TensorShape out_shape_fc0(num_labels);
 
@@ -138,22 +140,28 @@ public:
         /* [Configure functions] */
 
         // in:32x32x1: 5x5 convolution, 8 output features maps (OFM)
-        conv0->configure(&src, &weights0, &biases0, &out_conv0, PadStrideInfo(1 /* stride_x */, 1 /* stride_y */, 2 /* pad_x */, 2 /* pad_y */));
+        conv0->configure(&src, &weights0, &biases0, &out_conv0,
+                         PadStrideInfo(1 /* stride_x */, 1 /* stride_y */, 2 /* pad_x */, 2 /* pad_y */));
 
         // in:32x32x8, out:32x32x8, Activation function: relu
         act0.configure(&out_conv0, &out_act0, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
         // in:32x32x8, out:16x16x8 (2x2 pooling), Pool type function: Max
-        pool0.configure(&out_act0, &out_pool0, PoolingLayerInfo(PoolingType::MAX, 2, data_layout, PadStrideInfo(2 /* stride_x */, 2 /* stride_y */)));
+        pool0.configure(
+            &out_act0, &out_pool0,
+            PoolingLayerInfo(PoolingType::MAX, 2, data_layout, PadStrideInfo(2 /* stride_x */, 2 /* stride_y */)));
 
         // in:16x16x8: 3x3 convolution, 16 output features maps (OFM)
-        conv1->configure(&out_pool0, &weights1, &biases1, &out_conv1, PadStrideInfo(1 /* stride_x */, 1 /* stride_y */, 1 /* pad_x */, 1 /* pad_y */));
+        conv1->configure(&out_pool0, &weights1, &biases1, &out_conv1,
+                         PadStrideInfo(1 /* stride_x */, 1 /* stride_y */, 1 /* pad_x */, 1 /* pad_y */));
 
         // in:16x16x16, out:16x16x16, Activation function: relu
         act1.configure(&out_conv1, &out_act1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
         // in:16x16x16, out:8x8x16 (2x2 pooling), Pool type function: Average
-        pool1.configure(&out_act1, &out_pool1, PoolingLayerInfo(PoolingType::AVG, 2, data_layout, PadStrideInfo(2 /* stride_x */, 2 /* stride_y */)));
+        pool1.configure(
+            &out_act1, &out_pool1,
+            PoolingLayerInfo(PoolingType::AVG, 2, data_layout, PadStrideInfo(2 /* stride_x */, 2 /* stride_y */)));
 
         // in:8x8x16, out:128
         fc0->configure(&out_pool1, &weights2, &biases2, &out_fc0);
diff --git a/examples/neon_copy_objects.cpp b/examples/neon_copy_objects.cpp
index b060b09759b36673f9e0c4890c03739918a11446..6e9ebcaad5f65644ba41f466808dc1e5a66e03de 100644
--- a/examples/neon_copy_objects.cpp
+++ b/examples/neon_copy_objects.cpp
@@ -22,9 +22,9 @@
  * SOFTWARE.
  */
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 
-#include "arm_compute/core/Types.h"
 #include "utils/Utils.h"
 
 #include <cstring>
@@ -50,11 +50,11 @@ public:
         dst_data = new float[width * height * batch];
 
         // Fill src_data with pseudo(meaningless) values:
-        for(unsigned int b = 0; b < batch; b++)
+        for (unsigned int b = 0; b < batch; b++)
         {
-            for(unsigned int h = 0; h < height; h++)
+            for (unsigned int h = 0; h < height; h++)
             {
-                for(unsigned int w = 0; w < width; w++)
+                for (unsigned int w = 0; w < width; w++)
                 {
                     src_data[b * (width * height) + h * width + w] = static_cast<float>(100 * b + 10 * h + w);
                 }
@@ -78,9 +78,12 @@ public:
         Window input_window;
         input_window.use_tensor_dimensions(input.info()->tensor_shape());
         std::cout << " Dimensions of the input's iterator:\n";
-        std::cout << " X = [start=" << input_window.x().start() << ", end=" << input_window.x().end() << ", step=" << input_window.x().step() << "]\n";
-        std::cout << " Y = [start=" << input_window.y().start() << ", end=" << input_window.y().end() << ", step=" << input_window.y().step() << "]\n";
-        std::cout << " Z = [start=" << input_window.z().start() << ", end=" << input_window.z().end() << ", step=" << input_window.z().step() << "]\n";
+        std::cout << " X = [start=" << input_window.x().start() << ", end=" << input_window.x().end()
+                  << ", step=" << input_window.x().step() << "]\n";
+        std::cout << " Y = [start=" << input_window.y().start() << ", end=" << input_window.y().end()
+                  << ", step=" << input_window.y().step() << "]\n";
+        std::cout << " Z = [start=" << input_window.z().start() << ", end=" << input_window.z().end()
+                  << ", step=" << input_window.z().step() << "]\n";
 
         // Create an iterator:
         Iterator input_it(&input, input_window);
@@ -98,20 +101,28 @@ public:
         //   }
         // }
         // Except it works for an arbitrary number of dimensions
-        execute_window_loop(input_window, [&](const Coordinates & id)
-        {
-            std::cout << "Setting item [" << id.x() << "," << id.y() << "," << id.z() << "]\n";
-            *reinterpret_cast<float *>(input_it.ptr()) = src_data[id.z() * (width * height) + id.y() * width + id.x()];
-        },
-        input_it);
+        execute_window_loop(
+            input_window,
+            [&](const Coordinates &id)
+            {
+                std::cout << "Setting item [" << id.x() << "," << id.y() << "," << id.z() << "]\n";
+                *reinterpret_cast<float *>(input_it.ptr()) =
+                    src_data[id.z() * (width * height) + id.y() * width + id.x()];
+            },
+            input_it);
 
         // More efficient way: create an iterator to iterate through each row (instead of each element) of the output tensor:
         Window output_window;
-        output_window.use_tensor_dimensions(output.info()->tensor_shape(), /* first_dimension =*/Window::DimY); // Iterate through the rows (not each element)
+        output_window.use_tensor_dimensions(
+            output.info()->tensor_shape(),
+            /* first_dimension =*/Window::DimY); // Iterate through the rows (not each element)
         std::cout << " Dimensions of the output's iterator:\n";
-        std::cout << " X = [start=" << output_window.x().start() << ", end=" << output_window.x().end() << ", step=" << output_window.x().step() << "]\n";
-        std::cout << " Y = [start=" << output_window.y().start() << ", end=" << output_window.y().end() << ", step=" << output_window.y().step() << "]\n";
-        std::cout << " Z = [start=" << output_window.z().start() << ", end=" << output_window.z().end() << ", step=" << output_window.z().step() << "]\n";
+        std::cout << " X = [start=" << output_window.x().start() << ", end=" << output_window.x().end()
+                  << ", step=" << output_window.x().step() << "]\n";
+        std::cout << " Y = [start=" << output_window.y().start() << ", end=" << output_window.y().end()
+                  << ", step=" << output_window.y().step() << "]\n";
+        std::cout << " Z = [start=" << output_window.z().start() << ", end=" << output_window.z().end()
+                  << ", step=" << output_window.z().step() << "]\n";
 
         // Create an iterator:
         Iterator output_it(&output, output_window);
@@ -126,13 +137,15 @@ public:
         //   }
         // }
         // Except it works for an arbitrary number of dimensions
-        execute_window_loop(output_window, [&](const Coordinates & id)
-        {
-            std::cout << "Copying one row starting from [" << id.x() << "," << id.y() << "," << id.z() << "]\n";
-            // Copy one whole row:
-            memcpy(dst_data + id.z() * (width * height) + id.y() * width, output_it.ptr(), width * sizeof(float));
-        },
-        output_it);
+        execute_window_loop(
+            output_window,
+            [&](const Coordinates &id)
+            {
+                std::cout << "Copying one row starting from [" << id.x() << "," << id.y() << "," << id.z() << "]\n";
+                // Copy one whole row:
+                memcpy(dst_data + id.z() * (width * height) + id.y() * width, output_it.ptr(), width * sizeof(float));
+            },
+            output_it);
 
         /** [Copy objects example] */
 
diff --git a/examples/neon_gemm_qasymm8.cpp b/examples/neon_gemm_qasymm8.cpp
index f6f5dc102656a2927fcf3cfb04106f035f4185ed..3aaad02f8a0dcfc014aedbfe23d4c0b6a466a483 100644
--- a/examples/neon_gemm_qasymm8.cpp
+++ b/examples/neon_gemm_qasymm8.cpp
@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/WindowIterator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/WindowIterator.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/Utils.h"
 
@@ -38,7 +39,7 @@ using namespace utils;
 void find_min_max(int size, const float *data, float *min, float *max)
 {
     *min = *max = data[0];
-    for(int i = 0; i < size; i++)
+    for (int i = 0; i < size; i++)
     {
         const float val = data[i];
         *min            = std::min(*min, val);
@@ -66,11 +67,11 @@ QuantizationInfo choose_quantization_params(float min, float max)
 
     // But we need to nudge the zero_point to an integer (exact quantized value)
     std::uint8_t zero_point_nudged = 0;
-    if(zero_point_real < qmin)
+    if (zero_point_real < qmin)
     {
         zero_point_nudged = qmin;
     }
-    else if(zero_point_real > qmax)
+    else if (zero_point_real > qmax)
     {
         zero_point_nudged = qmax;
     }
@@ -85,7 +86,7 @@ QuantizationInfo choose_quantization_params(float min, float max)
 
 void quantize_values(int size, qasymm8_t *output, float *input, const QuantizationInfo qinfo)
 {
-    for(int i = 0; i < size; i++)
+    for (int i = 0; i < size; i++)
     {
         output[i] = quantize_qasymm8(input[i], qinfo);
     }
@@ -108,7 +109,7 @@ int main(int argc, char **argv)
     bool   default_input = true;
 
     // Parse args
-    if(argc < 3) /* case default matrix sizes */
+    if (argc < 3) /* case default matrix sizes */
     {
         // Print help
         std::cout << "Usage: ./build/neon_gemm_qasymm8 M N K\n";
@@ -144,23 +145,23 @@ int main(int argc, char **argv)
 
     // Fill in: one is the identity matrix, other is sequential values
     // src1: Identity matrix
-    for(size_t i = 0; i < M * K; i++)
+    for (size_t i = 0; i < M * K; i++)
     {
         src1_ptr[i] = 0;
     }
-    for(size_t i = 0; i < M; i++)
+    for (size_t i = 0; i < M; i++)
     {
         src1_ptr[i * K + i] = 1.0f;
     }
 
     // src2: Sequential values matrix
-    for(size_t i = 0; i < K * N; i++)
+    for (size_t i = 0; i < K * N; i++)
     {
         src2_ptr[i] = i * 1.123f;
     }
 
     // Otherwise if M, N, K is given, fill in with random values
-    if(!default_input)
+    if (!default_input)
     {
         fill_random_tensor(src1, 0.f, 1.f);
         fill_random_tensor(src2, 0.f, 1.f);
@@ -223,7 +224,7 @@ int main(int argc, char **argv)
     NEGEMMLowpOutputStage gemmlowp_output_stage;
     int                   output_multiplier;
     int                   output_shift;
-    float                 multiplier = (src1_qinfo.uniform().scale * src2_qinfo.uniform().scale) / dst0_qinfo.uniform().scale;
+    float multiplier = (src1_qinfo.uniform().scale * src2_qinfo.uniform().scale) / dst0_qinfo.uniform().scale;
     quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
     std::cout << "(q_multiplier, q_shift) = (" << output_multiplier << ", " << output_shift << ")\n\n";
 
diff --git a/examples/neon_permute.cpp b/examples/neon_permute.cpp
index 49848de4ea1eba1375aa114ebb712bd273a514a2..76ba0794306978a342c4720bbe5151e82ba35a7e 100644
--- a/examples/neon_permute.cpp
+++ b/examples/neon_permute.cpp
@@ -21,9 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 
-#include "arm_compute/core/Types.h"
 #include "utils/Utils.h"
 
 using namespace arm_compute;
@@ -85,11 +85,13 @@ private:
         window.use_tensor_dimensions(reference.info()->tensor_shape());
         Iterator ref_it(&reference, window);
         Iterator res_it(&result, window);
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            assert(*reinterpret_cast<unsigned char *>(ref_it.ptr()) == *reinterpret_cast<unsigned char *>(res_it.ptr()));
-        },
-        ref_it, res_it);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &) {
+                assert(*reinterpret_cast<unsigned char *>(ref_it.ptr()) ==
+                       *reinterpret_cast<unsigned char *>(res_it.ptr()));
+            },
+            ref_it, res_it);
     }
 
     void fill_tensor(Tensor &tensor)
@@ -98,11 +100,9 @@ private:
         window.use_tensor_dimensions(tensor.info()->tensor_shape());
         Iterator      tensor_it(&tensor, window);
         unsigned char val(0);
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            *reinterpret_cast<unsigned char *>(tensor_it.ptr()) = val++;
-        },
-        tensor_it);
+        execute_window_loop(
+            window, [&](const Coordinates &) { *reinterpret_cast<unsigned char *>(tensor_it.ptr()) = val++; },
+            tensor_it);
     }
     void init_tensor(const TensorShape shape, Tensor &tensor, DataType type, DataLayout layout)
     {
diff --git a/examples/neon_scale.cpp b/examples/neon_scale.cpp
index f120ea7f9678418c2c71bd90cd3c600d7ef6aa94..28590bd8616b270b08708aa95fa4b5e81ef465be 100644
--- a/examples/neon_scale.cpp
+++ b/examples/neon_scale.cpp
@@ -21,9 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 
-#include "arm_compute/core/Types.h"
 #include "utils/ImageLoader.h"
 #include "utils/Utils.h"
 
@@ -37,7 +37,7 @@ public:
     {
         PPMLoader ppm;
 
-        if(argc < 2)
+        if (argc < 2)
         {
             // Print help
             std::cout << "Usage: ./build/neon_scale[input_image.ppm]\n\n";
@@ -60,20 +60,16 @@ public:
         dst.allocator()->init(dst_tensor_info);
 
         // Configure Scale function object:
-        scale.configure(&src, &dst, ScaleKernelInfo{
-                    InterpolationPolicy::NEAREST_NEIGHBOR,
-                    BorderMode::UNDEFINED,
-                    PixelValue(),
-                    SamplingPolicy::CENTER,
-                    false
-        });
+        scale.configure(&src, &dst,
+                        ScaleKernelInfo{InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED, PixelValue(),
+                                        SamplingPolicy::CENTER, false});
 
         // Allocate all the images
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
         // Fill the input image with the content of the PPM image if a filename was provided:
-        if(ppm.is_open())
+        if (ppm.is_open())
         {
             ppm.fill_image(src);
             output_filename = std::string(argv[1]) + "_out.ppm";
@@ -89,7 +85,7 @@ public:
     void do_teardown() override
     {
         // Save the result to file:
-        if(!output_filename.empty())
+        if (!output_filename.empty())
         {
             save_to_ppm(dst, output_filename); // save_to_ppm maps and unmaps the image to store as PPM
         }
diff --git a/examples/neon_sgemm.cpp b/examples/neon_sgemm.cpp
index 07696bd622779ebf340c38623943bf58a844d8cd..8cda65a40094457257d61bf2fa0554c91fc0a600 100644
--- a/examples/neon_sgemm.cpp
+++ b/examples/neon_sgemm.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "utils/Utils.h"
 
 #include <cstdlib>
@@ -43,15 +44,16 @@ public:
         beta  = 0.0f;
 
         std::ifstream stream;
-        if(argc > 1)
+        if (argc > 1)
         {
             stream.open(argv[1], std::fstream::in);
         }
 
-        if(argc < 3 || (argc < 4 && stream.bad()))
+        if (argc < 3 || (argc < 4 && stream.bad()))
         {
             // Print help
-            std::cout << "Usage: 1) ./build/neon_sgemm input_matrix_1.npy input_matrix_2.npy [input_matrix_3.npy] [alpha = 1] [beta = 0]\n";
+            std::cout << "Usage: 1) ./build/neon_sgemm input_matrix_1.npy input_matrix_2.npy [input_matrix_3.npy] "
+                         "[alpha = 1] [beta = 0]\n";
             std::cout << "       2) ./build/neon_sgemm M N K [alpha = 1.0f] [beta = 0.0f]\n\n";
             std::cout << "Too few or no input_matrices provided. Using M=7, N=3, K=5, alpha=1.0f and beta=0.0f\n\n";
 
@@ -61,29 +63,29 @@ public:
         }
         else
         {
-            if(stream.good()) /* case file1.npy file2.npy [file3.npy] [alpha = 1.0f] [beta = 0.0f] */
+            if (stream.good()) /* case file1.npy file2.npy [file3.npy] [alpha = 1.0f] [beta = 0.0f] */
             {
                 npy0.open(argv[1]);
                 npy0.init_tensor(src0, DataType::F32);
                 npy1.open(argv[2]);
                 npy1.init_tensor(src1, DataType::F32);
 
-                if(argc > 3)
+                if (argc > 3)
                 {
                     stream.close();
                     stream.clear();
                     stream.open(argv[3], std::fstream::in);
-                    if(stream.good()) /* case with third file */
+                    if (stream.good()) /* case with third file */
                     {
                         npy2.open(argv[3]);
                         npy2.init_tensor(src2, DataType::F32);
 
-                        if(argc > 4)
+                        if (argc > 4)
                         {
                             // Convert string to float
                             alpha = strtof(argv[4], nullptr);
 
-                            if(argc > 5)
+                            if (argc > 5)
                             {
                                 // Convert string to float
                                 beta = strtof(argv[5], nullptr);
@@ -94,7 +96,7 @@ public:
                     {
                         alpha = strtof(argv[3], nullptr);
 
-                        if(argc > 4)
+                        if (argc > 4)
                         {
                             beta = strtof(argv[4], nullptr);
                         }
@@ -111,11 +113,11 @@ public:
                 src1.allocator()->init(TensorInfo(TensorShape(N, K), 1, DataType::F32));
                 src2.allocator()->init(TensorInfo(TensorShape(N, M), 1, DataType::F32));
 
-                if(argc > 4)
+                if (argc > 4)
                 {
                     alpha = strtof(argv[4], nullptr);
 
-                    if(argc > 5)
+                    if (argc > 5)
                     {
                         beta = strtof(argv[5], nullptr);
                     }
@@ -134,7 +136,7 @@ public:
         dst.allocator()->allocate();
 
         // Fill the input images with either the data provided or random data
-        if(npy0.is_open())
+        if (npy0.is_open())
         {
             npy0.fill_tensor(src0);
             npy1.fill_tensor(src1);
@@ -142,7 +144,7 @@ public:
             output_filename = "sgemm_out.npy";
             is_fortran      = npy0.is_fortran();
 
-            if(npy2.is_open())
+            if (npy2.is_open())
             {
                 src2.allocator()->allocate();
                 npy2.fill_tensor(src2);
@@ -169,7 +171,7 @@ public:
     }
     void do_teardown() override
     {
-        if(!output_filename.empty()) /* Save to .npy file */
+        if (!output_filename.empty()) /* Save to .npy file */
         {
             save_to_npy(dst, output_filename, is_fortran);
         }
diff --git a/filelist.json b/filelist.json
index 953b81de5a1c32631b346466af7574390bcfa477..5bca2419d61c03bad9927c825abc79376fc6b2a8 100644
--- a/filelist.json
+++ b/filelist.json
@@ -26,6 +26,7 @@
     "src/core/Validate.cpp",
     "src/core/Version.cpp",
     "src/core/helpers/SoftmaxHelpers.cpp",
+    "src/core/helpers/Utils.cpp",
     "src/core/helpers/WindowHelpers.cpp",
     "src/core/utils/ActivationFunctionUtils.cpp",
     "src/core/utils/AssemblyUtils.cpp",
@@ -33,6 +34,7 @@
     "src/core/utils/DataLayoutUtils.cpp",
     "src/core/utils/InterpolationPolicyUtils.cpp",
     "src/core/utils/FormatUtils.cpp",
+    "src/core/utils/Math.cpp",
     "src/core/utils/ScaleUtils.cpp",
     "src/core/utils/StringUtils.cpp",
     "src/core/utils/helpers/fft.cpp",
@@ -117,7 +119,10 @@
   ],
   "gpu": {
     "common": [
+      "src/core/CL/CLCommandBuffer.cpp",
+      "src/core/CL/CLCompatCommandBuffer.cpp",
       "src/core/CL/CLCompileContext.cpp",
+      "src/core/CL/CLMutableCommandBuffer.cpp",
       "src/core/CL/DefaultLWSHeuristics.cpp",
       "src/core/CL/CLHelpers.cpp",
       "src/core/CL/CLKernelLibrary.cpp",
@@ -520,8 +525,10 @@
       "files": {
         "common": [
           "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp",
+          "src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp",
           "src/gpu/cl/kernels/ClMatMulNativeKernel.cpp",
           "src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp",
+          "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp",
           "src/gpu/cl/operators/ClMatMul.cpp",
           "src/runtime/CL/functions/CLMatMul.cpp",
           "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp",
@@ -970,9 +977,7 @@
           ],
           "neon": {
             "common": [
-              "src/cpu/kernels/fuse_batch_normalization/generic/impl.cpp",
-              "src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp",
-              "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.cpp"
+              "src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp"
             ],
             "fp16": [
               "src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp",
@@ -1050,8 +1055,7 @@
           "common": [
             "src/cpu/operators/CpuCast.cpp",
             "src/cpu/kernels/CpuCastKernel.cpp",
-            "src/runtime/NEON/functions/NECast.cpp",
-            "src/cpu/kernels/cast/generic/neon/bfloat16.cpp"
+            "src/runtime/NEON/functions/NECast.cpp"
           ],
           "neon":{
             "fp16":["src/cpu/kernels/cast/generic/neon/fp16.cpp"]
@@ -1146,10 +1150,16 @@
               "src/core/NEON/kernels/convolution/winograd/weight_transforms/cpp_fp32_1x4_1x5.cpp",
               "src/core/NEON/kernels/convolution/winograd/weight_transforms/cpp_fp32_1x6_1x3.cpp",
               "src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp",
+              "src/cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp",
               "src/cpu/kernels/directconv2d/nchw/all.cpp"
+
             ],
             "fp32": [
               "src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp"
+            ],
+            "fp16": [
+              "src/cpu/kernels/directconv2d/nchw/fp16.cpp",
+              "src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp"
             ]
           },
           "sve": {
@@ -1178,7 +1188,6 @@
             "src/runtime/NEON/functions/NECropResize.cpp"
           ],
           "neon": {
-            "common":  [ "src/cpu/kernels/crop/generic/neon/impl.cpp" ],
             "fp32":    [ "src/cpu/kernels/crop/generic/neon/fp32.cpp" ],
             "fp16":    [ "src/cpu/kernels/crop/generic/neon/fp16.cpp" ],
             "integer": [ "src/cpu/kernels/crop/generic/neon/integer.cpp"   ]
@@ -1445,7 +1454,6 @@
             "src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp"
           ],
           "neon": {
-            "common":["src/cpu/kernels/elementwise_unary/generic/neon/impl.cpp"],
             "integer": ["src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp"],
             "fp32": ["src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp"],
             "fp16": ["src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp"],
@@ -1677,6 +1685,7 @@
             "common": [
               "src/core/NEON/kernels/arm_gemm/interleave_indirect-sve.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16fp32fp16_dot_16VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp",
@@ -1791,7 +1800,6 @@
             "src/runtime/NEON/functions/NEL2NormalizeLayer.cpp"
           ],
           "neon":{
-            "common":["src/cpu/kernels/l2normlayer/generic/neon/impl.cpp"],
             "fp32":["src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp"],
             "fp16":["src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp"]
           }
@@ -1855,8 +1863,7 @@
             "fp32":["src/cpu/kernels/maxunpool/generic/neon/fp32.cpp"],
             "fp16":["src/cpu/kernels/maxunpool/generic/neon/fp16.cpp"],
             "qasymm8":["src/cpu/kernels/maxunpool/generic/neon/qasymm8.cpp"],
-            "qasymm8_signed":[ "src/cpu/kernels/maxunpool/generic/neon/qasymm8_signed.cpp"],
-            "common":["src/cpu/kernels/maxunpool/generic/neon/impl.cpp"]
+            "qasymm8_signed":[ "src/cpu/kernels/maxunpool/generic/neon/qasymm8_signed.cpp"]
           }
         }
       },
@@ -2017,7 +2024,6 @@
             "src/runtime/NEON/functions/NEPooling3dLayer.cpp"
           ],
           "neon": {
-            "common":         [ "src/cpu/kernels/pool3d/neon/impl.cpp" ],
             "fp16":           [ "src/cpu/kernels/pool3d/neon/fp16.cpp" ],
             "fp32":           [ "src/cpu/kernels/pool3d/neon/fp32.cpp" ],
             "qasymm8":        [ "src/cpu/kernels/pool3d/neon/qasymm8.cpp" ],
@@ -2057,7 +2063,6 @@
             "src/runtime/NEON/functions/NERange.cpp"
           ],
           "neon": {
-            "common":  [ "src/cpu/kernels/range/generic/neon/impl.cpp" ],
             "fp32":    [ "src/cpu/kernels/range/generic/neon/fp32.cpp" ],
             "fp16":    [ "src/cpu/kernels/range/generic/neon/fp16.cpp" ],
             "integer": [ "src/cpu/kernels/range/generic/neon/integer.cpp"   ]
@@ -2119,7 +2124,6 @@
             "src/runtime/NEON/functions/NEROIAlignLayer.cpp"
           ],
           "neon":{
-            "common":["src/cpu/kernels/roialign/generic/neon/impl.cpp"],
             "fp32":["src/cpu/kernels/roialign/generic/neon/fp32.cpp"],
             "fp16":["src/cpu/kernels/roialign/generic/neon/fp16.cpp"],
             "qasymm8":["src/cpu/kernels/roialign/generic/neon/qasymm8.cpp"],
@@ -2164,7 +2168,6 @@
             "src/runtime/NEON/functions/NESelect.cpp"
           ],
           "neon": {
-            "common": [ "src/cpu/kernels/select/generic/neon/impl.cpp" ],
             "fp32": [ "src/cpu/kernels/select/generic/neon/fp32.cpp" ],
             "fp16": [ "src/cpu/kernels/select/generic/neon/fp16.cpp" ],
             "integer": [ "src/cpu/kernels/select/generic/neon/integer.cpp"  ]
@@ -2253,13 +2256,17 @@
           "common": [
             "src/cpu/operators/CpuSub.cpp",
             "src/cpu/kernels/CpuSubKernel.cpp",
-            "src/runtime/NEON/functions/NEArithmeticSubtraction.cpp",
-            "src/cpu/kernels/sub/neon/qasymm8.cpp",
-            "src/cpu/kernels/sub/neon/qasymm8_signed.cpp",
-            "src/cpu/kernels/sub/neon/qsymm16.cpp"
-          ]
+            "src/runtime/NEON/functions/NEArithmeticSubtraction.cpp"
+          ],
+          "neon": {
+            "fp16":["src/cpu/kernels/sub/neon/fp16.cpp"],
+            "qasymm8": ["src/cpu/kernels/sub/neon/qasymm8.cpp"],
+            "qasymm8_signed": ["src/cpu/kernels/sub/neon/qasymm8_signed.cpp"],
+            "qsymm16": ["src/cpu/kernels/sub/neon/qsymm16.cpp"]
+          }
         }
       },
+
       "Tile": {
         "files": {
           "common": [
@@ -2294,6 +2301,7 @@
         "src/dynamic_fusion/sketch/attributes/ClampAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/Conv2dAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp",
+        "src/dynamic_fusion/sketch/attributes/MatMulAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp",
@@ -2310,6 +2318,7 @@
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp",
@@ -2322,6 +2331,7 @@
         "src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp",
@@ -2353,8 +2363,12 @@
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp",
diff --git a/include/CL/cl.h b/include/CL/cl.h
index 0018a0f420a31b0556c8129e24d54028877b5686..c2cdb4008d58fefc9a474aadddd5b8672a57bbd8 100644
--- a/include/CL/cl.h
+++ b/include/CL/cl.h
@@ -141,6 +141,10 @@ typedef struct _cl_image_desc {
 #pragma warning( push )
 #pragma warning( disable : 4201 )   /* Prevents warning about nameless struct/union in /W4 builds */
 #endif
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc11-extensions" /* Prevents warning about nameless union being C11 extension*/
+#endif
 #if defined(_MSC_VER) && defined(__STDC__)
     /* Anonymous unions are not supported in /Za builds */
 #else
@@ -158,6 +162,9 @@ typedef struct _cl_image_desc {
 #if defined(_MSC_VER) && !defined(__STDC__)
 #pragma warning( pop )
 #endif
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
 #endif
 } cl_image_desc;
 
@@ -942,6 +949,13 @@ typedef struct _cl_name_version {
 
 /********************************************************************************************************/
 
+/* CL_NO_PROTOTYPES implies CL_NO_CORE_PROTOTYPES: */
+#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_CORE_PROTOTYPES)
+#define CL_NO_CORE_PROTOTYPES
+#endif
+
+#if !defined(CL_NO_CORE_PROTOTYPES)
+
 /* Platform API */
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetPlatformIDs(cl_uint          num_entries,
@@ -1922,6 +1936,8 @@ clEnqueueTask(cl_command_queue  command_queue,
               const cl_event *  event_wait_list,
               cl_event *        event) CL_API_SUFFIX__VERSION_1_2_DEPRECATED;
 
+#endif /* !defined(CL_NO_CORE_PROTOTYPES) */
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/CL/cl_d3d10.h b/include/CL/cl_d3d10.h
index 6adedb061633cc378310a86f2cf552226cd1bd23..3d8fe131281cd2e9094719dce2497a57acce6607 100644
--- a/include/CL/cl_d3d10.h
+++ b/include/CL/cl_d3d10.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2020 The Khronos Group Inc.
+ * Copyright (c) 2008-2023 The Khronos Group Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,18 @@
  * limitations under the License.
  ******************************************************************************/
 
-#ifndef __OPENCL_CL_D3D10_H
-#define __OPENCL_CL_D3D10_H
+#ifndef OPENCL_CL_D3D10_H_
+#define OPENCL_CL_D3D10_H_
+
+/*
+** This header is generated from the Khronos OpenCL XML API Registry.
+*/
 
 #if defined(_MSC_VER)
 #if _MSC_VER >=1500
 #pragma warning( push )
 #pragma warning( disable : 4201 )
+#pragma warning( disable : 5105 )
 #endif
 #endif
 #include <d3d10.h>
@@ -29,100 +34,208 @@
 #pragma warning( pop )
 #endif
 #endif
+
 #include <CL/cl.h>
-#include <CL/cl_platform.h>
+
+/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
+#define CL_NO_EXTENSION_PROTOTYPES
+#endif
+
+/* CL_NO_EXTENSION_PROTOTYPES implies
+   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
+   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/******************************************************************************
- * cl_khr_d3d10_sharing                                                       */
+/***************************************************************
+* cl_khr_d3d10_sharing
+***************************************************************/
 #define cl_khr_d3d10_sharing 1
+#define CL_KHR_D3D10_SHARING_EXTENSION_NAME \
+    "cl_khr_d3d10_sharing"
 
-typedef cl_uint cl_d3d10_device_source_khr;
-typedef cl_uint cl_d3d10_device_set_khr;
+typedef cl_uint             cl_d3d10_device_source_khr;
+typedef cl_uint             cl_d3d10_device_set_khr;
 
-/******************************************************************************/
+/* Error codes */
+#define CL_INVALID_D3D10_DEVICE_KHR                         -1002
+#define CL_INVALID_D3D10_RESOURCE_KHR                       -1003
+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR              -1004
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR                  -1005
 
-/* Error Codes */
-#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
-#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
-#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
-#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
+/* cl_d3d10_device_source_khr */
+#define CL_D3D10_DEVICE_KHR                                 0x4010
+#define CL_D3D10_DXGI_ADAPTER_KHR                           0x4011
 
-/* cl_d3d10_device_source_nv */
-#define CL_D3D10_DEVICE_KHR                          0x4010
-#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
-
-/* cl_d3d10_device_set_nv */
-#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
-#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
+/* cl_d3d10_device_set_khr */
+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR                  0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_KHR                        0x4013
 
 /* cl_context_info */
-#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
-#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
+#define CL_CONTEXT_D3D10_DEVICE_KHR                         0x4014
+#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR        0x402C
 
 /* cl_mem_info */
-#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
+#define CL_MEM_D3D10_RESOURCE_KHR                           0x4015
 
 /* cl_image_info */
-#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR                      0x4016
 
 /* cl_command_type */
-#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
-#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR                0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR                0x4018
 
-/******************************************************************************/
 
-typedef cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
-    cl_platform_id             platform,
+typedef cl_int (CL_API_CALL *
+clGetDeviceIDsFromD3D10KHR_fn)(
+    cl_platform_id platform,
     cl_d3d10_device_source_khr d3d_device_source,
-    void *                     d3d_object,
-    cl_d3d10_device_set_khr    d3d_device_set,
-    cl_uint                    num_entries,
-    cl_device_id *             devices,
-    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
-
-typedef cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
-    cl_context     context,
-    cl_mem_flags   flags,
-    ID3D10Buffer * resource,
-    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D10Texture2D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D10Texture3D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+    void* d3d_object,
+    cl_d3d10_device_set_khr d3d_device_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromD3D10BufferKHR_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D10Buffer* resource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromD3D10Texture2DKHR_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D10Texture2D* resource,
+    UINT subresource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromD3D10Texture3DKHR_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D10Texture3D* resource,
+    UINT subresource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueAcquireD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueReleaseD3D10ObjectsKHR_fn)(
     cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
-typedef cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromD3D10KHR(
+    cl_platform_id platform,
+    cl_d3d10_device_source_khr d3d_device_source,
+    void* d3d_object,
+    cl_d3d10_device_set_khr d3d_device_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D10BufferKHR(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D10Buffer* resource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D10Texture2DKHR(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D10Texture2D* resource,
+    UINT subresource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D10Texture3DKHR(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D10Texture3D* resource,
+    UINT subresource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireD3D10ObjectsKHR(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseD3D10ObjectsKHR(
     cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_intel_sharing_format_query_d3d10
+***************************************************************/
+#define cl_intel_sharing_format_query_d3d10 1
+#define CL_INTEL_SHARING_FORMAT_QUERY_D3D10_EXTENSION_NAME \
+    "cl_intel_sharing_format_query_d3d10"
+
+/* when cl_khr_d3d10_sharing is supported */
+
+typedef cl_int (CL_API_CALL *
+clGetSupportedD3D10TextureFormatsINTEL_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint num_entries,
+    DXGI_FORMAT* d3d10_formats,
+    cl_uint* num_texture_formats) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedD3D10TextureFormatsINTEL(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint num_entries,
+    DXGI_FORMAT* d3d10_formats,
+    cl_uint* num_texture_formats) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif  /* __OPENCL_CL_D3D10_H */
-
+#endif /* OPENCL_CL_D3D10_H_ */
diff --git a/include/CL/cl_d3d11.h b/include/CL/cl_d3d11.h
index 50ed906b3bb23d49ba288a9510b3b4860bbd8f78..6a6af21dc78fd87c849a55a89dd945aaa966bf34 100644
--- a/include/CL/cl_d3d11.h
+++ b/include/CL/cl_d3d11.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2020 The Khronos Group Inc.
+ * Copyright (c) 2008-2023 The Khronos Group Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,18 @@
  * limitations under the License.
  ******************************************************************************/
 
-#ifndef __OPENCL_CL_D3D11_H
-#define __OPENCL_CL_D3D11_H
+#ifndef OPENCL_CL_D3D11_H_
+#define OPENCL_CL_D3D11_H_
+
+/*
+** This header is generated from the Khronos OpenCL XML API Registry.
+*/
 
 #if defined(_MSC_VER)
 #if _MSC_VER >=1500
 #pragma warning( push )
 #pragma warning( disable : 4201 )
+#pragma warning( disable : 5105 )
 #endif
 #endif
 #include <d3d11.h>
@@ -29,100 +34,210 @@
 #pragma warning( pop )
 #endif
 #endif
+
 #include <CL/cl.h>
-#include <CL/cl_platform.h>
+
+/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
+#define CL_NO_EXTENSION_PROTOTYPES
+#endif
+
+/* CL_NO_EXTENSION_PROTOTYPES implies
+   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
+   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/******************************************************************************
- * cl_khr_d3d11_sharing                                                       */
+/***************************************************************
+* cl_khr_d3d11_sharing
+***************************************************************/
 #define cl_khr_d3d11_sharing 1
+#define CL_KHR_D3D11_SHARING_EXTENSION_NAME \
+    "cl_khr_d3d11_sharing"
 
-typedef cl_uint cl_d3d11_device_source_khr;
-typedef cl_uint cl_d3d11_device_set_khr;
+typedef cl_uint             cl_d3d11_device_source_khr;
+typedef cl_uint             cl_d3d11_device_set_khr;
 
-/******************************************************************************/
+/* Error codes */
+#define CL_INVALID_D3D11_DEVICE_KHR                         -1006
+#define CL_INVALID_D3D11_RESOURCE_KHR                       -1007
+#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR              -1008
+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR                  -1009
 
-/* Error Codes */
-#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
-#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
-#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
-#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
+/* cl_d3d11_device_source_khr */
+#define CL_D3D11_DEVICE_KHR                                 0x4019
+#define CL_D3D11_DXGI_ADAPTER_KHR                           0x401A
 
-/* cl_d3d11_device_source */
-#define CL_D3D11_DEVICE_KHR                          0x4019
-#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
-
-/* cl_d3d11_device_set */
-#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
-#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
+/* cl_d3d11_device_set_khr */
+#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR                  0x401B
+#define CL_ALL_DEVICES_FOR_D3D11_KHR                        0x401C
 
 /* cl_context_info */
-#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
-#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
+#define CL_CONTEXT_D3D11_DEVICE_KHR                         0x401D
+#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR        0x402D
 
 /* cl_mem_info */
-#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
+#define CL_MEM_D3D11_RESOURCE_KHR                           0x401E
 
 /* cl_image_info */
-#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
+#define CL_IMAGE_D3D11_SUBRESOURCE_KHR                      0x401F
 
 /* cl_command_type */
-#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
-#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR                0x4020
+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR                0x4021
 
-/******************************************************************************/
 
-typedef cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
-    cl_platform_id             platform,
+typedef cl_int (CL_API_CALL *
+clGetDeviceIDsFromD3D11KHR_fn)(
+    cl_platform_id platform,
     cl_d3d11_device_source_khr d3d_device_source,
-    void *                     d3d_object,
-    cl_d3d11_device_set_khr    d3d_device_set,
-    cl_uint                    num_entries,
-    cl_device_id *             devices,
-    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
-
-typedef cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
-    cl_context     context,
-    cl_mem_flags   flags,
-    ID3D11Buffer * resource,
-    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D11Texture2D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D11Texture3D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
+    void* d3d_object,
+    cl_d3d11_device_set_khr d3d_device_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromD3D11BufferKHR_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D11Buffer* resource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromD3D11Texture2DKHR_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D11Texture2D* resource,
+    UINT subresource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromD3D11Texture3DKHR_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D11Texture3D* resource,
+    UINT subresource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueAcquireD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueReleaseD3D11ObjectsKHR_fn)(
     cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
-typedef cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromD3D11KHR(
+    cl_platform_id platform,
+    cl_d3d11_device_source_khr d3d_device_source,
+    void* d3d_object,
+    cl_d3d11_device_set_khr d3d_device_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D11BufferKHR(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D11Buffer* resource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D11Texture2DKHR(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D11Texture2D* resource,
+    UINT subresource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D11Texture3DKHR(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D11Texture3D* resource,
+    UINT subresource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireD3D11ObjectsKHR(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseD3D11ObjectsKHR(
     cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_intel_sharing_format_query_d3d11
+***************************************************************/
+#define cl_intel_sharing_format_query_d3d11 1
+#define CL_INTEL_SHARING_FORMAT_QUERY_D3D11_EXTENSION_NAME \
+    "cl_intel_sharing_format_query_d3d11"
+
+/* when cl_khr_d3d11_sharing is supported */
+
+typedef cl_int (CL_API_CALL *
+clGetSupportedD3D11TextureFormatsINTEL_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint plane,
+    cl_uint num_entries,
+    DXGI_FORMAT* d3d11_formats,
+    cl_uint* num_texture_formats) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedD3D11TextureFormatsINTEL(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint plane,
+    cl_uint num_entries,
+    DXGI_FORMAT* d3d11_formats,
+    cl_uint* num_texture_formats) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif  /* __OPENCL_CL_D3D11_H */
-
+#endif /* OPENCL_CL_D3D11_H_ */
diff --git a/include/CL/cl_dx9_media_sharing.h b/include/CL/cl_dx9_media_sharing.h
index b0d2b23401c970ffebb5ed008f2729321b44d172..e7f8c452e1551fbe4db398ccc9410269c8cfb4a2 100644
--- a/include/CL/cl_dx9_media_sharing.h
+++ b/include/CL/cl_dx9_media_sharing.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2020 The Khronos Group Inc.
+ * Copyright (c) 2008-2023 The Khronos Group Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,216 +14,337 @@
  * limitations under the License.
  ******************************************************************************/
 
-#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
-#define __OPENCL_CL_DX9_MEDIA_SHARING_H
+#ifndef OPENCL_CL_DX9_MEDIA_SHARING_H_
+#define OPENCL_CL_DX9_MEDIA_SHARING_H_
+
+/*
+** This header is generated from the Khronos OpenCL XML API Registry.
+*/
+
+#if defined(_WIN32)
+#if defined(_MSC_VER)
+#if _MSC_VER >=1500
+#pragma warning( push )
+#pragma warning( disable : 4201 )
+#pragma warning( disable : 5105 )
+#endif
+#endif
+#include <d3d9.h>
+#if defined(_MSC_VER)
+#if _MSC_VER >=1500
+#pragma warning( pop )
+#endif
+#endif
+#endif
 
 #include <CL/cl.h>
-#include <CL/cl_platform.h>
+
+/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
+#define CL_NO_EXTENSION_PROTOTYPES
+#endif
+
+/* CL_NO_EXTENSION_PROTOTYPES implies
+   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
+   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/******************************************************************************/
-/* cl_khr_dx9_media_sharing                                                   */
+/***************************************************************
+* cl_khr_dx9_media_sharing
+***************************************************************/
 #define cl_khr_dx9_media_sharing 1
+#define CL_KHR_DX9_MEDIA_SHARING_EXTENSION_NAME \
+    "cl_khr_dx9_media_sharing"
 
 typedef cl_uint             cl_dx9_media_adapter_type_khr;
 typedef cl_uint             cl_dx9_media_adapter_set_khr;
-    
+
 #if defined(_WIN32)
-#include <d3d9.h>
-typedef struct _cl_dx9_surface_info_khr
-{
-    IDirect3DSurface9 *resource;
+typedef struct _cl_dx9_surface_info_khr {
+    IDirect3DSurface9* resource;
     HANDLE shared_handle;
 } cl_dx9_surface_info_khr;
-#endif
 
+#endif /* defined(_WIN32) */
 
-/******************************************************************************/
-
-/* Error Codes */
-#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
-#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
-#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
-#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
+/* Error codes */
+#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                    -1010
+#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                    -1011
+#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR           -1012
+#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR               -1013
 
 /* cl_media_adapter_type_khr */
-#define CL_ADAPTER_D3D9_KHR                              0x2020
-#define CL_ADAPTER_D3D9EX_KHR                            0x2021
-#define CL_ADAPTER_DXVA_KHR                              0x2022
+#define CL_ADAPTER_D3D9_KHR                                 0x2020
+#define CL_ADAPTER_D3D9EX_KHR                               0x2021
+#define CL_ADAPTER_DXVA_KHR                                 0x2022
 
 /* cl_media_adapter_set_khr */
-#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
-#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
+#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR      0x2023
+#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR            0x2024
 
 /* cl_context_info */
-#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
-#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
-#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
+#define CL_CONTEXT_ADAPTER_D3D9_KHR                         0x2025
+#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                       0x2026
+#define CL_CONTEXT_ADAPTER_DXVA_KHR                         0x2027
 
 /* cl_mem_info */
-#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
-#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
+#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                   0x2028
+#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                   0x2029
 
 /* cl_image_info */
-#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
+#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                        0x202A
 
 /* cl_command_type */
-#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
-#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
-
-/******************************************************************************/
-
-typedef cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
-    cl_platform_id                   platform,
-    cl_uint                          num_media_adapters,
-    cl_dx9_media_adapter_type_khr *  media_adapter_type,
-    void *                           media_adapters,
-    cl_dx9_media_adapter_set_khr     media_adapter_set,
-    cl_uint                          num_entries,
-    cl_device_id *                   devices,
-    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
-
-typedef cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
-    cl_context                    context,
-    cl_mem_flags                  flags,
+#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR           0x202B
+#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR           0x202C
+
+
+typedef cl_int (CL_API_CALL *
+clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
+    cl_platform_id platform,
+    cl_uint num_media_adapters,
+    cl_dx9_media_adapter_type_khr* media_adapter_type,
+    void* media_adapters,
+    cl_dx9_media_adapter_set_khr media_adapter_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromDX9MediaSurfaceKHR_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_dx9_media_adapter_type_khr adapter_type,
+    void* surface_info,
+    cl_uint plane,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromDX9MediaAdapterKHR(
+    cl_platform_id platform,
+    cl_uint num_media_adapters,
+    cl_dx9_media_adapter_type_khr* media_adapter_type,
+    void* media_adapters,
+    cl_dx9_media_adapter_set_khr media_adapter_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromDX9MediaSurfaceKHR(
+    cl_context context,
+    cl_mem_flags flags,
     cl_dx9_media_adapter_type_khr adapter_type,
-    void *                        surface_info,
-    cl_uint                       plane,                                                                          
-    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+    void* surface_info,
+    cl_uint plane,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-typedef cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireDX9MediaSurfacesKHR(
     cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
 
-typedef cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseDX9MediaSurfacesKHR(
     cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
 
-/***************************************
-* cl_intel_dx9_media_sharing extension *
-****************************************/
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
+/***************************************************************
+* cl_intel_dx9_media_sharing
+***************************************************************/
 #define cl_intel_dx9_media_sharing 1
+#define CL_INTEL_DX9_MEDIA_SHARING_EXTENSION_NAME \
+    "cl_intel_dx9_media_sharing"
 
-typedef cl_uint cl_dx9_device_source_intel;
-typedef cl_uint cl_dx9_device_set_intel;
+typedef cl_uint             cl_dx9_device_source_intel;
+typedef cl_uint             cl_dx9_device_set_intel;
 
-/* error codes */
-#define CL_INVALID_DX9_DEVICE_INTEL                   -1010
-#define CL_INVALID_DX9_RESOURCE_INTEL                 -1011
-#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL        -1012
-#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL            -1013
+/* Error codes */
+#define CL_INVALID_DX9_DEVICE_INTEL                         -1010
+#define CL_INVALID_DX9_RESOURCE_INTEL                       -1011
+#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL              -1012
+#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL                  -1013
 
 /* cl_dx9_device_source_intel */
-#define CL_D3D9_DEVICE_INTEL                          0x4022
-#define CL_D3D9EX_DEVICE_INTEL                        0x4070
-#define CL_DXVA_DEVICE_INTEL                          0x4071
+#define CL_D3D9_DEVICE_INTEL                                0x4022
+#define CL_D3D9EX_DEVICE_INTEL                              0x4070
+#define CL_DXVA_DEVICE_INTEL                                0x4071
 
 /* cl_dx9_device_set_intel */
-#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL            0x4024
-#define CL_ALL_DEVICES_FOR_DX9_INTEL                  0x4025
+#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL                  0x4024
+#define CL_ALL_DEVICES_FOR_DX9_INTEL                        0x4025
 
 /* cl_context_info */
-#define CL_CONTEXT_D3D9_DEVICE_INTEL                  0x4026
-#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                0x4072
-#define CL_CONTEXT_DXVA_DEVICE_INTEL                  0x4073
+#define CL_CONTEXT_D3D9_DEVICE_INTEL                        0x4026
+#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                      0x4072
+#define CL_CONTEXT_DXVA_DEVICE_INTEL                        0x4073
 
 /* cl_mem_info */
-#define CL_MEM_DX9_RESOURCE_INTEL                     0x4027
-#define CL_MEM_DX9_SHARED_HANDLE_INTEL                0x4074
+#define CL_MEM_DX9_RESOURCE_INTEL                           0x4027
+#define CL_MEM_DX9_SHARED_HANDLE_INTEL                      0x4074
 
 /* cl_image_info */
-#define CL_IMAGE_DX9_PLANE_INTEL                      0x4075
+#define CL_IMAGE_DX9_PLANE_INTEL                            0x4075
 
 /* cl_command_type */
-#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL          0x402A
-#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL          0x402B
-/******************************************************************************/
+#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL                0x402A
+#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL                0x402B
+
+
+typedef cl_int (CL_API_CALL *
+clGetDeviceIDsFromDX9INTEL_fn)(
+    cl_platform_id platform,
+    cl_dx9_device_source_intel dx9_device_source,
+    void* dx9_object,
+    cl_dx9_device_set_intel dx9_device_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromDX9MediaSurfaceINTEL_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    IDirect3DSurface9* resource,
+    HANDLE sharedHandle,
+    UINT plane,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueAcquireDX9ObjectsINTEL_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueReleaseDX9ObjectsINTEL_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_1;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetDeviceIDsFromDX9INTEL(
-    cl_platform_id              platform,
-    cl_dx9_device_source_intel  dx9_device_source,
-    void*                       dx9_object,
-    cl_dx9_device_set_intel     dx9_device_set,
-    cl_uint                     num_entries,
-    cl_device_id*               devices,
-    cl_uint*                    num_devices) CL_API_SUFFIX__VERSION_1_1;
-
-typedef cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
-    cl_platform_id              platform,
-    cl_dx9_device_source_intel  dx9_device_source,
-    void*                       dx9_object,
-    cl_dx9_device_set_intel     dx9_device_set,
-    cl_uint                     num_entries,
-    cl_device_id*               devices,
-    cl_uint*                    num_devices) CL_API_SUFFIX__VERSION_1_1;
+    cl_platform_id platform,
+    cl_dx9_device_source_intel dx9_device_source,
+    void* dx9_object,
+    cl_dx9_device_set_intel dx9_device_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_1;
 
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromDX9MediaSurfaceINTEL(
-    cl_context                  context,
-    cl_mem_flags                flags,
-    IDirect3DSurface9*          resource,
-    HANDLE                      sharedHandle,
-    UINT                        plane,
-    cl_int*                     errcode_ret) CL_API_SUFFIX__VERSION_1_1;
-
-typedef cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
-    cl_context                  context,
-    cl_mem_flags                flags,
-    IDirect3DSurface9*          resource,
-    HANDLE                      sharedHandle,
-    UINT                        plane,
-    cl_int*                     errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+    cl_context context,
+    cl_mem_flags flags,
+    IDirect3DSurface9* resource,
+    HANDLE sharedHandle,
+    UINT plane,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueAcquireDX9ObjectsINTEL(
-    cl_command_queue            command_queue,
-    cl_uint                     num_objects,
-    const cl_mem*               mem_objects,
-    cl_uint                     num_events_in_wait_list,
-    const cl_event*             event_wait_list,
-    cl_event*                   event) CL_API_SUFFIX__VERSION_1_1;
-
-typedef cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
-    cl_command_queue            command_queue,
-    cl_uint                     num_objects,
-    const cl_mem*               mem_objects,
-    cl_uint                     num_events_in_wait_list,
-    const cl_event*             event_wait_list,
-    cl_event*                   event) CL_API_SUFFIX__VERSION_1_1;
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_1;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReleaseDX9ObjectsINTEL(
-    cl_command_queue            command_queue,
-    cl_uint                     num_objects,
-    cl_mem*                     mem_objects,
-    cl_uint                     num_events_in_wait_list,
-    const cl_event*             event_wait_list,
-    cl_event*                   event) CL_API_SUFFIX__VERSION_1_1;
-
-typedef cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
-    cl_command_queue            command_queue,
-    cl_uint                     num_objects,
-    cl_mem*                     mem_objects,
-    cl_uint                     num_events_in_wait_list,
-    const cl_event*             event_wait_list,
-    cl_event*                   event) CL_API_SUFFIX__VERSION_1_1;
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_1;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_intel_sharing_format_query_dx9
+***************************************************************/
+#define cl_intel_sharing_format_query_dx9 1
+#define CL_INTEL_SHARING_FORMAT_QUERY_DX9_EXTENSION_NAME \
+    "cl_intel_sharing_format_query_dx9"
+
+/* when cl_khr_dx9_media_sharing or cl_intel_dx9_media_sharing is supported */
+
+typedef cl_int (CL_API_CALL *
+clGetSupportedDX9MediaSurfaceFormatsINTEL_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint plane,
+    cl_uint num_entries,
+    D3DFORMAT* dx9_formats,
+    cl_uint* num_surface_formats) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedDX9MediaSurfaceFormatsINTEL(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint plane,
+    cl_uint num_entries,
+    D3DFORMAT* dx9_formats,
+    cl_uint* num_surface_formats) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
-
+#endif /* OPENCL_CL_DX9_MEDIA_SHARING_H_ */
diff --git a/include/CL/cl_egl.h b/include/CL/cl_egl.h
index 357a37c021cf47efcdf23eea2fde880c790b89aa..e2db9025d481fff8bd278f1b219df305260825aa 100644
--- a/include/CL/cl_egl.h
+++ b/include/CL/cl_egl.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2020 The Khronos Group Inc.
+ * Copyright (c) 2008-2023 The Khronos Group Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,107 +14,154 @@
  * limitations under the License.
  ******************************************************************************/
 
-#ifndef __OPENCL_CL_EGL_H
-#define __OPENCL_CL_EGL_H
+#ifndef OPENCL_CL_EGL_H_
+#define OPENCL_CL_EGL_H_
+
+/*
+** This header is generated from the Khronos OpenCL XML API Registry.
+*/
 
 #include <CL/cl.h>
 
+/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
+#define CL_NO_EXTENSION_PROTOTYPES
+#endif
+
+/* CL_NO_EXTENSION_PROTOTYPES implies
+   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
+   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+/***************************************************************
+* cl_khr_egl_image
+***************************************************************/
+#define cl_khr_egl_image 1
+#define CL_KHR_EGL_IMAGE_EXTENSION_NAME \
+    "cl_khr_egl_image"
 
 /* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
-#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
-#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
-#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
+#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR                0x202F
+#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR                  0x202D
+#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR                  0x202E
 
 /* Error type for clCreateFromEGLImageKHR */
-#define CL_INVALID_EGL_OBJECT_KHR             -1093
-#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
+#define CL_INVALID_EGL_OBJECT_KHR                           -1093
+#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR                    -1092
 
 /* CLeglImageKHR is an opaque handle to an EGLImage */
-typedef void* CLeglImageKHR;
+typedef void*               CLeglImageKHR;
 
 /* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
-typedef void* CLeglDisplayKHR;
-
-/* CLeglSyncKHR is an opaque handle to an EGLSync object */
-typedef void* CLeglSyncKHR;
+typedef void*               CLeglDisplayKHR;
 
 /* properties passed to clCreateFromEGLImageKHR */
-typedef intptr_t cl_egl_image_properties_khr;
+typedef intptr_t            cl_egl_image_properties_khr;
 
 
-#define cl_khr_egl_image 1
+typedef cl_mem (CL_API_CALL *
+clCreateFromEGLImageKHR_fn)(
+    cl_context context,
+    CLeglDisplayKHR egldisplay,
+    CLeglImageKHR eglimage,
+    cl_mem_flags flags,
+    const cl_egl_image_properties_khr* properties,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromEGLImageKHR(cl_context                  context,
-                        CLeglDisplayKHR             egldisplay,
-                        CLeglImageKHR               eglimage,
-                        cl_mem_flags                flags,
-                        const cl_egl_image_properties_khr * properties,
-                        cl_int *                    errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
-    cl_context                  context,
-    CLeglDisplayKHR             egldisplay,
-    CLeglImageKHR               eglimage,
-    cl_mem_flags                flags,
-    const cl_egl_image_properties_khr * properties,
-    cl_int *                    errcode_ret);
+typedef cl_int (CL_API_CALL *
+clEnqueueAcquireEGLObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueReleaseEGLObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
 
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(
+    cl_context context,
+    CLeglDisplayKHR egldisplay,
+    CLeglImageKHR eglimage,
+    cl_mem_flags flags,
+    const cl_egl_image_properties_khr* properties,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue,
-                              cl_uint          num_objects,
-                              const cl_mem *   mem_objects,
-                              cl_uint          num_events_in_wait_list,
-                              const cl_event * event_wait_list,
-                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
-
-typedef cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+clEnqueueAcquireEGLObjectsKHR(
     cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event);
-
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue,
-                              cl_uint          num_objects,
-                              const cl_mem *   mem_objects,
-                              cl_uint          num_events_in_wait_list,
-                              const cl_event * event_wait_list,
-                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
-
-typedef cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+clEnqueueReleaseEGLObjectsKHR(
     cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event);
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
 
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
+/***************************************************************
+* cl_khr_egl_event
+***************************************************************/
 #define cl_khr_egl_event 1
+#define CL_KHR_EGL_EVENT_EXTENSION_NAME \
+    "cl_khr_egl_event"
+
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+/* type CLeglDisplayKHR */
+
+/* CLeglSyncKHR is an opaque handle to an EGLSync object */
+typedef void*               CLeglSyncKHR;
+
+
+typedef cl_event (CL_API_CALL *
+clCreateEventFromEGLSyncKHR_fn)(
+    cl_context context,
+    CLeglSyncKHR sync,
+    CLeglDisplayKHR display,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
 extern CL_API_ENTRY cl_event CL_API_CALL
-clCreateEventFromEGLSyncKHR(cl_context      context,
-                            CLeglSyncKHR    sync,
-                            CLeglDisplayKHR display,
-                            cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
-    cl_context      context,
-    CLeglSyncKHR    sync,
+clCreateEventFromEGLSyncKHR(
+    cl_context context,
+    CLeglSyncKHR sync,
     CLeglDisplayKHR display,
-    cl_int *        errcode_ret);
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif /* __OPENCL_CL_EGL_H */
+#endif /* OPENCL_CL_EGL_H_ */
diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h
index 36d0256843f44e2e21cff90a186bc01d17698473..12ebd043748a297e1898d34a6963a0a8e728af82 100644
--- a/include/CL/cl_ext.h
+++ b/include/CL/cl_ext.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2020 The Khronos Group Inc.
+ * Copyright (c) 2008-2023 The Khronos Group Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,635 +14,1373 @@
  * limitations under the License.
  ******************************************************************************/
 
-/* cl_ext.h contains OpenCL extensions which don't have external */
-/* (OpenGL, D3D) dependencies.                                   */
+#ifndef OPENCL_CL_EXT_H_
+#define OPENCL_CL_EXT_H_
 
-#ifndef __CL_EXT_H
-#define __CL_EXT_H
+/*
+** This header is generated from the Khronos OpenCL XML API Registry.
+*/
+
+#include <CL/cl.h>
+
+/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
+#define CL_NO_EXTENSION_PROTOTYPES
+#endif
+
+/* CL_NO_EXTENSION_PROTOTYPES implies
+   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
+   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include <CL/cl.h>
+/***************************************************************
+* cl_khr_command_buffer
+***************************************************************/
+#define cl_khr_command_buffer 1
+#define CL_KHR_COMMAND_BUFFER_EXTENSION_NAME \
+    "cl_khr_command_buffer"
+
+typedef cl_bitfield         cl_device_command_buffer_capabilities_khr;
+typedef struct _cl_command_buffer_khr* cl_command_buffer_khr;
+typedef cl_uint             cl_sync_point_khr;
+typedef cl_uint             cl_command_buffer_info_khr;
+typedef cl_uint             cl_command_buffer_state_khr;
+typedef cl_properties       cl_command_buffer_properties_khr;
+typedef cl_bitfield         cl_command_buffer_flags_khr;
+typedef cl_properties       cl_ndrange_kernel_command_properties_khr;
+typedef struct _cl_mutable_command_khr* cl_mutable_command_khr;
 
-/* cl_khr_fp64 extension - no extension #define since it has no functions  */
-/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */
+/* cl_device_info */
+#define CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR           0x12A9
+#define CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR 0x12AA
+
+/* cl_device_command_buffer_capabilities_khr - bitfield */
+#define CL_COMMAND_BUFFER_CAPABILITY_KERNEL_PRINTF_KHR      (1 << 0)
+#define CL_COMMAND_BUFFER_CAPABILITY_DEVICE_SIDE_ENQUEUE_KHR (1 << 1)
+#define CL_COMMAND_BUFFER_CAPABILITY_SIMULTANEOUS_USE_KHR   (1 << 2)
+#define CL_COMMAND_BUFFER_CAPABILITY_OUT_OF_ORDER_KHR       (1 << 3)
+
+/* cl_command_buffer_properties_khr */
+#define CL_COMMAND_BUFFER_FLAGS_KHR                         0x1293
+
+/* cl_command_buffer_flags_khr - bitfield */
+#define CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR              (1 << 0)
+
+/* Error codes */
+#define CL_INVALID_COMMAND_BUFFER_KHR                       -1138
+#define CL_INVALID_SYNC_POINT_WAIT_LIST_KHR                 -1139
+#define CL_INCOMPATIBLE_COMMAND_QUEUE_KHR                   -1140
+
+/* cl_command_buffer_info_khr */
+#define CL_COMMAND_BUFFER_QUEUES_KHR                        0x1294
+#define CL_COMMAND_BUFFER_NUM_QUEUES_KHR                    0x1295
+#define CL_COMMAND_BUFFER_REFERENCE_COUNT_KHR               0x1296
+#define CL_COMMAND_BUFFER_STATE_KHR                         0x1297
+#define CL_COMMAND_BUFFER_PROPERTIES_ARRAY_KHR              0x1298
+#define CL_COMMAND_BUFFER_CONTEXT_KHR                       0x1299
+
+/* cl_command_buffer_state_khr */
+#define CL_COMMAND_BUFFER_STATE_RECORDING_KHR               0
+#define CL_COMMAND_BUFFER_STATE_EXECUTABLE_KHR              1
+#define CL_COMMAND_BUFFER_STATE_PENDING_KHR                 2
+#define CL_COMMAND_BUFFER_STATE_INVALID_KHR                 3
 
-#if CL_TARGET_OPENCL_VERSION <= 110
-#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
-#endif
+/* cl_command_type */
+#define CL_COMMAND_COMMAND_BUFFER_KHR                       0x12A8
 
-/* cl_khr_fp16 extension - no extension #define since it has no functions  */
-#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
 
-/* Memory object destruction
- *
- * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
- *
- * Registers a user callback function that will be called when the memory object is deleted and its resources
- * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
- * stack associated with memobj. The registered user callback functions are called in the reverse order in
- * which they were registered. The user callback functions are called and then the memory object is deleted
- * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
- * notified when the memory referenced by host_ptr, specified when the memory object is created and used as
- * the storage bits for the memory object, can be reused or freed.
- *
- * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
- *
- * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
- * before using.
- */
+typedef cl_command_buffer_khr (CL_API_CALL *
+clCreateCommandBufferKHR_fn)(
+    cl_uint num_queues,
+    const cl_command_queue* queues,
+    const cl_command_buffer_properties_khr* properties,
+    cl_int* errcode_ret) ;
+
+typedef cl_int (CL_API_CALL *
+clFinalizeCommandBufferKHR_fn)(
+    cl_command_buffer_khr command_buffer) ;
+
+typedef cl_int (CL_API_CALL *
+clRetainCommandBufferKHR_fn)(
+    cl_command_buffer_khr command_buffer) ;
+
+typedef cl_int (CL_API_CALL *
+clReleaseCommandBufferKHR_fn)(
+    cl_command_buffer_khr command_buffer) ;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueCommandBufferKHR_fn)(
+    cl_uint num_queues,
+    cl_command_queue* queues,
+    cl_command_buffer_khr command_buffer,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+
+typedef cl_int (CL_API_CALL *
+clCommandBarrierWithWaitListKHR_fn)(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+typedef cl_int (CL_API_CALL *
+clCommandCopyBufferKHR_fn)(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_buffer,
+    cl_mem dst_buffer,
+    size_t src_offset,
+    size_t dst_offset,
+    size_t size,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+typedef cl_int (CL_API_CALL *
+clCommandCopyBufferRectKHR_fn)(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_buffer,
+    cl_mem dst_buffer,
+    const size_t* src_origin,
+    const size_t* dst_origin,
+    const size_t* region,
+    size_t src_row_pitch,
+    size_t src_slice_pitch,
+    size_t dst_row_pitch,
+    size_t dst_slice_pitch,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+typedef cl_int (CL_API_CALL *
+clCommandCopyBufferToImageKHR_fn)(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_buffer,
+    cl_mem dst_image,
+    size_t src_offset,
+    const size_t* dst_origin,
+    const size_t* region,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+typedef cl_int (CL_API_CALL *
+clCommandCopyImageKHR_fn)(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_image,
+    cl_mem dst_image,
+    const size_t* src_origin,
+    const size_t* dst_origin,
+    const size_t* region,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+typedef cl_int (CL_API_CALL *
+clCommandCopyImageToBufferKHR_fn)(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_image,
+    cl_mem dst_buffer,
+    const size_t* src_origin,
+    const size_t* region,
+    size_t dst_offset,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+typedef cl_int (CL_API_CALL *
+clCommandFillBufferKHR_fn)(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem buffer,
+    const void* pattern,
+    size_t pattern_size,
+    size_t offset,
+    size_t size,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+typedef cl_int (CL_API_CALL *
+clCommandFillImageKHR_fn)(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem image,
+    const void* fill_color,
+    const size_t* origin,
+    const size_t* region,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+typedef cl_int (CL_API_CALL *
+clCommandNDRangeKernelKHR_fn)(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    const cl_ndrange_kernel_command_properties_khr* properties,
+    cl_kernel kernel,
+    cl_uint work_dim,
+    const size_t* global_work_offset,
+    const size_t* global_work_size,
+    const size_t* local_work_size,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+typedef cl_int (CL_API_CALL *
+clGetCommandBufferInfoKHR_fn)(
+    cl_command_buffer_khr command_buffer,
+    cl_command_buffer_info_khr param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_command_buffer_khr CL_API_CALL
+clCreateCommandBufferKHR(
+    cl_uint num_queues,
+    const cl_command_queue* queues,
+    const cl_command_buffer_properties_khr* properties,
+    cl_int* errcode_ret) ;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinalizeCommandBufferKHR(
+    cl_command_buffer_khr command_buffer) ;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandBufferKHR(
+    cl_command_buffer_khr command_buffer) ;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandBufferKHR(
+    cl_command_buffer_khr command_buffer) ;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCommandBufferKHR(
+    cl_uint num_queues,
+    cl_command_queue* queues,
+    cl_command_buffer_khr command_buffer,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCommandBarrierWithWaitListKHR(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCommandCopyBufferKHR(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_buffer,
+    cl_mem dst_buffer,
+    size_t src_offset,
+    size_t dst_offset,
+    size_t size,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCommandCopyBufferRectKHR(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_buffer,
+    cl_mem dst_buffer,
+    const size_t* src_origin,
+    const size_t* dst_origin,
+    const size_t* region,
+    size_t src_row_pitch,
+    size_t src_slice_pitch,
+    size_t dst_row_pitch,
+    size_t dst_slice_pitch,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCommandCopyBufferToImageKHR(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_buffer,
+    cl_mem dst_image,
+    size_t src_offset,
+    const size_t* dst_origin,
+    const size_t* region,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCommandCopyImageKHR(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_image,
+    cl_mem dst_image,
+    const size_t* src_origin,
+    const size_t* dst_origin,
+    const size_t* region,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCommandCopyImageToBufferKHR(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem src_image,
+    cl_mem dst_buffer,
+    const size_t* src_origin,
+    const size_t* region,
+    size_t dst_offset,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCommandFillBufferKHR(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem buffer,
+    const void* pattern,
+    size_t pattern_size,
+    size_t offset,
+    size_t size,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCommandFillImageKHR(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    cl_mem image,
+    const void* fill_color,
+    const size_t* origin,
+    const size_t* region,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCommandNDRangeKernelKHR(
+    cl_command_buffer_khr command_buffer,
+    cl_command_queue command_queue,
+    const cl_ndrange_kernel_command_properties_khr* properties,
+    cl_kernel kernel,
+    cl_uint work_dim,
+    const size_t* global_work_offset,
+    const size_t* global_work_size,
+    const size_t* local_work_size,
+    cl_uint num_sync_points_in_wait_list,
+    const cl_sync_point_khr* sync_point_wait_list,
+    cl_sync_point_khr* sync_point,
+    cl_mutable_command_khr* mutable_handle) ;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandBufferInfoKHR(
+    cl_command_buffer_khr command_buffer,
+    cl_command_buffer_info_khr param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_khr_command_buffer_multi_device
+***************************************************************/
+#define cl_khr_command_buffer_multi_device 1
+#define CL_KHR_COMMAND_BUFFER_MULTI_DEVICE_EXTENSION_NAME \
+    "cl_khr_command_buffer_multi_device"
+
+typedef cl_bitfield         cl_platform_command_buffer_capabilities_khr;
+
+/* cl_platform_info */
+#define CL_PLATFORM_COMMAND_BUFFER_CAPABILITIES_KHR         0x0908
+
+/* cl_platform_command_buffer_capabilities_khr - bitfield */
+#define CL_COMMAND_BUFFER_PLATFORM_UNIVERSAL_SYNC_KHR       (1 << 0)
+#define CL_COMMAND_BUFFER_PLATFORM_REMAP_QUEUES_KHR         (1 << 1)
+#define CL_COMMAND_BUFFER_PLATFORM_AUTOMATIC_REMAP_KHR      (1 << 2)
+
+/* cl_device_info */
+#define CL_DEVICE_COMMAND_BUFFER_NUM_SYNC_DEVICES_KHR       0x12AB
+#define CL_DEVICE_COMMAND_BUFFER_SYNC_DEVICES_KHR           0x12AC
+
+/* cl_device_command_buffer_capabilities_khr - bitfield */
+#define CL_COMMAND_BUFFER_CAPABILITY_MULTIPLE_QUEUE_KHR     (1 << 4)
+
+/* cl_command_buffer_flags_khr - bitfield */
+#define CL_COMMAND_BUFFER_DEVICE_SIDE_SYNC_KHR              (1 << 2)
+
+
+typedef cl_command_buffer_khr (CL_API_CALL *
+clRemapCommandBufferKHR_fn)(
+    cl_command_buffer_khr command_buffer,
+    cl_bool automatic,
+    cl_uint num_queues,
+    const cl_command_queue* queues,
+    cl_uint num_handles,
+    const cl_mutable_command_khr* handles,
+    cl_mutable_command_khr* handles_ret,
+    cl_int* errcode_ret) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_command_buffer_khr CL_API_CALL
+clRemapCommandBufferKHR(
+    cl_command_buffer_khr command_buffer,
+    cl_bool automatic,
+    cl_uint num_queues,
+    const cl_command_queue* queues,
+    cl_uint num_handles,
+    const cl_mutable_command_khr* handles,
+    cl_mutable_command_khr* handles_ret,
+    cl_int* errcode_ret) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_khr_command_buffer_mutable_dispatch
+***************************************************************/
+#define cl_khr_command_buffer_mutable_dispatch 1
+#define CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_NAME \
+    "cl_khr_command_buffer_mutable_dispatch"
+
+typedef cl_uint             cl_command_buffer_structure_type_khr;
+typedef cl_bitfield         cl_mutable_dispatch_fields_khr;
+typedef cl_uint             cl_mutable_command_info_khr;
+typedef struct _cl_mutable_dispatch_arg_khr {
+    cl_uint arg_index;
+    size_t arg_size;
+    const void* arg_value;
+} cl_mutable_dispatch_arg_khr;
+typedef struct _cl_mutable_dispatch_exec_info_khr {
+    cl_uint param_name;
+    size_t param_value_size;
+    const void* param_value;
+} cl_mutable_dispatch_exec_info_khr;
+typedef struct _cl_mutable_dispatch_config_khr {
+    cl_command_buffer_structure_type_khr type;
+    const void* next;
+    cl_mutable_command_khr command;
+    cl_uint num_args;
+    cl_uint num_svm_args;
+    cl_uint num_exec_infos;
+    cl_uint work_dim;
+    const cl_mutable_dispatch_arg_khr* arg_list;
+    const cl_mutable_dispatch_arg_khr* arg_svm_list;
+    const cl_mutable_dispatch_exec_info_khr* exec_info_list;
+    const size_t* global_work_offset;
+    const size_t* global_work_size;
+    const size_t* local_work_size;
+} cl_mutable_dispatch_config_khr;
+typedef struct _cl_mutable_base_config_khr {
+    cl_command_buffer_structure_type_khr type;
+    const void* next;
+    cl_uint num_mutable_dispatch;
+    const cl_mutable_dispatch_config_khr* mutable_dispatch_list;
+} cl_mutable_base_config_khr;
+
+/* cl_command_buffer_flags_khr - bitfield */
+#define CL_COMMAND_BUFFER_MUTABLE_KHR                       (1 << 1)
+
+/* Error codes */
+#define CL_INVALID_MUTABLE_COMMAND_KHR                      -1141
+
+/* cl_device_info */
+#define CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR         0x12B0
+
+/* cl_ndrange_kernel_command_properties_khr */
+#define CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR            0x12B1
+
+/* cl_mutable_dispatch_fields_khr - bitfield */
+#define CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR               (1 << 0)
+#define CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR                 (1 << 1)
+#define CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR                  (1 << 2)
+#define CL_MUTABLE_DISPATCH_ARGUMENTS_KHR                   (1 << 3)
+#define CL_MUTABLE_DISPATCH_EXEC_INFO_KHR                   (1 << 4)
+
+/* cl_mutable_command_info_khr */
+#define CL_MUTABLE_COMMAND_COMMAND_QUEUE_KHR                0x12A0
+#define CL_MUTABLE_COMMAND_COMMAND_BUFFER_KHR               0x12A1
+#define CL_MUTABLE_COMMAND_COMMAND_TYPE_KHR                 0x12AD
+#define CL_MUTABLE_DISPATCH_PROPERTIES_ARRAY_KHR            0x12A2
+#define CL_MUTABLE_DISPATCH_KERNEL_KHR                      0x12A3
+#define CL_MUTABLE_DISPATCH_DIMENSIONS_KHR                  0x12A4
+#define CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR          0x12A5
+#define CL_MUTABLE_DISPATCH_GLOBAL_WORK_SIZE_KHR            0x12A6
+#define CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR             0x12A7
+
+/* cl_command_buffer_structure_type_khr */
+#define CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR           0
+#define CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR       1
+
+
+typedef cl_int (CL_API_CALL *
+clUpdateMutableCommandsKHR_fn)(
+    cl_command_buffer_khr command_buffer,
+    const cl_mutable_base_config_khr* mutable_config) ;
+
+typedef cl_int (CL_API_CALL *
+clGetMutableCommandInfoKHR_fn)(
+    cl_mutable_command_khr command,
+    cl_mutable_command_info_khr param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUpdateMutableCommandsKHR(
+    cl_command_buffer_khr command_buffer,
+    const cl_mutable_base_config_khr* mutable_config) ;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMutableCommandInfoKHR(
+    cl_mutable_command_khr command,
+    cl_mutable_command_info_khr param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_khr_fp64
+***************************************************************/
+#define cl_khr_fp64 1
+#define CL_KHR_FP64_EXTENSION_NAME \
+    "cl_khr_fp64"
+
+#if !defined(CL_VERSION_1_2)
+/* cl_device_info - defined in CL.h for OpenCL 1.2 and newer */
+#define CL_DEVICE_DOUBLE_FP_CONFIG                          0x1032
+
+#endif /* !defined(CL_VERSION_1_2) */
+
+/***************************************************************
+* cl_khr_fp16
+***************************************************************/
+#define cl_khr_fp16 1
+#define CL_KHR_FP16_EXTENSION_NAME \
+    "cl_khr_fp16"
+
+/* cl_device_info */
+#define CL_DEVICE_HALF_FP_CONFIG                            0x1033
+
+/***************************************************************
+* cl_APPLE_SetMemObjectDestructor
+***************************************************************/
 #define cl_APPLE_SetMemObjectDestructor 1
-extern CL_API_ENTRY cl_int CL_API_CALL clSetMemObjectDestructorAPPLE(  cl_mem memobj,
-                                        void (* pfn_notify)(cl_mem memobj, void * user_data),
-                                        void * user_data)             CL_API_SUFFIX__VERSION_1_0;
+#define CL_APPLE_SETMEMOBJECTDESTRUCTOR_EXTENSION_NAME \
+    "cl_APPLE_SetMemObjectDestructor"
 
 
-/* Context Logging Functions
- *
- * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
- * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
- * before using.
- *
- * clLogMessagesToSystemLog forwards on all log messages to the Apple System Logger
- */
+typedef cl_int (CL_API_CALL *
+clSetMemObjectDestructorAPPLE_fn)(
+    cl_mem memobj,
+    void (CL_CALLBACK* pfn_notify)(cl_mem memobj, void* user_data),
+    void* user_data) CL_API_SUFFIX__VERSION_1_0;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorAPPLE(
+    cl_mem memobj,
+    void (CL_CALLBACK* pfn_notify)(cl_mem memobj, void* user_data),
+    void* user_data) CL_API_SUFFIX__VERSION_1_0;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_APPLE_ContextLoggingFunctions
+***************************************************************/
 #define cl_APPLE_ContextLoggingFunctions 1
-extern CL_API_ENTRY void CL_API_CALL clLogMessagesToSystemLogAPPLE(  const char * errstr,
-                                            const void * private_info,
-                                            size_t       cb,
-                                            void *       user_data)  CL_API_SUFFIX__VERSION_1_0;
-
-/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
-extern CL_API_ENTRY void CL_API_CALL clLogMessagesToStdoutAPPLE(   const char * errstr,
-                                          const void * private_info,
-                                          size_t       cb,
-                                          void *       user_data)    CL_API_SUFFIX__VERSION_1_0;
-
-/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
-extern CL_API_ENTRY void CL_API_CALL clLogMessagesToStderrAPPLE(   const char * errstr,
-                                          const void * private_info,
-                                          size_t       cb,
-                                          void *       user_data)    CL_API_SUFFIX__VERSION_1_0;
-
-
-/************************
-* cl_khr_icd extension *
-************************/
+#define CL_APPLE_CONTEXTLOGGINGFUNCTIONS_EXTENSION_NAME \
+    "cl_APPLE_ContextLoggingFunctions"
+
+
+typedef void (CL_API_CALL *
+clLogMessagesToSystemLogAPPLE_fn)(
+    const char* errstr,
+    const void* private_info,
+    size_t cb,
+    void* user_data) CL_API_SUFFIX__VERSION_1_0;
+
+typedef void (CL_API_CALL *
+clLogMessagesToStdoutAPPLE_fn)(
+    const char* errstr,
+    const void* private_info,
+    size_t cb,
+    void* user_data) CL_API_SUFFIX__VERSION_1_0;
+
+typedef void (CL_API_CALL *
+clLogMessagesToStderrAPPLE_fn)(
+    const char* errstr,
+    const void* private_info,
+    size_t cb,
+    void* user_data) CL_API_SUFFIX__VERSION_1_0;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY void CL_API_CALL
+clLogMessagesToSystemLogAPPLE(
+    const char* errstr,
+    const void* private_info,
+    size_t cb,
+    void* user_data) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void CL_API_CALL
+clLogMessagesToStdoutAPPLE(
+    const char* errstr,
+    const void* private_info,
+    size_t cb,
+    void* user_data) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void CL_API_CALL
+clLogMessagesToStderrAPPLE(
+    const char* errstr,
+    const void* private_info,
+    size_t cb,
+    void* user_data) CL_API_SUFFIX__VERSION_1_0;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_khr_icd
+***************************************************************/
 #define cl_khr_icd 1
+#define CL_KHR_ICD_EXTENSION_NAME \
+    "cl_khr_icd"
 
-/* cl_platform_info                                                        */
-#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+/* cl_platform_info */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                          0x0920
 
-/* Additional Error Codes                                                  */
-#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+/* Error codes */
+#define CL_PLATFORM_NOT_FOUND_KHR                           -1001
 
-extern CL_API_ENTRY cl_int CL_API_CALL
-clIcdGetPlatformIDsKHR(cl_uint          num_entries,
-                       cl_platform_id * platforms,
-                       cl_uint *        num_platforms);
 
-typedef cl_int
-(CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(cl_uint          num_entries,
-                                         cl_platform_id * platforms,
-                                         cl_uint *        num_platforms);
+typedef cl_int (CL_API_CALL *
+clIcdGetPlatformIDsKHR_fn)(
+    cl_uint num_entries,
+    cl_platform_id* platforms,
+    cl_uint* num_platforms) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(
+    cl_uint num_entries,
+    cl_platform_id* platforms,
+    cl_uint* num_platforms) ;
 
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
-/*******************************
- * cl_khr_il_program extension *
- *******************************/
+/***************************************************************
+* cl_khr_il_program
+***************************************************************/
 #define cl_khr_il_program 1
+#define CL_KHR_IL_PROGRAM_EXTENSION_NAME \
+    "cl_khr_il_program"
 
-/* New property to clGetDeviceInfo for retrieving supported intermediate
- * languages
- */
-#define CL_DEVICE_IL_VERSION_KHR                    0x105B
+/* cl_device_info */
+#define CL_DEVICE_IL_VERSION_KHR                            0x105B
 
-/* New property to clGetProgramInfo for retrieving for retrieving the IL of a
- * program
- */
-#define CL_PROGRAM_IL_KHR                           0x1169
+/* cl_program_info */
+#define CL_PROGRAM_IL_KHR                                   0x1169
 
-extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithILKHR(cl_context   context,
-                         const void * il,
-                         size_t       length,
-                         cl_int *     errcode_ret);
-
-typedef cl_program
-(CL_API_CALL *clCreateProgramWithILKHR_fn)(cl_context   context,
-                                           const void * il,
-                                           size_t       length,
-                                           cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-/* Extension: cl_khr_image2d_from_buffer
- *
- * This extension allows a 2D image to be created from a cl_mem buffer without
- * a copy. The type associated with a 2D image created from a buffer in an
- * OpenCL program is image2d_t. Both the sampler and sampler-less read_image
- * built-in functions are supported for 2D images and 2D images created from
- * a buffer.  Similarly, the write_image built-ins are also supported for 2D
- * images created from a buffer.
- *
- * When the 2D image from buffer is created, the client must specify the
- * width, height, image format (i.e. channel order and channel data type)
- * and optionally the row pitch.
- *
- * The pitch specified must be a multiple of
- * CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR pixels.
- * The base address of the buffer must be aligned to
- * CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR pixels.
- */
 
-#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR              0x104A
-#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR       0x104B
+typedef cl_program (CL_API_CALL *
+clCreateProgramWithILKHR_fn)(
+    cl_context context,
+    const void* il,
+    size_t length,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
-/**************************************
- * cl_khr_initialize_memory extension *
- **************************************/
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithILKHR(
+    cl_context context,
+    const void* il,
+    size_t length,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
+/***************************************************************
+* cl_khr_image2d_from_buffer
+***************************************************************/
+#define cl_khr_image2d_from_buffer 1
+#define CL_KHR_IMAGE2D_FROM_BUFFER_EXTENSION_NAME \
+    "cl_khr_image2d_from_buffer"
 
-/**************************************
- * cl_khr_terminate_context extension *
- **************************************/
+/* cl_device_info */
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR                 0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR          0x104B
 
-#define CL_CONTEXT_TERMINATED_KHR                   -1121
+/***************************************************************
+* cl_khr_initialize_memory
+***************************************************************/
+#define cl_khr_initialize_memory 1
+#define CL_KHR_INITIALIZE_MEMORY_EXTENSION_NAME \
+    "cl_khr_initialize_memory"
 
-#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
-#define CL_CONTEXT_TERMINATE_KHR                    0x2032
+/* Interop tokens */
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR                    0x2030
 
+/***************************************************************
+* cl_khr_terminate_context
+***************************************************************/
 #define cl_khr_terminate_context 1
-extern CL_API_ENTRY cl_int CL_API_CALL
-clTerminateContextKHR(cl_context context) CL_API_SUFFIX__VERSION_1_2;
+#define CL_KHR_TERMINATE_CONTEXT_EXTENSION_NAME \
+    "cl_khr_terminate_context"
 
-typedef cl_int
-(CL_API_CALL *clTerminateContextKHR_fn)(cl_context context) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_bitfield         cl_device_terminate_capability_khr;
 
+/* cl_device_info */
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR                  0x2031
 
-/*
- * Extension: cl_khr_spir
- *
- * This extension adds support to create an OpenCL program object from a
- * Standard Portable Intermediate Representation (SPIR) instance
- */
+/* cl_context_properties */
+#define CL_CONTEXT_TERMINATE_KHR                            0x2032
+
+/* cl_device_terminate_capability_khr */
+#define CL_DEVICE_TERMINATE_CAPABILITY_CONTEXT_KHR          (1 << 0)
+
+/* Error codes */
+#define CL_CONTEXT_TERMINATED_KHR                           -1121
+
+
+typedef cl_int (CL_API_CALL *
+clTerminateContextKHR_fn)(
+    cl_context context) CL_API_SUFFIX__VERSION_1_2;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
-#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
-#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
+extern CL_API_ENTRY cl_int CL_API_CALL
+clTerminateContextKHR(
+    cl_context context) CL_API_SUFFIX__VERSION_1_2;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_khr_spir
+***************************************************************/
+#define cl_khr_spir 1
+#define CL_KHR_SPIR_EXTENSION_NAME \
+    "cl_khr_spir"
 
+/* cl_device_info */
+#define CL_DEVICE_SPIR_VERSIONS                             0x40E0
+
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE                 0x40E1
 
-/*****************************************
- * cl_khr_create_command_queue extension *
- *****************************************/
+/***************************************************************
+* cl_khr_create_command_queue
+***************************************************************/
 #define cl_khr_create_command_queue 1
+#define CL_KHR_CREATE_COMMAND_QUEUE_EXTENSION_NAME \
+    "cl_khr_create_command_queue"
+
+typedef cl_properties       cl_queue_properties_khr;
 
-typedef cl_properties cl_queue_properties_khr;
+
+typedef cl_command_queue (CL_API_CALL *
+clCreateCommandQueueWithPropertiesKHR_fn)(
+    cl_context context,
+    cl_device_id device,
+    const cl_queue_properties_khr* properties,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
 extern CL_API_ENTRY cl_command_queue CL_API_CALL
-clCreateCommandQueueWithPropertiesKHR(cl_context context,
-                                      cl_device_id device,
-                                      const cl_queue_properties_khr* properties,
-                                      cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef cl_command_queue
-(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)(cl_context context,
-                                                        cl_device_id device,
-                                                        const cl_queue_properties_khr* properties,
-                                                        cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-
-/******************************************
-* cl_nv_device_attribute_query extension *
-******************************************/
-
-/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
-#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
-#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
-#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
-#define CL_DEVICE_WARP_SIZE_NV                      0x4003
-#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
-#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
-#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
-
-
-/*********************************
-* cl_amd_device_attribute_query *
-*********************************/
-
-#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD            0x4036
-#define CL_DEVICE_TOPOLOGY_AMD                          0x4037
-#define CL_DEVICE_BOARD_NAME_AMD                        0x4038
-#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD                0x4039
-#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD             0x4040
-#define CL_DEVICE_SIMD_WIDTH_AMD                        0x4041
-#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD            0x4042
-#define CL_DEVICE_WAVEFRONT_WIDTH_AMD                   0x4043
-#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD               0x4044
-#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD          0x4045
-#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD     0x4046
-#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD   0x4047
-#define CL_DEVICE_LOCAL_MEM_BANKS_AMD                   0x4048
-#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD            0x4049
-#define CL_DEVICE_GFXIP_MAJOR_AMD                       0x404A
-#define CL_DEVICE_GFXIP_MINOR_AMD                       0x404B
-#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD            0x404C
-#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD         0x4030
-#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD               0x4031
-#define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD    0x4033
-#define CL_DEVICE_PCIE_ID_AMD                           0x4034
-
-
-/*********************************
-* cl_arm_printf extension
-*********************************/
-
-#define CL_PRINTF_CALLBACK_ARM                      0x40B0
-#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
-
-
-/***********************************
-* cl_ext_device_fission extension
-***********************************/
-#define cl_ext_device_fission   1
+clCreateCommandQueueWithPropertiesKHR(
+    cl_context context,
+    cl_device_id device,
+    const cl_queue_properties_khr* properties,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseDeviceEXT(cl_device_id device) CL_API_SUFFIX__VERSION_1_1;
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
-typedef cl_int
-(CL_API_CALL *clReleaseDeviceEXT_fn)(cl_device_id device) CL_API_SUFFIX__VERSION_1_1;
+/***************************************************************
+* cl_nv_device_attribute_query
+***************************************************************/
+#define cl_nv_device_attribute_query 1
+#define CL_NV_DEVICE_ATTRIBUTE_QUERY_EXTENSION_NAME \
+    "cl_nv_device_attribute_query"
 
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainDeviceEXT(cl_device_id device) CL_API_SUFFIX__VERSION_1_1;
+/* cl_device_info */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV               0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV               0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV                    0x4002
+#define CL_DEVICE_WARP_SIZE_NV                              0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                            0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV                    0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV                      0x4006
 
-typedef cl_int
-(CL_API_CALL *clRetainDeviceEXT_fn)(cl_device_id device) CL_API_SUFFIX__VERSION_1_1;
+/***************************************************************
+* cl_amd_device_attribute_query
+***************************************************************/
+#define cl_amd_device_attribute_query 1
+#define CL_AMD_DEVICE_ATTRIBUTE_QUERY_EXTENSION_NAME \
+    "cl_amd_device_attribute_query"
 
-typedef cl_ulong  cl_device_partition_property_ext;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clCreateSubDevicesEXT(cl_device_id   in_device,
-                      const cl_device_partition_property_ext * properties,
-                      cl_uint        num_entries,
-                      cl_device_id * out_devices,
-                      cl_uint *      num_devices) CL_API_SUFFIX__VERSION_1_1;
-
-typedef cl_int
-(CL_API_CALL * clCreateSubDevicesEXT_fn)(cl_device_id   in_device,
-                                         const cl_device_partition_property_ext * properties,
-                                         cl_uint        num_entries,
-                                         cl_device_id * out_devices,
-                                         cl_uint *      num_devices) CL_API_SUFFIX__VERSION_1_1;
+/* cl_device_info */
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD                0x4036
+#define CL_DEVICE_TOPOLOGY_AMD                              0x4037
+#define CL_DEVICE_BOARD_NAME_AMD                            0x4038
+#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD                    0x4039
+#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD                 0x4040
+#define CL_DEVICE_SIMD_WIDTH_AMD                            0x4041
+#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD                0x4042
+#define CL_DEVICE_WAVEFRONT_WIDTH_AMD                       0x4043
+#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD                   0x4044
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD              0x4045
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD         0x4046
+#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD       0x4047
+#define CL_DEVICE_LOCAL_MEM_BANKS_AMD                       0x4048
+#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD                0x4049
+#define CL_DEVICE_GFXIP_MAJOR_AMD                           0x404A
+#define CL_DEVICE_GFXIP_MINOR_AMD                           0x404B
+#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD                0x404C
+#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD             0x4030
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD                   0x4031
+#define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD        0x4033
+#define CL_DEVICE_PCIE_ID_AMD                               0x4034
+
+/***************************************************************
+* cl_arm_printf
+***************************************************************/
+#define cl_arm_printf 1
+#define CL_ARM_PRINTF_EXTENSION_NAME \
+    "cl_arm_printf"
+
+/* cl_context_properties */
+#define CL_PRINTF_CALLBACK_ARM                              0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                            0x40B1
+
+/***************************************************************
+* cl_ext_device_fission
+***************************************************************/
+#define cl_ext_device_fission 1
+#define CL_EXT_DEVICE_FISSION_EXTENSION_NAME \
+    "cl_ext_device_fission"
+
+typedef cl_ulong            cl_device_partition_property_ext;
+
+/* Error codes */
+#define CL_DEVICE_PARTITION_FAILED_EXT                      -1057
+#define CL_INVALID_PARTITION_COUNT_EXT                      -1058
+#define CL_INVALID_PARTITION_NAME_EXT                       -1059
+
+/* cl_device_info */
+#define CL_DEVICE_PARENT_DEVICE_EXT                         0x4054
+#define CL_DEVICE_PARTITION_TYPES_EXT                       0x4055
+#define CL_DEVICE_AFFINITY_DOMAINS_EXT                      0x4056
+#define CL_DEVICE_REFERENCE_COUNT_EXT                       0x4057
+#define CL_DEVICE_PARTITION_STYLE_EXT                       0x4058
 
 /* cl_device_partition_property_ext */
-#define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
-#define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
-#define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
-#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
-
-/* clDeviceGetInfo selectors */
-#define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
-#define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
-#define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
-#define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
-#define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
-
-/* error codes */
-#define CL_DEVICE_PARTITION_FAILED_EXT              -1057
-#define CL_INVALID_PARTITION_COUNT_EXT              -1058
-#define CL_INVALID_PARTITION_NAME_EXT               -1059
-
-/* CL_AFFINITY_DOMAINs */
-#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
-#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
-#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
-#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
-#define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
-#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
-
-/* cl_device_partition_property_ext list terminators */
-#define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
-#define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
-#define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
-
-
-/***********************************
- * cl_ext_migrate_memobject extension definitions
- ***********************************/
-#define cl_ext_migrate_memobject 1
+#define CL_DEVICE_PARTITION_EQUALLY_EXT                     0x4050
+#define CL_DEVICE_PARTITION_BY_COUNTS_EXT                   0x4051
+#define CL_DEVICE_PARTITION_BY_NAMES_EXT                    0x4052
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT          0x4053
+
+/* cl_device_partition_property_ext - affinity domains */
+#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT                     0x1
+#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT                     0x2
+#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT                     0x3
+#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT                     0x4
+#define CL_AFFINITY_DOMAIN_NUMA_EXT                         0x10
+#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT             0x100
+
+/* cl_device_partition_property_ext - list terminators */
+#define CL_PROPERTIES_LIST_END_EXT                          ((cl_device_partition_property_ext)0)
+#define CL_PARTITION_BY_COUNTS_LIST_END_EXT                 ((cl_device_partition_property_ext)0)
+#define CL_PARTITION_BY_NAMES_LIST_END_EXT                  ((cl_device_partition_property_ext)0 - 1)
+
 
-typedef cl_bitfield cl_mem_migration_flags_ext;
+typedef cl_int (CL_API_CALL *
+clReleaseDeviceEXT_fn)(
+    cl_device_id device) CL_API_SUFFIX__VERSION_1_1;
 
-#define CL_MIGRATE_MEM_OBJECT_HOST_EXT              0x1
+typedef cl_int (CL_API_CALL *
+clRetainDeviceEXT_fn)(
+    cl_device_id device) CL_API_SUFFIX__VERSION_1_1;
 
-#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT           0x4040
+typedef cl_int (CL_API_CALL *
+clCreateSubDevicesEXT_fn)(
+    cl_device_id in_device,
+    const cl_device_partition_property_ext* properties,
+    cl_uint num_entries,
+    cl_device_id* out_devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_1;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMigrateMemObjectEXT(cl_command_queue command_queue,
-                             cl_uint          num_mem_objects,
-                             const cl_mem *   mem_objects,
-                             cl_mem_migration_flags_ext flags,
-                             cl_uint          num_events_in_wait_list,
-                             const cl_event * event_wait_list,
-                             cl_event *       event);
-
-typedef cl_int
-(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)(cl_command_queue command_queue,
-                                               cl_uint          num_mem_objects,
-                                               const cl_mem *   mem_objects,
-                                               cl_mem_migration_flags_ext flags,
-                                               cl_uint          num_events_in_wait_list,
-                                               const cl_event * event_wait_list,
-                                               cl_event *       event);
-
-
-/*********************************
-* cl_ext_cxx_for_opencl extension
-*********************************/
-#define cl_ext_cxx_for_opencl 1
+clReleaseDeviceEXT(
+    cl_device_id device) CL_API_SUFFIX__VERSION_1_1;
 
-#define CL_DEVICE_CXX_FOR_OPENCL_NUMERIC_VERSION_EXT 0x4230
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDeviceEXT(
+    cl_device_id device) CL_API_SUFFIX__VERSION_1_1;
 
-/*********************************
-* cl_qcom_ext_host_ptr extension
-*********************************/
-#define cl_qcom_ext_host_ptr 1
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevicesEXT(
+    cl_device_id in_device,
+    const cl_device_partition_property_ext* properties,
+    cl_uint num_entries,
+    cl_device_id* out_devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_1;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_ext_migrate_memobject
+***************************************************************/
+#define cl_ext_migrate_memobject 1
+#define CL_EXT_MIGRATE_MEMOBJECT_EXTENSION_NAME \
+    "cl_ext_migrate_memobject"
+
+typedef cl_bitfield         cl_mem_migration_flags_ext;
+
+/* cl_mem_migration_flags_ext */
+#define CL_MIGRATE_MEM_OBJECT_HOST_EXT                      (1 << 0)
+
+/* cl_command_type */
+#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT                   0x4040
 
-#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
 
-#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0
-#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
-#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
-#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
-#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
-#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
-#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
-#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+typedef cl_int (CL_API_CALL *
+clEnqueueMigrateMemObjectEXT_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_mem_objects,
+    const cl_mem* mem_objects,
+    cl_mem_migration_flags_ext flags,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
 
-typedef cl_uint                                   cl_image_pitch_info_qcom;
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceImageInfoQCOM(cl_device_id             device,
-                         size_t                   image_width,
-                         size_t                   image_height,
-                         const cl_image_format   *image_format,
-                         cl_image_pitch_info_qcom param_name,
-                         size_t                   param_value_size,
-                         void                    *param_value,
-                         size_t                  *param_value_size_ret);
-
-typedef struct _cl_mem_ext_host_ptr
-{
-    /* Type of external memory allocation. */
-    /* Legal values will be defined in layered extensions. */
-    cl_uint  allocation_type;
-
-    /* Host cache policy for this external memory allocation. */
-    cl_uint  host_cache_policy;
+clEnqueueMigrateMemObjectEXT(
+    cl_command_queue command_queue,
+    cl_uint num_mem_objects,
+    const cl_mem* mem_objects,
+    cl_mem_migration_flags_ext flags,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
 
-} cl_mem_ext_host_ptr;
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
+/***************************************************************
+* cl_ext_cxx_for_opencl
+***************************************************************/
+#define cl_ext_cxx_for_opencl 1
+#define CL_EXT_CXX_FOR_OPENCL_EXTENSION_NAME \
+    "cl_ext_cxx_for_opencl"
 
-/*******************************************
-* cl_qcom_ext_host_ptr_iocoherent extension
-********************************************/
+/* cl_device_info */
+#define CL_DEVICE_CXX_FOR_OPENCL_NUMERIC_VERSION_EXT        0x4230
 
-/* Cache policy specifying io-coherence */
-#define CL_MEM_HOST_IOCOHERENT_QCOM               0x40A9
+/***************************************************************
+* cl_qcom_ext_host_ptr
+***************************************************************/
+#define cl_qcom_ext_host_ptr 1
+#define CL_QCOM_EXT_HOST_PTR_EXTENSION_NAME \
+    "cl_qcom_ext_host_ptr"
 
+typedef cl_uint             cl_image_pitch_info_qcom;
+typedef struct _cl_mem_ext_host_ptr {
+    cl_uint allocation_type;
+    cl_uint host_cache_policy;
+} cl_mem_ext_host_ptr;
 
-/*********************************
-* cl_qcom_ion_host_ptr extension
-*********************************/
+/* cl_mem_flags */
+#define CL_MEM_EXT_HOST_PTR_QCOM                            (1 << 29)
 
-#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+/* cl_device_info */
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM             0x40A0
+#define CL_DEVICE_PAGE_SIZE_QCOM                            0x40A1
 
-typedef struct _cl_mem_ion_host_ptr
-{
-    /* Type of external memory allocation. */
-    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
-    cl_mem_ext_host_ptr  ext_host_ptr;
+/* cl_image_pitch_info_qcom */
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM                         0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM                       0x40A3
 
-    /* ION file descriptor */
-    int                  ion_filedesc;
+/* cl_uint host_cache_policy */
+#define CL_MEM_HOST_UNCACHED_QCOM                           0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                          0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM                       0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM                    0x40A7
 
-    /* Host pointer to the ION allocated memory */
-    void*                ion_hostptr;
 
-} cl_mem_ion_host_ptr;
+typedef cl_int (CL_API_CALL *
+clGetDeviceImageInfoQCOM_fn)(
+    cl_device_id device,
+    size_t image_width,
+    size_t image_height,
+    const cl_image_format* image_format,
+    cl_image_pitch_info_qcom param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(
+    cl_device_id device,
+    size_t image_width,
+    size_t image_height,
+    const cl_image_format* image_format,
+    cl_image_pitch_info_qcom param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
-/*********************************
-* cl_qcom_android_native_buffer_host_ptr extension
-*********************************/
+/***************************************************************
+* cl_qcom_ext_host_ptr_iocoherent
+***************************************************************/
+#define cl_qcom_ext_host_ptr_iocoherent 1
+#define CL_QCOM_EXT_HOST_PTR_IOCOHERENT_EXTENSION_NAME \
+    "cl_qcom_ext_host_ptr_iocoherent"
 
-#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM                  0x40C6
+/* cl_uint host_cache_policy */
+#define CL_MEM_HOST_IOCOHERENT_QCOM                         0x40A9
 
-typedef struct _cl_mem_android_native_buffer_host_ptr
-{
-    /* Type of external memory allocation. */
-    /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */
-    cl_mem_ext_host_ptr  ext_host_ptr;
+/***************************************************************
+* cl_qcom_ion_host_ptr
+***************************************************************/
+#define cl_qcom_ion_host_ptr 1
+#define CL_QCOM_ION_HOST_PTR_EXTENSION_NAME \
+    "cl_qcom_ion_host_ptr"
+
+/* type cl_mem_ext_host_ptr */
+typedef struct _cl_mem_ion_host_ptr {
+    cl_mem_ext_host_ptr ext_host_ptr;
+    int ion_filedesc;
+    void* ion_hostptr;
+} cl_mem_ion_host_ptr;
 
-    /* Virtual pointer to the android native buffer */
-    void*                anb_ptr;
+/* cl_uint allocation_type */
+#define CL_MEM_ION_HOST_PTR_QCOM                            0x40A8
 
+/***************************************************************
+* cl_qcom_android_native_buffer_host_ptr
+***************************************************************/
+#define cl_qcom_android_native_buffer_host_ptr 1
+#define CL_QCOM_ANDROID_NATIVE_BUFFER_HOST_PTR_EXTENSION_NAME \
+    "cl_qcom_android_native_buffer_host_ptr"
+
+/* type cl_mem_ext_host_ptr */
+typedef struct _cl_mem_android_native_buffer_host_ptr {
+    cl_mem_ext_host_ptr ext_host_ptr;
+    void* anb_ptr;
 } cl_mem_android_native_buffer_host_ptr;
 
+/* cl_uint allocation_type */
+#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM          0x40C6
 
-/******************************************
- * cl_img_yuv_image extension *
- ******************************************/
+/***************************************************************
+* cl_img_yuv_image
+***************************************************************/
+#define cl_img_yuv_image 1
+#define CL_IMG_YUV_IMAGE_EXTENSION_NAME \
+    "cl_img_yuv_image"
+
+/* cl_channel_order */
+#define CL_NV21_IMG                                         0x40D0
+#define CL_YV12_IMG                                         0x40D1
+
+/***************************************************************
+* cl_img_cached_allocations
+***************************************************************/
+#define cl_img_cached_allocations 1
+#define CL_IMG_CACHED_ALLOCATIONS_EXTENSION_NAME \
+    "cl_img_cached_allocations"
 
-/* Image formats used in clCreateImage */
-#define CL_NV21_IMG                                 0x40D0
-#define CL_YV12_IMG                                 0x40D1
+/* cl_mem_flags */
+#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG                  (1 << 26)
+#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG                    (1 << 27)
 
+/***************************************************************
+* cl_img_use_gralloc_ptr
+***************************************************************/
+#define cl_img_use_gralloc_ptr 1
+#define CL_IMG_USE_GRALLOC_PTR_EXTENSION_NAME \
+    "cl_img_use_gralloc_ptr"
 
-/******************************************
- * cl_img_cached_allocations extension *
- ******************************************/
+/* Error codes */
+#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG                0x40D4
+#define CL_INVALID_GRALLOC_OBJECT_IMG                       0x40D5
 
-/* Flag values used by clCreateBuffer */
-#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG          (1 << 26)
-#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG            (1 << 27)
+/* cl_mem_flags */
+#define CL_MEM_USE_GRALLOC_PTR_IMG                          (1 << 28)
 
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG              0x40D2
+#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG              0x40D3
 
-/******************************************
- * cl_img_use_gralloc_ptr extension *
- ******************************************/
-#define cl_img_use_gralloc_ptr 1
 
-/* Flag values used by clCreateBuffer */
-#define CL_MEM_USE_GRALLOC_PTR_IMG                  (1 << 28)
+typedef cl_int (CL_API_CALL *
+clEnqueueAcquireGrallocObjectsIMG_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
 
-/* To be used by clGetEventInfo: */
-#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2
-#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3
+typedef cl_int (CL_API_CALL *
+clEnqueueReleaseGrallocObjectsIMG_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
 
-/* Error codes from clEnqueueAcquireGrallocObjectsIMG and clEnqueueReleaseGrallocObjectsIMG */
-#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4
-#define CL_INVALID_GRALLOC_OBJECT_IMG               0x40D5
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      command_queue,
-                                  cl_uint               num_objects,
-                                  const cl_mem *        mem_objects,
-                                  cl_uint               num_events_in_wait_list,
-                                  const cl_event *      event_wait_list,
-                                  cl_event *            event) CL_API_SUFFIX__VERSION_1_2;
+clEnqueueAcquireGrallocObjectsIMG(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      command_queue,
-                                  cl_uint               num_objects,
-                                  const cl_mem *        mem_objects,
-                                  cl_uint               num_events_in_wait_list,
-                                  const cl_event *      event_wait_list,
-                                  cl_event *            event) CL_API_SUFFIX__VERSION_1_2;
-
-/******************************************
- * cl_img_generate_mipmap extension *
- ******************************************/
+clEnqueueReleaseGrallocObjectsIMG(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_img_generate_mipmap
+***************************************************************/
 #define cl_img_generate_mipmap 1
+#define CL_IMG_GENERATE_MIPMAP_EXTENSION_NAME \
+    "cl_img_generate_mipmap"
 
-typedef cl_uint cl_mipmap_filter_mode_img;
+typedef cl_uint             cl_mipmap_filter_mode_img;
 
-/* To be used by clEnqueueGenerateMipmapIMG */
-#define CL_MIPMAP_FILTER_ANY_IMG 0x0
-#define CL_MIPMAP_FILTER_BOX_IMG 0x1
+/* cl_mipmap_filter_mode_img */
+#define CL_MIPMAP_FILTER_ANY_IMG                            0x0
+#define CL_MIPMAP_FILTER_BOX_IMG                            0x1
+
+/* cl_command_type */
+#define CL_COMMAND_GENERATE_MIPMAP_IMG                      0x40D6
 
-/* To be used by clGetEventInfo */
-#define CL_COMMAND_GENERATE_MIPMAP_IMG 0x40D6
+
+typedef cl_int (CL_API_CALL *
+clEnqueueGenerateMipmapIMG_fn)(
+    cl_command_queue command_queue,
+    cl_mem src_image,
+    cl_mem dst_image,
+    cl_mipmap_filter_mode_img mipmap_filter_mode,
+    const size_t* array_region,
+    const size_t* mip_region,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueGenerateMipmapIMG(cl_command_queue          command_queue,
-                           cl_mem                    src_image,
-                           cl_mem                    dst_image,
-                           cl_mipmap_filter_mode_img mipmap_filter_mode,
-                           const size_t              *array_region,
-                           const size_t              *mip_region,
-                           cl_uint                   num_events_in_wait_list,
-                           const cl_event            *event_wait_list,
-                           cl_event *event) CL_API_SUFFIX__VERSION_1_2;
-  
-/******************************************
- * cl_img_mem_properties extension *
- ******************************************/
-#define cl_img_mem_properties 1
+clEnqueueGenerateMipmapIMG(
+    cl_command_queue command_queue,
+    cl_mem src_image,
+    cl_mem dst_image,
+    cl_mipmap_filter_mode_img mipmap_filter_mode,
+    const size_t* array_region,
+    const size_t* mip_region,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
-/* To be used by clCreateBufferWithProperties */
-#define CL_MEM_ALLOC_FLAGS_IMG 0x40D7
+/***************************************************************
+* cl_img_mem_properties
+***************************************************************/
+#define cl_img_mem_properties 1
+#define CL_IMG_MEM_PROPERTIES_EXTENSION_NAME \
+    "cl_img_mem_properties"
 
-/* To be used wiith the CL_MEM_ALLOC_FLAGS_IMG property */
-typedef cl_bitfield cl_mem_alloc_flags_img;
+/* cl_mem_properties */
+#define CL_MEM_ALLOC_FLAGS_IMG                              0x40D7
 
-/* To be used with cl_mem_alloc_flags_img */
-#define CL_MEM_ALLOC_RELAX_REQUIREMENTS_IMG (1 << 0)
+/* cl_mem_alloc_flags_img */
+#define CL_MEM_ALLOC_RELAX_REQUIREMENTS_IMG                 (1 << 0)
 
-/*********************************
-* cl_khr_subgroups extension
-*********************************/
+/***************************************************************
+* cl_khr_subgroups
+***************************************************************/
 #define cl_khr_subgroups 1
+#define CL_KHR_SUBGROUPS_EXTENSION_NAME \
+    "cl_khr_subgroups"
 
 #if !defined(CL_VERSION_2_1)
-/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h.
-   In hindsight, there should have been a khr suffix on this type for
-   the extension, but keeping it un-suffixed to maintain backwards
-   compatibility. */
+/* defined in CL.h for OpenCL 2.1 and newer */
 typedef cl_uint             cl_kernel_sub_group_info;
-#endif
+
+#endif /* !defined(CL_VERSION_2_1) */
 
 /* cl_kernel_sub_group_info */
-#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR    0x2033
-#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR       0x2034
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR        0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR           0x2034
+
+
+typedef cl_int (CL_API_CALL *
+clGetKernelSubGroupInfoKHR_fn)(
+    cl_kernel in_kernel,
+    cl_device_id in_device,
+    cl_kernel_sub_group_info param_name,
+    size_t input_value_size,
+    const void* input_value,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_2_0_DEPRECATED;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelSubGroupInfoKHR(cl_kernel    in_kernel,
-                           cl_device_id in_device,
-                           cl_kernel_sub_group_info param_name,
-                           size_t       input_value_size,
-                           const void * input_value,
-                           size_t       param_value_size,
-                           void *       param_value,
-                           size_t *     param_value_size_ret) CL_API_SUFFIX__VERSION_2_0_DEPRECATED;
-
-typedef cl_int
-(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel    in_kernel,
-                                              cl_device_id in_device,
-                                              cl_kernel_sub_group_info param_name,
-                                              size_t       input_value_size,
-                                              const void * input_value,
-                                              size_t       param_value_size,
-                                              void *       param_value,
-                                              size_t *     param_value_size_ret) CL_API_SUFFIX__VERSION_2_0_DEPRECATED;
-
-
-/*********************************
-* cl_khr_mipmap_image extension
-*********************************/
+clGetKernelSubGroupInfoKHR(
+    cl_kernel in_kernel,
+    cl_device_id in_device,
+    cl_kernel_sub_group_info param_name,
+    size_t input_value_size,
+    const void* input_value,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_2_0_DEPRECATED;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
-/* cl_sampler_properties */
-#define CL_SAMPLER_MIP_FILTER_MODE_KHR              0x1155
-#define CL_SAMPLER_LOD_MIN_KHR                      0x1156
-#define CL_SAMPLER_LOD_MAX_KHR                      0x1157
+/***************************************************************
+* cl_khr_mipmap_image
+***************************************************************/
+#define cl_khr_mipmap_image 1
+#define CL_KHR_MIPMAP_IMAGE_EXTENSION_NAME \
+    "cl_khr_mipmap_image"
 
+/* cl_sampler_properties */
+#define CL_SAMPLER_MIP_FILTER_MODE_KHR                      0x1155
+#define CL_SAMPLER_LOD_MIN_KHR                              0x1156
+#define CL_SAMPLER_LOD_MAX_KHR                              0x1157
 
-/*********************************
-* cl_khr_priority_hints extension
-*********************************/
-/* This extension define is for backwards compatibility.
-   It shouldn't be required since this extension has no new functions. */
+/***************************************************************
+* cl_khr_priority_hints
+***************************************************************/
 #define cl_khr_priority_hints 1
+#define CL_KHR_PRIORITY_HINTS_EXTENSION_NAME \
+    "cl_khr_priority_hints"
 
-typedef cl_uint  cl_queue_priority_khr;
+/* To be used by clGetEventInfo */
+typedef cl_uint             cl_queue_priority_khr;
 
-/* cl_command_queue_properties */
-#define CL_QUEUE_PRIORITY_KHR 0x1096
+/* cl_queue_properties */
+#define CL_QUEUE_PRIORITY_KHR                               0x1096
 
 /* cl_queue_priority_khr */
-#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
-#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
-#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
-
+#define CL_QUEUE_PRIORITY_HIGH_KHR                          (1 << 0)
+#define CL_QUEUE_PRIORITY_MED_KHR                           (1 << 1)
+#define CL_QUEUE_PRIORITY_LOW_KHR                           (1 << 2)
 
-/*********************************
-* cl_khr_throttle_hints extension
-*********************************/
-/* This extension define is for backwards compatibility.
-   It shouldn't be required since this extension has no new functions. */
+/***************************************************************
+* cl_khr_throttle_hints
+***************************************************************/
 #define cl_khr_throttle_hints 1
+#define CL_KHR_THROTTLE_HINTS_EXTENSION_NAME \
+    "cl_khr_throttle_hints"
 
-typedef cl_uint  cl_queue_throttle_khr;
+/* To be used by clGetEventInfo */
+typedef cl_uint             cl_queue_throttle_khr;
 
-/* cl_command_queue_properties */
-#define CL_QUEUE_THROTTLE_KHR 0x1097
+/* cl_queue_properties */
+#define CL_QUEUE_THROTTLE_KHR                               0x1097
 
 /* cl_queue_throttle_khr */
-#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
-#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
-#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
+#define CL_QUEUE_THROTTLE_HIGH_KHR                          (1 << 0)
+#define CL_QUEUE_THROTTLE_MED_KHR                           (1 << 1)
+#define CL_QUEUE_THROTTLE_LOW_KHR                           (1 << 2)
 
-
-/*********************************
+/***************************************************************
 * cl_khr_subgroup_named_barrier
-*********************************/
-/* This extension define is for backwards compatibility.
-   It shouldn't be required since this extension has no new functions. */
+***************************************************************/
 #define cl_khr_subgroup_named_barrier 1
+#define CL_KHR_SUBGROUP_NAMED_BARRIER_EXTENSION_NAME \
+    "cl_khr_subgroup_named_barrier"
 
 /* cl_device_info */
-#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR       0x2035
-
+#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR               0x2035
 
-/*********************************
+/***************************************************************
 * cl_khr_extended_versioning
-*********************************/
-
+***************************************************************/
 #define cl_khr_extended_versioning 1
+#define CL_KHR_EXTENDED_VERSIONING_EXTENSION_NAME \
+    "cl_khr_extended_versioning"
 
-#define CL_VERSION_MAJOR_BITS_KHR (10)
-#define CL_VERSION_MINOR_BITS_KHR (10)
-#define CL_VERSION_PATCH_BITS_KHR (12)
+#define CL_VERSION_MAJOR_BITS_KHR                           10
+#define CL_VERSION_MINOR_BITS_KHR                           10
+#define CL_VERSION_PATCH_BITS_KHR                           12
 
 #define CL_VERSION_MAJOR_MASK_KHR ((1 << CL_VERSION_MAJOR_BITS_KHR) - 1)
 #define CL_VERSION_MINOR_MASK_KHR ((1 << CL_VERSION_MINOR_BITS_KHR) - 1)
@@ -657,47 +1395,49 @@ typedef cl_uint  cl_queue_throttle_khr;
     (((minor) &  CL_VERSION_MINOR_MASK_KHR) << CL_VERSION_PATCH_BITS_KHR) | \
     ((patch) & CL_VERSION_PATCH_MASK_KHR))
 
-typedef cl_uint cl_version_khr;
-
-#define CL_NAME_VERSION_MAX_NAME_SIZE_KHR 64
+#define CL_NAME_VERSION_MAX_NAME_SIZE_KHR                   64
 
-typedef struct _cl_name_version_khr
-{
+typedef cl_uint             cl_version_khr;
+typedef struct _cl_name_version_khr {
     cl_version_khr version;
     char name[CL_NAME_VERSION_MAX_NAME_SIZE_KHR];
 } cl_name_version_khr;
 
 /* cl_platform_info */
-#define CL_PLATFORM_NUMERIC_VERSION_KHR                  0x0906
-#define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR          0x0907
+#define CL_PLATFORM_NUMERIC_VERSION_KHR                     0x0906
+#define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR             0x0907
 
 /* cl_device_info */
-#define CL_DEVICE_NUMERIC_VERSION_KHR                    0x105E
-#define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR           0x105F
-#define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR            0x1060
-#define CL_DEVICE_ILS_WITH_VERSION_KHR                   0x1061
-#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR      0x1062
+#define CL_DEVICE_NUMERIC_VERSION_KHR                       0x105E
+#define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR              0x105F
+#define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR               0x1060
+#define CL_DEVICE_ILS_WITH_VERSION_KHR                      0x1061
+#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR         0x1062
 
-
-/*********************************
-* cl_khr_device_uuid extension
-*********************************/
+/***************************************************************
+* cl_khr_device_uuid
+***************************************************************/
 #define cl_khr_device_uuid 1
+#define CL_KHR_DEVICE_UUID_EXTENSION_NAME \
+    "cl_khr_device_uuid"
 
-#define CL_UUID_SIZE_KHR 16
-#define CL_LUID_SIZE_KHR 8
-
-#define CL_DEVICE_UUID_KHR          0x106A
-#define CL_DRIVER_UUID_KHR          0x106B
-#define CL_DEVICE_LUID_VALID_KHR    0x106C
-#define CL_DEVICE_LUID_KHR          0x106D
-#define CL_DEVICE_NODE_MASK_KHR     0x106E
+/* Size Constants */
+#define CL_UUID_SIZE_KHR                                    16
+#define CL_LUID_SIZE_KHR                                    8
 
+/* cl_device_info */
+#define CL_DEVICE_UUID_KHR                                  0x106A
+#define CL_DRIVER_UUID_KHR                                  0x106B
+#define CL_DEVICE_LUID_VALID_KHR                            0x106C
+#define CL_DEVICE_LUID_KHR                                  0x106D
+#define CL_DEVICE_NODE_MASK_KHR                             0x106E
 
 /***************************************************************
 * cl_khr_pci_bus_info
 ***************************************************************/
 #define cl_khr_pci_bus_info 1
+#define CL_KHR_PCI_BUS_INFO_EXTENSION_NAME \
+    "cl_khr_pci_bus_info"
 
 typedef struct _cl_device_pci_bus_info_khr {
     cl_uint pci_domain;
@@ -709,14 +1449,16 @@ typedef struct _cl_device_pci_bus_info_khr {
 /* cl_device_info */
 #define CL_DEVICE_PCI_BUS_INFO_KHR                          0x410F
 
-
 /***************************************************************
 * cl_khr_suggested_local_work_size
 ***************************************************************/
 #define cl_khr_suggested_local_work_size 1
+#define CL_KHR_SUGGESTED_LOCAL_WORK_SIZE_EXTENSION_NAME \
+    "cl_khr_suggested_local_work_size"
 
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelSuggestedLocalWorkSizeKHR(
+
+typedef cl_int (CL_API_CALL *
+clGetKernelSuggestedLocalWorkSizeKHR_fn)(
     cl_command_queue command_queue,
     cl_kernel kernel,
     cl_uint work_dim,
@@ -724,8 +1466,10 @@ clGetKernelSuggestedLocalWorkSizeKHR(
     const size_t* global_work_size,
     size_t* suggested_local_work_size) CL_API_SUFFIX__VERSION_3_0;
 
-typedef cl_int (CL_API_CALL *
-clGetKernelSuggestedLocalWorkSizeKHR_fn)(
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSuggestedLocalWorkSizeKHR(
     cl_command_queue command_queue,
     cl_kernel kernel,
     cl_uint work_dim,
@@ -733,13 +1477,24 @@ clGetKernelSuggestedLocalWorkSizeKHR_fn)(
     const size_t* global_work_size,
     size_t* suggested_local_work_size) CL_API_SUFFIX__VERSION_3_0;
 
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
 /***************************************************************
 * cl_khr_integer_dot_product
 ***************************************************************/
 #define cl_khr_integer_dot_product 1
+#define CL_KHR_INTEGER_DOT_PRODUCT_EXTENSION_NAME \
+    "cl_khr_integer_dot_product"
 
 typedef cl_bitfield         cl_device_integer_dot_product_capabilities_khr;
+typedef struct _cl_device_integer_dot_product_acceleration_properties_khr {
+    cl_bool signed_accelerated;
+    cl_bool unsigned_accelerated;
+    cl_bool mixed_signedness_accelerated;
+    cl_bool accumulating_saturating_signed_accelerated;
+    cl_bool accumulating_saturating_unsigned_accelerated;
+    cl_bool accumulating_saturating_mixed_signedness_accelerated;
+} cl_device_integer_dot_product_acceleration_properties_khr;
 
 /* cl_device_integer_dot_product_capabilities_khr */
 #define CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR (1 << 0)
@@ -747,508 +1502,1003 @@ typedef cl_bitfield         cl_device_integer_dot_product_capabilities_khr;
 
 /* cl_device_info */
 #define CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR      0x1073
+#define CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR 0x1074
+#define CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR 0x1075
+
+/***************************************************************
+* cl_khr_external_memory
+***************************************************************/
+#define cl_khr_external_memory 1
+#define CL_KHR_EXTERNAL_MEMORY_EXTENSION_NAME \
+    "cl_khr_external_memory"
 
+typedef cl_uint             cl_external_memory_handle_type_khr;
 
-/**********************************
- * cl_arm_import_memory extension *
- **********************************/
-#define cl_arm_import_memory 1
+/* cl_platform_info */
+#define CL_PLATFORM_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR 0x2044
+
+/* cl_device_info */
+#define CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR   0x204F
 
-typedef intptr_t cl_import_properties_arm;
+/* cl_mem_properties */
+#define CL_DEVICE_HANDLE_LIST_KHR                           0x2051
+#define CL_DEVICE_HANDLE_LIST_END_KHR                       0
 
-/* Default and valid proporties name for cl_arm_import_memory */
-#define CL_IMPORT_TYPE_ARM                        0x40B2
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_EXTERNAL_MEM_OBJECTS_KHR         0x2047
+#define CL_COMMAND_RELEASE_EXTERNAL_MEM_OBJECTS_KHR         0x2048
 
-/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
-#define CL_IMPORT_TYPE_HOST_ARM                   0x40B3
 
-/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
-#define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4
+typedef cl_int (CL_API_CALL *
+clEnqueueAcquireExternalMemObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_mem_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_3_0;
 
-/* Protected memory property */
-#define CL_IMPORT_TYPE_PROTECTED_ARM              0x40B5
+typedef cl_int (CL_API_CALL *
+clEnqueueReleaseExternalMemObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_mem_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_3_0;
 
-/* Android hardware buffer type value for CL_IMPORT_TYPE_ARM property */
-#define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM 0x41E2
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
-/* Data consistency with host property */
-#define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM 0x41E3
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireExternalMemObjectsKHR(
+    cl_command_queue command_queue,
+    cl_uint num_mem_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_3_0;
 
-/* Index of plane in a multiplanar hardware buffer */
-#define CL_IMPORT_ANDROID_HARDWARE_BUFFER_PLANE_INDEX_ARM 0x41EF
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseExternalMemObjectsKHR(
+    cl_command_queue command_queue,
+    cl_uint num_mem_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_3_0;
 
-/* Index of layer in a multilayer hardware buffer */
-#define CL_IMPORT_ANDROID_HARDWARE_BUFFER_LAYER_INDEX_ARM 0x41F0
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
-/* Import memory size value to indicate a size for the whole buffer */
-#define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM SIZE_MAX
+/***************************************************************
+* cl_khr_external_memory_dma_buf
+***************************************************************/
+#define cl_khr_external_memory_dma_buf 1
+#define CL_KHR_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME \
+    "cl_khr_external_memory_dma_buf"
+
+/* cl_external_memory_handle_type_khr */
+#define CL_EXTERNAL_MEMORY_HANDLE_DMA_BUF_KHR               0x2067
+
+/***************************************************************
+* cl_khr_external_memory_dx
+***************************************************************/
+#define cl_khr_external_memory_dx 1
+#define CL_KHR_EXTERNAL_MEMORY_DX_EXTENSION_NAME \
+    "cl_khr_external_memory_dx"
+
+/* cl_external_memory_handle_type_khr */
+#define CL_EXTERNAL_MEMORY_HANDLE_D3D11_TEXTURE_KHR         0x2063
+#define CL_EXTERNAL_MEMORY_HANDLE_D3D11_TEXTURE_KMT_KHR     0x2064
+#define CL_EXTERNAL_MEMORY_HANDLE_D3D12_HEAP_KHR            0x2065
+#define CL_EXTERNAL_MEMORY_HANDLE_D3D12_RESOURCE_KHR        0x2066
+
+/***************************************************************
+* cl_khr_external_memory_opaque_fd
+***************************************************************/
+#define cl_khr_external_memory_opaque_fd 1
+#define CL_KHR_EXTERNAL_MEMORY_OPAQUE_FD_EXTENSION_NAME \
+    "cl_khr_external_memory_opaque_fd"
+
+/* cl_external_memory_handle_type_khr */
+#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR             0x2060
+
+/***************************************************************
+* cl_khr_external_memory_win32
+***************************************************************/
+#define cl_khr_external_memory_win32 1
+#define CL_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME \
+    "cl_khr_external_memory_win32"
+
+/* cl_external_memory_handle_type_khr */
+#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR          0x2061
+#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR      0x2062
+
+/***************************************************************
+* cl_khr_external_semaphore
+***************************************************************/
+#define cl_khr_external_semaphore 1
+#define CL_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME \
+    "cl_khr_external_semaphore"
+
+typedef struct _cl_semaphore_khr * cl_semaphore_khr;
+typedef cl_uint             cl_external_semaphore_handle_type_khr;
+
+/* cl_platform_info */
+#define CL_PLATFORM_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR       0x2037
+#define CL_PLATFORM_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR       0x2038
+
+/* cl_device_info */
+#define CL_DEVICE_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR         0x204D
+#define CL_DEVICE_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR         0x204E
+
+/* cl_semaphore_properties_khr */
+#define CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR                0x203F
+#define CL_SEMAPHORE_EXPORT_HANDLE_TYPES_LIST_END_KHR       0
+
+
+typedef cl_int (CL_API_CALL *
+clGetSemaphoreHandleForTypeKHR_fn)(
+    cl_semaphore_khr sema_object,
+    cl_device_id device,
+    cl_external_semaphore_handle_type_khr handle_type,
+    size_t handle_size,
+    void* handle_ptr,
+    size_t* handle_size_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSemaphoreHandleForTypeKHR(
+    cl_semaphore_khr sema_object,
+    cl_device_id device,
+    cl_external_semaphore_handle_type_khr handle_type,
+    size_t handle_size,
+    void* handle_ptr,
+    size_t* handle_size_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_khr_external_semaphore_dx_fence
+***************************************************************/
+#define cl_khr_external_semaphore_dx_fence 1
+#define CL_KHR_EXTERNAL_SEMAPHORE_DX_FENCE_EXTENSION_NAME \
+    "cl_khr_external_semaphore_dx_fence"
+
+/* cl_external_semaphore_handle_type_khr */
+#define CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR                 0x2059
+
+/***************************************************************
+* cl_khr_external_semaphore_opaque_fd
+***************************************************************/
+#define cl_khr_external_semaphore_opaque_fd 1
+#define CL_KHR_EXTERNAL_SEMAPHORE_OPAQUE_FD_EXTENSION_NAME \
+    "cl_khr_external_semaphore_opaque_fd"
+
+/* cl_external_semaphore_handle_type_khr */
+#define CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR                   0x2055
+
+/***************************************************************
+* cl_khr_external_semaphore_sync_fd
+***************************************************************/
+#define cl_khr_external_semaphore_sync_fd 1
+#define CL_KHR_EXTERNAL_SEMAPHORE_SYNC_FD_EXTENSION_NAME \
+    "cl_khr_external_semaphore_sync_fd"
+
+/* cl_external_semaphore_handle_type_khr */
+#define CL_SEMAPHORE_HANDLE_SYNC_FD_KHR                     0x2058
+
+/***************************************************************
+* cl_khr_external_semaphore_win32
+***************************************************************/
+#define cl_khr_external_semaphore_win32 1
+#define CL_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME \
+    "cl_khr_external_semaphore_win32"
+
+/* cl_external_semaphore_handle_type_khr */
+#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR                0x2056
+#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR            0x2057
+
+/***************************************************************
+* cl_khr_semaphore
+***************************************************************/
+#define cl_khr_semaphore 1
+#define CL_KHR_SEMAPHORE_EXTENSION_NAME \
+    "cl_khr_semaphore"
+
+/* type cl_semaphore_khr */
+typedef cl_properties       cl_semaphore_properties_khr;
+typedef cl_uint             cl_semaphore_info_khr;
+typedef cl_uint             cl_semaphore_type_khr;
+typedef cl_ulong            cl_semaphore_payload_khr;
+
+/* cl_semaphore_type */
+#define CL_SEMAPHORE_TYPE_BINARY_KHR                        1
+
+/* cl_platform_info */
+#define CL_PLATFORM_SEMAPHORE_TYPES_KHR                     0x2036
+
+/* cl_device_info */
+#define CL_DEVICE_SEMAPHORE_TYPES_KHR                       0x204C
+
+/* cl_semaphore_info_khr */
+#define CL_SEMAPHORE_CONTEXT_KHR                            0x2039
+#define CL_SEMAPHORE_REFERENCE_COUNT_KHR                    0x203A
+#define CL_SEMAPHORE_PROPERTIES_KHR                         0x203B
+#define CL_SEMAPHORE_PAYLOAD_KHR                            0x203C
+
+/* cl_semaphore_info_khr or cl_semaphore_properties_khr */
+#define CL_SEMAPHORE_TYPE_KHR                               0x203D
+/* enum CL_DEVICE_HANDLE_LIST_KHR */
+/* enum CL_DEVICE_HANDLE_LIST_END_KHR */
+
+/* cl_command_type */
+#define CL_COMMAND_SEMAPHORE_WAIT_KHR                       0x2042
+#define CL_COMMAND_SEMAPHORE_SIGNAL_KHR                     0x2043
+
+/* Error codes */
+#define CL_INVALID_SEMAPHORE_KHR                            -1142
+
+
+typedef cl_semaphore_khr (CL_API_CALL *
+clCreateSemaphoreWithPropertiesKHR_fn)(
+    cl_context context,
+    const cl_semaphore_properties_khr* sema_props,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueWaitSemaphoresKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_sema_objects,
+    const cl_semaphore_khr* sema_objects,
+    const cl_semaphore_payload_khr* sema_payload_list,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueSignalSemaphoresKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_sema_objects,
+    const cl_semaphore_khr* sema_objects,
+    const cl_semaphore_payload_khr* sema_payload_list,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clGetSemaphoreInfoKHR_fn)(
+    cl_semaphore_khr sema_object,
+    cl_semaphore_info_khr param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clReleaseSemaphoreKHR_fn)(
+    cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clRetainSemaphoreKHR_fn)(
+    cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_semaphore_khr CL_API_CALL
+clCreateSemaphoreWithPropertiesKHR(
+    cl_context context,
+    const cl_semaphore_properties_khr* sema_props,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWaitSemaphoresKHR(
+    cl_command_queue command_queue,
+    cl_uint num_sema_objects,
+    const cl_semaphore_khr* sema_objects,
+    const cl_semaphore_payload_khr* sema_payload_list,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSignalSemaphoresKHR(
+    cl_command_queue command_queue,
+    cl_uint num_sema_objects,
+    const cl_semaphore_khr* sema_objects,
+    const cl_semaphore_payload_khr* sema_payload_list,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSemaphoreInfoKHR(
+    cl_semaphore_khr sema_object,
+    cl_semaphore_info_khr param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSemaphoreKHR(
+    cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSemaphoreKHR(
+    cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_arm_import_memory
+***************************************************************/
+#define cl_arm_import_memory 1
+#define CL_ARM_IMPORT_MEMORY_EXTENSION_NAME \
+    "cl_arm_import_memory"
+
+typedef intptr_t            cl_import_properties_arm;
+
+/* cl_import_properties_arm */
+#define CL_IMPORT_TYPE_ARM                                  0x40B2
+#define CL_IMPORT_TYPE_HOST_ARM                             0x40B3
+#define CL_IMPORT_TYPE_DMA_BUF_ARM                          0x40B4
+#define CL_IMPORT_TYPE_PROTECTED_ARM                        0x40B5
+#define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM          0x41E2
+#define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM    0x41E3
+#define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM               SIZE_MAX
+#define CL_IMPORT_ANDROID_HARDWARE_BUFFER_PLANE_INDEX_ARM   0x41EF
+#define CL_IMPORT_ANDROID_HARDWARE_BUFFER_LAYER_INDEX_ARM   0x41F0
+
+
+typedef cl_mem (CL_API_CALL *
+clImportMemoryARM_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    const cl_import_properties_arm* properties,
+    void* memory,
+    size_t size,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
-/* This extension adds a new function that allows for direct memory import into
- * OpenCL via the clImportMemoryARM function.
- *
- * Memory imported through this interface will be mapped into the device's page
- * tables directly, providing zero copy access. It will never fall back to copy
- * operations and aliased buffers.
- *
- * Types of memory supported for import are specified as additional extension
- * strings.
- *
- * This extension produces cl_mem allocations which are compatible with all other
- * users of cl_mem in the standard API.
- *
- * This extension maps pages with the same properties as the normal buffer creation
- * function clCreateBuffer.
- */
 extern CL_API_ENTRY cl_mem CL_API_CALL
-clImportMemoryARM( cl_context context,
-                   cl_mem_flags flags,
-                   const cl_import_properties_arm *properties,
-                   void *memory,
-                   size_t size,
-                   cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+clImportMemoryARM(
+    cl_context context,
+    cl_mem_flags flags,
+    const cl_import_properties_arm* properties,
+    void* memory,
+    size_t size,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
-/******************************************
- * cl_arm_shared_virtual_memory extension *
- ******************************************/
+/***************************************************************
+* cl_arm_shared_virtual_memory
+***************************************************************/
 #define cl_arm_shared_virtual_memory 1
+#define CL_ARM_SHARED_VIRTUAL_MEMORY_EXTENSION_NAME \
+    "cl_arm_shared_virtual_memory"
+
+typedef cl_bitfield         cl_svm_mem_flags_arm;
+typedef cl_uint             cl_kernel_exec_info_arm;
+typedef cl_bitfield         cl_device_svm_capabilities_arm;
+
+/* cl_device_info */
+#define CL_DEVICE_SVM_CAPABILITIES_ARM                      0x40B6
+
+/* cl_mem_info */
+#define CL_MEM_USES_SVM_POINTER_ARM                         0x40B7
+
+/* cl_kernel_exec_info_arm */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                    0x40B8
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM       0x40B9
+
+/* cl_command_type */
+#define CL_COMMAND_SVM_FREE_ARM                             0x40BA
+#define CL_COMMAND_SVM_MEMCPY_ARM                           0x40BB
+#define CL_COMMAND_SVM_MEMFILL_ARM                          0x40BC
+#define CL_COMMAND_SVM_MAP_ARM                              0x40BD
+#define CL_COMMAND_SVM_UNMAP_ARM                            0x40BE
+
+/* cl_device_svm_capabilities_arm */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM               (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM                 (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM                 (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS_ARM                           (1 << 3)
+
+/* cl_svm_mem_flags_arm */
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                    (1 << 10)
+#define CL_MEM_SVM_ATOMICS_ARM                              (1 << 11)
+
 
-/* Used by clGetDeviceInfo */
-#define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6
+typedef void* (CL_API_CALL *
+clSVMAllocARM_fn)(
+    cl_context context,
+    cl_svm_mem_flags_arm flags,
+    size_t size,
+    cl_uint alignment) CL_API_SUFFIX__VERSION_1_2;
+
+typedef void (CL_API_CALL *
+clSVMFreeARM_fn)(
+    cl_context context,
+    void* svm_pointer) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueSVMFreeARM_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_svm_pointers,
+    void* svm_pointers[],
+    void (CL_CALLBACK* pfn_free_func)(cl_command_queue queue, cl_uint num_svm_pointers, void * svm_pointers[], void *user_data),
+    void* user_data,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueSVMMemcpyARM_fn)(
+    cl_command_queue command_queue,
+    cl_bool blocking_copy,
+    void* dst_ptr,
+    const void* src_ptr,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
 
-/* Used by clGetMemObjectInfo */
-#define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7
+typedef cl_int (CL_API_CALL *
+clEnqueueSVMMemFillARM_fn)(
+    cl_command_queue command_queue,
+    void* svm_ptr,
+    const void* pattern,
+    size_t pattern_size,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
 
-/* Used by clSetKernelExecInfoARM: */
-#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
-#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9
+typedef cl_int (CL_API_CALL *
+clEnqueueSVMMapARM_fn)(
+    cl_command_queue command_queue,
+    cl_bool blocking_map,
+    cl_map_flags flags,
+    void* svm_ptr,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
 
-/* To be used by clGetEventInfo: */
-#define CL_COMMAND_SVM_FREE_ARM                         0x40BA
-#define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
-#define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
-#define CL_COMMAND_SVM_MAP_ARM                          0x40BD
-#define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE
+typedef cl_int (CL_API_CALL *
+clEnqueueSVMUnmapARM_fn)(
+    cl_command_queue command_queue,
+    void* svm_ptr,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
 
-/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
-#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
-#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
-#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
-#define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)
+typedef cl_int (CL_API_CALL *
+clSetKernelArgSVMPointerARM_fn)(
+    cl_kernel kernel,
+    cl_uint arg_index,
+    const void* arg_value) CL_API_SUFFIX__VERSION_1_2;
 
-/* Flag values used by clSVMAllocARM: */
-#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
-#define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)
+typedef cl_int (CL_API_CALL *
+clSetKernelExecInfoARM_fn)(
+    cl_kernel kernel,
+    cl_kernel_exec_info_arm param_name,
+    size_t param_value_size,
+    const void* param_value) CL_API_SUFFIX__VERSION_1_2;
 
-typedef cl_bitfield cl_svm_mem_flags_arm;
-typedef cl_uint     cl_kernel_exec_info_arm;
-typedef cl_bitfield cl_device_svm_capabilities_arm;
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
-extern CL_API_ENTRY void * CL_API_CALL
-clSVMAllocARM(cl_context       context,
-              cl_svm_mem_flags_arm flags,
-              size_t           size,
-              cl_uint          alignment) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY void* CL_API_CALL
+clSVMAllocARM(
+    cl_context context,
+    cl_svm_mem_flags_arm flags,
+    size_t size,
+    cl_uint alignment) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY void CL_API_CALL
-clSVMFreeARM(cl_context        context,
-             void *            svm_pointer) CL_API_SUFFIX__VERSION_1_2;
+clSVMFreeARM(
+    cl_context context,
+    void* svm_pointer) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMFreeARM(cl_command_queue  command_queue,
-                    cl_uint           num_svm_pointers,
-                    void *            svm_pointers[],
-                    void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,
-                                                       cl_uint          num_svm_pointers,
-                                                       void *           svm_pointers[],
-                                                       void *           user_data),
-                    void *            user_data,
-                    cl_uint           num_events_in_wait_list,
-                    const cl_event *  event_wait_list,
-                    cl_event *        event) CL_API_SUFFIX__VERSION_1_2;
+clEnqueueSVMFreeARM(
+    cl_command_queue command_queue,
+    cl_uint num_svm_pointers,
+    void* svm_pointers[],
+    void (CL_CALLBACK* pfn_free_func)(cl_command_queue queue, cl_uint num_svm_pointers, void * svm_pointers[], void *user_data),
+    void* user_data,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMMemcpyARM(cl_command_queue  command_queue,
-                      cl_bool           blocking_copy,
-                      void *            dst_ptr,
-                      const void *      src_ptr,
-                      size_t            size,
-                      cl_uint           num_events_in_wait_list,
-                      const cl_event *  event_wait_list,
-                      cl_event *        event) CL_API_SUFFIX__VERSION_1_2;
+clEnqueueSVMMemcpyARM(
+    cl_command_queue command_queue,
+    cl_bool blocking_copy,
+    void* dst_ptr,
+    const void* src_ptr,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMMemFillARM(cl_command_queue  command_queue,
-                       void *            svm_ptr,
-                       const void *      pattern,
-                       size_t            pattern_size,
-                       size_t            size,
-                       cl_uint           num_events_in_wait_list,
-                       const cl_event *  event_wait_list,
-                       cl_event *        event) CL_API_SUFFIX__VERSION_1_2;
+clEnqueueSVMMemFillARM(
+    cl_command_queue command_queue,
+    void* svm_ptr,
+    const void* pattern,
+    size_t pattern_size,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMMapARM(cl_command_queue  command_queue,
-                   cl_bool           blocking_map,
-                   cl_map_flags      flags,
-                   void *            svm_ptr,
-                   size_t            size,
-                   cl_uint           num_events_in_wait_list,
-                   const cl_event *  event_wait_list,
-                   cl_event *        event) CL_API_SUFFIX__VERSION_1_2;
+clEnqueueSVMMapARM(
+    cl_command_queue command_queue,
+    cl_bool blocking_map,
+    cl_map_flags flags,
+    void* svm_ptr,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMUnmapARM(cl_command_queue  command_queue,
-                     void *            svm_ptr,
-                     cl_uint           num_events_in_wait_list,
-                     const cl_event *  event_wait_list,
-                     cl_event *        event) CL_API_SUFFIX__VERSION_1_2;
+clEnqueueSVMUnmapARM(
+    cl_command_queue command_queue,
+    void* svm_ptr,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clSetKernelArgSVMPointerARM(cl_kernel    kernel,
-                            cl_uint      arg_index,
-                            const void * arg_value) CL_API_SUFFIX__VERSION_1_2;
+clSetKernelArgSVMPointerARM(
+    cl_kernel kernel,
+    cl_uint arg_index,
+    const void* arg_value) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clSetKernelExecInfoARM(cl_kernel            kernel,
-                       cl_kernel_exec_info_arm  param_name,
-                       size_t               param_value_size,
-                       const void *         param_value) CL_API_SUFFIX__VERSION_1_2;
+clSetKernelExecInfoARM(
+    cl_kernel kernel,
+    cl_kernel_exec_info_arm param_name,
+    size_t param_value_size,
+    const void* param_value) CL_API_SUFFIX__VERSION_1_2;
 
-/********************************
- * cl_arm_get_core_id extension *
- ********************************/
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
-#ifdef CL_VERSION_1_2
+/***************************************************************
+* cl_arm_get_core_id
+***************************************************************/
+#if defined(CL_VERSION_1_2)
 
 #define cl_arm_get_core_id 1
+#define CL_ARM_GET_CORE_ID_EXTENSION_NAME \
+    "cl_arm_get_core_id"
 
-/* Device info property for bitfield of cores present */
-#define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM      0x40BF
+/* cl_device_info */
+#define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM                0x40BF
 
-#endif  /* CL_VERSION_1_2 */
+#endif /* defined(CL_VERSION_1_2) */
 
-/*********************************
+/***************************************************************
 * cl_arm_job_slot_selection
-*********************************/
-
+***************************************************************/
 #define cl_arm_job_slot_selection 1
+#define CL_ARM_JOB_SLOT_SELECTION_EXTENSION_NAME \
+    "cl_arm_job_slot_selection"
 
 /* cl_device_info */
-#define CL_DEVICE_JOB_SLOTS_ARM                   0x41E0
+#define CL_DEVICE_JOB_SLOTS_ARM                             0x41E0
 
-/* cl_command_queue_properties */
-#define CL_QUEUE_JOB_SLOT_ARM                     0x41E1
+/* cl_queue_properties */
+#define CL_QUEUE_JOB_SLOT_ARM                               0x41E1
 
-/*********************************
+/***************************************************************
 * cl_arm_scheduling_controls
-*********************************/
-
+***************************************************************/
 #define cl_arm_scheduling_controls 1
+#define CL_ARM_SCHEDULING_CONTROLS_EXTENSION_NAME \
+    "cl_arm_scheduling_controls"
 
-typedef cl_bitfield cl_device_scheduling_controls_capabilities_arm;
-
-/* cl_device_info */
-#define CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM          0x41E4
+/* Types */
+typedef cl_bitfield         cl_device_scheduling_controls_capabilities_arm;
 
-#define CL_DEVICE_SCHEDULING_KERNEL_BATCHING_ARM               (1 << 0)
-#define CL_DEVICE_SCHEDULING_WORKGROUP_BATCH_SIZE_ARM          (1 << 1)
+/* cl_device_scheduling_controls_capabilities_arm */
+#define CL_DEVICE_SCHEDULING_KERNEL_BATCHING_ARM            (1 << 0)
+#define CL_DEVICE_SCHEDULING_WORKGROUP_BATCH_SIZE_ARM       (1 << 1)
 #define CL_DEVICE_SCHEDULING_WORKGROUP_BATCH_SIZE_MODIFIER_ARM (1 << 2)
-#define CL_DEVICE_SCHEDULING_DEFERRED_FLUSH_ARM                (1 << 3)
-#define CL_DEVICE_SCHEDULING_REGISTER_ALLOCATION_ARM           (1 << 4)
+#define CL_DEVICE_SCHEDULING_DEFERRED_FLUSH_ARM             (1 << 3)
+#define CL_DEVICE_SCHEDULING_REGISTER_ALLOCATION_ARM        (1 << 4)
+#define CL_DEVICE_SCHEDULING_WARP_THROTTLING_ARM            (1 << 5)
+#define CL_DEVICE_SCHEDULING_COMPUTE_UNIT_BATCH_QUEUE_SIZE_ARM (1 << 6)
+
+/* cl_device_info */
+#define CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM      0x41E4
+#define CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM        0x41EB
+#define CL_DEVICE_MAX_WARP_COUNT_ARM                        0x41EA
 
-#define CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM            0x41EB
+/* cl_kernel_exec_info */
+#define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM        0x41E5
+#define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM 0x41E6
+#define CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM            0x41E8
+#define CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM 0x41F1
 
 /* cl_kernel_info */
-#define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM            0x41E5
-#define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM   0x41E6
+#define CL_KERNEL_MAX_WARP_COUNT_ARM                        0x41E9
 
 /* cl_queue_properties */
-#define CL_QUEUE_KERNEL_BATCHING_ARM                            0x41E7
-#define CL_QUEUE_DEFERRED_FLUSH_ARM                             0x41EC
+#define CL_QUEUE_KERNEL_BATCHING_ARM                        0x41E7
+#define CL_QUEUE_DEFERRED_FLUSH_ARM                         0x41EC
 
-/**************************************
+/***************************************************************
 * cl_arm_controlled_kernel_termination
-***************************************/
-
+***************************************************************/
 #define cl_arm_controlled_kernel_termination 1
+#define CL_ARM_CONTROLLED_KERNEL_TERMINATION_EXTENSION_NAME \
+    "cl_arm_controlled_kernel_termination"
 
-/* Error code to indicate kernel terminated with failure */
-#define CL_COMMAND_TERMINATED_ITSELF_WITH_FAILURE_ARM -1108
+/* Types */
+typedef cl_bitfield         cl_device_controlled_termination_capabilities_arm;
 
-/* cl_device_info */
-#define CL_DEVICE_CONTROLLED_TERMINATION_CAPABILITIES_ARM 0x41EE
+/* Error codes */
+#define CL_COMMAND_TERMINATED_ITSELF_WITH_FAILURE_ARM       -1108
 
-/* Bit fields for controlled termination feature query */
-typedef cl_bitfield cl_device_controlled_termination_capabilities_arm;
+/* cl_device_controlled_termination_capabilities_arm */
+#define CL_DEVICE_CONTROLLED_TERMINATION_SUCCESS_ARM        (1 << 0)
+#define CL_DEVICE_CONTROLLED_TERMINATION_FAILURE_ARM        (1 << 1)
+#define CL_DEVICE_CONTROLLED_TERMINATION_QUERY_ARM          (1 << 2)
 
-#define CL_DEVICE_CONTROLLED_TERMINATION_SUCCESS_ARM (1 << 0)
-#define CL_DEVICE_CONTROLLED_TERMINATION_FAILURE_ARM (1 << 1)
-#define CL_DEVICE_CONTROLLED_TERMINATION_QUERY_ARM (1 << 2)
+/* cl_device_info */
+#define CL_DEVICE_CONTROLLED_TERMINATION_CAPABILITIES_ARM   0x41EE
 
 /* cl_event_info */
-#define CL_EVENT_COMMAND_TERMINATION_REASON_ARM 0x41ED
+#define CL_EVENT_COMMAND_TERMINATION_REASON_ARM             0x41ED
+
+/* cl_command_termination_reason_arm */
+#define CL_COMMAND_TERMINATION_COMPLETION_ARM               0
+#define CL_COMMAND_TERMINATION_CONTROLLED_SUCCESS_ARM       1
+#define CL_COMMAND_TERMINATION_CONTROLLED_FAILURE_ARM       2
+#define CL_COMMAND_TERMINATION_ERROR_ARM                    3
 
-/* Values returned for event termination reason query */
-typedef cl_uint cl_command_termination_reason_arm;
+/***************************************************************
+* cl_arm_protected_memory_allocation
+***************************************************************/
+#define cl_arm_protected_memory_allocation 1
+#define CL_ARM_PROTECTED_MEMORY_ALLOCATION_EXTENSION_NAME \
+    "cl_arm_protected_memory_allocation"
 
-#define CL_COMMAND_TERMINATION_COMPLETION_ARM  0
-#define CL_COMMAND_TERMINATION_CONTROLLED_SUCCESS_ARM 1
-#define CL_COMMAND_TERMINATION_CONTROLLED_FAILURE_ARM 2
-#define CL_COMMAND_TERMINATION_ERROR_ARM 3
+#define CL_MEM_PROTECTED_ALLOC_ARM                          ((cl_bitfield)1 << 36)
+
+/***************************************************************
+* cl_intel_exec_by_local_thread
+***************************************************************/
+#define cl_intel_exec_by_local_thread 1
+#define CL_INTEL_EXEC_BY_LOCAL_THREAD_EXTENSION_NAME \
+    "cl_intel_exec_by_local_thread"
 
-/***************************************
-* cl_intel_thread_local_exec extension *
-****************************************/
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL             ((cl_bitfield)1 << 31)
 
-#define cl_intel_thread_local_exec 1
+/***************************************************************
+* cl_intel_device_attribute_query
+***************************************************************/
+#define cl_intel_device_attribute_query 1
+#define CL_INTEL_DEVICE_ATTRIBUTE_QUERY_EXTENSION_NAME \
+    "cl_intel_device_attribute_query"
 
-#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)
+typedef cl_bitfield         cl_device_feature_capabilities_intel;
 
-/***********************************************
-* cl_intel_device_partition_by_names extension *
-************************************************/
+/* cl_device_feature_capabilities_intel */
+#define CL_DEVICE_FEATURE_FLAG_DP4A_INTEL                   (1 << 0)
+#define CL_DEVICE_FEATURE_FLAG_DPAS_INTEL                   (1 << 1)
 
-#define cl_intel_device_partition_by_names 1
+/* cl_device_info */
+#define CL_DEVICE_IP_VERSION_INTEL                          0x4250
+#define CL_DEVICE_ID_INTEL                                  0x4251
+#define CL_DEVICE_NUM_SLICES_INTEL                          0x4252
+#define CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL            0x4253
+#define CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL               0x4254
+#define CL_DEVICE_NUM_THREADS_PER_EU_INTEL                  0x4255
+#define CL_DEVICE_FEATURE_CAPABILITIES_INTEL                0x4256
 
-#define CL_DEVICE_PARTITION_BY_NAMES_INTEL          0x4052
-#define CL_PARTITION_BY_NAMES_LIST_END_INTEL        -1
+/***************************************************************
+* cl_intel_device_partition_by_names
+***************************************************************/
+#define cl_intel_device_partition_by_names 1
+#define CL_INTEL_DEVICE_PARTITION_BY_NAMES_EXTENSION_NAME \
+    "cl_intel_device_partition_by_names"
 
-/************************************************
-* cl_intel_accelerator extension                *
-* cl_intel_motion_estimation extension          *
-* cl_intel_advanced_motion_estimation extension *
-*************************************************/
+#define CL_DEVICE_PARTITION_BY_NAMES_INTEL                  0x4052
+#define CL_PARTITION_BY_NAMES_LIST_END_INTEL                -1
 
+/***************************************************************
+* cl_intel_accelerator
+***************************************************************/
 #define cl_intel_accelerator 1
-#define cl_intel_motion_estimation 1
-#define cl_intel_advanced_motion_estimation 1
+#define CL_INTEL_ACCELERATOR_EXTENSION_NAME \
+    "cl_intel_accelerator"
 
 typedef struct _cl_accelerator_intel* cl_accelerator_intel;
-typedef cl_uint cl_accelerator_type_intel;
-typedef cl_uint cl_accelerator_info_intel;
+typedef cl_uint             cl_accelerator_type_intel;
+typedef cl_uint             cl_accelerator_info_intel;
 
-typedef struct _cl_motion_estimation_desc_intel {
-    cl_uint mb_block_type;
-    cl_uint subpixel_mode;
-    cl_uint sad_adjust_mode;
-    cl_uint search_path_type;
-} cl_motion_estimation_desc_intel;
+/* cl_accelerator_info_intel */
+#define CL_ACCELERATOR_DESCRIPTOR_INTEL                     0x4090
+#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                0x4091
+#define CL_ACCELERATOR_CONTEXT_INTEL                        0x4092
+#define CL_ACCELERATOR_TYPE_INTEL                           0x4093
+
+/* Error codes */
+#define CL_INVALID_ACCELERATOR_INTEL                        -1094
+#define CL_INVALID_ACCELERATOR_TYPE_INTEL                   -1095
+#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL             -1096
+#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL             -1097
+
+
+typedef cl_accelerator_intel (CL_API_CALL *
+clCreateAcceleratorINTEL_fn)(
+    cl_context context,
+    cl_accelerator_type_intel accelerator_type,
+    size_t descriptor_size,
+    const void* descriptor,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clGetAcceleratorInfoINTEL_fn)(
+    cl_accelerator_intel accelerator,
+    cl_accelerator_info_intel param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
 
-/* error codes */
-#define CL_INVALID_ACCELERATOR_INTEL                              -1094
-#define CL_INVALID_ACCELERATOR_TYPE_INTEL                         -1095
-#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL                   -1096
-#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL                   -1097
+typedef cl_int (CL_API_CALL *
+clRetainAcceleratorINTEL_fn)(
+    cl_accelerator_intel accelerator) CL_API_SUFFIX__VERSION_1_2;
 
-/* cl_accelerator_type_intel */
-#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL               0x0
+typedef cl_int (CL_API_CALL *
+clReleaseAcceleratorINTEL_fn)(
+    cl_accelerator_intel accelerator) CL_API_SUFFIX__VERSION_1_2;
 
-/* cl_accelerator_info_intel */
-#define CL_ACCELERATOR_DESCRIPTOR_INTEL                           0x4090
-#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                      0x4091
-#define CL_ACCELERATOR_CONTEXT_INTEL                              0x4092
-#define CL_ACCELERATOR_TYPE_INTEL                                 0x4093
-
-/* cl_motion_detect_desc_intel flags */
-#define CL_ME_MB_TYPE_16x16_INTEL                                 0x0
-#define CL_ME_MB_TYPE_8x8_INTEL                                   0x1
-#define CL_ME_MB_TYPE_4x4_INTEL                                   0x2
-
-#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                         0x0
-#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                            0x1
-#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                            0x2
-
-#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                          0x0
-#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                          0x1
-
-#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                        0x0
-#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                        0x1
-#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                      0x5
-
-#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                         0x0
-#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL                  0x1
-#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL                    0x2
-#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                           0x4
-
-#define CL_ME_FORWARD_INPUT_MODE_INTEL                            0x1
-#define CL_ME_BACKWARD_INPUT_MODE_INTEL                           0x2
-#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                        0x3
-
-#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                          16
-#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                            21
-#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                             32
-#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                        43
-#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL                    48
-
-#define CL_ME_COST_PENALTY_NONE_INTEL                             0x0
-#define CL_ME_COST_PENALTY_LOW_INTEL                              0x1
-#define CL_ME_COST_PENALTY_NORMAL_INTEL                           0x2
-#define CL_ME_COST_PENALTY_HIGH_INTEL                             0x3
-
-#define CL_ME_COST_PRECISION_QPEL_INTEL                           0x0
-#define CL_ME_COST_PRECISION_HPEL_INTEL                           0x1
-#define CL_ME_COST_PRECISION_PEL_INTEL                            0x2
-#define CL_ME_COST_PRECISION_DPEL_INTEL                           0x3
-
-#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL                  0x0
-#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL                0x1
-#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                        0x2
-#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL        0x3
-
-#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL       0x4
-#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL                     0x4
-#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL            0x5
-#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL           0x6
-#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL             0x7
-#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL             0x8
-
-#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                      0x0
-#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL              0x1
-#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL                0x2
-#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL                   0x3
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
-/* cl_device_info */
-#define CL_DEVICE_ME_VERSION_INTEL                                0x407E
+extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
+clCreateAcceleratorINTEL(
+    cl_context context,
+    cl_accelerator_type_intel accelerator_type,
+    size_t descriptor_size,
+    const void* descriptor,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-#define CL_ME_VERSION_LEGACY_INTEL                                0x0
-#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                        0x1
-#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                        0x2
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetAcceleratorInfoINTEL(
+    cl_accelerator_intel accelerator,
+    cl_accelerator_info_intel param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainAcceleratorINTEL(
+    cl_accelerator_intel accelerator) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseAcceleratorINTEL(
+    cl_accelerator_intel accelerator) CL_API_SUFFIX__VERSION_1_2;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_intel_motion_estimation
+***************************************************************/
+#define cl_intel_motion_estimation 1
+#define CL_INTEL_MOTION_ESTIMATION_EXTENSION_NAME \
+    "cl_intel_motion_estimation"
+
+typedef struct _cl_motion_estimation_desc_intel {
+    cl_uint mb_block_type;
+    cl_uint subpixel_mode;
+    cl_uint sad_adjust_mode;
+    cl_uint search_path_type;
+} cl_motion_estimation_desc_intel;
 
-extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
-clCreateAcceleratorINTEL(
-    cl_context                   context,
-    cl_accelerator_type_intel    accelerator_type,
-    size_t                       descriptor_size,
-    const void*                  descriptor,
-    cl_int*                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
-    cl_context                   context,
-    cl_accelerator_type_intel    accelerator_type,
-    size_t                       descriptor_size,
-    const void*                  descriptor,
-    cl_int*                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+/* cl_accelerator_type_intel */
+#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL         0x0
 
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetAcceleratorInfoINTEL(
-    cl_accelerator_intel         accelerator,
-    cl_accelerator_info_intel    param_name,
-    size_t                       param_value_size,
-    void*                        param_value,
-    size_t*                      param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
-    cl_accelerator_intel         accelerator,
-    cl_accelerator_info_intel    param_name,
-    size_t                       param_value_size,
-    void*                        param_value,
-    size_t*                      param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
+/* cl_uint mb_block_type */
+#define CL_ME_MB_TYPE_16x16_INTEL                           0x0
+#define CL_ME_MB_TYPE_8x8_INTEL                             0x1
+#define CL_ME_MB_TYPE_4x4_INTEL                             0x2
 
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainAcceleratorINTEL(
-    cl_accelerator_intel         accelerator) CL_API_SUFFIX__VERSION_1_2;
+/* cl_uint subpixel_mode */
+#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                   0x0
+#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                      0x1
+#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                      0x2
 
-typedef cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
-    cl_accelerator_intel         accelerator) CL_API_SUFFIX__VERSION_1_2;
+/* cl_uint sad_adjust_mode */
+#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                    0x0
+#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                    0x1
 
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseAcceleratorINTEL(
-    cl_accelerator_intel         accelerator) CL_API_SUFFIX__VERSION_1_2;
+/* cl_uint search_path_type */
+#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                  0x0
+#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                  0x1
+#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                0x5
 
-typedef cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
-    cl_accelerator_intel         accelerator) CL_API_SUFFIX__VERSION_1_2;
+/***************************************************************
+* cl_intel_advanced_motion_estimation
+***************************************************************/
+#define cl_intel_advanced_motion_estimation 1
+#define CL_INTEL_ADVANCED_MOTION_ESTIMATION_EXTENSION_NAME \
+    "cl_intel_advanced_motion_estimation"
 
-/******************************************
-* cl_intel_simultaneous_sharing extension *
-*******************************************/
+/* cl_device_info */
+#define CL_DEVICE_ME_VERSION_INTEL                          0x407E
+
+#define CL_ME_VERSION_LEGACY_INTEL                          0x0
+#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                  0x1
+#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                  0x2
+
+#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL            0x1
+#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL              0x2
+
+#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                   0x0
+#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                     0x1
+
+#define CL_ME_COST_PENALTY_NONE_INTEL                       0x0
+#define CL_ME_COST_PENALTY_LOW_INTEL                        0x1
+#define CL_ME_COST_PENALTY_NORMAL_INTEL                     0x2
+#define CL_ME_COST_PENALTY_HIGH_INTEL                       0x3
+
+#define CL_ME_COST_PRECISION_QPEL_INTEL                     0x0
+#define CL_ME_COST_PRECISION_HPEL_INTEL                     0x1
+#define CL_ME_COST_PRECISION_PEL_INTEL                      0x2
+#define CL_ME_COST_PRECISION_DPEL_INTEL                     0x3
+
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
+#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
+#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
+#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
+#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8
+
+#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
+#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
+#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
+#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3
+
+#define CL_ME_FORWARD_INPUT_MODE_INTEL                      0x1
+#define CL_ME_BACKWARD_INPUT_MODE_INTEL                     0x2
+#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                  0x3
+
+#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                    16
+#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                      21
+#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                       32
+#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                  43
+#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL              48
 
+/***************************************************************
+* cl_intel_simultaneous_sharing
+***************************************************************/
 #define cl_intel_simultaneous_sharing 1
+#define CL_INTEL_SIMULTANEOUS_SHARING_EXTENSION_NAME \
+    "cl_intel_simultaneous_sharing"
 
-#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL            0x4104
-#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL        0x4105
-
-/***********************************
-* cl_intel_egl_image_yuv extension *
-************************************/
+/* cl_device_info */
+#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL               0x4104
+#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL           0x4105
 
+/***************************************************************
+* cl_intel_egl_image_yuv
+***************************************************************/
 #define cl_intel_egl_image_yuv 1
+#define CL_INTEL_EGL_IMAGE_YUV_EXTENSION_NAME \
+    "cl_intel_egl_image_yuv"
 
-#define CL_EGL_YUV_PLANE_INTEL                           0x4107
-
-/********************************
-* cl_intel_packed_yuv extension *
-*********************************/
+/* cl_egl_image_properties_khr */
+#define CL_EGL_YUV_PLANE_INTEL                              0x4107
 
+/***************************************************************
+* cl_intel_packed_yuv
+***************************************************************/
 #define cl_intel_packed_yuv 1
+#define CL_INTEL_PACKED_YUV_EXTENSION_NAME \
+    "cl_intel_packed_yuv"
 
-#define CL_YUYV_INTEL                                    0x4076
-#define CL_UYVY_INTEL                                    0x4077
-#define CL_YVYU_INTEL                                    0x4078
-#define CL_VYUY_INTEL                                    0x4079
-
-/********************************************
-* cl_intel_required_subgroup_size extension *
-*********************************************/
+/* cl_channel_order */
+#define CL_YUYV_INTEL                                       0x4076
+#define CL_UYVY_INTEL                                       0x4077
+#define CL_YVYU_INTEL                                       0x4078
+#define CL_VYUY_INTEL                                       0x4079
 
+/***************************************************************
+* cl_intel_required_subgroup_size
+***************************************************************/
 #define cl_intel_required_subgroup_size 1
+#define CL_INTEL_REQUIRED_SUBGROUP_SIZE_EXTENSION_NAME \
+    "cl_intel_required_subgroup_size"
 
-#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                  0x4108
-#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                   0x4109
-#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL           0x410A
+/* cl_device_info */
+#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                     0x4108
 
-/****************************************
-* cl_intel_driver_diagnostics extension *
-*****************************************/
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                      0x4109
 
-#define cl_intel_driver_diagnostics 1
+/* cl_kernel_sub_group_info */
+#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL              0x410A
 
-typedef cl_uint cl_diagnostics_verbose_level;
+/***************************************************************
+* cl_intel_driver_diagnostics
+***************************************************************/
+#define cl_intel_driver_diagnostics 1
+#define CL_INTEL_DRIVER_DIAGNOSTICS_EXTENSION_NAME \
+    "cl_intel_driver_diagnostics"
 
-#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                0x4106
+typedef cl_uint             cl_diagnostics_verbose_level;
 
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL           ( 0xff )
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL          ( 1 )
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL           ( 1 << 1 )
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL       ( 1 << 2 )
+/* cl_context_properties */
+#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                   0x4106
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL              0xff
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL             (1 << 0)
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL              (1 << 1)
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL          (1 << 2)
 
-/********************************
-* cl_intel_planar_yuv extension *
-*********************************/
+/***************************************************************
+* cl_intel_planar_yuv
+***************************************************************/
+#define cl_intel_planar_yuv 1
+#define CL_INTEL_PLANAR_YUV_EXTENSION_NAME \
+    "cl_intel_planar_yuv"
 
+/* cl_channel_order */
 #define CL_NV12_INTEL                                       0x410E
 
-#define CL_MEM_NO_ACCESS_INTEL                              ( 1 << 24 )
-#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              ( 1 << 25 )
+/* cl_mem_flags */
+#define CL_MEM_NO_ACCESS_INTEL                              (1 << 24)
+#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              (1 << 25)
 
+/* cl_device_info */
 #define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E
 #define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F
 
-/*******************************************************
-* cl_intel_device_side_avc_motion_estimation extension *
-********************************************************/
+/***************************************************************
+* cl_intel_device_side_avc_motion_estimation
+***************************************************************/
+#define cl_intel_device_side_avc_motion_estimation 1
+#define CL_INTEL_DEVICE_SIDE_AVC_MOTION_ESTIMATION_EXTENSION_NAME \
+    "cl_intel_device_side_avc_motion_estimation"
 
+/* cl_device_info */
 #define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B
 #define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
 #define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D
 
-#define CL_AVC_ME_VERSION_0_INTEL                           0x0   /* No support. */
-#define CL_AVC_ME_VERSION_1_INTEL                           0x1   /* First supported version. */
+/* returned by CL_DEVICE_AVC_ME_VERSION_INTEL */
+#define CL_AVC_ME_VERSION_0_INTEL                           0x0
+#define CL_AVC_ME_VERSION_1_INTEL                           0x1
 
+/* Inter macro-block major shape values */
 #define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0
 #define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1
 #define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2
 #define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3
 
+/* Inter macro-block minor shape values */
 #define CL_AVC_ME_MINOR_8x8_INTEL                           0x0
 #define CL_AVC_ME_MINOR_8x4_INTEL                           0x1
 #define CL_AVC_ME_MINOR_4x8_INTEL                           0x2
 #define CL_AVC_ME_MINOR_4x4_INTEL                           0x3
 
+/* Inter macro-block major direction values */
 #define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0
 #define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1
 #define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2
 
+/* Inter (IME) partition mask values */
 #define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0
 #define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E
 #define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D
@@ -1258,6 +2508,7 @@ typedef cl_uint cl_diagnostics_verbose_level;
 #define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F
 #define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F
 
+/* Search window configuration */
 #define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0
 #define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1
 #define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2
@@ -1271,143 +2522,152 @@ typedef cl_uint cl_diagnostics_verbose_level;
 #define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2
 #define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa
 
+/* SAD adjustment mode */
 #define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
 #define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2
 
+/* Pixel resolution */
 #define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
 #define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
 #define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3
 
+/* Cost precision values */
 #define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0
 #define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1
 #define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2
 #define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3
 
+/* Inter bidirectional weights */
 #define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10
 #define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15
 #define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20
 #define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B
 #define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30
 
+/* Inter border reached values */
 #define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0
 #define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2
 #define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4
 #define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8
 
+/* Inter skip block partition type */
 #define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0
 #define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000
 
-#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        ( 0x3 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       ( 0x55 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      ( 0xAA << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          ( 0xFF << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     ( 0x1 << 26 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    ( 0x2 << 26 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     ( 0x1 << 28 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    ( 0x2 << 28 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     ( 0x1 << 30 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    ( 0x2 << 30 )
-
+/* Inter skip motion vector mask */
+#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     (0x1 << 24)
+#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    (0x2 << 24)
+#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        (0x3 << 24)
+#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       (0x55 << 24)
+#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      (0xAA << 24)
+#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          (0xFF << 24)
+#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     (0x1 << 24)
+#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    (0x2 << 24)
+#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     (0x1 << 26)
+#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    (0x2 << 26)
+#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     (0x1 << 28)
+#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    (0x2 << 28)
+#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     (0x1 << 30)
+#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    (0x2 << 30)
+
+/* Block based skip type values */
 #define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00
 #define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80
 
+/* cl_intel_device_side_avc_motion_estimation.?? */
 #define CL_AVC_ME_INTRA_16x16_INTEL                         0x0
 #define CL_AVC_ME_INTRA_8x8_INTEL                           0x1
 #define CL_AVC_ME_INTRA_4x4_INTEL                           0x2
 
+/* Luma intra partition mask values */
 #define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6
 #define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5
 #define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3
 
-#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL         0x60
-#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL        0x10
-#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL  0x8
-#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL   0x4
-
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
+/* Intra neighbor availability mask values */
+#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL     0x60
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL    0x10
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4
+
+/* Luma intra modes */
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL        0x0
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL      0x1
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL              0x2
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8
-#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
-#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
-#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
-#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3
-
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL           0x4
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL  0x5
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL   0x7
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL   0x8
+
+/* Chroma intra modes */
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL            0x0
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL    0x1
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL      0x2
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL         0x3
+
+/* Reference image select values */
 #define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1
 #define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2
 #define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3
 
+/* Slice type values */
 #define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0
 #define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1
 #define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2
 
+/* Interlaced image field polarity values */
 #define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0
 #define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1
 
-/*******************************************
-* cl_intel_unified_shared_memory extension *
-********************************************/
-
-/* These APIs are in sync with Revision Q of the cl_intel_unified_shared_memory spec! */
-
+/***************************************************************
+* cl_intel_unified_shared_memory
+***************************************************************/
 #define cl_intel_unified_shared_memory 1
+#define CL_INTEL_UNIFIED_SHARED_MEMORY_EXTENSION_NAME \
+    "cl_intel_unified_shared_memory"
+
+typedef cl_bitfield         cl_device_unified_shared_memory_capabilities_intel;
+typedef cl_properties       cl_mem_properties_intel;
+typedef cl_bitfield         cl_mem_alloc_flags_intel;
+typedef cl_uint             cl_mem_info_intel;
+typedef cl_uint             cl_unified_shared_memory_type_intel;
+typedef cl_uint             cl_mem_advice_intel;
 
 /* cl_device_info */
-#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL                   0x4190
-#define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL                 0x4191
-#define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL   0x4192
-#define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL    0x4193
-#define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL          0x4194
-
-typedef cl_bitfield cl_device_unified_shared_memory_capabilities_intel;
-
-/* cl_device_unified_shared_memory_capabilities_intel - bitfield */
-#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL                   (1 << 0)
-#define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL            (1 << 1)
-#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL        (1 << 2)
+#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL               0x4190
+#define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL             0x4191
+#define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4192
+#define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4193
+#define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL      0x4194
+
+/* cl_unified_shared_memory_capabilities_intel - bitfield */
+#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL               (1 << 0)
+#define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL        (1 << 1)
+#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL    (1 << 2)
 #define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL (1 << 3)
 
-typedef cl_properties cl_mem_properties_intel;
-
 /* cl_mem_properties_intel */
-#define CL_MEM_ALLOC_FLAGS_INTEL        0x4195
-
-typedef cl_bitfield cl_mem_alloc_flags_intel;
+#define CL_MEM_ALLOC_FLAGS_INTEL                            0x4195
 
 /* cl_mem_alloc_flags_intel - bitfield */
-#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL               (1 << 0)
-
-typedef cl_uint cl_mem_info_intel;
+#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL                   (1 << 0)
+#define CL_MEM_ALLOC_INITIAL_PLACEMENT_DEVICE_INTEL         (1 << 1)
+#define CL_MEM_ALLOC_INITIAL_PLACEMENT_HOST_INTEL           (1 << 2)
 
 /* cl_mem_alloc_info_intel */
-#define CL_MEM_ALLOC_TYPE_INTEL         0x419A
-#define CL_MEM_ALLOC_BASE_PTR_INTEL     0x419B
-#define CL_MEM_ALLOC_SIZE_INTEL         0x419C
-#define CL_MEM_ALLOC_DEVICE_INTEL       0x419D
-/* Enum values 0x419E-0x419F are reserved for future queries. */
-
-typedef cl_uint cl_unified_shared_memory_type_intel;
+#define CL_MEM_ALLOC_TYPE_INTEL                             0x419A
+#define CL_MEM_ALLOC_BASE_PTR_INTEL                         0x419B
+#define CL_MEM_ALLOC_SIZE_INTEL                             0x419C
+#define CL_MEM_ALLOC_DEVICE_INTEL                           0x419D
 
 /* cl_unified_shared_memory_type_intel */
-#define CL_MEM_TYPE_UNKNOWN_INTEL       0x4196
-#define CL_MEM_TYPE_HOST_INTEL          0x4197
-#define CL_MEM_TYPE_DEVICE_INTEL        0x4198
-#define CL_MEM_TYPE_SHARED_INTEL        0x4199
-
-typedef cl_uint cl_mem_advice_intel;
-
-/* cl_mem_advice_intel */
-/* Enum values 0x4208-0x420F are reserved for future memory advices. */
+#define CL_MEM_TYPE_UNKNOWN_INTEL                           0x4196
+#define CL_MEM_TYPE_HOST_INTEL                              0x4197
+#define CL_MEM_TYPE_DEVICE_INTEL                            0x4198
+#define CL_MEM_TYPE_SHARED_INTEL                            0x4199
 
 /* cl_kernel_exec_info */
 #define CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL      0x4200
@@ -1416,259 +2676,366 @@ typedef cl_uint cl_mem_advice_intel;
 #define CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL                  0x4203
 
 /* cl_command_type */
-#define CL_COMMAND_MEMFILL_INTEL        0x4204
-#define CL_COMMAND_MEMCPY_INTEL         0x4205
-#define CL_COMMAND_MIGRATEMEM_INTEL     0x4206
-#define CL_COMMAND_MEMADVISE_INTEL      0x4207
+#define CL_COMMAND_MEMFILL_INTEL                            0x4204
+#define CL_COMMAND_MEMCPY_INTEL                             0x4205
+#define CL_COMMAND_MIGRATEMEM_INTEL                         0x4206
+#define CL_COMMAND_MEMADVISE_INTEL                          0x4207
 
-extern CL_API_ENTRY void* CL_API_CALL
-clHostMemAllocINTEL(
-            cl_context context,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
 
 typedef void* (CL_API_CALL *
 clHostMemAllocINTEL_fn)(
-            cl_context context,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
-
-extern CL_API_ENTRY void* CL_API_CALL
-clDeviceMemAllocINTEL(
-            cl_context context,
-            cl_device_id device,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
+    cl_context context,
+    const cl_mem_properties_intel* properties,
+    size_t size,
+    cl_uint alignment,
+    cl_int* errcode_ret) ;
 
 typedef void* (CL_API_CALL *
 clDeviceMemAllocINTEL_fn)(
-            cl_context context,
-            cl_device_id device,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
-
-extern CL_API_ENTRY void* CL_API_CALL
-clSharedMemAllocINTEL(
-            cl_context context,
-            cl_device_id device,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
+    cl_context context,
+    cl_device_id device,
+    const cl_mem_properties_intel* properties,
+    size_t size,
+    cl_uint alignment,
+    cl_int* errcode_ret) ;
 
 typedef void* (CL_API_CALL *
 clSharedMemAllocINTEL_fn)(
-            cl_context context,
-            cl_device_id device,
-            const cl_mem_properties_intel* properties,
-            size_t size,
-            cl_uint alignment,
-            cl_int* errcode_ret);
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clMemFreeINTEL(
-            cl_context context,
-            void* ptr);
+    cl_context context,
+    cl_device_id device,
+    const cl_mem_properties_intel* properties,
+    size_t size,
+    cl_uint alignment,
+    cl_int* errcode_ret) ;
 
 typedef cl_int (CL_API_CALL *
 clMemFreeINTEL_fn)(
-            cl_context context,
-            void* ptr);
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clMemBlockingFreeINTEL(
-            cl_context context,
-            void* ptr);
+    cl_context context,
+    void* ptr) ;
 
 typedef cl_int (CL_API_CALL *
 clMemBlockingFreeINTEL_fn)(
-            cl_context context,
-            void* ptr);
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetMemAllocInfoINTEL(
-            cl_context context,
-            const void* ptr,
-            cl_mem_info_intel param_name,
-            size_t param_value_size,
-            void* param_value,
-            size_t* param_value_size_ret);
+    cl_context context,
+    void* ptr) ;
 
 typedef cl_int (CL_API_CALL *
 clGetMemAllocInfoINTEL_fn)(
-            cl_context context,
-            const void* ptr,
-            cl_mem_info_intel param_name,
-            size_t param_value_size,
-            void* param_value,
-            size_t* param_value_size_ret);
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetKernelArgMemPointerINTEL(
-            cl_kernel kernel,
-            cl_uint arg_index,
-            const void* arg_value);
+    cl_context context,
+    const void* ptr,
+    cl_mem_info_intel param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
 
 typedef cl_int (CL_API_CALL *
 clSetKernelArgMemPointerINTEL_fn)(
-            cl_kernel kernel,
-            cl_uint arg_index,
-            const void* arg_value);
+    cl_kernel kernel,
+    cl_uint arg_index,
+    const void* arg_value) ;
 
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMemsetINTEL(       /* Deprecated */
-            cl_command_queue command_queue,
-            void* dst_ptr,
-            cl_int value,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
+typedef cl_int (CL_API_CALL *
+clEnqueueMemFillINTEL_fn)(
+    cl_command_queue command_queue,
+    void* dst_ptr,
+    const void* pattern,
+    size_t pattern_size,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueMemcpyINTEL_fn)(
+    cl_command_queue command_queue,
+    cl_bool blocking,
+    void* dst_ptr,
+    const void* src_ptr,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
 
 typedef cl_int (CL_API_CALL *
-clEnqueueMemsetINTEL_fn)(   /* Deprecated */
-            cl_command_queue command_queue,
-            void* dst_ptr,
-            cl_int value,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
+clEnqueueMemAdviseINTEL_fn)(
+    cl_command_queue command_queue,
+    const void* ptr,
+    size_t size,
+    cl_mem_advice_intel advice,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY void* CL_API_CALL
+clHostMemAllocINTEL(
+    cl_context context,
+    const cl_mem_properties_intel* properties,
+    size_t size,
+    cl_uint alignment,
+    cl_int* errcode_ret) ;
+
+extern CL_API_ENTRY void* CL_API_CALL
+clDeviceMemAllocINTEL(
+    cl_context context,
+    cl_device_id device,
+    const cl_mem_properties_intel* properties,
+    size_t size,
+    cl_uint alignment,
+    cl_int* errcode_ret) ;
+
+extern CL_API_ENTRY void* CL_API_CALL
+clSharedMemAllocINTEL(
+    cl_context context,
+    cl_device_id device,
+    const cl_mem_properties_intel* properties,
+    size_t size,
+    cl_uint alignment,
+    cl_int* errcode_ret) ;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMemFillINTEL(
-            cl_command_queue command_queue,
-            void* dst_ptr,
-            const void* pattern,
-            size_t pattern_size,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
+clMemFreeINTEL(
+    cl_context context,
+    void* ptr) ;
 
-typedef cl_int (CL_API_CALL *
-clEnqueueMemFillINTEL_fn)(
-            cl_command_queue command_queue,
-            void* dst_ptr,
-            const void* pattern,
-            size_t pattern_size,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
+extern CL_API_ENTRY cl_int CL_API_CALL
+clMemBlockingFreeINTEL(
+    cl_context context,
+    void* ptr) ;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMemcpyINTEL(
-            cl_command_queue command_queue,
-            cl_bool blocking,
-            void* dst_ptr,
-            const void* src_ptr,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
+clGetMemAllocInfoINTEL(
+    cl_context context,
+    const void* ptr,
+    cl_mem_info_intel param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
 
-typedef cl_int (CL_API_CALL *
-clEnqueueMemcpyINTEL_fn)(
-            cl_command_queue command_queue,
-            cl_bool blocking,
-            void* dst_ptr,
-            const void* src_ptr,
-            size_t size,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgMemPointerINTEL(
+    cl_kernel kernel,
+    cl_uint arg_index,
+    const void* arg_value) ;
 
-#ifdef CL_VERSION_1_2
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMemFillINTEL(
+    cl_command_queue command_queue,
+    void* dst_ptr,
+    const void* pattern,
+    size_t pattern_size,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
 
-/* Because these APIs use cl_mem_migration_flags, they require
-   OpenCL 1.2: */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMemcpyINTEL(
+    cl_command_queue command_queue,
+    cl_bool blocking,
+    void* dst_ptr,
+    const void* src_ptr,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMigrateMemINTEL(
-            cl_command_queue command_queue,
-            const void* ptr,
-            size_t size,
-            cl_mem_migration_flags flags,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
+clEnqueueMemAdviseINTEL(
+    cl_command_queue command_queue,
+    const void* ptr,
+    size_t size,
+    cl_mem_advice_intel advice,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+#if defined(CL_VERSION_1_2)
+/* Requires OpenCL 1.2 for cl_mem_migration_flags: */
 
 typedef cl_int (CL_API_CALL *
 clEnqueueMigrateMemINTEL_fn)(
-            cl_command_queue command_queue,
-            const void* ptr,
-            size_t size,
-            cl_mem_migration_flags flags,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
+    cl_command_queue command_queue,
+    const void* ptr,
+    size_t size,
+    cl_mem_migration_flags flags,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
 
-#endif
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMemAdviseINTEL(
-            cl_command_queue command_queue,
-            const void* ptr,
-            size_t size,
-            cl_mem_advice_intel advice,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
+clEnqueueMigrateMemINTEL(
+    cl_command_queue command_queue,
+    const void* ptr,
+    size_t size,
+    cl_mem_migration_flags flags,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+#endif /* defined(CL_VERSION_1_2) */
+
+/* deprecated, use clEnqueueMemFillINTEL instead */
 
 typedef cl_int (CL_API_CALL *
-clEnqueueMemAdviseINTEL_fn)(
-            cl_command_queue command_queue,
-            const void* ptr,
-            size_t size,
-            cl_mem_advice_intel advice,
-            cl_uint num_events_in_wait_list,
-            const cl_event* event_wait_list,
-            cl_event* event);
-
-/***************************************************
-* cl_intel_create_buffer_with_properties extension *
-****************************************************/
+clEnqueueMemsetINTEL_fn)(
+    cl_command_queue command_queue,
+    void* dst_ptr,
+    cl_int value,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMemsetINTEL(
+    cl_command_queue command_queue,
+    void* dst_ptr,
+    cl_int value,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_intel_mem_alloc_buffer_location
+***************************************************************/
+#define cl_intel_mem_alloc_buffer_location 1
+#define CL_INTEL_MEM_ALLOC_BUFFER_LOCATION_EXTENSION_NAME \
+    "cl_intel_mem_alloc_buffer_location"
+
+/* cl_mem_properties_intel */
+#define CL_MEM_ALLOC_BUFFER_LOCATION_INTEL                  0x419E
+
+/* cl_mem_alloc_info_intel */
+/* enum CL_MEM_ALLOC_BUFFER_LOCATION_INTEL */
 
+/***************************************************************
+* cl_intel_create_buffer_with_properties
+***************************************************************/
 #define cl_intel_create_buffer_with_properties 1
+#define CL_INTEL_CREATE_BUFFER_WITH_PROPERTIES_EXTENSION_NAME \
+    "cl_intel_create_buffer_with_properties"
+
+/* type cl_mem_properties_intel */
 
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateBufferWithPropertiesINTEL(
-    cl_context   context,
-    const cl_mem_properties_intel* properties,
-    cl_mem_flags flags,
-    size_t       size,
-    void *       host_ptr,
-    cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
 typedef cl_mem (CL_API_CALL *
 clCreateBufferWithPropertiesINTEL_fn)(
-    cl_context   context,
+    cl_context context,
+    const cl_mem_properties_intel* properties,
+    cl_mem_flags flags,
+    size_t size,
+    void* host_ptr,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBufferWithPropertiesINTEL(
+    cl_context context,
     const cl_mem_properties_intel* properties,
     cl_mem_flags flags,
-    size_t       size,
-    void *       host_ptr,
-    cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+    size_t size,
+    void* host_ptr,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_intel_program_scope_host_pipe
+***************************************************************/
+#define cl_intel_program_scope_host_pipe 1
+#define CL_INTEL_PROGRAM_SCOPE_HOST_PIPE_EXTENSION_NAME \
+    "cl_intel_program_scope_host_pipe"
+
+/* clGetEventInfo response when param_name is CL_EVENT_COMMAND_TYPE */
+#define CL_COMMAND_READ_HOST_PIPE_INTEL                     0x4214
+#define CL_COMMAND_WRITE_HOST_PIPE_INTEL                    0x4215
+
+/* clGetProgramInfo param_name */
+#define CL_PROGRAM_NUM_HOST_PIPES_INTEL                     0x4216
+#define CL_PROGRAM_HOST_PIPE_NAMES_INTEL                    0x4217
+
+
+typedef cl_int (CL_API_CALL *
+clEnqueueReadHostPipeINTEL_fn)(
+    cl_command_queue command_queue,
+    cl_program program,
+    const char* pipe_symbol,
+    cl_bool blocking_read,
+    void* ptr,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueWriteHostPipeINTEL_fn)(
+    cl_command_queue command_queue,
+    cl_program program,
+    const char* pipe_symbol,
+    cl_bool blocking_write,
+    const void* ptr,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
-/******************************************
-* cl_intel_mem_channel_property extension *
-*******************************************/
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadHostPipeINTEL(
+    cl_command_queue command_queue,
+    cl_program program,
+    const char* pipe_symbol,
+    cl_bool blocking_read,
+    void* ptr,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteHostPipeINTEL(
+    cl_command_queue command_queue,
+    cl_program program,
+    const char* pipe_symbol,
+    cl_bool blocking_write,
+    const void* ptr,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
-#define CL_MEM_CHANNEL_INTEL            0x4213
+/***************************************************************
+* cl_intel_mem_channel_property
+***************************************************************/
+#define cl_intel_mem_channel_property 1
+#define CL_INTEL_MEM_CHANNEL_PROPERTY_EXTENSION_NAME \
+    "cl_intel_mem_channel_property"
 
-/*********************************
-* cl_intel_mem_force_host_memory *
-**********************************/
+/* cl_mem_properties_intel */
+#define CL_MEM_CHANNEL_INTEL                                0x4213
 
+/***************************************************************
+* cl_intel_mem_force_host_memory
+***************************************************************/
 #define cl_intel_mem_force_host_memory 1
+#define CL_INTEL_MEM_FORCE_HOST_MEMORY_EXTENSION_NAME \
+    "cl_intel_mem_force_host_memory"
 
 /* cl_mem_flags */
 #define CL_MEM_FORCE_HOST_MEMORY_INTEL                      (1 << 20)
@@ -1677,6 +3044,8 @@ clCreateBufferWithPropertiesINTEL_fn)(
 * cl_intel_command_queue_families
 ***************************************************************/
 #define cl_intel_command_queue_families 1
+#define CL_INTEL_COMMAND_QUEUE_FAMILIES_EXTENSION_NAME \
+    "cl_intel_command_queue_families"
 
 typedef cl_bitfield         cl_command_queue_capabilities_intel;
 
@@ -1715,9 +3084,194 @@ typedef struct _cl_queue_family_properties_intel {
 #define CL_QUEUE_CAPABILITY_BARRIER_INTEL                   (1 << 25)
 #define CL_QUEUE_CAPABILITY_KERNEL_INTEL                    (1 << 26)
 
+/***************************************************************
+* cl_intel_queue_no_sync_operations
+***************************************************************/
+#define cl_intel_queue_no_sync_operations 1
+#define CL_INTEL_QUEUE_NO_SYNC_OPERATIONS_EXTENSION_NAME \
+    "cl_intel_queue_no_sync_operations"
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_NO_SYNC_OPERATIONS_INTEL                   (1 << 29)
+
+/***************************************************************
+* cl_intel_sharing_format_query
+***************************************************************/
+#define cl_intel_sharing_format_query 1
+#define CL_INTEL_SHARING_FORMAT_QUERY_EXTENSION_NAME \
+    "cl_intel_sharing_format_query"
+
+/***************************************************************
+* cl_ext_image_requirements_info
+***************************************************************/
+#if defined(CL_VERSION_3_0)
+
+#define cl_ext_image_requirements_info 1
+#define CL_EXT_IMAGE_REQUIREMENTS_INFO_EXTENSION_NAME \
+    "cl_ext_image_requirements_info"
+
+/* Types */
+typedef cl_uint             cl_image_requirements_info_ext;
+
+/* cl_image_requirements_info_ext */
+#define CL_IMAGE_REQUIREMENTS_BASE_ADDRESS_ALIGNMENT_EXT    0x1292
+#define CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT       0x1290
+#define CL_IMAGE_REQUIREMENTS_SIZE_EXT                      0x12B2
+#define CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT                 0x12B3
+#define CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT                0x12B4
+#define CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT                 0x12B5
+#define CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT            0x12B6
+
+/* Enqueued Commands APIs */
+
+typedef cl_int (CL_API_CALL *
+clGetImageRequirementsInfoEXT_fn)(
+    cl_context context,
+    const cl_mem_properties* properties,
+    cl_mem_flags flags,
+    const cl_image_format* image_format,
+    const cl_image_desc* image_desc,
+    cl_image_requirements_info_ext param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_3_0;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageRequirementsInfoEXT(
+    cl_context context,
+    const cl_mem_properties* properties,
+    cl_mem_flags flags,
+    const cl_image_format* image_format,
+    const cl_image_desc* image_desc,
+    cl_image_requirements_info_ext param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_3_0;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+#endif /* defined(CL_VERSION_3_0) */
+
+/***************************************************************
+* cl_ext_image_from_buffer
+***************************************************************/
+#if defined(CL_VERSION_3_0)
+
+#define cl_ext_image_from_buffer 1
+#define CL_EXT_IMAGE_FROM_BUFFER_EXTENSION_NAME \
+    "cl_ext_image_from_buffer"
+
+/* cl_image_requirements_info_ext */
+#define CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT     0x1291
+
+#endif /* defined(CL_VERSION_3_0) */
+
+/***************************************************************
+* cl_loader_info
+***************************************************************/
+#define cl_loader_info 1
+#define CL_LOADER_INFO_EXTENSION_NAME \
+    "cl_loader_info"
+
+typedef cl_uint             cl_icdl_info;
+
+/* cl_icdl_info */
+#define CL_ICDL_OCL_VERSION                                 1
+#define CL_ICDL_VERSION                                     2
+#define CL_ICDL_NAME                                        3
+#define CL_ICDL_VENDOR                                      4
+
+
+typedef cl_int (CL_API_CALL *
+clGetICDLoaderInfoOCLICD_fn)(
+    cl_icdl_info param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetICDLoaderInfoOCLICD(
+    cl_icdl_info param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_khr_depth_images
+***************************************************************/
+#define cl_khr_depth_images 1
+#define CL_KHR_DEPTH_IMAGES_EXTENSION_NAME \
+    "cl_khr_depth_images"
+
+#if !defined(CL_VERSION_1_2)
+/* cl_channel_order - defined in CL.h for OpenCL 1.2 (?) and newer */
+#define CL_DEPTH                                            0x10BD
+
+#endif /* !defined(CL_VERSION_1_2) */
+
+/***************************************************************
+* cl_ext_float_atomics
+***************************************************************/
+#define cl_ext_float_atomics 1
+#define CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME \
+    "cl_ext_float_atomics"
+
+typedef cl_bitfield         cl_device_fp_atomic_capabilities_ext;
+
+/* cl_device_fp_atomic_capabilities_ext */
+#define CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT           (1 << 0)
+#define CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT                  (1 << 1)
+#define CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT              (1 << 2)
+#define CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT            (1 << 16)
+#define CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT                   (1 << 17)
+#define CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT               (1 << 18)
+
+/* cl_device_info */
+#define CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT         0x4231
+#define CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT         0x4232
+#define CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT           0x4233
+
+/***************************************************************
+* cl_intel_create_mem_object_properties
+***************************************************************/
+#define cl_intel_create_mem_object_properties 1
+#define CL_INTEL_CREATE_MEM_OBJECT_PROPERTIES_EXTENSION_NAME \
+    "cl_intel_create_mem_object_properties"
+
+/* cl_mem_properties */
+#define CL_MEM_LOCALLY_UNCACHED_RESOURCE_INTEL              0x4218
+#define CL_MEM_DEVICE_ID_INTEL                              0x4219
+
+/***************************************************************
+* cl_pocl_content_size
+***************************************************************/
+#define cl_pocl_content_size 1
+#define CL_POCL_CONTENT_SIZE_EXTENSION_NAME \
+    "cl_pocl_content_size"
+
+
+typedef cl_int (CL_API_CALL *
+clSetContentSizeBufferPoCL_fn)(
+    cl_mem buffer,
+    cl_mem content_size_buffer) CL_API_SUFFIX__VERSION_1_0;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetContentSizeBufferPoCL(
+    cl_mem buffer,
+    cl_mem content_size_buffer) CL_API_SUFFIX__VERSION_1_0;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
 #ifdef __cplusplus
 }
 #endif
 
-
-#endif /* __CL_EXT_H */
+#endif /* OPENCL_CL_EXT_H_ */
diff --git a/include/CL/cl_gl.h b/include/CL/cl_gl.h
index 5ea0fd8b7f4274bf28a2bbea83b896593c1f1a5b..f88348e25a42e439220688569941691e82ce9e96 100644
--- a/include/CL/cl_gl.h
+++ b/include/CL/cl_gl.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2021 The Khronos Group Inc.
+ * Copyright (c) 2008-2023 The Khronos Group Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,156 +14,359 @@
  * limitations under the License.
  ******************************************************************************/
 
-#ifndef __OPENCL_CL_GL_H
-#define __OPENCL_CL_GL_H
+#ifndef OPENCL_CL_GL_H_
+#define OPENCL_CL_GL_H_
+
+/*
+** This header is generated from the Khronos OpenCL XML API Registry.
+*/
 
 #include <CL/cl.h>
 
-#ifdef __cplusplus
-extern "C" {
+/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
+#define CL_NO_EXTENSION_PROTOTYPES
 #endif
 
-typedef cl_uint     cl_gl_object_type;
-typedef cl_uint     cl_gl_texture_info;
-typedef cl_uint     cl_gl_platform_info;
-typedef struct __GLsync *cl_GLsync;
-
-/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
-#define CL_GL_OBJECT_BUFFER                     0x2000
-#define CL_GL_OBJECT_TEXTURE2D                  0x2001
-#define CL_GL_OBJECT_TEXTURE3D                  0x2002
-#define CL_GL_OBJECT_RENDERBUFFER               0x2003
-#ifdef CL_VERSION_1_2
-#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
-#define CL_GL_OBJECT_TEXTURE1D                  0x200F
-#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
-#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+/* CL_NO_EXTENSION_PROTOTYPES implies
+   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
+   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
 #endif
 
-/* cl_gl_texture_info           */
-#define CL_GL_TEXTURE_TARGET                    0x2004
-#define CL_GL_MIPMAP_LEVEL                      0x2005
-#ifdef CL_VERSION_1_2
-#define CL_GL_NUM_SAMPLES                       0x2012
+#ifdef __cplusplus
+extern "C" {
 #endif
 
+/***************************************************************
+* cl_khr_gl_sharing
+***************************************************************/
+#define cl_khr_gl_sharing 1
+#define CL_KHR_GL_SHARING_EXTENSION_NAME \
+    "cl_khr_gl_sharing"
+
+typedef cl_uint             cl_gl_context_info;
+
+/* Error codes */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR              -1000
+
+/* cl_gl_context_info */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR                0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR                       0x2007
+
+/* Additional cl_context_properties */
+#define CL_GL_CONTEXT_KHR                                   0x2008
+#define CL_EGL_DISPLAY_KHR                                  0x2009
+#define CL_GLX_DISPLAY_KHR                                  0x200A
+#define CL_WGL_HDC_KHR                                      0x200B
+#define CL_CGL_SHAREGROUP_KHR                               0x200C
+
+typedef cl_uint             cl_gl_object_type;
+typedef cl_uint             cl_gl_texture_info;
+typedef cl_uint             cl_gl_platform_info;
+
+/* cl_gl_object_type */
+#define CL_GL_OBJECT_BUFFER                                 0x2000
+#define CL_GL_OBJECT_TEXTURE2D                              0x2001
+#define CL_GL_OBJECT_TEXTURE3D                              0x2002
+#define CL_GL_OBJECT_RENDERBUFFER                           0x2003
+
+#if defined(CL_VERSION_1_2)
+/* cl_gl_object_type */
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY                        0x200E
+#define CL_GL_OBJECT_TEXTURE1D                              0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY                        0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER                         0x2011
+
+#endif /* defined(CL_VERSION_1_2) */
+
+/* cl_gl_texture_info */
+#define CL_GL_TEXTURE_TARGET                                0x2004
+#define CL_GL_MIPMAP_LEVEL                                  0x2005
+
+
+typedef cl_int (CL_API_CALL *
+clGetGLContextInfoKHR_fn)(
+    const cl_context_properties* properties,
+    cl_gl_context_info param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromGLBuffer_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLuint bufobj,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(
+    const cl_context_properties* properties,
+    cl_gl_context_info param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLBuffer(cl_context     context,
-                     cl_mem_flags   flags,
-                     cl_GLuint      bufobj,
-                     cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+clCreateFromGLBuffer(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLuint bufobj,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+#if defined(CL_VERSION_1_2)
 
-#ifdef CL_VERSION_1_2
+typedef cl_mem (CL_API_CALL *
+clCreateFromGLTexture_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLenum target,
+    cl_GLint miplevel,
+    cl_GLuint texture,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
 extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLTexture(cl_context      context,
-                      cl_mem_flags    flags,
-                      cl_GLenum       target,
-                      cl_GLint        miplevel,
-                      cl_GLuint       texture,
-                      cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+clCreateFromGLTexture(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLenum target,
+    cl_GLint miplevel,
+    cl_GLuint texture,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-#endif
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+#endif /* defined(CL_VERSION_1_2) */
+
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromGLRenderbuffer_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLuint renderbuffer,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int (CL_API_CALL *
+clGetGLObjectInfo_fn)(
+    cl_mem memobj,
+    cl_gl_object_type* gl_object_type,
+    cl_GLuint* gl_object_name) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int (CL_API_CALL *
+clGetGLTextureInfo_fn)(
+    cl_mem memobj,
+    cl_gl_texture_info param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueAcquireGLObjects_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueReleaseGLObjects_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
 extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLRenderbuffer(cl_context   context,
-                           cl_mem_flags flags,
-                           cl_GLuint    renderbuffer,
-                           cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+clCreateFromGLRenderbuffer(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLuint renderbuffer,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLObjectInfo(cl_mem                memobj,
-                  cl_gl_object_type *   gl_object_type,
-                  cl_GLuint *           gl_object_name) CL_API_SUFFIX__VERSION_1_0;
+clGetGLObjectInfo(
+    cl_mem memobj,
+    cl_gl_object_type* gl_object_type,
+    cl_GLuint* gl_object_name) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLTextureInfo(cl_mem               memobj,
-                   cl_gl_texture_info   param_name,
-                   size_t               param_value_size,
-                   void *               param_value,
-                   size_t *             param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+clGetGLTextureInfo(
+    cl_mem memobj,
+    cl_gl_texture_info param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireGLObjects(cl_command_queue      command_queue,
-                          cl_uint               num_objects,
-                          const cl_mem *        mem_objects,
-                          cl_uint               num_events_in_wait_list,
-                          const cl_event *      event_wait_list,
-                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;
+clEnqueueAcquireGLObjects(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseGLObjects(cl_command_queue      command_queue,
-                          cl_uint               num_objects,
-                          const cl_mem *        mem_objects,
-                          cl_uint               num_events_in_wait_list,
-                          const cl_event *      event_wait_list,
-                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;
-
-
-/* Deprecated OpenCL 1.1 APIs */
-extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
-clCreateFromGLTexture2D(cl_context      context,
-                        cl_mem_flags    flags,
-                        cl_GLenum       target,
-                        cl_GLint        miplevel,
-                        cl_GLuint       texture,
-                        cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
-
-extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
-clCreateFromGLTexture3D(cl_context      context,
-                        cl_mem_flags    flags,
-                        cl_GLenum       target,
-                        cl_GLint        miplevel,
-                        cl_GLuint       texture,
-                        cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
-
-/* cl_khr_gl_sharing extension  */
+clEnqueueReleaseGLObjects(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
 
-#define cl_khr_gl_sharing 1
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
-typedef cl_uint     cl_gl_context_info;
+/* OpenCL 1.0 APIs that were deprecated in OpenCL 1.2 */
 
-/* Additional Error Codes  */
-#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+typedef cl_mem (CL_API_CALL *
+clCreateFromGLTexture2D_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLenum target,
+    cl_GLint miplevel,
+    cl_GLuint texture,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
-/* cl_gl_context_info  */
-#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
-#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+typedef cl_mem (CL_API_CALL *
+clCreateFromGLTexture3D_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLenum target,
+    cl_GLint miplevel,
+    cl_GLuint texture,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 
-/* Additional cl_context_properties  */
-#define CL_GL_CONTEXT_KHR                       0x2008
-#define CL_EGL_DISPLAY_KHR                      0x2009
-#define CL_GLX_DISPLAY_KHR                      0x200A
-#define CL_WGL_HDC_KHR                          0x200B
-#define CL_CGL_SHAREGROUP_KHR                   0x200C
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLContextInfoKHR(const cl_context_properties * properties,
-                      cl_gl_context_info            param_name,
-                      size_t                        param_value_size,
-                      void *                        param_value,
-                      size_t *                      param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
-    const cl_context_properties * properties,
-    cl_gl_context_info            param_name,
-    size_t                        param_value_size,
-    void *                        param_value,
-    size_t *                      param_value_size_ret);
-
-/* 
- *  cl_khr_gl_event extension
- */
-#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture2D(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLenum target,
+    cl_GLint miplevel,
+    cl_GLuint texture,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture3D(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLenum target,
+    cl_GLint miplevel,
+    cl_GLuint texture,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_khr_gl_event
+***************************************************************/
+#define cl_khr_gl_event 1
+#define CL_KHR_GL_EVENT_EXTENSION_NAME \
+    "cl_khr_gl_event"
+
+typedef struct __GLsync *   cl_GLsync;
+
+/* cl_command_type */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR                 0x200D
+
+
+typedef cl_event (CL_API_CALL *
+clCreateEventFromGLsyncKHR_fn)(
+    cl_context context,
+    cl_GLsync sync,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
 extern CL_API_ENTRY cl_event CL_API_CALL
-clCreateEventFromGLsyncKHR(cl_context context,
-                           cl_GLsync  sync,
-                           cl_int *   errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+clCreateEventFromGLsyncKHR(
+    cl_context context,
+    cl_GLsync sync,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_khr_gl_depth_images
+***************************************************************/
+#define cl_khr_gl_depth_images 1
+#define CL_KHR_GL_DEPTH_IMAGES_EXTENSION_NAME \
+    "cl_khr_gl_depth_images"
+
+#if !defined(CL_VERSION_1_2)
+/* cl_channel_order - defined in CL.h for OpenCL 1.2 and newer */
+#define CL_DEPTH_STENCIL                                    0x10BE
+
+#endif /* !defined(CL_VERSION_1_2) */
+
+#if !defined(CL_VERSION_1_2)
+/* cl_channel_type - defined in CL.h for OpenCL 1.2 and newer */
+#define CL_UNORM_INT24                                      0x10DF
+
+#endif /* !defined(CL_VERSION_1_2) */
+
+/***************************************************************
+* cl_khr_gl_msaa_sharing
+***************************************************************/
+#define cl_khr_gl_msaa_sharing 1
+#define CL_KHR_GL_MSAA_SHARING_EXTENSION_NAME \
+    "cl_khr_gl_msaa_sharing"
+
+/* cl_gl_texture_info */
+#define CL_GL_NUM_SAMPLES                                   0x2012
+
+/***************************************************************
+* cl_intel_sharing_format_query_gl
+***************************************************************/
+#define cl_intel_sharing_format_query_gl 1
+#define CL_INTEL_SHARING_FORMAT_QUERY_GL_EXTENSION_NAME \
+    "cl_intel_sharing_format_query_gl"
+
+/* when cl_khr_gl_sharing is supported */
+
+typedef cl_int (CL_API_CALL *
+clGetSupportedGLTextureFormatsINTEL_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint num_entries,
+    cl_GLenum* gl_formats,
+    cl_uint* num_texture_formats) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedGLTextureFormatsINTEL(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint num_entries,
+    cl_GLenum* gl_formats,
+    cl_uint* num_texture_formats) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif  /* __OPENCL_CL_GL_H */
+#endif /* OPENCL_CL_GL_H_ */
diff --git a/include/CL/cl_gl_ext.h b/include/CL/cl_gl_ext.h
index 8ec8181674dc3e4f904733e8a24f8de218157e5e..b5da13eb67050daeb1e79044fa1f08d5ab8026ef 100644
--- a/include/CL/cl_gl_ext.h
+++ b/include/CL/cl_gl_ext.h
@@ -15,4 +15,4 @@
  ******************************************************************************/
 
 #include <CL/cl_gl.h>
-#pragma message("All OpenGL-related extensions have been moved into cl_gl.h.  Please include cl_gl.h directly.")
+#pragma message("The extensions in cl_gl_ext.h have been moved into cl_gl.h.  Please include cl_gl.h directly.")
diff --git a/include/CL/cl_icd.h b/include/CL/cl_icd.h
index d5ac1de09908775f828582e3bb7a6bad44ca2b8f..360b870305523f2fef692d24a8ce0f9672e6c65d 100644
--- a/include/CL/cl_icd.h
+++ b/include/CL/cl_icd.h
@@ -1099,7 +1099,7 @@ typedef void *cl_api_clGetHostTimer;
 
 #endif
 
-/* Vendor dispatch table struture */
+/* Vendor dispatch table structure */
 
 typedef struct _cl_icd_dispatch {
   /* OpenCL 1.0 */
diff --git a/include/CL/cl_platform.h b/include/CL/cl_platform.h
index 8ae655d143553c24434cba2ceff6449af8d1c94a..e7a0d6f4761771a1e4d54ce185ed47e3861639dc 100644
--- a/include/CL/cl_platform.h
+++ b/include/CL/cl_platform.h
@@ -135,6 +135,11 @@ extern "C" {
 
 #if (defined (_WIN32) && defined(_MSC_VER))
 
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wlanguage-extension-token"
+#endif
+
 /* intptr_t is used in cl.h and provided by stddef.h in Visual C++, but not in clang */
 /* stdint.h was missing before Visual Studio 2010, include it for later versions and for clang */
 #if defined(__clang__) || _MSC_VER >= 1600
@@ -155,6 +160,10 @@ typedef unsigned __int16        cl_half;
 typedef float                   cl_float;
 typedef double                  cl_double;
 
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
 /* Macro names and corresponding values defined by OpenCL */
 #define CL_CHAR_BIT         8
 #define CL_SCHAR_MAX        127
@@ -501,25 +510,26 @@ typedef unsigned int cl_GLenum;
 #if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
 #define  __CL_HAS_ANON_STRUCT__ 1
 #define  __CL_ANON_STRUCT__
-#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#elif defined(_WIN32) && defined(_MSC_VER) && !defined(__STDC__)
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__
+#elif defined(__GNUC__) && ! defined(__STRICT_ANSI__)
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__ __extension__
+#elif defined(__clang__)
 #define  __CL_HAS_ANON_STRUCT__ 1
 #define  __CL_ANON_STRUCT__ __extension__
-#elif defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__)
-    #if _MSC_VER >= 1500
-   /* Microsoft Developer Studio 2008 supports anonymous structs, but
-    * complains by default. */
-    #define  __CL_HAS_ANON_STRUCT__ 1
-    #define  __CL_ANON_STRUCT__
-   /* Disable warning C4201: nonstandard extension used : nameless
-    * struct/union */
-    #pragma warning( push )
-    #pragma warning( disable : 4201 )
-    #endif
 #else
 #define  __CL_HAS_ANON_STRUCT__ 0
 #define  __CL_ANON_STRUCT__
 #endif
 
+#if defined(_WIN32) && defined(_MSC_VER) && __CL_HAS_ANON_STRUCT__
+   /* Disable warning C4201: nonstandard extension used : nameless struct/union */
+    #pragma warning( push )
+    #pragma warning( disable : 4201 )
+#endif
+
 /* Define alignment keys */
 #if defined( __GNUC__ ) || defined(__INTEGRITY)
     #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
@@ -1395,10 +1405,8 @@ typedef union
 }
 #endif
 
-#if defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__)
-    #if _MSC_VER >=1500
+#if defined(_WIN32) && defined(_MSC_VER) && __CL_HAS_ANON_STRUCT__
     #pragma warning( pop )
-    #endif
 #endif
 
 #endif  /* __CL_PLATFORM_H  */
diff --git a/include/CL/cl_va_api_media_sharing_intel.h b/include/CL/cl_va_api_media_sharing_intel.h
index 7ba2ec83d1db33b53d1af65c66583ade1f42af83..767da05fc8d25e9287382b0f563f119e560a29bc 100644
--- a/include/CL/cl_va_api_media_sharing_intel.h
+++ b/include/CL/cl_va_api_media_sharing_intel.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2020 The Khronos Group Inc.
+ * Copyright (c) 2008-2023 The Khronos Group Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,24 +14,82 @@
  * limitations under the License.
  ******************************************************************************/
 
-#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
-#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
+#ifndef OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H_
+#define OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H_
+
+/*
+** This header is generated from the Khronos OpenCL XML API Registry.
+*/
 
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
 #include <va/va.h>
 
+#include <CL/cl.h>
+
+/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
+#define CL_NO_EXTENSION_PROTOTYPES
+#endif
+
+/* CL_NO_EXTENSION_PROTOTYPES implies
+   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
+   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/******************************************
-* cl_intel_va_api_media_sharing extension *
-*******************************************/
+/***************************************************************
+* cl_intel_sharing_format_query_va_api
+***************************************************************/
+#define cl_intel_sharing_format_query_va_api 1
+#define CL_INTEL_SHARING_FORMAT_QUERY_VA_API_EXTENSION_NAME \
+    "cl_intel_sharing_format_query_va_api"
+
+/* when cl_intel_va_api_media_sharing is supported */
 
+typedef cl_int (CL_API_CALL *
+clGetSupportedVA_APIMediaSurfaceFormatsINTEL_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint plane,
+    cl_uint num_entries,
+    VAImageFormat* va_api_formats,
+    cl_uint* num_surface_formats) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedVA_APIMediaSurfaceFormatsINTEL(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint plane,
+    cl_uint num_entries,
+    VAImageFormat* va_api_formats,
+    cl_uint* num_surface_formats) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_intel_va_api_media_sharing
+***************************************************************/
 #define cl_intel_va_api_media_sharing 1
+#define CL_INTEL_VA_API_MEDIA_SHARING_EXTENSION_NAME \
+    "cl_intel_va_api_media_sharing"
 
-/* error codes */
+typedef cl_uint             cl_va_api_device_source_intel;
+typedef cl_uint             cl_va_api_device_set_intel;
+
+/* Error codes */
 #define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098
 #define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099
 #define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100
@@ -57,80 +115,85 @@ extern "C" {
 #define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A
 #define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B
 
-typedef cl_uint cl_va_api_device_source_intel;
-typedef cl_uint cl_va_api_device_set_intel;
+
+typedef cl_int (CL_API_CALL *
+clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
+    cl_platform_id platform,
+    cl_va_api_device_source_intel media_adapter_type,
+    void* media_adapter,
+    cl_va_api_device_set_intel media_adapter_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromVA_APIMediaSurfaceINTEL_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    VASurfaceID* surface,
+    cl_uint plane,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
-    cl_platform_id                platform,
-    cl_va_api_device_source_intel media_adapter_type,
-    void*                         media_adapter,
-    cl_va_api_device_set_intel    media_adapter_set,
-    cl_uint                       num_entries,
-    cl_device_id*                 devices,
-    cl_uint*                      num_devices) CL_API_SUFFIX__VERSION_1_2;
-
-typedef cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
-    cl_platform_id                platform,
+    cl_platform_id platform,
     cl_va_api_device_source_intel media_adapter_type,
-    void*                         media_adapter,
-    cl_va_api_device_set_intel    media_adapter_set,
-    cl_uint                       num_entries,
-    cl_device_id*                 devices,
-    cl_uint*                      num_devices) CL_API_SUFFIX__VERSION_1_2;
+    void* media_adapter,
+    cl_va_api_device_set_intel media_adapter_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromVA_APIMediaSurfaceINTEL(
-    cl_context                    context,
-    cl_mem_flags                  flags,
-    VASurfaceID*                  surface,
-    cl_uint                       plane,
-    cl_int*                       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
-    cl_context                    context,
-    cl_mem_flags                  flags,
-    VASurfaceID*                  surface,
-    cl_uint                       plane,
-    cl_int*                       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+    cl_context context,
+    cl_mem_flags flags,
+    VASurfaceID* surface,
+    cl_uint plane,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueAcquireVA_APIMediaSurfacesINTEL(
-    cl_command_queue              command_queue,
-    cl_uint                       num_objects,
-    const cl_mem*                 mem_objects,
-    cl_uint                       num_events_in_wait_list,
-    const cl_event*               event_wait_list,
-    cl_event*                     event) CL_API_SUFFIX__VERSION_1_2;
-
-typedef cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
-    cl_command_queue              command_queue,
-    cl_uint                       num_objects,
-    const cl_mem*                 mem_objects,
-    cl_uint                       num_events_in_wait_list,
-    const cl_event*               event_wait_list,
-    cl_event*                     event) CL_API_SUFFIX__VERSION_1_2;
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReleaseVA_APIMediaSurfacesINTEL(
-    cl_command_queue              command_queue,
-    cl_uint                       num_objects,
-    const cl_mem*                 mem_objects,
-    cl_uint                       num_events_in_wait_list,
-    const cl_event*               event_wait_list,
-    cl_event*                     event) CL_API_SUFFIX__VERSION_1_2;
-
-typedef cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
-    cl_command_queue              command_queue,
-    cl_uint                       num_objects,
-    const cl_mem*                 mem_objects,
-    cl_uint                       num_events_in_wait_list,
-    const cl_event*               event_wait_list,
-    cl_event*                     event) CL_API_SUFFIX__VERSION_1_2;
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif  /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */
-
+#endif /* OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H_ */
diff --git a/include/CL/opencl.hpp b/include/CL/opencl.hpp
index 75b9c92c27a9b0e85f8ce6ae8655e7436ca363f6..d8d227e1b2d6c86343651ca54dfebe4c133424b6 100644
--- a/include/CL/opencl.hpp
+++ b/include/CL/opencl.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2008-2020 The Khronos Group Inc.
+// Copyright (c) 2008-2023 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -33,14 +33,12 @@
  *
  *   Optional extension support
  *
- *         cl_ext_device_fission
- *         #define CL_HPP_USE_CL_DEVICE_FISSION
  *         cl_khr_d3d10_sharing
  *         #define CL_HPP_USE_DX_INTEROP
+ *         cl_khr_il_program
+ *         #define CL_HPP_USE_IL_KHR
  *         cl_khr_sub_groups
  *         #define CL_HPP_USE_CL_SUB_GROUPS_KHR
- *         cl_khr_image2d_from_buffer
- *         #define CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR
  *
  *   Doxygen documentation for this header is available here:
  *
@@ -82,9 +80,9 @@
  *
  * The combination of preprocessor macros CL_HPP_TARGET_OPENCL_VERSION and 
  * CL_HPP_MINIMUM_OPENCL_VERSION control this range. These are three digit
- * decimal values representing OpenCL runime versions. The default for 
- * the target is 200, representing OpenCL 2.0 and the minimum is also 
- * defined as 200. These settings would use 2.0 API calls only.
+ * decimal values representing OpenCL runtime versions. The default for 
+ * the target is 300, representing OpenCL 3.0.  The minimum is defined as 200.
+ * These settings would use 2.0 and newer API calls only.
  * If backward compatibility with a 1.2 runtime is required, the minimum
  * version may be set to 120.
  *
@@ -139,6 +137,11 @@
  * - CL_HPP_TARGET_OPENCL_VERSION
  *
  *   Defines the target OpenCL runtime version to build the header
+ *   against. Defaults to 300, representing OpenCL 3.0.
+ *
+ * - CL_HPP_MINIMUM_OPENCL_VERSION
+ *
+ *   Defines the minimum OpenCL runtime version to build the header
  *   against. Defaults to 200, representing OpenCL 2.0.
  *
  * - CL_HPP_NO_STD_STRING
@@ -165,10 +168,6 @@
  *   the cl::allocate_pointer functions are not defined and may be
  *   defined by the user before opencl.hpp is included.
  *
- * - CL_HPP_ENABLE_DEVICE_FISSION
- *
- *   Enables device fission for OpenCL 1.2 platforms.
- *
  * - CL_HPP_ENABLE_EXCEPTIONS
  *
  *   Enable exceptions for use in the C++ bindings header. This is the
@@ -194,10 +193,15 @@
  *   applies to use of cl::Program construction and other program
  *   build variants.
  *
+ *
  * - CL_HPP_USE_CL_SUB_GROUPS_KHR
  *
  *   Enable the cl_khr_subgroups extension.
  *
+ * - CL_HPP_USE_DX_INTEROP
+ *
+ *   Enable the cl_khr_d3d10_sharing extension.
+ *
  * - CL_HPP_USE_IL_KHR
  *
  *   Enable the cl_khr_il_program extension.
@@ -209,6 +213,10 @@
  * bindings, including support for the optional exception feature and
  * also the supplied vector and string classes, see following sections for
  * decriptions of these features.
+ * 
+ * Note: the C++ bindings use std::call_once and therefore may need to be
+ * compiled using special command-line options (such as "-pthread") on some
+ * platforms!
  *
  * \code
     #define CL_HPP_ENABLE_EXCEPTIONS
@@ -224,28 +232,30 @@
 
     int main(void)
     {
-        // Filter for a 2.0 platform and set it as the default
+        // Filter for a 2.0 or newer platform and set it as the default
         std::vector<cl::Platform> platforms;
         cl::Platform::get(&platforms);
         cl::Platform plat;
         for (auto &p : platforms) {
             std::string platver = p.getInfo<CL_PLATFORM_VERSION>();
-            if (platver.find("OpenCL 2.") != std::string::npos) {
+            if (platver.find("OpenCL 2.") != std::string::npos ||
+                platver.find("OpenCL 3.") != std::string::npos) {
+                // Note: an OpenCL 3.x platform may not support all required features!
                 plat = p;
             }
         }
-        if (plat() == 0)  {
-            std::cout << "No OpenCL 2.0 platform found.";
+        if (plat() == 0) {
+            std::cout << "No OpenCL 2.0 or newer platform found.\n";
             return -1;
         }
 
         cl::Platform newP = cl::Platform::setDefault(plat);
         if (newP != plat) {
-            std::cout << "Error setting default platform.";
+            std::cout << "Error setting default platform.\n";
             return -1;
         }
 
-        // Use C++11 raw string literals for kernel source code
+        // C++11 raw string literal for the first kernel
         std::string kernel1{R"CLC(
             global int globalA;
             kernel void updateGlobal()
@@ -253,6 +263,8 @@
               globalA = 75;
             }
         )CLC"};
+
+        // Raw string literal for the second kernel
         std::string kernel2{R"CLC(
             typedef struct { global int *bar; } Foo;
             kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB,
@@ -279,8 +291,9 @@
             }
         )CLC"};
 
-        // New simpler string interface style
-        std::vector<std::string> programStrings {kernel1, kernel2};
+        std::vector<std::string> programStrings;
+        programStrings.push_back(kernel1);
+        programStrings.push_back(kernel2);
 
         cl::Program vectorAddProgram(programStrings);
         try {
@@ -319,10 +332,9 @@
         std::vector<int, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>> inputA(numElements, 1, svmAlloc);
         cl::coarse_svm_vector<int> inputB(numElements, 2, svmAlloc);
 
-        //
         //////////////
-
         // Traditional cl_mem allocations
+
         std::vector<int> output(numElements, 0xdeadbeef);
         cl::Buffer outputBuffer(begin(output), end(output), false);
         cl::Pipe aPipe(sizeof(cl_int), numElements / 2);
@@ -346,14 +358,8 @@
         // This one was not passed as a parameter
         vectorAddKernel.setSVMPointers(anSVMInt);
 
-        // Hand control of coarse allocations to runtime
-        cl::enqueueUnmapSVM(anSVMInt);
-        cl::enqueueUnmapSVM(fooPointer);
-        cl::unmapSVM(inputB);
-        cl::unmapSVM(output2);
-
-	    cl_int error;
-	    vectorAddKernel(
+        cl_int error;
+        vectorAddKernel(
             cl::EnqueueArgs(
                 cl::NDRange(numElements/2),
                 cl::NDRange(numElements/2)),
@@ -364,12 +370,10 @@
             3,
             aPipe,
             defaultDeviceQueue,
-		    error
+            error
             );
 
         cl::copy(outputBuffer, begin(output), end(output));
-        // Grab the SVM output vector using a map
-        cl::mapSVM(output2);
 
         cl::Device d = cl::Device::getDefault();
 
@@ -396,10 +400,6 @@
 # pragma message("opencl.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead")
 # define CL_HPP_USE_DX_INTEROP
 #endif
-#if !defined(CL_HPP_USE_CL_DEVICE_FISSION) && defined(USE_CL_DEVICE_FISSION)
-# pragma message("opencl.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead")
-# define CL_HPP_USE_CL_DEVICE_FISSION
-#endif
 #if !defined(CL_HPP_ENABLE_EXCEPTIONS) && defined(__CL_ENABLE_EXCEPTIONS)
 # pragma message("opencl.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead")
 # define CL_HPP_ENABLE_EXCEPTIONS
@@ -519,11 +519,6 @@
 #error Visual studio 2013 or another C++11-supporting compiler required
 #endif
 
-// 
-#if defined(CL_HPP_USE_CL_DEVICE_FISSION) || defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
-#include <CL/cl_ext.h>
-#endif
-
 #if defined(__APPLE__) || defined(__MACOSX)
 #include <OpenCL/opencl.h>
 #else
@@ -549,18 +544,25 @@
 // Define deprecated prefixes and suffixes to ensure compilation
 // in case they are not pre-defined
 #if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED)
-#define CL_API_PREFIX__VERSION_1_1_DEPRECATED  
+#define CL_API_PREFIX__VERSION_1_1_DEPRECATED
 #endif // #if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED)
 #if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED)
 #define CL_API_SUFFIX__VERSION_1_1_DEPRECATED
-#endif // #if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED)
+#endif // #if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED)
 
 #if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED)
-#define CL_API_PREFIX__VERSION_1_2_DEPRECATED  
+#define CL_API_PREFIX__VERSION_1_2_DEPRECATED
 #endif // #if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED)
 #if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED)
 #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED
-#endif // #if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED)
+#endif // #if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED)
+
+#if !defined(CL_API_PREFIX__VERSION_2_2_DEPRECATED)
+#define CL_API_PREFIX__VERSION_2_2_DEPRECATED
+#endif // #if !defined(CL_API_PREFIX__VERSION_2_2_DEPRECATED)
+#if !defined(CL_API_SUFFIX__VERSION_2_2_DEPRECATED)
+#define CL_API_SUFFIX__VERSION_2_2_DEPRECATED
+#endif // #if !defined(CL_API_SUFFIX__VERSION_2_2_DEPRECATED)
 
 #if !defined(CL_CALLBACK)
 #define CL_CALLBACK
@@ -703,24 +705,26 @@ namespace cl {
  *
  */
 namespace cl {
-    class Memory;
 
-#define CL_HPP_INIT_CL_EXT_FCN_PTR_(name) \
-    if (!pfn_##name) {    \
-    pfn_##name = (PFN_##name) \
-    clGetExtensionFunctionAddress(#name); \
-    if (!pfn_##name) {    \
-    } \
+#define CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(name) \
+    using PFN_##name = name##_fn
+
+#define CL_HPP_INIT_CL_EXT_FCN_PTR_(name)                               \
+    if (!pfn_##name) {                                                  \
+        pfn_##name = (PFN_##name)clGetExtensionFunctionAddress(#name);  \
     }
 
-#define CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, name) \
-    if (!pfn_##name) {    \
-    pfn_##name = (PFN_##name) \
-    clGetExtensionFunctionAddressForPlatform(platform, #name); \
-    if (!pfn_##name) {    \
-    } \
+#define CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, name)            \
+    if (!pfn_##name) {                                                  \
+        pfn_##name = (PFN_##name)                                       \
+            clGetExtensionFunctionAddressForPlatform(platform, #name);  \
     }
 
+#ifdef cl_khr_external_memory
+    enum class ExternalMemoryType : cl_external_memory_handle_type_khr;
+#endif
+
+    class Memory;
     class Program;
     class Device;
     class Context;
@@ -729,6 +733,13 @@ namespace cl {
     class Memory;
     class Buffer;
     class Pipe;
+#ifdef cl_khr_semaphore
+    class Semaphore;
+#endif
+#if defined(cl_khr_command_buffer)
+    class CommandBufferKhr;
+    class MutableCommandKhr;
+#endif // cl_khr_command_buffer
 
 #if defined(CL_HPP_ENABLE_EXCEPTIONS)
     /*! \brief Exception class 
@@ -750,7 +761,7 @@ namespace cl {
          *                handling of the exception has concluded.  If set, it
          *                will be returned by what().
          */
-        Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
+        Error(cl_int err, const char * errStr = nullptr) : err_(err), errStr_(errStr)
         {}
 
         ~Error() throw() {}
@@ -761,7 +772,7 @@ namespace cl {
          */
         virtual const char * what() const throw ()
         {
-            if (errStr_ == NULL) {
+            if (errStr_ == nullptr) {
                 return "empty";
             }
             else {
@@ -777,7 +788,7 @@ namespace cl {
     };
 #define CL_HPP_ERR_STR_(x) #x
 #else
-#define CL_HPP_ERR_STR_(x) NULL
+#define CL_HPP_ERR_STR_(x) nullptr
 #endif // CL_HPP_ENABLE_EXCEPTIONS
 
 
@@ -786,7 +797,7 @@ namespace detail
 #if defined(CL_HPP_ENABLE_EXCEPTIONS)
 static inline cl_int errHandler (
     cl_int err,
-    const char * errStr = NULL)
+    const char * errStr = nullptr)
 {
     if (err != CL_SUCCESS) {
         throw Error(err, errStr);
@@ -794,7 +805,7 @@ static inline cl_int errHandler (
     return err;
 }
 #else
-static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
+static inline cl_int errHandler (cl_int err, const char * errStr = nullptr)
 {
     (void) errStr; // suppress unused variable warning
     return err;
@@ -820,9 +831,11 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 #define __GET_KERNEL_ARG_INFO_ERR           CL_HPP_ERR_STR_(clGetKernelArgInfo)
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
-#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
 #define __GET_KERNEL_SUB_GROUP_INFO_ERR     CL_HPP_ERR_STR_(clGetKernelSubGroupInfo)
-#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#else
+#define __GET_KERNEL_SUB_GROUP_INFO_ERR     CL_HPP_ERR_STR_(clGetKernelSubGroupInfoKHR)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210
 #define __GET_KERNEL_WORK_GROUP_INFO_ERR    CL_HPP_ERR_STR_(clGetKernelWorkGroupInfo)
 #define __GET_PROGRAM_INFO_ERR              CL_HPP_ERR_STR_(clGetProgramInfo)
 #define __GET_PROGRAM_BUILD_INFO_ERR        CL_HPP_ERR_STR_(clGetProgramBuildInfo)
@@ -831,6 +844,9 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __CREATE_CONTEXT_ERR                CL_HPP_ERR_STR_(clCreateContext)
 #define __CREATE_CONTEXT_FROM_TYPE_ERR      CL_HPP_ERR_STR_(clCreateContextFromType)
 #define __GET_SUPPORTED_IMAGE_FORMATS_ERR   CL_HPP_ERR_STR_(clGetSupportedImageFormats)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 300
+#define __SET_CONTEXT_DESCTRUCTOR_CALLBACK_ERR  CL_HPP_ERR_STR_(clSetContextDestructorCallback)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 300
 
 #define __CREATE_BUFFER_ERR                 CL_HPP_ERR_STR_(clCreateBuffer)
 #define __COPY_ERR                          CL_HPP_ERR_STR_(cl::copy)
@@ -853,12 +869,11 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __CREATE_KERNEL_ERR                 CL_HPP_ERR_STR_(clCreateKernel)
 #define __SET_KERNEL_ARGS_ERR               CL_HPP_ERR_STR_(clSetKernelArg)
 #define __CREATE_PROGRAM_WITH_SOURCE_ERR    CL_HPP_ERR_STR_(clCreateProgramWithSource)
-#if CL_HPP_TARGET_OPENCL_VERSION >= 200
-#define __CREATE_PROGRAM_WITH_IL_ERR        CL_HPP_ERR_STR_(clCreateProgramWithIL)
-#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
 #define __CREATE_PROGRAM_WITH_BINARY_ERR    CL_HPP_ERR_STR_(clCreateProgramWithBinary)
 #if CL_HPP_TARGET_OPENCL_VERSION >= 210
 #define __CREATE_PROGRAM_WITH_IL_ERR        CL_HPP_ERR_STR_(clCreateProgramWithIL)
+#else
+#define __CREATE_PROGRAM_WITH_IL_ERR        CL_HPP_ERR_STR_(clCreateProgramWithILKHR)
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 210
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 #define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    CL_HPP_ERR_STR_(clCreateProgramWithBuiltInKernels)
@@ -908,7 +923,6 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __CREATE_PIPE_ERR             CL_HPP_ERR_STR_(clCreatePipe)
 #define __GET_PIPE_INFO_ERR           CL_HPP_ERR_STR_(clGetPipeInfo)
 
-
 #define __RETAIN_ERR                        CL_HPP_ERR_STR_(Retain Object)
 #define __RELEASE_ERR                       CL_HPP_ERR_STR_(Release Object)
 #define __FLUSH_ERR                         CL_HPP_ERR_STR_(clFlush)
@@ -924,6 +938,38 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR   CL_HPP_ERR_STR_(clSetProgramSpecializationConstant)
 #endif
 
+#ifdef cl_khr_external_memory
+#define __ENQUEUE_ACQUIRE_EXTERNAL_MEMORY_ERR       CL_HPP_ERR_STR_(clEnqueueAcquireExternalMemObjectsKHR)
+#define __ENQUEUE_RELEASE_EXTERNAL_MEMORY_ERR       CL_HPP_ERR_STR_(clEnqueueReleaseExternalMemObjectsKHR)
+#endif
+
+#ifdef cl_khr_semaphore
+#define __GET_SEMAPHORE_KHR_INFO_ERR                CL_HPP_ERR_STR_(clGetSemaphoreInfoKHR)
+#define __CREATE_SEMAPHORE_KHR_WITH_PROPERTIES_ERR  CL_HPP_ERR_STR_(clCreateSemaphoreWithPropertiesKHR)
+#define __ENQUEUE_WAIT_SEMAPHORE_KHR_ERR            CL_HPP_ERR_STR_(clEnqueueWaitSemaphoresKHR)
+#define __ENQUEUE_SIGNAL_SEMAPHORE_KHR_ERR          CL_HPP_ERR_STR_(clEnqueueSignalSemaphoresKHR)
+#define __RETAIN_SEMAPHORE_KHR_ERR                  CL_HPP_ERR_STR_(clRetainSemaphoreKHR)
+#define __RELEASE_SEMAPHORE_KHR_ERR                 CL_HPP_ERR_STR_(clReleaseSemaphoreKHR)
+#endif
+#if defined(cl_khr_command_buffer)
+#define __CREATE_COMMAND_BUFFER_KHR_ERR             CL_HPP_ERR_STR_(clCreateCommandBufferKHR)
+#define __GET_COMMAND_BUFFER_INFO_KHR_ERR           CL_HPP_ERR_STR_(clGetCommandBufferInfoKHR)
+#define __FINALIZE_COMMAND_BUFFER_KHR_ERR           CL_HPP_ERR_STR_(clFinalizeCommandBufferKHR)
+#define __ENQUEUE_COMMAND_BUFFER_KHR_ERR            CL_HPP_ERR_STR_(clEnqueueCommandBufferKHR)
+#define __COMMAND_BARRIER_WITH_WAIT_LIST_KHR_ERR    CL_HPP_ERR_STR_(clCommandBarrierWithWaitListKHR)
+#define __COMMAND_COPY_BUFFER_KHR_ERR               CL_HPP_ERR_STR_(clCommandCopyBufferKHR)
+#define __COMMAND_COPY_BUFFER_RECT_KHR_ERR          CL_HPP_ERR_STR_(clCommandCopyBufferRectKHR)
+#define __COMMAND_COPY_BUFFER_TO_IMAGE_KHR_ERR      CL_HPP_ERR_STR_(clCommandCopyBufferToImageKHR)
+#define __COMMAND_COPY_IMAGE_KHR_ERR                CL_HPP_ERR_STR_(clCommandCopyImageKHR)
+#define __COMMAND_COPY_IMAGE_TO_BUFFER_KHR_ERR      CL_HPP_ERR_STR_(clCommandCopyImageToBufferKHR)
+#define __COMMAND_FILL_BUFFER_KHR_ERR               CL_HPP_ERR_STR_(clCommandFillBufferKHR)
+#define __COMMAND_FILL_IMAGE_KHR_ERR                CL_HPP_ERR_STR_(clCommandFillImageKHR)
+#define __COMMAND_NDRANGE_KERNEL_KHR_ERR            CL_HPP_ERR_STR_(clCommandNDRangeKernelKHR)
+#define __UPDATE_MUTABLE_COMMANDS_KHR_ERR           CL_HPP_ERR_STR_(clUpdateMutableCommandsKHR)
+#define __GET_MUTABLE_COMMAND_INFO_KHR_ERR          CL_HPP_ERR_STR_(clGetMutableCommandInfoKHR)
+#define __RETAIN_COMMAND_BUFFER_KHR_ERR             CL_HPP_ERR_STR_(clRetainCommandBufferKHR)
+#define __RELEASE_COMMAND_BUFFER_KHR_ERR            CL_HPP_ERR_STR_(clReleaseCommandBufferKHR)
+#endif // cl_khr_command_buffer
 
 /**
  * CL 1.2 version that uses device fission.
@@ -972,6 +1018,71 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #endif // CL_HPP_USER_OVERRIDE_ERROR_STRINGS
 //! \endcond
 
+#ifdef cl_khr_external_memory
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueAcquireExternalMemObjectsKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueReleaseExternalMemObjectsKHR);
+
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueAcquireExternalMemObjectsKHR pfn_clEnqueueAcquireExternalMemObjectsKHR = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueReleaseExternalMemObjectsKHR pfn_clEnqueueReleaseExternalMemObjectsKHR = nullptr;
+#endif // cl_khr_external_memory
+
+#ifdef cl_khr_semaphore
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCreateSemaphoreWithPropertiesKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clReleaseSemaphoreKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clRetainSemaphoreKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueWaitSemaphoresKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueSignalSemaphoresKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetSemaphoreInfoKHR);
+
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCreateSemaphoreWithPropertiesKHR pfn_clCreateSemaphoreWithPropertiesKHR  = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clReleaseSemaphoreKHR              pfn_clReleaseSemaphoreKHR               = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clRetainSemaphoreKHR               pfn_clRetainSemaphoreKHR                = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueWaitSemaphoresKHR         pfn_clEnqueueWaitSemaphoresKHR          = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueSignalSemaphoresKHR       pfn_clEnqueueSignalSemaphoresKHR        = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetSemaphoreInfoKHR              pfn_clGetSemaphoreInfoKHR               = nullptr;
+#endif // cl_khr_semaphore
+
+#if defined(cl_khr_command_buffer)
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCreateCommandBufferKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clFinalizeCommandBufferKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clRetainCommandBufferKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clReleaseCommandBufferKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetCommandBufferInfoKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueCommandBufferKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandBarrierWithWaitListKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyBufferKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyBufferRectKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyBufferToImageKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyImageKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyImageToBufferKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandFillBufferKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandFillImageKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandNDRangeKernelKHR);
+
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCreateCommandBufferKHR pfn_clCreateCommandBufferKHR               = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clFinalizeCommandBufferKHR pfn_clFinalizeCommandBufferKHR           = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clRetainCommandBufferKHR pfn_clRetainCommandBufferKHR               = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clReleaseCommandBufferKHR pfn_clReleaseCommandBufferKHR             = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetCommandBufferInfoKHR pfn_clGetCommandBufferInfoKHR             = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueCommandBufferKHR pfn_clEnqueueCommandBufferKHR             = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandBarrierWithWaitListKHR pfn_clCommandBarrierWithWaitListKHR = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyBufferKHR pfn_clCommandCopyBufferKHR                   = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyBufferRectKHR pfn_clCommandCopyBufferRectKHR           = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyBufferToImageKHR pfn_clCommandCopyBufferToImageKHR     = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyImageKHR pfn_clCommandCopyImageKHR                     = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyImageToBufferKHR pfn_clCommandCopyImageToBufferKHR     = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandFillBufferKHR pfn_clCommandFillBufferKHR                   = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandFillImageKHR pfn_clCommandFillImageKHR                     = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandNDRangeKernelKHR pfn_clCommandNDRangeKernelKHR             = nullptr;
+#endif /* cl_khr_command_buffer */
+
+#if defined(cl_khr_command_buffer_mutable_dispatch)
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clUpdateMutableCommandsKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetMutableCommandInfoKHR);
+
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clUpdateMutableCommandsKHR pfn_clUpdateMutableCommandsKHR           = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetMutableCommandInfoKHR pfn_clGetMutableCommandInfoKHR           = nullptr;
+#endif /* cl_khr_command_buffer_mutable_dispatch */
 
 namespace detail {
 
@@ -982,7 +1093,7 @@ namespace detail {
 template<typename Functor, typename T>
 inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
 {
-    return f(name, sizeof(T), param, NULL);
+    return f(name, sizeof(T), param, nullptr);
 }
 
 // Specialized for getInfo<CL_PROGRAM_BINARIES>
@@ -1003,7 +1114,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, vector<vector<unsigned char>>*
             binariesPointers[i] = (*param)[i].data();
         }
 
-        cl_int err = f(name, numBinaries * sizeof(unsigned char*), binariesPointers.data(), NULL);
+        cl_int err = f(name, numBinaries * sizeof(unsigned char*), binariesPointers.data(), nullptr);
 
         if (err != CL_SUCCESS) {
             return err;
@@ -1019,7 +1130,7 @@ template <typename Func, typename T>
 inline cl_int getInfoHelper(Func f, cl_uint name, vector<T>* param, long)
 {
     size_type required;
-    cl_int err = f(name, 0, NULL, &required);
+    cl_int err = f(name, 0, nullptr, &required);
     if (err != CL_SUCCESS) {
         return err;
     }
@@ -1027,7 +1138,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, vector<T>* param, long)
 
     // Temporary to avoid changing param on an error
     vector<T> localData(elements);
-    err = f(name, required, localData.data(), NULL);
+    err = f(name, required, localData.data(), nullptr);
     if (err != CL_SUCCESS) {
         return err;
     }
@@ -1049,7 +1160,7 @@ inline cl_int getInfoHelper(
     Func f, cl_uint name, vector<T>* param, int, typename T::cl_type = 0)
 {
     size_type required;
-    cl_int err = f(name, 0, NULL, &required);
+    cl_int err = f(name, 0, nullptr, &required);
     if (err != CL_SUCCESS) {
         return err;
     }
@@ -1057,7 +1168,7 @@ inline cl_int getInfoHelper(
     const size_type elements = required / sizeof(typename T::cl_type);
 
     vector<typename T::cl_type> value(elements);
-    err = f(name, required, value.data(), NULL);
+    err = f(name, required, value.data(), nullptr);
     if (err != CL_SUCCESS) {
         return err;
     }
@@ -1080,7 +1191,7 @@ template <typename Func>
 inline cl_int getInfoHelper(Func f, cl_uint name, string* param, long)
 {
     size_type required;
-    cl_int err = f(name, 0, NULL, &required);
+    cl_int err = f(name, 0, nullptr, &required);
     if (err != CL_SUCCESS) {
         return err;
     }
@@ -1089,7 +1200,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, string* param, long)
     // a char vector does not
     if (required > 0) {
         vector<char> value(required);
-        err = f(name, required, value.data(), NULL);
+        err = f(name, required, value.data(), nullptr);
         if (err != CL_SUCCESS) {
             return err;
         }
@@ -1108,7 +1219,7 @@ template <typename Func, size_type N>
 inline cl_int getInfoHelper(Func f, cl_uint name, array<size_type, N>* param, long)
 {
     size_type required;
-    cl_int err = f(name, 0, NULL, &required);
+    cl_int err = f(name, 0, nullptr, &required);
     if (err != CL_SUCCESS) {
         return err;
     }
@@ -1116,7 +1227,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, array<size_type, N>* param, lo
     size_type elements = required / sizeof(size_type);
     vector<size_type> value(elements, 0);
 
-    err = f(name, required, value.data(), NULL);
+    err = f(name, required, value.data(), nullptr);
     if (err != CL_SUCCESS) {
         return err;
     }
@@ -1145,12 +1256,12 @@ template<typename Func, typename T>
 inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
 {
     typename T::cl_type value;
-    cl_int err = f(name, sizeof(value), &value, NULL);
+    cl_int err = f(name, sizeof(value), &value, nullptr);
     if (err != CL_SUCCESS) {
         return err;
     }
     *param = value;
-    if (value != NULL)
+    if (value != nullptr)
     {
         err = param->retain();
         if (err != CL_SUCCESS) {
@@ -1389,8 +1500,8 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT, cl_bool) \
     F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT, cl_bool)
 
-#define CL_HPP_PARAM_NAME_DEVICE_FISSION_(F) \
-    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
+#define CL_HPP_PARAM_NAME_DEVICE_FISSION_EXT_(F) \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl::Device) \
     F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, cl::vector<cl_device_partition_property_ext>) \
     F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, cl::vector<cl_device_partition_property_ext>) \
     F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
@@ -1408,6 +1519,15 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
 #define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(F) \
     F(cl_device_info, CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR, cl_version_khr)
 
+#define CL_HPP_PARAM_NAME_CL_KHR_SEMAPHORE_(F) \
+    F(cl_semaphore_info_khr, CL_SEMAPHORE_PROPERTIES_KHR, cl::vector<cl_semaphore_properties_khr>) \
+    F(cl_platform_info, CL_PLATFORM_SEMAPHORE_TYPES_KHR,  cl::vector<cl_semaphore_type_khr>) \
+    F(cl_device_info, CL_DEVICE_SEMAPHORE_TYPES_KHR,      cl::vector<cl_semaphore_type_khr>) \
+
+#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_MEMORY_(F) \
+    F(cl_device_info, CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR, cl::vector<cl::ExternalMemoryType>) \
+    F(cl_platform_info, CL_PLATFORM_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR, cl::vector<cl::ExternalMemoryType>)
+
 #define CL_HPP_PARAM_NAME_INFO_3_0_(F) \
     F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION, cl_version) \
     F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION, cl::vector<cl_name_version>) \
@@ -1431,7 +1551,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_command_queue_info, CL_QUEUE_PROPERTIES_ARRAY, cl::vector<cl_queue_properties>) \
     F(cl_mem_info, CL_MEM_PROPERTIES, cl::vector<cl_mem_properties>) \
     F(cl_pipe_info, CL_PIPE_PROPERTIES, cl::vector<cl_pipe_properties>) \
-    F(cl_sampler_info, CL_SAMPLER_PROPERTIES, cl::vector<cl_sampler_properties>)
+    F(cl_sampler_info, CL_SAMPLER_PROPERTIES, cl::vector<cl_sampler_properties>) \
 
 template <typename enum_type, cl_int Name>
 struct param_traits {};
@@ -1465,13 +1585,13 @@ CL_HPP_PARAM_NAME_INFO_2_2_(CL_HPP_DECLARE_PARAM_TRAITS_)
 CL_HPP_PARAM_NAME_INFO_3_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 300
 
-#if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210
+#if defined(cl_khr_subgroups) && CL_HPP_TARGET_OPENCL_VERSION < 210
 CL_HPP_PARAM_NAME_INFO_SUBGROUP_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_)
-#endif // #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210
+#endif // #if defined(cl_khr_subgroups) && CL_HPP_TARGET_OPENCL_VERSION < 210
 
-#if defined(CL_HPP_USE_IL_KHR)
+#if defined(cl_khr_il_program) && CL_HPP_TARGET_OPENCL_VERSION < 210
 CL_HPP_PARAM_NAME_INFO_IL_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_)
-#endif // #if defined(CL_HPP_USE_IL_KHR)
+#endif // #if defined(cl_khr_il_program) && CL_HPP_TARGET_OPENCL_VERSION < 210
 
 
 // Flags deprecated in OpenCL 2.0
@@ -1496,9 +1616,9 @@ CL_HPP_PARAM_NAME_INFO_1_1_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
 CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
 #endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
 
-#if defined(CL_HPP_USE_CL_DEVICE_FISSION)
-CL_HPP_PARAM_NAME_DEVICE_FISSION_(CL_HPP_DECLARE_PARAM_TRAITS_);
-#endif // CL_HPP_USE_CL_DEVICE_FISSION
+#if defined(cl_ext_device_fission)
+CL_HPP_PARAM_NAME_DEVICE_FISSION_EXT_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // cl_ext_device_fission
 
 #if defined(cl_khr_extended_versioning)
 #if CL_HPP_TARGET_OPENCL_VERSION < 300
@@ -1507,6 +1627,14 @@ CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(CL_HPP_DECLARE_PARAM_TR
 CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(CL_HPP_DECLARE_PARAM_TRAITS_)
 #endif // cl_khr_extended_versioning
 
+#if defined(cl_khr_semaphore)
+CL_HPP_PARAM_NAME_CL_KHR_SEMAPHORE_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // cl_khr_semaphore
+
+#ifdef cl_khr_external_memory
+CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_MEMORY_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // cl_khr_external_memory
+
 #if defined(cl_khr_device_uuid)
 using uuid_array = array<cl_uchar, CL_UUID_SIZE_KHR>;
 using luid_array = array<cl_uchar, CL_LUID_SIZE_KHR>;
@@ -1521,9 +1649,23 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NODE_MASK_KHR, cl_uint)
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PCI_BUS_INFO_KHR, cl_device_pci_bus_info_khr)
 #endif
 
+// Note: some headers do not define cl_khr_image2d_from_buffer
+#if CL_HPP_TARGET_OPENCL_VERSION < 200
+#if defined(CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR, cl_uint)
+#endif
+#if defined(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR, cl_uint)
+#endif
+#endif // CL_HPP_TARGET_OPENCL_VERSION < 200
+
 #if defined(cl_khr_integer_dot_product)
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR, cl_device_integer_dot_product_capabilities_khr)
-#endif
+#if defined(CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR, cl_device_integer_dot_product_acceleration_properties_khr)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR, cl_device_integer_dot_product_acceleration_properties_khr)
+#endif // defined(CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR)
+#endif // defined(cl_khr_integer_dot_product)
 
 #ifdef CL_PLATFORM_ICD_SUFFIX_KHR
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, string)
@@ -1562,6 +1704,9 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUT
 #ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
 #endif
+#ifdef CL_DEVICE_BOARD_NAME_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_BOARD_NAME_AMD, string)
+#endif
 
 #ifdef CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM, cl_ulong)
@@ -1572,12 +1717,27 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_JOB_SLOTS_ARM, cl_uint)
 #ifdef CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, cl_bitfield)
 #endif
+#ifdef CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM, vector<cl_uint>)
+#endif
+#ifdef CL_DEVICE_MAX_WARP_COUNT_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_MAX_WARP_COUNT_ARM, cl_uint)
+#endif
+#ifdef CL_KERNEL_MAX_WARP_COUNT_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_info, CL_KERNEL_MAX_WARP_COUNT_ARM, cl_uint)
+#endif
 #ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM, cl_uint)
 #endif
 #ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM, cl_int)
 #endif
+#ifdef CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM, cl_uint)
+#endif
+#ifdef CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM, cl_uint)
+#endif
 
 #ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
@@ -1601,6 +1761,28 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, c
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
 #endif
 
+#if defined(cl_khr_command_buffer)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR, cl_device_command_buffer_capabilities_khr)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR, cl_command_buffer_properties_khr)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_QUEUES_KHR, cl::vector<CommandQueue>)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_NUM_QUEUES_KHR, cl_uint)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_REFERENCE_COUNT_KHR, cl_uint)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_STATE_KHR, cl_command_buffer_state_khr)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_PROPERTIES_ARRAY_KHR, cl::vector<cl_command_buffer_properties_khr>)
+#endif /* cl_khr_command_buffer */
+
+#if defined(cl_khr_command_buffer_mutable_dispatch)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_COMMAND_COMMAND_QUEUE_KHR, CommandQueue)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_COMMAND_COMMAND_BUFFER_KHR, CommandBufferKhr)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_COMMAND_COMMAND_TYPE_KHR, cl_command_type)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_PROPERTIES_ARRAY_KHR, cl::vector<cl_ndrange_kernel_command_properties_khr>)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_KERNEL_KHR, cl_kernel)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_DIMENSIONS_KHR, cl_uint)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR, cl::vector<size_type>)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_GLOBAL_WORK_SIZE_KHR, cl::vector<size_type>)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR, cl::vector<size_type>)
+#endif /* cl_khr_command_buffer_mutable_dispatch */
+
 // Convenience functions
 
 template <typename Func, typename T>
@@ -1769,6 +1951,62 @@ struct ReferenceHandler<cl_event>
     { return ::clReleaseEvent(event); }
 };
 
+#ifdef cl_khr_semaphore
+template <>
+struct ReferenceHandler<cl_semaphore_khr>
+{
+    static cl_int retain(cl_semaphore_khr semaphore)
+    { 
+        if (pfn_clRetainSemaphoreKHR != nullptr) {
+            return pfn_clRetainSemaphoreKHR(semaphore);
+        }
+
+        return CL_INVALID_OPERATION;
+    }
+
+    static cl_int release(cl_semaphore_khr semaphore)
+    {
+        if (pfn_clReleaseSemaphoreKHR != nullptr) {
+            return pfn_clReleaseSemaphoreKHR(semaphore);
+        }
+
+        return CL_INVALID_OPERATION;
+    }
+};
+#endif // cl_khr_semaphore
+#if defined(cl_khr_command_buffer)
+template <>
+struct ReferenceHandler<cl_command_buffer_khr>
+{
+    static cl_int retain(cl_command_buffer_khr cmdBufferKhr)
+    {
+        if (pfn_clRetainCommandBufferKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION, __RETAIN_COMMAND_BUFFER_KHR_ERR);
+        }
+        return pfn_clRetainCommandBufferKHR(cmdBufferKhr);
+    }
+
+    static cl_int release(cl_command_buffer_khr cmdBufferKhr)
+    {
+        if (pfn_clReleaseCommandBufferKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION, __RELEASE_COMMAND_BUFFER_KHR_ERR);
+        }
+        return pfn_clReleaseCommandBufferKHR(cmdBufferKhr);
+    }
+};
+
+template <>
+struct ReferenceHandler<cl_mutable_command_khr>
+{
+    // cl_mutable_command_khr does not have retain().
+    static cl_int retain(cl_mutable_command_khr)
+    { return CL_SUCCESS; }
+    // cl_mutable_command_khr does not have release().
+    static cl_int release(cl_mutable_command_khr)
+    { return CL_SUCCESS; }
+};
+#endif // cl_khr_command_buffer
+
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
 // Extracts version number with major in the upper 16 bits, minor in the lower 16
@@ -1794,7 +2032,7 @@ static cl_uint getVersion(const vector<char> &versionInfo)
 static cl_uint getPlatformVersion(cl_platform_id platform)
 {
     size_type size = 0;
-    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, &size);
 
     vector<char> versionInfo(size);
     clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, versionInfo.data(), &size);
@@ -1804,7 +2042,7 @@ static cl_uint getPlatformVersion(cl_platform_id platform)
 static cl_uint getDevicePlatformVersion(cl_device_id device)
 {
     cl_platform_id platform;
-    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, nullptr);
     return getPlatformVersion(platform);
 }
 
@@ -1813,11 +2051,11 @@ static cl_uint getContextPlatformVersion(cl_context context)
     // The platform cannot be queried directly, so we first have to grab a
     // device and obtain its context
     size_type size = 0;
-    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, nullptr, &size);
     if (size == 0)
         return 0;
     vector<cl_device_id> devices(size/sizeof(cl_device_id));
-    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices.data(), NULL);
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices.data(), nullptr);
     return getDevicePlatformVersion(devices[0]);
 }
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
@@ -1832,7 +2070,7 @@ protected:
     cl_type object_;
 
 public:
-    Wrapper() : object_(NULL) { }
+    Wrapper() : object_(nullptr) { }
     
     Wrapper(const cl_type &obj, bool retainObject) : object_(obj) 
     {
@@ -1843,7 +2081,7 @@ public:
 
     ~Wrapper()
     {
-        if (object_ != NULL) { release(); }
+        if (object_ != nullptr) { release(); }
     }
 
     Wrapper(const Wrapper<cl_type>& rhs)
@@ -1855,7 +2093,7 @@ public:
     Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT_
     {
         object_ = rhs.object_;
-        rhs.object_ = NULL;
+        rhs.object_ = nullptr;
     }
 
     Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
@@ -1873,7 +2111,7 @@ public:
         if (this != &rhs) {
             detail::errHandler(release(), __RELEASE_ERR);
             object_ = rhs.object_;
-            rhs.object_ = NULL;
+            rhs.object_ = nullptr;
         }
         return *this;
     }
@@ -1931,7 +2169,7 @@ protected:
         bool retVal = false;
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 #if CL_HPP_MINIMUM_OPENCL_VERSION < 120
-        if (device != NULL) {
+        if (device != nullptr) {
             int version = getDevicePlatformVersion(device);
             if(version > ((1 << 16) + 1)) {
                 retVal = true;
@@ -1941,11 +2179,12 @@ protected:
         retVal = true;
 #endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+        (void)device;
         return retVal;
     }
 
 public:
-    Wrapper() : object_(NULL), referenceCountable_(false) 
+    Wrapper() : object_(nullptr), referenceCountable_(false) 
     { 
     }
     
@@ -1976,7 +2215,7 @@ public:
     {
         object_ = rhs.object_;
         referenceCountable_ = rhs.referenceCountable_;
-        rhs.object_ = NULL;
+        rhs.object_ = nullptr;
         rhs.referenceCountable_ = false;
     }
 
@@ -1997,7 +2236,7 @@ public:
             detail::errHandler(release(), __RELEASE_ERR);
             object_ = rhs.object_;
             referenceCountable_ = rhs.referenceCountable_;
-            rhs.object_ = NULL;
+            rhs.object_ = nullptr;
             rhs.referenceCountable_ = false;
         }
         return *this;
@@ -2061,51 +2300,7 @@ inline bool operator!=(const Wrapper<T> &lhs, const Wrapper<T> &rhs)
 //! \endcond
 
 
-using BuildLogType = vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, CL_PROGRAM_BUILD_LOG>::param_type>>;
-#if defined(CL_HPP_ENABLE_EXCEPTIONS)
-/**
-* Exception class for build errors to carry build info
-*/
-class BuildError : public Error
-{
-private:
-    BuildLogType buildLogs;
-public:
-    BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec)
-    {
-    }
-
-    BuildLogType getBuildLog() const
-    {
-        return buildLogs;
-    }
-};
-namespace detail {
-    static inline cl_int buildErrHandler(
-        cl_int err,
-        const char * errStr,
-        const BuildLogType &buildLogs)
-    {
-        if (err != CL_SUCCESS) {
-            throw BuildError(err, errStr, buildLogs);
-        }
-        return err;
-    }
-} // namespace detail
 
-#else
-namespace detail {
-    static inline cl_int buildErrHandler(
-        cl_int err,
-        const char * errStr,
-        const BuildLogType &buildLogs)
-    {
-        (void)buildLogs; // suppress unused variable warning
-        (void)errStr;
-        return err;
-    }
-} // namespace detail
-#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)
 
 
 /*! \stuct ImageFormat
@@ -2182,7 +2377,7 @@ public:
     }
 #endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Device() : detail::Wrapper<cl_type>() { }
 
     /*! \brief Constructor from cl_device_id.
@@ -2197,11 +2392,11 @@ public:
      *  \see Context::getDefault()
      */
     static Device getDefault(
-        cl_int *errResult = NULL)
+        cl_int *errResult = nullptr)
     {
         std::call_once(default_initialized_, makeDefault);
         detail::errHandler(default_error_);
-        if (errResult != NULL) {
+        if (errResult != nullptr) {
             *errResult = default_error_;
         }
         return default_;
@@ -2230,34 +2425,7 @@ public:
         detail::Wrapper<cl_type>::operator=(rhs);
         return *this;
     }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-    * Required for MSVC.
-    */
-    Device(const Device& dev) : detail::Wrapper<cl_type>(dev) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-    * Required for MSVC.
-    */
-    Device& operator = (const Device &dev)
-    {
-        detail::Wrapper<cl_type>::operator=(dev);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-    * Required for MSVC.
-    */
-    Device(Device&& dev) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(dev)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-    * Required for MSVC.
-    */
-    Device& operator = (Device &&dev)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(dev));
-        return *this;
-    }
+ 
 
     //! \brief Wrapper for clGetDeviceInfo().
     template <typename T>
@@ -2271,18 +2439,17 @@ public:
     //! \brief Wrapper for clGetDeviceInfo() that returns by value.
     template <cl_device_info name> typename
     detail::param_traits<detail::cl_device_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_device_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
     }
 
-
 #if CL_HPP_TARGET_OPENCL_VERSION >= 210
     /**
      * Return the current value of the host clock as seen by the device.
@@ -2329,9 +2496,6 @@ public:
     }
 #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
 
-    /**
-     * CL 1.2 version
-     */
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
     //! \brief Wrapper for clCreateSubDevices().
     cl_int createSubDevices(
@@ -2339,13 +2503,13 @@ public:
         vector<Device>* devices)
     {
         cl_uint n = 0;
-        cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
+        cl_int err = clCreateSubDevices(object_, properties, 0, nullptr, &n);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
         }
 
         vector<cl_device_id> ids(n);
-        err = clCreateSubDevices(object_, properties, n, ids.data(), NULL);
+        err = clCreateSubDevices(object_, properties, n, ids.data(), nullptr);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
         }
@@ -2366,11 +2530,10 @@ public:
 
         return CL_SUCCESS;
     }
-#elif defined(CL_HPP_USE_CL_DEVICE_FISSION)
+#endif
 
-/**
- * CL 1.1 version that uses device fission extension.
- */
+#if defined(cl_ext_device_fission)
+    //! \brief Wrapper for clCreateSubDevices().
     cl_int createSubDevices(
         const cl_device_partition_property_ext * properties,
         vector<Device>* devices)
@@ -2383,17 +2546,24 @@ public:
                 cl_device_id * /*out_devices*/,
                 cl_uint * /*num_devices*/ ) CL_API_SUFFIX__VERSION_1_1;
 
-        static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
+        static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = nullptr;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        cl::Device device(object_);
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateSubDevicesEXT);
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
         CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSubDevicesEXT);
+#endif
 
         cl_uint n = 0;
-        cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
+        cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, nullptr, &n);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
         }
 
         vector<cl_device_id> ids(n);
-        err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids.data(), NULL);
+        err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids.data(), nullptr);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
         }
@@ -2410,10 +2580,57 @@ public:
                 (*devices)[i] = Device(ids[i], false);
             }
         }
+
         return CL_SUCCESS;
     }
-#endif // defined(CL_HPP_USE_CL_DEVICE_FISSION)
+#endif // defined(cl_ext_device_fission)
+};
+
+using BuildLogType = vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, CL_PROGRAM_BUILD_LOG>::param_type>>;
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+/**
+* Exception class for build errors to carry build info
+*/
+class BuildError : public Error
+{
+private:
+    BuildLogType buildLogs;
+public:
+    BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec)
+    {
+    }
+
+    BuildLogType getBuildLog() const
+    {
+        return buildLogs;
+    }
 };
+namespace detail {
+    static inline cl_int buildErrHandler(
+        cl_int err,
+        const char * errStr,
+        const BuildLogType &buildLogs)
+    {
+        if (err != CL_SUCCESS) {
+            throw BuildError(err, errStr, buildLogs);
+        }
+        return err;
+    }
+} // namespace detail
+
+#else
+namespace detail {
+    static inline cl_int buildErrHandler(
+        cl_int err,
+        const char * errStr,
+        const BuildLogType &buildLogs)
+    {
+        (void)buildLogs; // suppress unused variable warning
+        (void)errStr;
+        return err;
+    }
+} // namespace detail
+#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)
 
 CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Device::default_initialized_;
 CL_HPP_DEFINE_STATIC_MEMBER_ Device Device::default_;
@@ -2450,7 +2667,7 @@ private:
             // Otherwise set it
             cl_uint n = 0;
 
-            cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+            cl_int err = ::clGetPlatformIDs(0, nullptr, &n);
             if (err != CL_SUCCESS) {
                 default_error_ = err;
                 return;
@@ -2461,7 +2678,7 @@ private:
             }
 
             vector<cl_platform_id> ids(n);
-            err = ::clGetPlatformIDs(n, ids.data(), NULL);
+            err = ::clGetPlatformIDs(n, ids.data(), nullptr);
             if (err != CL_SUCCESS) {
                 default_error_ = err;
                 return;
@@ -2498,7 +2715,7 @@ public:
     }
 #endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Platform() : detail::Wrapper<cl_type>()  { }
 
     /*! \brief Constructor from cl_platform_id.
@@ -2522,11 +2739,11 @@ public:
     }
 
     static Platform getDefault(
-        cl_int *errResult = NULL)
+        cl_int *errResult = nullptr)
     {
         std::call_once(default_initialized_, makeDefault);
         detail::errHandler(default_error_);
-        if (errResult != NULL) {
+        if (errResult != nullptr) {
             *errResult = default_error_;
         }
         return default_;
@@ -2558,12 +2775,12 @@ public:
     //! \brief Wrapper for clGetPlatformInfo() that returns by value.
     template <cl_platform_info name> typename
     detail::param_traits<detail::cl_platform_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_platform_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -2578,17 +2795,17 @@ public:
         vector<Device>* devices) const
     {
         cl_uint n = 0;
-        if( devices == NULL ) {
+        if( devices == nullptr ) {
             return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
         }
-        cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
+        cl_int err = ::clGetDeviceIDs(object_, type, 0, nullptr, &n);
         if (err != CL_SUCCESS  && err != CL_DEVICE_NOT_FOUND) {
             return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
         }
 
         vector<cl_device_id> ids(n);
         if (n>0) {
-            err = ::clGetDeviceIDs(object_, type, n, ids.data(), NULL);
+            err = ::clGetDeviceIDs(object_, type, n, ids.data(), nullptr);
             if (err != CL_SUCCESS) {
                 return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
             }
@@ -2621,7 +2838,7 @@ public:
      *
      *  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
      *  values returned in devices can be used to identify a specific OpenCL
-     *  device. If \a devices argument is NULL, this argument is ignored.
+     *  device. If \a devices argument is nullptr, this argument is ignored.
      *
      *  \return One of the following values:
      *    - CL_SUCCESS if the function is executed successfully.
@@ -2649,12 +2866,17 @@ public:
             cl_device_id * devices,
             cl_uint* num_devices);
 
-        if( devices == NULL ) {
+        if( devices == nullptr ) {
             return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
         }
 
-        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
+        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = nullptr;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
         CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(object_, clGetDeviceIDsFromD3D10KHR);
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetDeviceIDsFromD3D10KHR);
+#endif
 
         cl_uint n = 0;
         cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
@@ -2663,7 +2885,7 @@ public:
             d3d_object,
             d3d_device_set, 
             0, 
-            NULL, 
+            nullptr, 
             &n);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
@@ -2677,7 +2899,7 @@ public:
             d3d_device_set,
             n, 
             ids.data(), 
-            NULL);
+            nullptr);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
         }
@@ -2708,17 +2930,17 @@ public:
     {
         cl_uint n = 0;
 
-        if( platforms == NULL ) {
+        if( platforms == nullptr ) {
             return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
         }
 
-        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        cl_int err = ::clGetPlatformIDs(0, nullptr, &n);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
         }
 
         vector<cl_platform_id> ids(n);
-        err = ::clGetPlatformIDs(n, ids.data(), NULL);
+        err = ::clGetPlatformIDs(n, ids.data(), nullptr);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
         }
@@ -2758,7 +2980,7 @@ public:
      * Wraps clGetPlatformIDs(), returning the first result.
      */
     static Platform get(
-        cl_int * errResult = NULL)
+        cl_int * errResult = nullptr)
     {
         cl_int err;
         Platform default_platform = Platform::getDefault(&err);
@@ -2842,8 +3064,8 @@ private:
             default_ = Context(
                 CL_DEVICE_TYPE_DEFAULT,
                 properties,
-                NULL,
-                NULL,
+                nullptr,
+                nullptr,
                 &default_error_);
         }
 #if defined(CL_HPP_ENABLE_EXCEPTIONS)
@@ -2882,14 +3104,14 @@ public:
      */
     Context(
         const vector<Device>& devices,
-        const cl_context_properties* properties = NULL,
+        const cl_context_properties* properties = nullptr,
         void (CL_CALLBACK * notifyFptr)(
             const char *,
             const void *,
             size_type,
-            void *) = NULL,
-        void* data = NULL,
-        cl_int* err = NULL)
+            void *) = nullptr,
+        void* data = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -2906,7 +3128,7 @@ public:
             notifyFptr, data, &error);
 
         detail::errHandler(error, __CREATE_CONTEXT_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -2917,14 +3139,14 @@ public:
      */
     Context(
         const Device& device,
-        const cl_context_properties* properties = NULL,
+        const cl_context_properties* properties = nullptr,
         void (CL_CALLBACK * notifyFptr)(
             const char *,
             const void *,
             size_type,
-            void *) = NULL,
-        void* data = NULL,
-        cl_int* err = NULL)
+            void *) = nullptr,
+        void* data = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -2936,7 +3158,7 @@ public:
             notifyFptr, data, &error);
 
         detail::errHandler(error, __CREATE_CONTEXT_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -2947,27 +3169,27 @@ public:
      */
     Context(
         cl_device_type type,
-        const cl_context_properties* properties = NULL,
+        const cl_context_properties* properties = nullptr,
         void (CL_CALLBACK * notifyFptr)(
             const char *,
             const void *,
             size_type,
-            void *) = NULL,
-        void* data = NULL,
-        cl_int* err = NULL)
+            void *) = nullptr,
+        void* data = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
 
 #if !defined(__APPLE__) && !defined(__MACOS)
         cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };
 
-        if (properties == NULL) {
+        if (properties == nullptr) {
             // Get a valid platform ID as we cannot send in a blank one
             vector<Platform> platforms;
             error = Platform::get(&platforms);
             if (error != CL_SUCCESS) {
                 detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
-                if (err != NULL) {
+                if (err != nullptr) {
                     *err = error;
                 }
                 return;
@@ -2996,7 +3218,7 @@ public:
                 // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
                 if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
                     detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
-                    if (err != NULL) {
+                    if (err != nullptr) {
                         *err = error;
                     }
                 }
@@ -3009,7 +3231,7 @@ public:
 
             if (platform_id == 0) {
                 detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
-                if (err != NULL) {
+                if (err != nullptr) {
                     *err = CL_DEVICE_NOT_FOUND;
                 }
                 return;
@@ -3023,49 +3245,21 @@ public:
             properties, type, notifyFptr, data, &error);
 
         detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Context(const Context& ctx) : detail::Wrapper<cl_type>(ctx) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Context& operator = (const Context &ctx)
-    {
-        detail::Wrapper<cl_type>::operator=(ctx);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Context(Context&& ctx) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(ctx)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Context& operator = (Context &&ctx)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(ctx));
-        return *this;
-    }
-
 
     /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
      *
      *  \note All calls to this function return the same cl_context as the first.
      */
-    static Context getDefault(cl_int * err = NULL) 
+    static Context getDefault(cl_int * err = nullptr) 
     {
         std::call_once(default_initialized_, makeDefault);
         detail::errHandler(default_error_);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = default_error_;
         }
         return default_;
@@ -3085,7 +3279,7 @@ public:
         return default_;
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Context() : detail::Wrapper<cl_type>() { }
 
     /*! \brief Constructor from cl_context - takes ownership.
@@ -3119,12 +3313,12 @@ public:
     //! \brief Wrapper for clGetContextInfo() that returns by value.
     template <cl_context_info name> typename
     detail::param_traits<detail::cl_context_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_context_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -3150,7 +3344,7 @@ public:
            flags,
            type, 
            0, 
-           NULL, 
+           nullptr, 
            &numEntries);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
@@ -3164,7 +3358,7 @@ public:
                 type,
                 numEntries,
                 (cl_image_format*)value.data(),
-                NULL);
+                nullptr);
             if (err != CL_SUCCESS) {
                 return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
             }
@@ -3178,6 +3372,30 @@ public:
 
         return CL_SUCCESS;
     }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 300
+    /*! \brief  Registers a destructor callback function with a context.
+     *
+     *  Wraps clSetContextDestructorCallback().
+     * 
+     * Each call to this function registers the specified callback function on
+     * a destructor callback stack associated with context. The registered
+     * callback functions are called in the reverse order in which they were registered.
+     * If a context callback function was specified when context was created,
+     * it will not be called after any context destructor callback is called.
+     */
+    cl_int setDestructorCallback(
+        void (CL_CALLBACK * pfn_notify)(cl_context, void *),
+        void * user_data = nullptr)
+    {
+        return detail::errHandler(
+            ::clSetContextDestructorCallback(
+                object_,
+                pfn_notify,
+                user_data),
+                __SET_CONTEXT_DESCTRUCTOR_CALLBACK_ERR);
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 300
 };
 
 inline void Device::makeDefault()
@@ -3224,7 +3442,7 @@ CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Context::default_error_ = CL_SUCCESS;
 class Event : public detail::Wrapper<cl_event>
 {
 public:
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Event() : detail::Wrapper<cl_type>() { }
 
     /*! \brief Constructor from cl_event - takes ownership.
@@ -3261,12 +3479,12 @@ public:
     //! \brief Wrapper for clGetEventInfo() that returns by value.
     template <cl_event_info name> typename
     detail::param_traits<detail::cl_event_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_event_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -3284,12 +3502,12 @@ public:
     //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
     template <cl_profiling_info name> typename
     detail::param_traits<detail::cl_profiling_info, name>::param_type
-    getProfilingInfo(cl_int* err = NULL) const
+    getProfilingInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_profiling_info, name>::param_type param;
         cl_int result = getProfilingInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -3314,7 +3532,7 @@ public:
     cl_int setCallback(
         cl_int type,
         void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),
-        void * user_data = NULL)
+        void * user_data = nullptr)
     {
         return detail::errHandler(
             ::clSetEventCallback(
@@ -3333,9 +3551,12 @@ public:
     static cl_int
     waitForEvents(const vector<Event>& events)
     {
+        static_assert(sizeof(cl::Event) == sizeof(cl_event),
+        "Size of cl::Event must be equal to size of cl_event");
+
         return detail::errHandler(
             ::clWaitForEvents(
-                (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
+                (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : nullptr),
             __WAIT_FOR_EVENTS_ERR);
     }
 };
@@ -3354,7 +3575,7 @@ public:
      */
     UserEvent(
         const Context& context,
-        cl_int * err = NULL)
+        cl_int * err = nullptr)
     {
         cl_int error;
         object_ = ::clCreateUserEvent(
@@ -3362,12 +3583,12 @@ public:
             &error);
 
         detail::errHandler(error, __CREATE_USER_EVENT_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     UserEvent() : Event() { }
 
     /*! \brief Sets the execution status of a user event object.
@@ -3392,7 +3613,7 @@ WaitForEvents(const vector<Event>& events)
 {
     return detail::errHandler(
         ::clWaitForEvents(
-            (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
+            (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : nullptr),
         __WAIT_FOR_EVENTS_ERR);
 }
 
@@ -3407,7 +3628,7 @@ WaitForEvents(const vector<Event>& events)
 class Memory : public detail::Wrapper<cl_mem>
 {
 public:
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Memory() : detail::Wrapper<cl_type>() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -3435,38 +3656,9 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Memory(const Memory& mem) : detail::Wrapper<cl_type>(mem) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Memory& operator = (const Memory &mem)
-    {
-        detail::Wrapper<cl_type>::operator=(mem);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Memory(Memory&& mem) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(mem)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Memory& operator = (Memory &&mem)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(mem));
-        return *this;
-    }
-
-
-    //! \brief Wrapper for clGetMemObjectInfo().
-    template <typename T>
-    cl_int getInfo(cl_mem_info name, T* param) const
+    //! \brief Wrapper for clGetMemObjectInfo().
+    template <typename T>
+    cl_int getInfo(cl_mem_info name, T* param) const
     {
         return detail::errHandler(
             detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
@@ -3476,12 +3668,12 @@ public:
     //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
     template <cl_mem_info name> typename
     detail::param_traits<detail::cl_mem_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_mem_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -3503,7 +3695,7 @@ public:
      */
     cl_int setDestructorCallback(
         void (CL_CALLBACK * pfn_notify)(cl_mem, void *),
-        void * user_data = NULL)
+        void * user_data = nullptr)
     {
         return detail::errHandler(
             ::clSetMemObjectDestructorCallback(
@@ -3615,8 +3807,8 @@ inline cl_int enqueueMapSVM(
     cl_bool blocking,
     cl_map_flags flags,
     size_type size,
-    const vector<Event>* events = NULL,
-    Event* event = NULL);
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr);
 
 /**
  * STL-like allocator class for managing SVM objects provided for convenience.
@@ -3845,7 +4037,7 @@ cl::pointer<T, detail::Deleter<Alloc>> allocate_pointer(const Alloc &alloc_, Arg
 
         return cl::pointer<T, detail::Deleter<Alloc>>(tmp, detail::Deleter<Alloc>{alloc, copies});
     }
-    catch (std::bad_alloc& b)
+    catch (std::bad_alloc&)
     {
         std::allocator_traits<Alloc>::deallocate(alloc, tmp, copies);
         throw;
@@ -3909,44 +4101,90 @@ public:
         const Context& context,
         cl_mem_flags flags,
         size_type size,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
         object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
 
         detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    /*! \brief Constructs a Buffer in the default context.
+#if CL_HPP_TARGET_OPENCL_VERSION >= 300
+    /*! \brief Constructs a Buffer in a specified context and with specified properties.
      *
-     *  Wraps clCreateBuffer().
+     *  Wraps clCreateBufferWithProperties().
      *
+     *  \param properties Optional list of properties for the buffer object and
+     *                    their corresponding values. The non-empty list must
+     *                    end with 0. 
      *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
-     *                  specified.  Note alignment & exclusivity requirements.
-     *
-     *  \see Context::getDefault()
+     *                  specified. Note alignment & exclusivity requirements.
      */
     Buffer(
-         cl_mem_flags flags,
+        const Context& context,
+        const vector<cl_mem_properties>& properties,
+        cl_mem_flags flags,
         size_type size,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
 
-        Context context = Context::getDefault(err);
-
-        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+        if (properties.empty()) {
+            object_ = ::clCreateBufferWithProperties(context(), nullptr, flags,
+                                                     size, host_ptr, &error);
+        }
+        else {
+            object_ = ::clCreateBufferWithProperties(
+                context(), properties.data(), flags, size, host_ptr, &error);
+        }
 
         detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
+#endif
+
+    /*! \brief Constructs a Buffer in the default context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     *
+     *  \see Context::getDefault()
+     */
+    Buffer(
+        cl_mem_flags flags,
+        size_type size,
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr) : Buffer(Context::getDefault(err), flags, size, host_ptr, err) { }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 300
+    /*! \brief Constructs a Buffer in the default context and with specified properties.
+     *
+     *  Wraps clCreateBufferWithProperties().
+     *
+     *  \param properties Optional list of properties for the buffer object and
+     *                    their corresponding values. The non-empty list must
+     *                    end with 0. 
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified. Note alignment & exclusivity requirements.
+     * 
+     *  \see Context::getDefault()
+     */
+    Buffer(
+        const vector<cl_mem_properties>& properties,
+        cl_mem_flags flags,
+        size_type size,
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr) : Buffer(Context::getDefault(err), properties, flags, size, host_ptr, err) { }
+#endif
 
     /*!
      * \brief Construct a Buffer from a host container via iterators.
@@ -3959,7 +4197,7 @@ public:
         IteratorType endIterator,
         bool readOnly,
         bool useHostPtr = false,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         typedef typename std::iterator_traits<IteratorType>::value_type DataType;
         cl_int error;
@@ -3980,20 +4218,20 @@ public:
         Context context = Context::getDefault(err);
 
         if( useHostPtr ) {
-            object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+            object_ = ::clCreateBuffer(context(), flags, size, const_cast<DataType*>(&*startIterator), &error);
         } else {
             object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
         }
 
         detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 
         if( !useHostPtr ) {
             error = cl::copy(startIterator, endIterator, *this);
             detail::errHandler(error, __CREATE_BUFFER_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -4006,7 +4244,7 @@ public:
      */
     template< typename IteratorType >
     Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
-        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+        bool readOnly, bool useHostPtr = false, cl_int* err = nullptr);
     
     /*!
     * \brief Construct a Buffer from a host container via iterators using a specified queue.
@@ -4014,9 +4252,9 @@ public:
     */
     template< typename IteratorType >
     Buffer(const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator,
-        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+        bool readOnly, bool useHostPtr = false, cl_int* err = nullptr);
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Buffer() : Memory() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -4039,33 +4277,6 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Buffer(const Buffer& buf) : Memory(buf) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Buffer& operator = (const Buffer &buf)
-    {
-        Memory::operator=(buf);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Buffer(Buffer&& buf) CL_HPP_NOEXCEPT_ : Memory(std::move(buf)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Buffer& operator = (Buffer &&buf)
-    {
-        Memory::operator=(std::move(buf));
-        return *this;
-    }
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 110
     /*! \brief Creates a new buffer object from this.
@@ -4076,7 +4287,7 @@ public:
         cl_mem_flags flags,
         cl_buffer_create_type buffer_create_type,
         const void * buffer_create_info,
-        cl_int * err = NULL)
+        cl_int * err = nullptr)
     {
         Buffer result;
         cl_int error;
@@ -4088,7 +4299,7 @@ public:
             &error);
 
         detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 
@@ -4120,7 +4331,7 @@ public:
         const Context& context,
         cl_mem_flags flags,
         ID3D10Buffer* bufobj,
-        cl_int * err = NULL) : pfn_clCreateFromD3D10BufferKHR(nullptr)
+        cl_int * err = nullptr) : pfn_clCreateFromD3D10BufferKHR(nullptr)
     {
         typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
             cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
@@ -4128,14 +4339,15 @@ public:
         PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR;
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
         vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
-        cl_platform platform = -1;
+        cl_platform platform = nullptr;
         for( int i = 0; i < props.size(); ++i ) {
             if( props[i] == CL_CONTEXT_PLATFORM ) {
                 platform = props[i+1];
             }
         }
         CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateFromD3D10BufferKHR);
-#elif CL_HPP_TARGET_OPENCL_VERSION >= 110
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
         CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateFromD3D10BufferKHR);
 #endif
 
@@ -4146,13 +4358,14 @@ public:
             bufobj,
             &error);
 
+        // TODO: This should really have a D3D10 rerror code!
         detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     BufferD3D10() : Buffer() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -4174,35 +4387,6 @@ public:
         Buffer::operator=(rhs);
         return *this;
     }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferD3D10(const BufferD3D10& buf) : 
-        Buffer(buf) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferD3D10& operator = (const BufferD3D10 &buf)
-    {
-        Buffer::operator=(buf);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferD3D10(BufferD3D10&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferD3D10& operator = (BufferD3D10 &&buf)
-    {
-        Buffer::operator=(std::move(buf));
-        return *this;
-    }
 };
 #endif
 
@@ -4226,7 +4410,7 @@ public:
         const Context& context,
         cl_mem_flags flags,
         cl_GLuint bufobj,
-        cl_int * err = NULL)
+        cl_int * err = nullptr)
     {
         cl_int error;
         object_ = ::clCreateFromGLBuffer(
@@ -4236,12 +4420,12 @@ public:
             &error);
 
         detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     BufferGL() : Buffer() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -4264,33 +4448,6 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferGL(const BufferGL& buf) : Buffer(buf) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferGL& operator = (const BufferGL &buf)
-    {
-        Buffer::operator=(buf);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferGL(BufferGL&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferGL& operator = (BufferGL &&buf)
-    {
-        Buffer::operator=(std::move(buf));
-        return *this;
-    }
 
     //! \brief Wrapper for clGetGLObjectInfo().
     cl_int getObjectInfo(
@@ -4323,7 +4480,7 @@ public:
         const Context& context,
         cl_mem_flags flags,
         cl_GLuint bufobj,
-        cl_int * err = NULL)
+        cl_int * err = nullptr)
     {
         cl_int error;
         object_ = ::clCreateFromGLRenderbuffer(
@@ -4333,12 +4490,12 @@ public:
             &error);
 
         detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     BufferRenderGL() : Buffer() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -4361,33 +4518,6 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferRenderGL(const BufferRenderGL& buf) : Buffer(buf) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferRenderGL& operator = (const BufferRenderGL &buf)
-    {
-        Buffer::operator=(buf);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferRenderGL& operator = (BufferRenderGL &&buf)
-    {
-        Buffer::operator=(std::move(buf));
-        return *this;
-    }
 
     //! \brief Wrapper for clGetGLObjectInfo().
     cl_int getObjectInfo(
@@ -4409,7 +4539,7 @@ public:
 class Image : public Memory
 {
 protected:
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Image() : Memory() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -4432,34 +4562,6 @@ protected:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image(const Image& img) : Memory(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image& operator = (const Image &img)
-    {
-        Memory::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image(Image&& img) CL_HPP_NOEXCEPT_ : Memory(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image& operator = (Image &&img)
-    {
-        Memory::operator=(std::move(img));
-        return *this;
-    }
-
 
 public:
     //! \brief Wrapper for clGetImageInfo().
@@ -4474,12 +4576,12 @@ public:
     //! \brief Wrapper for clGetImageInfo() that returns by value.
     template <cl_image_info name> typename
     detail::param_traits<detail::cl_image_info, name>::param_type
-    getImageInfo(cl_int* err = NULL) const
+    getImageInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_image_info, name>::param_type param;
         cl_int result = getImageInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -4505,12 +4607,12 @@ public:
         cl_mem_flags flags,
         ImageFormat format,
         size_type width,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
 
-        cl_image_desc desc = {0};
+        cl_image_desc desc = {};
         desc.image_type = CL_MEM_OBJECT_IMAGE1D;
         desc.image_width = width;
 
@@ -4523,12 +4625,12 @@ public:
             &error);
 
         detail::errHandler(error, __CREATE_IMAGE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Image1D() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -4551,33 +4653,6 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1D(const Image1D& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1D& operator = (const Image1D &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1D(Image1D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1D& operator = (Image1D &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
 
 };
 
@@ -4593,11 +4668,11 @@ public:
         ImageFormat format,
         size_type width,
         const Buffer &buffer,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
-        cl_image_desc desc = {0};
+        cl_image_desc desc = {};
         desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
         desc.image_width = width;
         desc.buffer = buffer();
@@ -4607,11 +4682,11 @@ public:
             flags, 
             &format, 
             &desc, 
-            NULL, 
+            nullptr, 
             &error);
 
         detail::errHandler(error, __CREATE_IMAGE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -4634,33 +4709,7 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DBuffer(const Image1DBuffer& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DBuffer& operator = (const Image1DBuffer &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DBuffer(Image1DBuffer&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
 
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DBuffer& operator = (Image1DBuffer &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
 
 };
 
@@ -4677,12 +4726,12 @@ public:
         size_type arraySize,
         size_type width,
         size_type rowPitch,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
 
-        cl_image_desc desc = {0};
+        cl_image_desc desc = {};
         desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
         desc.image_width = width;
         desc.image_array_size = arraySize;
@@ -4697,7 +4746,7 @@ public:
             &error);
 
         detail::errHandler(error, __CREATE_IMAGE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -4721,33 +4770,6 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DArray(const Image1DArray& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DArray& operator = (const Image1DArray &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DArray(Image1DArray&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DArray& operator = (Image1DArray &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
 
 };
 #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120
@@ -4773,8 +4795,8 @@ public:
         size_type width,
         size_type height,
         size_type row_pitch = 0,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
         bool useCreateImage;
@@ -4794,7 +4816,7 @@ public:
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
         if (useCreateImage)
         {
-            cl_image_desc desc = {0};
+            cl_image_desc desc = {};
             desc.image_type = CL_MEM_OBJECT_IMAGE2D;
             desc.image_width = width;
             desc.image_height = height;
@@ -4809,7 +4831,7 @@ public:
                 &error);
 
             detail::errHandler(error, __CREATE_IMAGE_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -4821,17 +4843,20 @@ public:
                 context(), flags,&format, width, height, row_pitch, host_ptr, &error);
 
             detail::errHandler(error, __CREATE_IMAGE2D_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
 #endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
     }
 
-#if CL_HPP_TARGET_OPENCL_VERSION >= 200 || defined(CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
     /*! \brief Constructs a 2D Image from a buffer.
     * \note This will share storage with the underlying buffer.
     *
+    *  Requires OpenCL 2.0 or newer or OpenCL 1.2 and the 
+    *  cl_khr_image2d_from_buffer extension.
+    *
     *  Wraps clCreateImage().
     */
     Image2D(
@@ -4845,7 +4870,7 @@ public:
     {
         cl_int error;
 
-        cl_image_desc desc = {0};
+        cl_image_desc desc = {};
         desc.image_type = CL_MEM_OBJECT_IMAGE2D;
         desc.image_width = width;
         desc.image_height = height;
@@ -4865,7 +4890,7 @@ public:
             *err = error;
         }
     }
-#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200 || defined(CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
     /*! \brief Constructs a 2D Image from an image.
@@ -4906,7 +4931,7 @@ public:
         // Channel format inherited from source.
         sourceFormat.image_channel_order = order;
 
-        cl_image_desc desc = {0};
+        cl_image_desc desc = {};
         desc.image_type = CL_MEM_OBJECT_IMAGE2D;
         desc.image_width = sourceWidth;
         desc.image_height = sourceHeight;
@@ -4930,7 +4955,7 @@ public:
     }
 #endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Image2D() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -4953,33 +4978,8 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2D(const Image2D& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2D& operator = (const Image2D &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
 
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2D(Image2D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
 
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2D& operator = (Image2D &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
 
 };
 
@@ -5008,7 +5008,7 @@ public:
         cl_GLenum target,
         cl_GLint  miplevel,
         cl_GLuint texobj,
-        cl_int * err = NULL)
+        cl_int * err = nullptr)
     {
         cl_int error;
         object_ = ::clCreateFromGLTexture2D(
@@ -5020,13 +5020,13 @@ public:
             &error);
 
         detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 
     }
     
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Image2DGL() : Image2D() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -5049,33 +5049,7 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DGL(const Image2DGL& img) : Image2D(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DGL& operator = (const Image2DGL &img)
-    {
-        Image2D::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DGL(Image2DGL&& img) CL_HPP_NOEXCEPT_ : Image2D(std::move(img)) {}
 
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DGL& operator = (Image2DGL &&img)
-    {
-        Image2D::operator=(std::move(img));
-        return *this;
-    }
 
 } CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 #endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
@@ -5096,12 +5070,12 @@ public:
         size_type height,
         size_type rowPitch,
         size_type slicePitch,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
 
-        cl_image_desc desc = {0};
+        cl_image_desc desc = {};
         desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
         desc.image_width = width;
         desc.image_height = height;
@@ -5118,7 +5092,7 @@ public:
             &error);
 
         detail::errHandler(error, __CREATE_IMAGE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -5140,33 +5114,6 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DArray(const Image2DArray& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DArray& operator = (const Image2DArray &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DArray(Image2DArray&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DArray& operator = (Image2DArray &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
 };
 #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 
@@ -5192,8 +5139,8 @@ public:
         size_type depth,
         size_type row_pitch = 0,
         size_type slice_pitch = 0,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
         bool useCreateImage;
@@ -5213,7 +5160,7 @@ public:
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
         if (useCreateImage)
         {
-            cl_image_desc desc = {0};
+            cl_image_desc desc = {};
             desc.image_type = CL_MEM_OBJECT_IMAGE3D;
             desc.image_width = width;
             desc.image_height = height;
@@ -5230,7 +5177,7 @@ public:
                 &error);
 
             detail::errHandler(error, __CREATE_IMAGE_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -5243,14 +5190,14 @@ public:
                 slice_pitch, host_ptr, &error);
 
             detail::errHandler(error, __CREATE_IMAGE3D_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
 #endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Image3D() : Image() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -5273,33 +5220,6 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3D(const Image3D& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3D& operator = (const Image3D &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3D(Image3D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3D& operator = (Image3D &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
 };
 
 #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
@@ -5325,7 +5245,7 @@ public:
         cl_GLenum target,
         cl_GLint  miplevel,
         cl_GLuint texobj,
-        cl_int * err = NULL)
+        cl_int * err = nullptr)
     {
         cl_int error;
         object_ = ::clCreateFromGLTexture3D(
@@ -5337,12 +5257,12 @@ public:
             &error);
 
         detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Image3DGL() : Image3D() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -5365,33 +5285,6 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3DGL(const Image3DGL& img) : Image3D(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3DGL& operator = (const Image3DGL &img)
-    {
-        Image3D::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3DGL(Image3DGL&& img) CL_HPP_NOEXCEPT_ : Image3D(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3DGL& operator = (Image3DGL &&img)
-    {
-        Image3D::operator=(std::move(img));
-        return *this;
-    }
 };
 #endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
 
@@ -5411,7 +5304,7 @@ public:
         cl_GLenum target,
         cl_GLint  miplevel,
         cl_GLuint texobj,
-        cl_int * err = NULL)
+        cl_int * err = nullptr)
     {
         cl_int error;
         object_ = ::clCreateFromGLTexture(
@@ -5423,7 +5316,7 @@ public:
             &error);
 
         detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -5446,35 +5339,8 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    ImageGL(const ImageGL& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    ImageGL& operator = (const ImageGL &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    ImageGL(ImageGL&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    ImageGL& operator = (ImageGL &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
-};
-#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+};
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 
 
 
@@ -5502,7 +5368,7 @@ public:
         const Context& context,
         cl_uint packet_size,
         cl_uint max_packets,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -5510,7 +5376,7 @@ public:
         object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error);
 
         detail::errHandler(error, __CREATE_PIPE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -5526,7 +5392,7 @@ public:
     Pipe(
         cl_uint packet_size,
         cl_uint max_packets,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -5536,12 +5402,12 @@ public:
         object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error);
 
         detail::errHandler(error, __CREATE_PIPE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Pipe() : Memory() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -5564,33 +5430,7 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Pipe(const Pipe& pipe) : Memory(pipe) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Pipe& operator = (const Pipe &pipe)
-    {
-        Memory::operator=(pipe);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Pipe(Pipe&& pipe) CL_HPP_NOEXCEPT_ : Memory(std::move(pipe)) {}
 
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Pipe& operator = (Pipe &&pipe)
-    {
-        Memory::operator=(std::move(pipe));
-        return *this;
-    }
 
     //! \brief Wrapper for clGetMemObjectInfo().
     template <typename T>
@@ -5604,12 +5444,12 @@ public:
     //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
     template <cl_pipe_info name> typename
         detail::param_traits<detail::cl_pipe_info, name>::param_type
-        getInfo(cl_int* err = NULL) const
+        getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_pipe_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -5629,7 +5469,7 @@ public:
 class Sampler : public detail::Wrapper<cl_sampler>
 {
 public:
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Sampler() { }
 
     /*! \brief Constructs a Sampler in a specified context.
@@ -5641,7 +5481,7 @@ public:
         cl_bool normalized_coords,
         cl_addressing_mode addressing_mode,
         cl_filter_mode filter_mode,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -5657,7 +5497,7 @@ public:
             &error);
 
         detail::errHandler(error, __CREATE_SAMPLER_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 #else
@@ -5669,7 +5509,7 @@ public:
             &error);
 
         detail::errHandler(error, __CREATE_SAMPLER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 #endif        
@@ -5697,33 +5537,7 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Sampler(const Sampler& sam) : detail::Wrapper<cl_type>(sam) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Sampler& operator = (const Sampler &sam)
-    {
-        detail::Wrapper<cl_type>::operator=(sam);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Sampler(Sampler&& sam) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(sam)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Sampler& operator = (Sampler &&sam)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(sam));
-        return *this;
-    }
+  
 
     //! \brief Wrapper for clGetSamplerInfo().
     template <typename T>
@@ -5737,12 +5551,12 @@ public:
     //! \brief Wrapper for clGetSamplerInfo() that returns by value.
     template <cl_sampler_info name> typename
     detail::param_traits<detail::cl_sampler_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_sampler_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -5798,6 +5612,15 @@ public:
         sizes_[2] = size2;
     }
 
+    //! \brief Constructs one-dimensional range.
+    NDRange(array<size_type, 1> a) : NDRange(a[0]){}
+
+    //! \brief Constructs two-dimensional range.
+    NDRange(array<size_type, 2> a) : NDRange(a[0], a[1]){}
+
+    //! \brief Constructs three-dimensional range.
+    NDRange(array<size_type, 3> a) : NDRange(a[0], a[1], a[2]){}
+
     /*! \brief Conversion operator to const size_type *.
      *  
      *  \returns a pointer to the size of the first dimension.
@@ -5868,7 +5691,7 @@ template <>
 struct KernelArgumentHandler<LocalSpaceArg, void>
 {
     static size_type size(const LocalSpaceArg& value) { return value.size_; }
-    static const void* ptr(const LocalSpaceArg&) { return NULL; }
+    static const void* ptr(const LocalSpaceArg&) { return nullptr; }
 };
 
 } 
@@ -5895,9 +5718,9 @@ Local(size_type size)
 class Kernel : public detail::Wrapper<cl_kernel>
 {
 public:
-    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+    inline Kernel(const Program& program, const char* name, cl_int* err = nullptr);
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Kernel() { }
 
     /*! \brief Constructor from cl_kernel - takes ownership.
@@ -5922,33 +5745,8 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Kernel& operator = (const Kernel &kernel)
-    {
-        detail::Wrapper<cl_type>::operator=(kernel);
-        return *this;
-    }
 
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Kernel(Kernel&& kernel) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(kernel)) {}
 
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Kernel& operator = (Kernel &&kernel)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(kernel));
-        return *this;
-    }
 
     template <typename T>
     cl_int getInfo(cl_kernel_info name, T* param) const
@@ -5960,12 +5758,12 @@ public:
 
     template <cl_kernel_info name> typename
     detail::param_traits<detail::cl_kernel_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_kernel_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -5982,12 +5780,12 @@ public:
 
     template <cl_kernel_arg_info name> typename
     detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
-    getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
+    getArgInfo(cl_uint argIndex, cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_kernel_arg_info, name>::param_type param;
         cl_int result = getArgInfo(argIndex, name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -6006,18 +5804,18 @@ public:
 
     template <cl_kernel_work_group_info name> typename
     detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
-        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
+        getWorkGroupInfo(const Device& device, cl_int* err = nullptr) const
     {
         typename detail::param_traits<
         detail::cl_kernel_work_group_info, name>::param_type param;
         cl_int result = getWorkGroupInfo(device, name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
     }
     
-#if (CL_HPP_TARGET_OPENCL_VERSION >= 200 && defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)) || CL_HPP_TARGET_OPENCL_VERSION >= 210
+#if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) || CL_HPP_TARGET_OPENCL_VERSION >= 210
     cl_int getSubGroupInfo(const cl::Device &dev, cl_kernel_sub_group_info name, const cl::NDRange &range, size_type* param) const
     {
 #if CL_HPP_TARGET_OPENCL_VERSION >= 210
@@ -6029,7 +5827,7 @@ public:
 #else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
 
         typedef clGetKernelSubGroupInfoKHR_fn PFN_clGetKernelSubGroupInfoKHR;
-        static PFN_clGetKernelSubGroupInfoKHR pfn_clGetKernelSubGroupInfoKHR = NULL;
+        static PFN_clGetKernelSubGroupInfoKHR pfn_clGetKernelSubGroupInfoKHR = nullptr;
         CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetKernelSubGroupInfoKHR);
 
         return detail::errHandler(
@@ -6040,16 +5838,16 @@ public:
     }
 
     template <cl_kernel_sub_group_info name>
-        size_type getSubGroupInfo(const cl::Device &dev, const cl::NDRange &range, cl_int* err = NULL) const
+        size_type getSubGroupInfo(const cl::Device &dev, const cl::NDRange &range, cl_int* err = nullptr) const
     {
         size_type param;
         cl_int result = getSubGroupInfo(dev, name, range, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
     }
-#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+#endif // defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) || CL_HPP_TARGET_OPENCL_VERSION >= 210
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
     /*! \brief setArg overload taking a shared_ptr type
@@ -6255,7 +6053,7 @@ public:
     Program(
         const string& source,
         bool build = false,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -6274,19 +6072,19 @@ public:
             error = ::clBuildProgram(
                 object_,
                 0,
-                NULL,
+                nullptr,
 #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
                 "-cl-std=CL2.0",
 #else
                 "",
 #endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
-                NULL,
-                NULL);
+                nullptr,
+                nullptr);
 
             detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
         }
 
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -6295,7 +6093,7 @@ public:
         const Context& context,
         const string& source,
         bool build = false,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -6311,19 +6109,19 @@ public:
             error = ::clBuildProgram(
                 object_,
                 0,
-                NULL,
+                nullptr,
 #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
                 "-cl-std=CL2.0",
 #else
                 "",
 #endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
-                NULL,
-                NULL);
+                nullptr,
+                nullptr);
             
             detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
         }
 
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -6334,7 +6132,7 @@ public:
      */
     Program(
         const Sources& sources,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
         Context context = Context::getDefault(err);
@@ -6358,7 +6156,7 @@ public:
             context(), (cl_uint)n, strings.data(), lengths.data(), &error);
 
         detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -6370,7 +6168,7 @@ public:
     Program(
         const Context& context,
         const Sources& sources,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -6393,21 +6191,22 @@ public:
             context(), (cl_uint)n, strings.data(), lengths.data(), &error);
 
         detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
 
-#if CL_HPP_TARGET_OPENCL_VERSION >= 210 || (CL_HPP_TARGET_OPENCL_VERSION==200 && defined(CL_HPP_USE_IL_KHR))
+#if defined(CL_HPP_USE_IL_KHR) || CL_HPP_TARGET_OPENCL_VERSION >= 210
     /**
      * Program constructor to allow construction of program from SPIR-V or another IL.
-     * Valid for either OpenCL >= 2.1 or when CL_HPP_USE_IL_KHR is defined.
+     *
+     * Requires OpenCL 2.1 or newer or the cl_khr_il_program extension.
      */
     Program(
         const vector<char>& IL,
         bool build = false,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -6421,11 +6220,10 @@ public:
 #else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
 
         typedef clCreateProgramWithILKHR_fn PFN_clCreateProgramWithILKHR;
-        static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = NULL;
+        static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = nullptr;
         CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR);
 
-        return detail::errHandler(
-            pfn_clCreateProgramWithILKHR(
+        object_ = pfn_clCreateProgramWithILKHR(
                 context(), static_cast<const void*>(IL.data()), IL.size(), &error);
 
 #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
@@ -6437,19 +6235,19 @@ public:
             error = ::clBuildProgram(
                 object_,
                 0,
-                NULL,
+                nullptr,
 #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
                 "-cl-std=CL2.0",
 #else
                 "",
 #endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
-                NULL,
-                NULL);
+                nullptr,
+                nullptr);
 
             detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
         }
 
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -6457,13 +6255,14 @@ public:
     /**
      * Program constructor to allow construction of program from SPIR-V or another IL
      * for a specific context.
-     * Valid for either OpenCL >= 2.1 or when CL_HPP_USE_IL_KHR is defined.
+     *
+     * Requires OpenCL 2.1 or newer or the cl_khr_il_program extension.
      */
     Program(
         const Context& context,
         const vector<char>& IL,
         bool build = false,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -6475,11 +6274,10 @@ public:
 #else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
 
         typedef clCreateProgramWithILKHR_fn PFN_clCreateProgramWithILKHR;
-        static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = NULL;
+        static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = nullptr;
         CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR);
 
-        return detail::errHandler(
-            pfn_clCreateProgramWithILKHR(
+        object_ = pfn_clCreateProgramWithILKHR(
             context(), static_cast<const void*>(IL.data()), IL.size(), &error);
 
 #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
@@ -6490,23 +6288,23 @@ public:
             error = ::clBuildProgram(
                 object_,
                 0,
-                NULL,
+                nullptr,
 #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
                 "-cl-std=CL2.0",
 #else
                 "",
 #endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
-                NULL,
-                NULL);
+                nullptr,
+                nullptr);
 
             detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
         }
 
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
-#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
+#endif // defined(CL_HPP_USE_IL_KHR) || CL_HPP_TARGET_OPENCL_VERSION >= 210
 
     /**
      * Construct a program object from a list of devices and a per-device list of binaries.
@@ -6517,12 +6315,12 @@ public:
      *   match the size of binaries and filled with values to specify if each binary
      *   was successfully loaded.
      *   Set to CL_SUCCESS if the binary was successfully loaded.
-     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
+     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is nullptr.
      *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
-     * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
+     * \param err if non-nullptr will be set to CL_SUCCESS on successful operation or one of the following errors:
      *   CL_INVALID_CONTEXT if context is not a valid context.
      *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; 
-     *     or if any entry in binaries is NULL or has length 0.
+     *     or if any entry in binaries is nullptr or has length 0.
      *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
      *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
      *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
@@ -6531,8 +6329,8 @@ public:
         const Context& context,
         const vector<Device>& devices,
         const Binaries& binaries,
-        vector<cl_int>* binaryStatus = NULL,
-        cl_int* err = NULL)
+        vector<cl_int>* binaryStatus = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
         
@@ -6542,7 +6340,7 @@ public:
         if(binaries.size() != numDevices) {
             error = CL_INVALID_VALUE;
             detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
             return;
@@ -6575,12 +6373,12 @@ public:
         object_ = ::clCreateProgramWithBinary(
             context(), (cl_uint) devices.size(),
             deviceIDs.data(),
-            lengths.data(), images.data(), (binaryStatus != NULL && numDevices > 0)
+            lengths.data(), images.data(), (binaryStatus != nullptr && numDevices > 0)
                ? &binaryStatus->front()
-               : NULL, &error);
+               : nullptr, &error);
 
         detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -6595,7 +6393,7 @@ public:
         const Context& context,
         const vector<Device>& devices,
         const string& kernelNames,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -6614,7 +6412,7 @@ public:
             &error);
 
         detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -6638,39 +6436,12 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Program(const Program& program) : detail::Wrapper<cl_type>(program) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Program& operator = (const Program &program)
-    {
-        detail::Wrapper<cl_type>::operator=(program);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Program(Program&& program) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(program)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Program& operator = (Program &&program)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(program));
-        return *this;
-    }
 
     cl_int build(
         const vector<Device>& devices,
-        const char* options = NULL,
-        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-        void* data = NULL) const
+        const char* options = nullptr,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+        void* data = nullptr) const
     {
         size_type numDevices = devices.size();
         vector<cl_device_id> deviceIDs(numDevices);
@@ -6693,9 +6464,9 @@ public:
 
     cl_int build(
         const Device& device,
-        const char* options = NULL,
-        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-        void* data = NULL) const
+        const char* options = nullptr,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+        void* data = nullptr) const
     {
         cl_device_id deviceID = device();
 
@@ -6707,20 +6478,20 @@ public:
             notifyFptr,
             data);
 
-        BuildLogType buildLog(1);
+        BuildLogType buildLog(0);
         buildLog.push_back(std::make_pair(device, getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)));
         return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, buildLog);
     }
 
     cl_int build(
-        const char* options = NULL,
-        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-        void* data = NULL) const
+        const char* options = nullptr,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+        void* data = nullptr) const
     {
         cl_int buildError = ::clBuildProgram(
             object_,
             0,
-            NULL,
+            nullptr,
             options,
             notifyFptr,
             data);
@@ -6730,18 +6501,18 @@ public:
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
     cl_int compile(
-        const char* options = NULL,
-        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-        void* data = NULL) const
+        const char* options = nullptr,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+        void* data = nullptr) const
     {
         cl_int error = ::clCompileProgram(
             object_,
             0,
-            NULL,
+            nullptr,
             options,
             0,
-            NULL,
-            NULL,
+            nullptr,
+            nullptr,
             notifyFptr,
             data);
         return detail::buildErrHandler(error, __COMPILE_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
@@ -6758,12 +6529,12 @@ public:
 
     template <cl_program_info name> typename
     detail::param_traits<detail::cl_program_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_program_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -6781,12 +6552,12 @@ public:
 
     template <cl_program_build_info name> typename
     detail::param_traits<detail::cl_program_build_info, name>::param_type
-    getBuildInfo(const Device& device, cl_int* err = NULL) const
+    getBuildInfo(const Device& device, cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_program_build_info, name>::param_type param;
         cl_int result = getBuildInfo(device, name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -6799,7 +6570,7 @@ public:
      */
     template <cl_program_build_info name>
     vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>>
-        getBuildInfo(cl_int *err = NULL) const
+        getBuildInfo(cl_int *err = nullptr) const
     {
         cl_int result = CL_SUCCESS;
 
@@ -6809,7 +6580,7 @@ public:
 
         // If there was an initial error from getInfo return the error
         if (result != CL_SUCCESS) {
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = result;
             }
             return devInfo;
@@ -6827,7 +6598,7 @@ public:
                 break;
             }
         }
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         if (result != CL_SUCCESS) {
@@ -6839,7 +6610,7 @@ public:
     cl_int createKernels(vector<Kernel>* kernels)
     {
         cl_uint numKernels;
-        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+        cl_int err = ::clCreateKernelsInProgram(object_, 0, nullptr, &numKernels);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
         }
@@ -6847,7 +6618,7 @@ public:
         vector<cl_kernel> value(numKernels);
         
         err = ::clCreateKernelsInProgram(
-            object_, numKernels, value.data(), NULL);
+            object_, numKernels, value.data(), nullptr);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
         }
@@ -6880,7 +6651,7 @@ public:
      */
     CL_API_PREFIX__VERSION_2_2_DEPRECATED cl_int setReleaseCallback(
         void (CL_CALLBACK * pfn_notify)(cl_program program, void * user_data),
-        void * user_data = NULL) CL_API_SUFFIX__VERSION_2_2_DEPRECATED
+        void * user_data = nullptr) CL_API_SUFFIX__VERSION_2_2_DEPRECATED
     {
         return detail::errHandler(
             ::clSetProgramReleaseCallback(
@@ -6927,15 +6698,14 @@ public:
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 inline Program linkProgram(
-    Program input1,
-    Program input2,
-    const char* options = NULL,
-    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-    void* data = NULL,
-    cl_int* err = NULL) 
+    const Program& input1,
+    const Program& input2,
+    const char* options = nullptr,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+    void* data = nullptr,
+    cl_int* err = nullptr)
 {
     cl_int error_local = CL_SUCCESS;
-
     cl_program programs[2] = { input1(), input2() };
 
     Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>(&error_local);
@@ -6946,7 +6716,7 @@ inline Program linkProgram(
     cl_program prog = ::clLinkProgram(
         ctx(),
         0,
-        NULL,
+        nullptr,
         options,
         2,
         programs,
@@ -6955,7 +6725,7 @@ inline Program linkProgram(
         &error_local);
 
     detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
-    if (err != NULL) {
+    if (err != nullptr) {
         *err = error_local;
     }
 
@@ -6963,44 +6733,42 @@ inline Program linkProgram(
 }
 
 inline Program linkProgram(
-    vector<Program> inputPrograms,
-    const char* options = NULL,
-    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-    void* data = NULL,
-    cl_int* err = NULL) 
+    const vector<Program>& inputPrograms,
+    const char* options = nullptr,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+    void* data = nullptr,
+    cl_int* err = nullptr)
 {
     cl_int error_local = CL_SUCCESS;
+    Context ctx;
 
-    vector<cl_program> programs(inputPrograms.size());
+    static_assert(sizeof(cl::Program) == sizeof(cl_program),
+        "Size of cl::Program must be equal to size of cl_program");
 
-    for (unsigned int i = 0; i < inputPrograms.size(); i++) {
-        programs[i] = inputPrograms[i]();
-    }
-    
-    Context ctx;
     if(inputPrograms.size() > 0) {
         ctx = inputPrograms[0].getInfo<CL_PROGRAM_CONTEXT>(&error_local);
         if(error_local!=CL_SUCCESS) {
             detail::errHandler(error_local, __LINK_PROGRAM_ERR);
         }
     }
+
     cl_program prog = ::clLinkProgram(
         ctx(),
         0,
-        NULL,
+        nullptr,
         options,
-        (cl_uint)inputPrograms.size(),
-        programs.data(),
+        static_cast<cl_uint>(inputPrograms.size()),
+        reinterpret_cast<const cl_program *>(inputPrograms.data()),
         notifyFptr,
         data,
         &error_local);
 
     detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
-    if (err != NULL) {
+    if (err != nullptr) {
         *err = error_local;
     }
 
-    return Program(prog, false);
+    return Program(prog);
 }
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 
@@ -7038,7 +6806,7 @@ inline vector<vector<unsigned char>> cl::Program::getInfo<CL_PROGRAM_BINARIES>(c
     vector<vector<unsigned char>> binariesVectors;
 
     cl_int result = getInfo(CL_PROGRAM_BINARIES, &binariesVectors);
-    if (err != NULL) {
+    if (err != nullptr) {
         *err = result;
     }
     return binariesVectors;
@@ -7067,12 +6835,31 @@ inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
     object_ = ::clCreateKernel(program(), name, &error);
     detail::errHandler(error, __CREATE_KERNEL_ERR);
 
-    if (err != NULL) {
+    if (err != nullptr) {
         *err = error;
     }
 
 }
 
+#ifdef cl_khr_external_memory
+enum class ExternalMemoryType : cl_external_memory_handle_type_khr
+{
+    None = 0,
+
+    OpaqueFd = CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR,
+    OpaqueWin32 = CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR,
+    OpaqueWin32Kmt = CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR,
+
+    D3D11Texture = CL_EXTERNAL_MEMORY_HANDLE_D3D11_TEXTURE_KHR,
+    D3D11TextureKmt = CL_EXTERNAL_MEMORY_HANDLE_D3D11_TEXTURE_KMT_KHR,
+
+    D3D12Heap = CL_EXTERNAL_MEMORY_HANDLE_D3D12_HEAP_KHR,
+    D3D12Resource = CL_EXTERNAL_MEMORY_HANDLE_D3D12_RESOURCE_KHR,
+
+    DmaBuf = CL_EXTERNAL_MEMORY_HANDLE_DMA_BUF_KHR,
+};
+#endif
+
 enum class QueueProperties : cl_command_queue_properties
 {
     None = 0,
@@ -7085,6 +6872,11 @@ inline QueueProperties operator|(QueueProperties lhs, QueueProperties rhs)
     return static_cast<QueueProperties>(static_cast<cl_command_queue_properties>(lhs) | static_cast<cl_command_queue_properties>(rhs));
 }
 
+inline QueueProperties operator&(QueueProperties lhs, QueueProperties rhs)
+{
+    return static_cast<QueueProperties>(static_cast<cl_command_queue_properties>(lhs) & static_cast<cl_command_queue_properties>(rhs));
+}
+
 /*! \class CommandQueue
  * \brief CommandQueue interface for cl_command_queue.
  */
@@ -7136,6 +6928,24 @@ private:
         default_ = c;
     }
 
+#ifdef cl_khr_external_memory
+    static std::once_flag ext_memory_initialized_;
+
+    static void initMemoryExtension(const cl::Device& device) 
+    {
+        auto platform = device.getInfo<CL_DEVICE_PLATFORM>();
+
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueAcquireExternalMemObjectsKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueReleaseExternalMemObjectsKHR);
+
+        if ((pfn_clEnqueueAcquireExternalMemObjectsKHR == nullptr)
+            && (pfn_clEnqueueReleaseExternalMemObjectsKHR == nullptr))
+        {
+            detail::errHandler(CL_INVALID_VALUE, __ENQUEUE_ACQUIRE_EXTERNAL_MEMORY_ERR);
+        }
+    }
+#endif // cl_khr_external_memory
+
 public:
 #ifdef CL_HPP_UNIT_TEST_ENABLE
     /*! \brief Reset the default.
@@ -7156,7 +6966,7 @@ public:
      */
    CommandQueue(
         cl_command_queue_properties properties,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -7164,7 +6974,7 @@ public:
         detail::errHandler(error, __CREATE_CONTEXT_ERR);
 
         if (error != CL_SUCCESS) {
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -7197,7 +7007,7 @@ public:
                 }
 
                 detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-                if (err != NULL) {
+                if (err != nullptr) {
                     *err = error;
                 }
             }
@@ -7208,7 +7018,7 @@ public:
                     context(), device(), properties, &error);
 
                 detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-                if (err != NULL) {
+                if (err != nullptr) {
                     *err = error;
                 }
             }
@@ -7222,7 +7032,7 @@ public:
     */
    CommandQueue(
        QueueProperties properties,
-       cl_int* err = NULL)
+       cl_int* err = nullptr)
    {
        cl_int error;
 
@@ -7230,7 +7040,7 @@ public:
        detail::errHandler(error, __CREATE_CONTEXT_ERR);
 
        if (error != CL_SUCCESS) {
-           if (err != NULL) {
+           if (err != nullptr) {
                *err = error;
            }
        }
@@ -7259,7 +7069,7 @@ public:
                    context(), device(), queue_properties, &error);
 
                detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-               if (err != NULL) {
+               if (err != nullptr) {
                    *err = error;
                }
            }
@@ -7270,7 +7080,7 @@ public:
                    context(), device(), static_cast<cl_command_queue_properties>(properties), &error);
 
                detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-               if (err != NULL) {
+               if (err != nullptr) {
                    *err = error;
                }
            }
@@ -7286,7 +7096,7 @@ public:
     explicit CommandQueue(
         const Context& context,
         cl_command_queue_properties properties = 0,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
         bool useWithProperties;
@@ -7297,7 +7107,7 @@ public:
 
         if (error != CL_SUCCESS)
         {
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
             return;
@@ -7328,7 +7138,7 @@ public:
             }
 
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -7339,7 +7149,7 @@ public:
                 context(), devices[0](), properties, &error);
 
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -7353,7 +7163,7 @@ public:
     explicit CommandQueue(
         const Context& context,
         QueueProperties properties,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
         bool useWithProperties;
@@ -7364,7 +7174,7 @@ public:
 
         if (error != CL_SUCCESS)
         {
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
             return;
@@ -7390,7 +7200,7 @@ public:
                 context(), devices[0](), queue_properties, &error);
 
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -7401,7 +7211,7 @@ public:
                 context(), devices[0](), static_cast<cl_command_queue_properties>(properties), &error);
 
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -7416,7 +7226,7 @@ public:
         const Context& context,
         const Device& device,
         cl_command_queue_properties properties = 0,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
         bool useWithProperties;
@@ -7441,7 +7251,7 @@ public:
                 context(), device(), queue_properties, &error);
 
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -7452,7 +7262,7 @@ public:
                 context(), device(), properties, &error);
 
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -7467,7 +7277,7 @@ public:
         const Context& context,
         const Device& device,
         QueueProperties properties,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
         bool useWithProperties;
@@ -7492,7 +7302,7 @@ public:
                 context(), device(), queue_properties, &error);
 
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -7503,14 +7313,14 @@ public:
                 context(), device(), static_cast<cl_command_queue_properties>(properties), &error);
 
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
 #endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
     }
 
-    static CommandQueue getDefault(cl_int * err = NULL) 
+    static CommandQueue getDefault(cl_int * err = nullptr) 
     {
         std::call_once(default_initialized_, makeDefault);
 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
@@ -7518,7 +7328,7 @@ public:
 #else // CL_HPP_TARGET_OPENCL_VERSION >= 200
         detail::errHandler(default_error_, __CREATE_COMMAND_QUEUE_ERR);
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = default_error_;
         }
         return default_;
@@ -7556,34 +7366,6 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    CommandQueue(const CommandQueue& queue) : detail::Wrapper<cl_type>(queue) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    CommandQueue& operator = (const CommandQueue &queue)
-    {
-        detail::Wrapper<cl_type>::operator=(queue);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    CommandQueue(CommandQueue&& queue) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(queue)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    CommandQueue& operator = (CommandQueue &&queue)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(queue));
-        return *this;
-    }
-
     template <typename T>
     cl_int getInfo(cl_command_queue_info name, T* param) const
     {
@@ -7595,12 +7377,12 @@ public:
 
     template <cl_command_queue_info name> typename
     detail::param_traits<detail::cl_command_queue_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_command_queue_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -7612,20 +7394,20 @@ public:
         size_type offset,
         size_type size,
         void* ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueReadBuffer(
                 object_, buffer(), blocking, offset, size,
                 ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_READ_BUFFER_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7637,20 +7419,20 @@ public:
         size_type offset,
         size_type size,
         const void* ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueWriteBuffer(
                 object_, buffer(), blocking, offset, size,
                 ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
                 __ENQUEUE_WRITE_BUFFER_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7662,19 +7444,19 @@ public:
         size_type src_offset,
         size_type dst_offset,
         size_type size,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueCopyBuffer(
                 object_, src(), dst(), src_offset, dst_offset, size,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQEUE_COPY_BUFFER_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7691,8 +7473,8 @@ public:
         size_type host_row_pitch,
         size_type host_slice_pitch,
         void *ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7708,17 +7490,46 @@ public:
                 host_row_pitch,
                 host_slice_pitch,
                 ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
                 __ENQUEUE_READ_BUFFER_RECT_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 
+    cl_int enqueueReadBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const array<size_type, 2>& buffer_offset,
+        const array<size_type, 2>& host_offset,
+        const array<size_type, 2>& region,
+        size_type buffer_row_pitch,
+        size_type buffer_slice_pitch,
+        size_type host_row_pitch,
+        size_type host_slice_pitch,
+        void* ptr,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    { 
+        return enqueueReadBufferRect(
+            buffer,
+            blocking,
+            { buffer_offset[0], buffer_offset[1], 0 },
+            { host_offset[0], host_offset[1], 0 },
+            { region[0], region[1], 1 },
+            buffer_row_pitch,
+            buffer_slice_pitch,
+            host_row_pitch,
+            host_slice_pitch,
+            ptr,
+            events,
+            event);
+    }
+
     cl_int enqueueWriteBufferRect(
         const Buffer& buffer,
         cl_bool blocking,
@@ -7730,8 +7541,8 @@ public:
         size_type host_row_pitch,
         size_type host_slice_pitch,
         const void *ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7747,17 +7558,46 @@ public:
                 host_row_pitch,
                 host_slice_pitch,
                 ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
                 __ENQUEUE_WRITE_BUFFER_RECT_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 
+    cl_int enqueueWriteBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const array<size_type, 2>& buffer_offset,
+        const array<size_type, 2>& host_offset,
+        const array<size_type, 2>& region,
+        size_type buffer_row_pitch,
+        size_type buffer_slice_pitch,
+        size_type host_row_pitch,
+        size_type host_slice_pitch,
+        const void* ptr,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        return enqueueWriteBufferRect(
+            buffer, 
+            blocking,
+            { buffer_offset[0], buffer_offset[1], 0 },
+            { host_offset[0], host_offset[1], 0 },
+            { region[0], region[1], 1 },
+            buffer_row_pitch,
+            buffer_slice_pitch,
+            host_row_pitch,
+            host_slice_pitch,
+            ptr,
+            events,
+            event);
+    }
+
     cl_int enqueueCopyBufferRect(
         const Buffer& src,
         const Buffer& dst,
@@ -7768,8 +7608,8 @@ public:
         size_type src_slice_pitch,
         size_type dst_row_pitch,
         size_type dst_slice_pitch,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7784,16 +7624,44 @@ public:
                 src_slice_pitch,
                 dst_row_pitch,
                 dst_slice_pitch,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQEUE_COPY_BUFFER_RECT_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
+
+    cl_int enqueueCopyBufferRect(
+        const Buffer& src,
+        const Buffer& dst,
+        const array<size_type, 2>& src_origin,
+        const array<size_type, 2>& dst_origin,
+        const array<size_type, 2>& region,
+        size_type src_row_pitch,
+        size_type src_slice_pitch,
+        size_type dst_row_pitch,
+        size_type dst_slice_pitch,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        return enqueueCopyBufferRect(
+            src,
+            dst,
+            { src_origin[0], src_origin[1], 0 },
+            { dst_origin[0], dst_origin[1], 0 },
+            { region[0], region[1], 1 },
+            src_row_pitch,
+            src_slice_pitch,
+            dst_row_pitch,
+            dst_slice_pitch,
+            events,
+            event);
+    }
+
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
     /**
@@ -7813,8 +7681,8 @@ public:
         PatternType pattern,
         size_type offset,
         size_type size,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7825,12 +7693,12 @@ public:
                 sizeof(PatternType), 
                 offset, 
                 size,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
                 __ENQUEUE_FILL_BUFFER_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7845,8 +7713,8 @@ public:
         size_type row_pitch,
         size_type slice_pitch,
         void* ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7859,17 +7727,40 @@ public:
                 row_pitch, 
                 slice_pitch, 
                 ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_READ_IMAGE_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 
+    cl_int enqueueReadImage(
+        const Image& image,
+        cl_bool blocking,
+        const array<size_type, 2>& origin,
+        const array<size_type, 2>& region,
+        size_type row_pitch,
+        size_type slice_pitch,
+        void* ptr,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        return enqueueReadImage(
+            image,
+            blocking,
+            { origin[0], origin[1], 0 },
+            { region[0], region[1], 1 },
+            row_pitch,
+            slice_pitch,
+            ptr,
+            events,
+            event);
+    }
+
     cl_int enqueueWriteImage(
         const Image& image,
         cl_bool blocking,
@@ -7878,8 +7769,8 @@ public:
         size_type row_pitch,
         size_type slice_pitch,
         const void* ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7892,25 +7783,48 @@ public:
                 row_pitch, 
                 slice_pitch, 
                 ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_WRITE_IMAGE_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 
+    cl_int enqueueWriteImage(
+        const Image& image,
+        cl_bool blocking,
+        const array<size_type, 2>& origin,
+        const array<size_type, 2>& region,
+        size_type row_pitch,
+        size_type slice_pitch,
+        const void* ptr,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        return enqueueWriteImage(
+            image,
+            blocking,
+            { origin[0], origin[1], 0 },
+            { region[0], region[1], 1 },
+            row_pitch,
+            slice_pitch,
+            ptr,
+            events,
+            event);
+    }
+
     cl_int enqueueCopyImage(
         const Image& src,
         const Image& dst,
         const array<size_type, 3>& src_origin,
         const array<size_type, 3>& dst_origin,
         const array<size_type, 3>& region,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7921,118 +7835,102 @@ public:
                 src_origin.data(),
                 dst_origin.data(), 
                 region.data(),
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_COPY_IMAGE_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 
-#if CL_HPP_TARGET_OPENCL_VERSION >= 120
-    /**
-     * Enqueue a command to fill an image object with a specified color.
-     * \param fillColor is the color to use to fill the image.
-     *     This is a four component RGBA floating-point color value if
-     *     the image channel data type is not an unnormalized signed or
-     *     unsigned data type.
-     */
-    cl_int enqueueFillImage(
-        const Image& image,
-        cl_float4 fillColor,
-        const array<size_type, 3>& origin,
-        const array<size_type, 3>& region,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueFillImage(
-                object_, 
-                image(),
-                static_cast<void*>(&fillColor), 
-                origin.data(),
-                region.data(),
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_FILL_IMAGE_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
+    cl_int enqueueCopyImage(
+        const Image& src,
+        const Image& dst,
+        const array<size_type, 2>& src_origin,
+        const array<size_type, 2>& dst_origin,
+        const array<size_type, 2>& region,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        return enqueueCopyImage(
+            src,
+            dst,
+            { src_origin[0], src_origin[1], 0 },
+            { dst_origin[0], dst_origin[1], 0 },
+            { region[0], region[1], 1 },
+            events,
+            event);
     }
 
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
     /**
      * Enqueue a command to fill an image object with a specified color.
      * \param fillColor is the color to use to fill the image.
-     *     This is a four component RGBA signed integer color value if
-     *     the image channel data type is an unnormalized signed integer
-     *     type.
+     *     This is a four component RGBA floating-point, signed integer
+     *     or unsigned integer color value if  the image channel data
+     *     type is an unnormalized signed integer type.   
      */
-    cl_int enqueueFillImage(
-        const Image& image,
-        cl_int4 fillColor,
-        const array<size_type, 3>& origin,
-        const array<size_type, 3>& region,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+    template <typename T>
+    typename std::enable_if<std::is_same<T, cl_float4>::value ||
+                            std::is_same<T, cl_int4  >::value ||
+                            std::is_same<T, cl_uint4 >::value,
+                            cl_int>::type 
+     enqueueFillImage(
+         const Image& image, 
+         T fillColor,
+         const array<size_type, 3>& origin,
+         const array<size_type, 3>& region,
+         const vector<Event>* events = nullptr,
+         Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueFillImage(
-                object_, 
+                object_,
                 image(),
-                static_cast<void*>(&fillColor), 
+                static_cast<void*>(&fillColor),
                 origin.data(),
                 region.data(),
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_FILL_IMAGE_ERR);
+                (events != nullptr) ? (cl_uint)events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+                (event != NULL) ? &tmp : nullptr),
+            __ENQUEUE_FILL_IMAGE_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
+        if (event != nullptr && err == CL_SUCCESS) *event = tmp;
 
         return err;
     }
 
-    /**
+   /**
      * Enqueue a command to fill an image object with a specified color.
      * \param fillColor is the color to use to fill the image.
-     *     This is a four component RGBA unsigned integer color value if
-     *     the image channel data type is an unnormalized unsigned integer
-     *     type.
+     *     This is a four component RGBA floating-point, signed integer
+     *     or unsigned integer color value if  the image channel data
+     *     type is an unnormalized signed integer type.
      */
-    cl_int enqueueFillImage(
+    template <typename T>
+    typename std::enable_if<std::is_same<T, cl_float4>::value ||
+                            std::is_same<T, cl_int4  >::value ||
+                            std::is_same<T, cl_uint4 >::value, cl_int>::type
+    enqueueFillImage(
         const Image& image,
-        cl_uint4 fillColor,
-        const array<size_type, 3>& origin,
-        const array<size_type, 3>& region,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueFillImage(
-                object_, 
-                image(),
-                static_cast<void*>(&fillColor), 
-                origin.data(),
-                region.data(),
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_FILL_IMAGE_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
+        T fillColor,
+        const array<size_type, 2>& origin,
+        const array<size_type, 2>& region,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        return enqueueFillImage(
+            image,
+            fillColor,
+            { origin[0], origin[1], 0 },
+            { region[0], region[1], 1 },
+            events,
+            event
+            );
     }
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 
@@ -8042,8 +7940,8 @@ public:
         const array<size_type, 3>& src_origin,
         const array<size_type, 3>& region,
         size_type dst_offset,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -8054,25 +7952,44 @@ public:
                 src_origin.data(),
                 region.data(), 
                 dst_offset,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 
-    cl_int enqueueCopyBufferToImage(
+    cl_int enqueueCopyImageToBuffer(
+        const Image& src,
+        const Buffer& dst,
+        const array<size_type, 2>& src_origin,
+        const array<size_type, 2>& region,
+        size_type dst_offset,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    { 
+        return enqueueCopyImageToBuffer(
+            src,
+            dst,
+            { src_origin[0], src_origin[1], 0 },
+            { region[0], region[1], 1 },
+            dst_offset,
+            events,
+            event);
+    }
+
+    cl_int enqueueCopyBufferToImage(
         const Buffer& src,
         const Image& dst,
         size_type src_offset,
         const array<size_type, 3>& dst_origin,
         const array<size_type, 3>& region,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -8083,79 +8000,116 @@ public:
                 src_offset,
                 dst_origin.data(), 
                 region.data(),
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 
+    cl_int enqueueCopyBufferToImage(
+        const Buffer& src,
+        const Image& dst,
+        size_type src_offset,
+        const array<size_type, 2>& dst_origin,
+        const array<size_type, 2>& region,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        return enqueueCopyBufferToImage(
+            src,
+            dst, 
+            src_offset,
+            { dst_origin[0], dst_origin[1], 0 },
+            { region[0], region[1], 1 },
+            events,
+            event);
+    }
+
     void* enqueueMapBuffer(
         const Buffer& buffer,
         cl_bool blocking,
         cl_map_flags flags,
         size_type offset,
         size_type size,
-        const vector<Event>* events = NULL,
-        Event* event = NULL,
-        cl_int* err = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr,
+        cl_int* err = nullptr) const
     {
         cl_event tmp;
         cl_int error;
         void * result = ::clEnqueueMapBuffer(
             object_, buffer(), blocking, flags, offset, size,
-            (events != NULL) ? (cl_uint) events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-            (event != NULL) ? &tmp : NULL,
+            (events != nullptr) ? (cl_uint) events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr,
             &error);
 
         detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
-        if (event != NULL && error == CL_SUCCESS)
+        if (event != nullptr && error == CL_SUCCESS)
             *event = tmp;
 
         return result;
     }
 
     void* enqueueMapImage(
-        const Image& buffer,
+        const Image& image,
         cl_bool blocking,
         cl_map_flags flags,
         const array<size_type, 3>& origin,
         const array<size_type, 3>& region,
         size_type * row_pitch,
         size_type * slice_pitch,
-        const vector<Event>* events = NULL,
-        Event* event = NULL,
-        cl_int* err = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr,
+        cl_int* err = nullptr) const
     {
         cl_event tmp;
         cl_int error;
         void * result = ::clEnqueueMapImage(
-            object_, buffer(), blocking, flags,
+            object_, image(), blocking, flags,
             origin.data(), 
             region.data(),
             row_pitch, slice_pitch,
-            (events != NULL) ? (cl_uint) events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-            (event != NULL) ? &tmp : NULL,
+            (events != nullptr) ? (cl_uint) events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr,
             &error);
 
         detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
               *err = error;
         }
-        if (event != NULL && error == CL_SUCCESS)
+        if (event != nullptr && error == CL_SUCCESS)
             *event = tmp;
         return result;
     }
 
+    void* enqueueMapImage(
+         const Image& image,
+         cl_bool blocking,
+         cl_map_flags flags,
+         const array<size_type, 2>& origin,
+         const array<size_type, 2>& region,
+         size_type* row_pitch,
+         size_type* slice_pitch,
+         const vector<Event>* events = nullptr,
+         Event* event = nullptr,
+         cl_int* err = nullptr) const
+    {
+        return enqueueMapImage(image, blocking, flags,
+                               { origin[0], origin[1], 0 },
+                               { region[0], region[1], 1 }, row_pitch,
+                               slice_pitch, events, event, err);
+    }
+
 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
     /**
      * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
@@ -8167,18 +8121,18 @@ public:
         cl_bool blocking,
         cl_map_flags flags,
         size_type size,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(::clEnqueueSVMMap(
             object_, blocking, flags, static_cast<void*>(ptr), size,
-            (events != NULL) ? (cl_uint)events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-            (event != NULL) ? &tmp : NULL),
+            (events != nullptr) ? (cl_uint)events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_MAP_BUFFER_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8195,18 +8149,18 @@ public:
         cl_bool blocking,
         cl_map_flags flags,
         size_type size,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(::clEnqueueSVMMap(
             object_, blocking, flags, static_cast<void*>(ptr.get()), size,
-            (events != NULL) ? (cl_uint)events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-            (event != NULL) ? &tmp : NULL),
+            (events != nullptr) ? (cl_uint)events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_MAP_BUFFER_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8221,18 +8175,18 @@ public:
         cl::vector<T, Alloc> &container,
         cl_bool blocking,
         cl_map_flags flags,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(::clEnqueueSVMMap(
             object_, blocking, flags, static_cast<void*>(container.data()), container.size()*sizeof(T),
-            (events != NULL) ? (cl_uint)events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-            (event != NULL) ? &tmp : NULL),
+            (events != nullptr) ? (cl_uint)events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_MAP_BUFFER_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8242,19 +8196,19 @@ public:
     cl_int enqueueUnmapMemObject(
         const Memory& memory,
         void* mapped_ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueUnmapMemObject(
                 object_, memory(), mapped_ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8269,19 +8223,19 @@ public:
     template<typename T>
     cl_int enqueueUnmapSVM(
         T* ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueSVMUnmap(
             object_, static_cast<void*>(ptr),
-            (events != NULL) ? (cl_uint)events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-            (event != NULL) ? &tmp : NULL),
+            (events != nullptr) ? (cl_uint)events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8294,19 +8248,19 @@ public:
     template<typename T, class D>
     cl_int enqueueUnmapSVM(
         cl::pointer<T, D> &ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueSVMUnmap(
             object_, static_cast<void*>(ptr.get()),
-            (events != NULL) ? (cl_uint)events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-            (event != NULL) ? &tmp : NULL),
+            (events != nullptr) ? (cl_uint)events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8319,19 +8273,19 @@ public:
     template<typename T, class Alloc>
     cl_int enqueueUnmapSVM(
         cl::vector<T, Alloc> &container,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueSVMUnmap(
             object_, static_cast<void*>(container.data()),
-            (events != NULL) ? (cl_uint)events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-            (event != NULL) ? &tmp : NULL),
+            (events != nullptr) ? (cl_uint)events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8351,19 +8305,19 @@ public:
      * have completed.
      */
     cl_int enqueueMarkerWithWaitList(
-        const vector<Event> *events = 0,
-        Event *event = 0) const
+        const vector<Event> *events = nullptr,
+        Event *event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueMarkerWithWaitList(
                 object_,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_MARKER_WAIT_LIST_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8381,19 +8335,19 @@ public:
      * before this command to command_queue, have completed.
      */
     cl_int enqueueBarrierWithWaitList(
-        const vector<Event> *events = 0,
-        Event *event = 0) const
+        const vector<Event> *events = nullptr,
+        Event *event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueBarrierWithWaitList(
                 object_,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_BARRIER_WAIT_LIST_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8406,8 +8360,8 @@ public:
     cl_int enqueueMigrateMemObjects(
         const vector<Memory> &memObjects,
         cl_mem_migration_flags flags,
-        const vector<Event>* events = NULL,
-        Event* event = NULL
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr
         ) const
     {
         cl_event tmp;
@@ -8424,12 +8378,12 @@ public:
                 (cl_uint)memObjects.size(), 
                 localMemObjects.data(),
                 flags,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8448,8 +8402,8 @@ public:
         const cl::vector<T*> &svmRawPointers,
         const cl::vector<size_type> &sizes,
         cl_mem_migration_flags flags = 0,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(::clEnqueueSVMMigrateMem(
@@ -8457,12 +8411,12 @@ public:
             svmRawPointers.size(), static_cast<void**>(svmRawPointers.data()),
             sizes.data(), // array of sizes not passed
             flags,
-            (events != NULL) ? (cl_uint)events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-            (event != NULL) ? &tmp : NULL),
+            (events != nullptr) ? (cl_uint)events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_MIGRATE_SVM_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8476,8 +8430,8 @@ public:
     cl_int enqueueMigrateSVM(
         const cl::vector<T*> &svmRawPointers,
         cl_mem_migration_flags flags = 0,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         return enqueueMigrateSVM(svmRawPointers, cl::vector<size_type>(svmRawPointers.size()), flags, events, event);
     }
@@ -8493,8 +8447,8 @@ public:
         const cl::vector<cl::pointer<T, D>> &svmPointers,
         const cl::vector<size_type> &sizes,
         cl_mem_migration_flags flags = 0,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl::vector<void*> svmRawPointers;
         svmRawPointers.reserve(svmPointers.size());
@@ -8514,8 +8468,8 @@ public:
     cl_int enqueueMigrateSVM(
         const cl::vector<cl::pointer<T, D>> &svmPointers,
         cl_mem_migration_flags flags = 0,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         return enqueueMigrateSVM(svmPointers, cl::vector<size_type>(svmPointers.size()), flags, events, event);
     }
@@ -8530,8 +8484,8 @@ public:
         const cl::vector<cl::vector<T, Alloc>> &svmContainers,
         const cl::vector<size_type> &sizes,
         cl_mem_migration_flags flags = 0,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl::vector<void*> svmRawPointers;
         svmRawPointers.reserve(svmContainers.size());
@@ -8550,8 +8504,8 @@ public:
     cl_int enqueueMigrateSVM(
         const cl::vector<cl::vector<T, Alloc>> &svmContainers,
         cl_mem_migration_flags flags = 0,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         return enqueueMigrateSVM(svmContainers, cl::vector<size_type>(svmContainers.size()), flags, events, event);
     }
@@ -8563,22 +8517,22 @@ public:
         const NDRange& offset,
         const NDRange& global,
         const NDRange& local = NullRange,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueNDRangeKernel(
                 object_, kernel(), (cl_uint) global.dimensions(),
-                offset.dimensions() != 0 ? (const size_type*) offset : NULL,
+                offset.dimensions() != 0 ? (const size_type*) offset : nullptr,
                 (const size_type*) global,
-                local.dimensions() != 0 ? (const size_type*) local : NULL,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                local.dimensions() != 0 ? (const size_type*) local : nullptr,
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_NDRANGE_KERNEL_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8587,19 +8541,19 @@ public:
 #if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
     CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask(
         const Kernel& kernel,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const CL_API_SUFFIX__VERSION_1_2_DEPRECATED
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const CL_API_SUFFIX__VERSION_1_2_DEPRECATED
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueTask(
                 object_, kernel(),
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_TASK_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8609,33 +8563,24 @@ public:
     cl_int enqueueNativeKernel(
         void (CL_CALLBACK *userFptr)(void *),
         std::pair<void*, size_type> args,
-        const vector<Memory>* mem_objects = NULL,
-        const vector<const void*>* mem_locs = NULL,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Memory>* mem_objects = nullptr,
+        const vector<const void*>* mem_locs = nullptr,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
-        size_type elements = 0;
-        if (mem_objects != NULL) {
-            elements = mem_objects->size();
-        }
-        vector<cl_mem> mems(elements);
-        for (unsigned int i = 0; i < elements; i++) {
-            mems[i] = ((*mem_objects)[i])();
-        }
-        
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueNativeKernel(
                 object_, userFptr, args.first, args.second,
-                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                mems.data(),
-                (mem_locs != NULL && mem_locs->size() > 0) ? (const void **) &mem_locs->front() : NULL,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,
+                (mem_objects->size() > 0 ) ? reinterpret_cast<const cl_mem *>(mem_objects->data()) : nullptr,
+                (mem_locs != nullptr && mem_locs->size() > 0) ? (const void **) &mem_locs->front() : nullptr,
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_NATIVE_KERNEL);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8646,16 +8591,16 @@ public:
  */
 #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
     CL_API_PREFIX__VERSION_1_1_DEPRECATED 
-    cl_int enqueueMarker(Event* event = NULL) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueMarker(Event* event = nullptr) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueMarker(
                 object_, 
-                (event != NULL) ? &tmp : NULL),
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_MARKER_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8668,50 +8613,50 @@ public:
             ::clEnqueueWaitForEvents(
                 object_,
                 (cl_uint) events.size(),
-                events.size() > 0 ? (const cl_event*) &events.front() : NULL),
+                events.size() > 0 ? (const cl_event*) &events.front() : nullptr),
             __ENQUEUE_WAIT_FOR_EVENTS_ERR);
     }
 #endif // defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
 
     cl_int enqueueAcquireGLObjects(
-         const vector<Memory>* mem_objects = NULL,
-         const vector<Event>* events = NULL,
-         Event* event = NULL) const
+         const vector<Memory>* mem_objects = nullptr,
+         const vector<Event>* events = nullptr,
+         Event* event = nullptr) const
      {
         cl_event tmp;
         cl_int err = detail::errHandler(
              ::clEnqueueAcquireGLObjects(
                  object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                 (event != NULL) ? &tmp : NULL),
+                 (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != nullptr && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): nullptr,
+                 (events != nullptr) ? (cl_uint) events->size() : 0,
+                 (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                 (event != nullptr) ? &tmp : nullptr),
              __ENQUEUE_ACQUIRE_GL_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
      }
 
     cl_int enqueueReleaseGLObjects(
-         const vector<Memory>* mem_objects = NULL,
-         const vector<Event>* events = NULL,
-         Event* event = NULL) const
+         const vector<Memory>* mem_objects = nullptr,
+         const vector<Event>* events = nullptr,
+         Event* event = nullptr) const
      {
         cl_event tmp;
         cl_int err = detail::errHandler(
              ::clEnqueueReleaseGLObjects(
                  object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                 (event != NULL) ? &tmp : NULL),
+                 (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != nullptr && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): nullptr,
+                 (events != nullptr) ? (cl_uint) events->size() : 0,
+                 (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                 (event != nullptr) ? &tmp : nullptr),
              __ENQUEUE_RELEASE_GL_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8728,18 +8673,18 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
     const cl_event* event_wait_list, cl_event* event);
 
     cl_int enqueueAcquireD3D10Objects(
-         const vector<Memory>* mem_objects = NULL,
-         const vector<Event>* events = NULL,
-         Event* event = NULL) const
+         const vector<Memory>* mem_objects = nullptr,
+         const vector<Event>* events = nullptr,
+         Event* event = nullptr) const
     {
-        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = nullptr;
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
         cl_context context = getInfo<CL_QUEUE_CONTEXT>();
         cl::Device device(getInfo<CL_QUEUE_DEVICE>());
         cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
         CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueAcquireD3D10ObjectsKHR);
 #endif
-#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
         CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueAcquireD3D10ObjectsKHR);
 #endif
         
@@ -8747,47 +8692,47 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
         cl_int err = detail::errHandler(
              pfn_clEnqueueAcquireD3D10ObjectsKHR(
                  object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL) ? (cl_event*) &events->front() : NULL,
-                 (event != NULL) ? &tmp : NULL),
+                 (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != nullptr && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): nullptr,
+                 (events != nullptr) ? (cl_uint) events->size() : 0,
+                 (events != nullptr) ? (cl_event*) &events->front() : nullptr,
+                 (event != nullptr) ? &tmp : nullptr),
              __ENQUEUE_ACQUIRE_GL_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
      }
 
     cl_int enqueueReleaseD3D10Objects(
-         const vector<Memory>* mem_objects = NULL,
-         const vector<Event>* events = NULL,
-         Event* event = NULL) const
+         const vector<Memory>* mem_objects = nullptr,
+         const vector<Event>* events = nullptr,
+         Event* event = nullptr) const
     {
-        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = nullptr;
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
         cl_context context = getInfo<CL_QUEUE_CONTEXT>();
         cl::Device device(getInfo<CL_QUEUE_DEVICE>());
         cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
         CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueReleaseD3D10ObjectsKHR);
-#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
-#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
         CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueReleaseD3D10ObjectsKHR);
-#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+#endif
 
         cl_event tmp;
         cl_int err = detail::errHandler(
             pfn_clEnqueueReleaseD3D10ObjectsKHR(
                 object_,
-                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,
+                (mem_objects != nullptr && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): nullptr,
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_RELEASE_GL_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8816,8 +8761,86 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
     {
         return detail::errHandler(::clFinish(object_), __FINISH_ERR);
     }
+
+#ifdef cl_khr_external_memory
+    cl_int enqueueAcquireExternalMemObjects(
+        const vector<Memory>& mem_objects,
+        const vector<Event>* events_wait = nullptr,
+        Event *event = nullptr)
+    {
+        cl_int err = CL_INVALID_OPERATION;
+        cl_event tmp;
+
+        std::call_once(ext_memory_initialized_, initMemoryExtension, this->getInfo<CL_QUEUE_DEVICE>());
+
+        if (pfn_clEnqueueAcquireExternalMemObjectsKHR)
+        {
+            err = pfn_clEnqueueAcquireExternalMemObjectsKHR(
+                object_,
+                static_cast<cl_uint>(mem_objects.size()),
+                (mem_objects.size() > 0) ? reinterpret_cast<const cl_mem *>(mem_objects.data()) : nullptr,
+                (events_wait != nullptr) ? static_cast<cl_uint>(events_wait->size()) : 0,
+                (events_wait != nullptr && events_wait->size() > 0) ? reinterpret_cast<const cl_event*>(events_wait->data()) : nullptr,
+                &tmp);
+        }
+
+        detail::errHandler(err, __ENQUEUE_ACQUIRE_EXTERNAL_MEMORY_ERR);
+
+        if (event != nullptr && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueReleaseExternalMemObjects(
+        const vector<Memory>& mem_objects,
+        const vector<Event>* events_wait = nullptr,
+        Event *event = nullptr)
+    {
+        cl_int err = CL_INVALID_OPERATION;
+        cl_event tmp;
+
+        std::call_once(ext_memory_initialized_, initMemoryExtension, this->getInfo<CL_QUEUE_DEVICE>());
+
+        if (pfn_clEnqueueReleaseExternalMemObjectsKHR)
+        {
+            err = pfn_clEnqueueReleaseExternalMemObjectsKHR(
+                object_,
+                static_cast<cl_uint>(mem_objects.size()),
+                (mem_objects.size() > 0) ? reinterpret_cast<const cl_mem *>(mem_objects.data()) : nullptr,
+                (events_wait != nullptr) ? static_cast<cl_uint>(events_wait->size()) : 0,
+                (events_wait != nullptr && events_wait->size() > 0) ? reinterpret_cast<const cl_event*>(events_wait->data()) : nullptr,
+                &tmp);
+        }
+
+        detail::errHandler(err, __ENQUEUE_RELEASE_EXTERNAL_MEMORY_ERR);
+
+        if (event != nullptr && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // cl_khr_external_memory && CL_HPP_TARGET_OPENCL_VERSION >= 300
+
+#ifdef cl_khr_semaphore
+    cl_int enqueueWaitSemaphores(
+        const vector<Semaphore> &sema_objects,
+        const vector<cl_semaphore_payload_khr> &sema_payloads = {},
+        const vector<Event>* events_wait_list = nullptr,
+        Event *event = nullptr) const;
+
+    cl_int enqueueSignalSemaphores(
+        const vector<Semaphore> &sema_objects,
+        const vector<cl_semaphore_payload_khr>& sema_payloads = {},
+        const vector<Event>* events_wait_list = nullptr,
+        Event* event = nullptr);
+#endif // cl_khr_semaphore
 }; // CommandQueue
 
+#ifdef cl_khr_external_memory
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandQueue::ext_memory_initialized_;
+#endif
+
 CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandQueue::default_initialized_;
 CL_HPP_DEFINE_STATIC_MEMBER_ CommandQueue CommandQueue::default_;
 CL_HPP_DEFINE_STATIC_MEMBER_ cl_int CommandQueue::default_error_ = CL_SUCCESS;
@@ -8850,7 +8873,7 @@ public:
     /*!
      * Default construct device command queue on default context and device
      */
-    DeviceCommandQueue(DeviceQueueProperties properties, cl_int* err = NULL)
+    DeviceCommandQueue(DeviceQueueProperties properties, cl_int* err = nullptr)
     {
         cl_int error;
         cl::Context context = cl::Context::getDefault();
@@ -8865,7 +8888,7 @@ public:
             context(), device(), queue_properties, &error);
 
         detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -8877,7 +8900,7 @@ public:
         const Context& context,
         const Device& device,
         DeviceQueueProperties properties = DeviceQueueProperties::None,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -8889,7 +8912,7 @@ public:
             context(), device(), queue_properties, &error);
 
         detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -8902,7 +8925,7 @@ public:
         const Device& device,
         cl_uint queueSize,
         DeviceQueueProperties properties = DeviceQueueProperties::None,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -8916,7 +8939,7 @@ public:
             context(), device(), queue_properties, &error);
 
         detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -8936,34 +8959,6 @@ public:
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    DeviceCommandQueue(const DeviceCommandQueue& queue) : detail::Wrapper<cl_type>(queue) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    DeviceCommandQueue& operator = (const DeviceCommandQueue &queue)
-    {
-        detail::Wrapper<cl_type>::operator=(queue);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    DeviceCommandQueue(DeviceCommandQueue&& queue) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(queue)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    DeviceCommandQueue& operator = (DeviceCommandQueue &&queue)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(queue));
-        return *this;
-    }
-
     template <typename T>
     cl_int getInfo(cl_command_queue_info name, T* param) const
     {
@@ -8975,12 +8970,12 @@ public:
 
     template <cl_command_queue_info name> typename
         detail::param_traits<detail::cl_command_queue_info, name>::param_type
-        getInfo(cl_int* err = NULL) const
+        getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_command_queue_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -9009,7 +9004,7 @@ public:
             context(), device(), queue_properties, &error));
 
         detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 
@@ -9037,7 +9032,7 @@ public:
             context(), device(), queue_properties, &error));
 
         detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 
@@ -9066,7 +9061,7 @@ public:
                 context(), device(), queue_properties, &error));
 
         detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 
@@ -9088,7 +9083,7 @@ public:
         error = clSetDefaultDeviceCommandQueue(context.get(), device.get(), default_queue.get());
 
         detail::errHandler(error, __SET_DEFAULT_DEVICE_COMMAND_QUEUE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
         return default_queue;
@@ -9097,7 +9092,7 @@ public:
     /*!
      * Return the current default command queue for the specified command queue
      */
-    static DeviceCommandQueue getDefault(const CommandQueue &queue, cl_int * err = NULL)
+    static DeviceCommandQueue getDefault(const CommandQueue &queue, cl_int * err = nullptr)
     {
         return queue.getInfo<CL_QUEUE_DEVICE_DEFAULT>(err);
     }
@@ -9145,26 +9140,26 @@ Buffer::Buffer(
     size_type size = sizeof(DataType)*(endIterator - startIterator);
 
     if( useHostPtr ) {
-        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        object_ = ::clCreateBuffer(context(), flags, size, const_cast<DataType*>(&*startIterator), &error);
     } else {
         object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
     }
 
     detail::errHandler(error, __CREATE_BUFFER_ERR);
-    if (err != NULL) {
+    if (err != nullptr) {
         *err = error;
     }
 
     if( !useHostPtr ) {
         CommandQueue queue(context, 0, &error);
         detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 
         error = cl::copy(queue, startIterator, endIterator, *this);
         detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -9198,21 +9193,21 @@ Buffer::Buffer(
     Context context = queue.getInfo<CL_QUEUE_CONTEXT>();
 
     if (useHostPtr) {
-        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        object_ = ::clCreateBuffer(context(), flags, size, const_cast<DataType*>(&*startIterator), &error);
     }
     else {
         object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
     }
 
     detail::errHandler(error, __CREATE_BUFFER_ERR);
-    if (err != NULL) {
+    if (err != nullptr) {
         *err = error;
     }
 
     if (!useHostPtr) {
         error = cl::copy(queue, startIterator, endIterator, *this);
         detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -9224,8 +9219,8 @@ inline cl_int enqueueReadBuffer(
     size_type offset,
     size_type size,
     void* ptr,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9243,8 +9238,8 @@ inline cl_int enqueueWriteBuffer(
         size_type offset,
         size_type size,
         const void* ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL)
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9262,26 +9257,26 @@ inline void* enqueueMapBuffer(
         cl_map_flags flags,
         size_type offset,
         size_type size,
-        const vector<Event>* events = NULL,
-        Event* event = NULL,
-        cl_int* err = NULL)
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr,
+        cl_int* err = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
     detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-    if (err != NULL) {
+    if (err != nullptr) {
         *err = error;
     }
 
     void * result = ::clEnqueueMapBuffer(
             queue(), buffer(), blocking, flags, offset, size,
-            (events != NULL) ? (cl_uint) events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (events != nullptr) ? (cl_uint) events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
             (cl_event*) event,
             &error);
 
     detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-    if (err != NULL) {
+    if (err != nullptr) {
         *err = error;
     }
     return result;
@@ -9320,12 +9315,12 @@ inline cl_int enqueueMapSVM(
  */
 template<typename T, class D>
 inline cl_int enqueueMapSVM(
-    cl::pointer<T, D> ptr,
+    cl::pointer<T, D> &ptr,
     cl_bool blocking,
     cl_map_flags flags,
     size_type size,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9344,11 +9339,11 @@ inline cl_int enqueueMapSVM(
  */
 template<typename T, class Alloc>
 inline cl_int enqueueMapSVM(
-    cl::vector<T, Alloc> container,
+    cl::vector<T, Alloc> &container,
     cl_bool blocking,
     cl_map_flags flags,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9365,8 +9360,8 @@ inline cl_int enqueueMapSVM(
 inline cl_int enqueueUnmapMemObject(
     const Memory& memory,
     void* mapped_ptr,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9379,12 +9374,12 @@ inline cl_int enqueueUnmapMemObject(
     cl_int err = detail::errHandler(
         ::clEnqueueUnmapMemObject(
         queue(), memory(), mapped_ptr,
-        (events != NULL) ? (cl_uint)events->size() : 0,
-        (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-        (event != NULL) ? &tmp : NULL),
+        (events != nullptr) ? (cl_uint)events->size() : 0,
+        (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
+        (event != nullptr) ? &tmp : nullptr),
         __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
 
-    if (event != NULL && err == CL_SUCCESS)
+    if (event != nullptr && err == CL_SUCCESS)
         *event = tmp;
 
     return err;
@@ -9399,8 +9394,8 @@ inline cl_int enqueueUnmapMemObject(
 template<typename T>
 inline cl_int enqueueUnmapSVM(
     T* ptr,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9421,8 +9416,8 @@ inline cl_int enqueueUnmapSVM(
 template<typename T, class D>
 inline cl_int enqueueUnmapSVM(
     cl::pointer<T, D> &ptr,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9442,8 +9437,8 @@ inline cl_int enqueueUnmapSVM(
 template<typename T, class Alloc>
 inline cl_int enqueueUnmapSVM(
     cl::vector<T, Alloc> &container,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9463,8 +9458,8 @@ inline cl_int enqueueCopyBuffer(
         size_type src_offset,
         size_type dst_offset,
         size_type size,
-        const vector<Event>* events = NULL,
-        Event* event = NULL)
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9612,8 +9607,8 @@ inline cl_int enqueueReadBufferRect(
     size_type host_row_pitch,
     size_type host_slice_pitch,
     void *ptr,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9637,6 +9632,35 @@ inline cl_int enqueueReadBufferRect(
         event);
 }
 
+inline cl_int enqueueReadBufferRect(
+    const Buffer& buffer, 
+    cl_bool blocking,
+    const array<size_type, 2>& buffer_offset,
+    const array<size_type, 2>& host_offset, 
+    const array<size_type, 2>& region,
+    size_type buffer_row_pitch,
+    size_type buffer_slice_pitch,
+    size_type host_row_pitch,
+    size_type host_slice_pitch,
+    void* ptr,
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    return enqueueReadBufferRect(
+        buffer,
+        blocking,
+        { buffer_offset[0], buffer_offset[1], 0 },
+        { host_offset[0], host_offset[1], 0 },
+        { region[0], region[1], 1 },
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr,
+        events,
+        event);
+}
+
 inline cl_int enqueueWriteBufferRect(
     const Buffer& buffer,
     cl_bool blocking,
@@ -9648,8 +9672,8 @@ inline cl_int enqueueWriteBufferRect(
     size_type host_row_pitch,
     size_type host_slice_pitch,
     const void *ptr,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9673,6 +9697,35 @@ inline cl_int enqueueWriteBufferRect(
         event);
 }
 
+inline cl_int enqueueWriteBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const array<size_type, 2>& buffer_offset,
+    const array<size_type, 2>& host_offset,
+    const array<size_type, 2>& region,
+    size_type buffer_row_pitch,
+    size_type buffer_slice_pitch,
+    size_type host_row_pitch,
+    size_type host_slice_pitch,
+    const void* ptr,
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    return enqueueWriteBufferRect(
+        buffer, 
+        blocking,
+        { buffer_offset[0], buffer_offset[1], 0 },
+        { host_offset[0], host_offset[1], 0 },
+        { region[0], region[1], 1 }, 
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr,
+        events,
+        event);
+}
+
 inline cl_int enqueueCopyBufferRect(
     const Buffer& src,
     const Buffer& dst,
@@ -9683,8 +9736,8 @@ inline cl_int enqueueCopyBufferRect(
     size_type src_slice_pitch,
     size_type dst_row_pitch,
     size_type dst_slice_pitch,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9706,6 +9759,33 @@ inline cl_int enqueueCopyBufferRect(
         events, 
         event);
 }
+
+inline cl_int enqueueCopyBufferRect(
+    const Buffer& src,
+    const Buffer& dst,
+    const array<size_type, 2>& src_origin,
+    const array<size_type, 2>& dst_origin,
+    const array<size_type, 2>& region,
+    size_type src_row_pitch,
+    size_type src_slice_pitch,
+    size_type dst_row_pitch,
+    size_type dst_slice_pitch,
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    return enqueueCopyBufferRect(
+        src,
+        dst, 
+        { src_origin[0], src_origin[1], 0 },
+        { dst_origin[0], dst_origin[1], 0 },
+        { region[0], region[1], 1 }, 
+        src_row_pitch,
+        src_slice_pitch,
+        dst_row_pitch,
+        dst_slice_pitch,
+        events,
+        event);
+}
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
 
 inline cl_int enqueueReadImage(
@@ -9716,8 +9796,8 @@ inline cl_int enqueueReadImage(
     size_type row_pitch,
     size_type slice_pitch,
     void* ptr,
-    const vector<Event>* events = NULL,
-    Event* event = NULL) 
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr) 
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9738,6 +9818,29 @@ inline cl_int enqueueReadImage(
         event);
 }
 
+inline cl_int enqueueReadImage(
+    const Image& image, 
+    cl_bool blocking,
+    const array<size_type, 2>& origin,
+    const array<size_type, 2>& region,
+    size_type row_pitch,
+    size_type slice_pitch,
+    void* ptr, 
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    return enqueueReadImage(
+        image,
+        blocking, 
+        { origin[0], origin[1], 0 },
+        { region[0], region[1], 1 },
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events,
+        event);
+}
+
 inline cl_int enqueueWriteImage(
     const Image& image,
     cl_bool blocking,
@@ -9746,8 +9849,8 @@ inline cl_int enqueueWriteImage(
     size_type row_pitch,
     size_type slice_pitch,
     const void* ptr,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9768,14 +9871,37 @@ inline cl_int enqueueWriteImage(
         event);
 }
 
+inline cl_int enqueueWriteImage(
+    const Image& image, 
+    cl_bool blocking,
+    const array<size_type, 2>& origin,
+    const array<size_type, 2>& region,
+    size_type row_pitch, 
+    size_type slice_pitch,
+    const void* ptr,
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    return enqueueWriteImage(
+        image, 
+        blocking, 
+        { origin[0], origin[1], 0 },
+        { region[0], region[1], 1 }, 
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events,
+        event);    
+}
+
 inline cl_int enqueueCopyImage(
     const Image& src,
     const Image& dst,
     const array<size_type, 3>& src_origin,
     const array<size_type, 3>& dst_origin,
     const array<size_type, 3>& region,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9794,14 +9920,33 @@ inline cl_int enqueueCopyImage(
         event);
 }
 
+inline cl_int enqueueCopyImage(
+    const Image& src, 
+    const Image& dst,
+    const array<size_type, 2>& src_origin,
+    const array<size_type, 2>& dst_origin,
+    const array<size_type, 2>& region,
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    return enqueueCopyImage(
+        src, 
+        dst,
+        { src_origin[0], src_origin[1], 0 },
+        { dst_origin[0], dst_origin[1], 0 },
+        { region[0], region[1], 1 },
+        events,
+        event);
+}
+
 inline cl_int enqueueCopyImageToBuffer(
     const Image& src,
     const Buffer& dst,
     const array<size_type, 3>& src_origin,
     const array<size_type, 3>& region,
     size_type dst_offset,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9820,14 +9965,33 @@ inline cl_int enqueueCopyImageToBuffer(
         event);
 }
 
+inline cl_int enqueueCopyImageToBuffer(
+    const Image& src, 
+    const Buffer& dst,
+    const array<size_type, 2>& src_origin,
+    const array<size_type, 2>& region,
+    size_type dst_offset,
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    return enqueueCopyImageToBuffer(
+        src,
+        dst,
+        { src_origin[0], src_origin[1], 0 },
+        { region[0], region[1], 1 },
+        dst_offset,
+        events,
+        event);
+}
+
 inline cl_int enqueueCopyBufferToImage(
     const Buffer& src,
     const Image& dst,
     size_type src_offset,
     const array<size_type, 3>& dst_origin,
     const array<size_type, 3>& region,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9846,6 +10010,31 @@ inline cl_int enqueueCopyBufferToImage(
         event);
 }
 
+inline cl_int enqueueCopyBufferToImage(
+    const Buffer& src,
+    const Image& dst,
+    size_type src_offset,
+    const array<size_type, 2>& dst_origin,
+    const array<size_type, 2>& region,
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return enqueueCopyBufferToImage(
+        src,
+        dst,
+        src_offset,
+        { dst_origin[0], dst_origin[1], 0 },
+        { region[0], region[1], 1 },
+        events,
+        event);
+}
 
 inline cl_int flush(void)
 {
@@ -10094,7 +10283,7 @@ public:
     KernelFunctor(
         const Program& program,
         const string name,
-        cl_int * err = NULL) :
+        cl_int * err = nullptr) :
         kernel_(program, name.c_str(), err)
     {}
 
@@ -10183,7 +10372,7 @@ namespace compatibility {
         make_kernel(
             const Program& program,
             const string name,
-            cl_int * err = NULL) :
+            cl_int * err = nullptr) :
             functor_(FunctorType(program, name, err))
         {}
 
@@ -10210,7 +10399,737 @@ namespace compatibility {
     };
 } // namespace compatibility
 
+#ifdef cl_khr_semaphore
+class Semaphore : public detail::Wrapper<cl_semaphore_khr>
+{
+public:
+    Semaphore() : detail::Wrapper<cl_type>() {}
+    Semaphore(
+        const Context &context,
+        const vector<cl_semaphore_properties_khr>& sema_props,
+        cl_int *err = nullptr) 
+    {
+        /* initialization of addresses to extension functions (it is done only once) */
+        std::call_once(ext_init_, initExtensions, context);
+
+        cl_int error = CL_INVALID_OPERATION;
 
+        if (pfn_clCreateSemaphoreWithPropertiesKHR)
+        {
+            object_ = pfn_clCreateSemaphoreWithPropertiesKHR(
+                context(),
+                sema_props.data(),
+                &error);
+        }
+          
+        detail::errHandler(error, __CREATE_SEMAPHORE_KHR_WITH_PROPERTIES_ERR);
+
+        if (err != nullptr) {
+            *err = error;
+        }
+    }
+    Semaphore(
+        const vector<cl_semaphore_properties_khr>& sema_props,
+        cl_int* err = nullptr):Semaphore(Context::getDefault(err), sema_props, err) {}
+    
+    explicit Semaphore(const cl_semaphore_khr& semaphore, bool retainObject = false) :
+        detail::Wrapper<cl_type>(semaphore, retainObject) {}
+    Semaphore& operator = (const cl_semaphore_khr& rhs) {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+    template <typename T>
+    cl_int getInfo(cl_semaphore_info_khr name, T* param) const
+    {
+        if (pfn_clGetSemaphoreInfoKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                                      __GET_SEMAPHORE_KHR_INFO_ERR);
+        }
+
+        return detail::errHandler(
+            detail::getInfo(&pfn_clGetSemaphoreInfoKHR, object_, name, param),
+            __GET_SEMAPHORE_KHR_INFO_ERR);
+    }
+    template <cl_semaphore_info_khr name> typename
+    detail::param_traits<detail::cl_semaphore_info_khr, name>::param_type
+    getInfo(cl_int* err = nullptr) const
+    {
+        typename detail::param_traits<
+            detail::cl_semaphore_info_khr, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != nullptr) {
+            *err = result;        
+        }
+        return param;      
+    }
+
+    cl_int retain()
+    { 
+        if (pfn_clRetainSemaphoreKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                                      __RETAIN_SEMAPHORE_KHR_ERR);
+        }
+        return pfn_clRetainSemaphoreKHR(object_);
+    }
+
+    cl_int release()
+    { 
+        if (pfn_clReleaseSemaphoreKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                                      __RELEASE_SEMAPHORE_KHR_ERR);
+        }
+        return pfn_clReleaseSemaphoreKHR(object_);
+    }
+
+private:
+    static std::once_flag ext_init_;
+
+    static void initExtensions(const Context& context)
+    {
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        Device device = context.getInfo<CL_CONTEXT_DEVICES>().at(0);
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateSemaphoreWithPropertiesKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clReleaseSemaphoreKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clRetainSemaphoreKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueWaitSemaphoresKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueSignalSemaphoresKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetSemaphoreInfoKHR);
+#else
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSemaphoreWithPropertiesKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clReleaseSemaphoreKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clRetainSemaphoreKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueWaitSemaphoresKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueSignalSemaphoresKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetSemaphoreInfoKHR);
+#endif
+        if ((pfn_clCreateSemaphoreWithPropertiesKHR == nullptr) &&
+            (pfn_clReleaseSemaphoreKHR              == nullptr) &&
+            (pfn_clRetainSemaphoreKHR               == nullptr) &&
+            (pfn_clEnqueueWaitSemaphoresKHR         == nullptr) &&
+            (pfn_clEnqueueSignalSemaphoresKHR       == nullptr) &&
+            (pfn_clGetSemaphoreInfoKHR              == nullptr))
+        {
+            detail::errHandler(CL_INVALID_VALUE, __CREATE_SEMAPHORE_KHR_WITH_PROPERTIES_ERR);
+        }
+    }
+
+};
+
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Semaphore::ext_init_;
+
+inline cl_int CommandQueue::enqueueWaitSemaphores(
+    const vector<Semaphore> &sema_objects,
+    const vector<cl_semaphore_payload_khr> &sema_payloads,
+    const vector<Event>* events_wait_list,
+    Event *event) const
+{
+    cl_event tmp;
+    cl_int err = CL_INVALID_OPERATION;
+
+    if (pfn_clEnqueueWaitSemaphoresKHR != nullptr) {
+        err = pfn_clEnqueueWaitSemaphoresKHR(
+                object_,
+                (cl_uint)sema_objects.size(),
+                (const cl_semaphore_khr *) &sema_objects.front(),
+                (sema_payloads.size() > 0) ? &sema_payloads.front() : nullptr,
+                (events_wait_list != nullptr) ? (cl_uint) events_wait_list->size() : 0,
+                (events_wait_list != nullptr && events_wait_list->size() > 0) ? (cl_event*) &events_wait_list->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr);
+    }
+
+    detail::errHandler(err, __ENQUEUE_WAIT_SEMAPHORE_KHR_ERR);
+
+    if (event != nullptr && err == CL_SUCCESS)
+        *event = tmp;
+
+    return err;
+}
+
+inline cl_int CommandQueue::enqueueSignalSemaphores(
+    const vector<Semaphore> &sema_objects,
+    const vector<cl_semaphore_payload_khr>& sema_payloads,
+    const vector<Event>* events_wait_list,
+    Event* event)
+{
+    cl_event tmp;
+    cl_int err = CL_INVALID_OPERATION;
+
+    if (pfn_clEnqueueSignalSemaphoresKHR != nullptr) {
+        err = pfn_clEnqueueSignalSemaphoresKHR(
+                object_,
+                (cl_uint)sema_objects.size(),
+                (const cl_semaphore_khr*) &sema_objects.front(),
+                (sema_payloads.size() > 0) ? &sema_payloads.front() : nullptr,
+                (events_wait_list != nullptr) ? (cl_uint) events_wait_list->size() : 0,
+                (events_wait_list != nullptr && events_wait_list->size() > 0) ? (cl_event*) &events_wait_list->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr);
+    }
+
+    detail::errHandler(err, __ENQUEUE_SIGNAL_SEMAPHORE_KHR_ERR);
+
+    if (event != nullptr && err == CL_SUCCESS)
+        *event = tmp;
+
+    return err;
+}
+
+#endif // cl_khr_semaphore
+
+#if defined(cl_khr_command_buffer)
+/*! \class CommandBufferKhr
+ * \brief CommandBufferKhr interface for cl_command_buffer_khr.
+ */
+class CommandBufferKhr : public detail::Wrapper<cl_command_buffer_khr>
+{
+public:
+    //! \brief Default constructor - initializes to nullptr.
+    CommandBufferKhr() : detail::Wrapper<cl_type>() { }
+
+    explicit CommandBufferKhr(const vector<CommandQueue> &queues,
+        cl_command_buffer_properties_khr properties = 0,
+        cl_int* errcode_ret = nullptr)
+    {
+        cl_command_buffer_properties_khr command_buffer_properties[] = {
+            CL_COMMAND_BUFFER_FLAGS_KHR, properties, 0
+        };
+
+        /* initialization of addresses to extension functions (it is done only once) */
+        std::call_once(ext_init_, [&] { initExtensions(queues[0].getInfo<CL_QUEUE_DEVICE>()); });
+        cl_int error = CL_INVALID_OPERATION;
+
+        static_assert(sizeof(cl::CommandQueue) == sizeof(cl_command_queue),
+            "Size of cl::CommandQueue must be equal to size of cl_command_queue");
+
+        if (pfn_clCreateCommandBufferKHR)
+        {
+            object_ = pfn_clCreateCommandBufferKHR((cl_uint) queues.size(),
+                (cl_command_queue *) &queues.front(),
+                command_buffer_properties,
+                &error);
+        }
+
+        detail::errHandler(error, __CREATE_COMMAND_BUFFER_KHR_ERR);
+        if (errcode_ret != nullptr) {
+            *errcode_ret = error;
+        }
+    }
+
+    explicit CommandBufferKhr(const cl_command_buffer_khr& commandBufferKhr, bool retainObject = false) :
+        detail::Wrapper<cl_type>(commandBufferKhr, retainObject) { }
+
+    CommandBufferKhr& operator=(const cl_command_buffer_khr& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_buffer_info_khr name, T* param) const
+    {
+        if (pfn_clGetCommandBufferInfoKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __GET_COMMAND_BUFFER_INFO_KHR_ERR);
+        }
+        return detail::errHandler(
+            detail::getInfo(pfn_clGetCommandBufferInfoKHR, object_, name, param),
+                __GET_COMMAND_BUFFER_INFO_KHR_ERR);
+    }
+
+    template <cl_command_buffer_info_khr name> typename
+        detail::param_traits<detail::cl_command_buffer_info_khr, name>::param_type
+        getInfo(cl_int* err = nullptr) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_buffer_info_khr, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != nullptr) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int finalizeCommandBuffer() const
+    {
+        return detail::errHandler(::clFinalizeCommandBufferKHR(object_), __FINALIZE_COMMAND_BUFFER_KHR_ERR);
+    }
+
+    cl_int enqueueCommandBuffer(vector<CommandQueue> &queues,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr)
+    {
+        if (pfn_clEnqueueCommandBufferKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __ENQUEUE_COMMAND_BUFFER_KHR_ERR);
+        }
+
+         static_assert(sizeof(cl::CommandQueue) == sizeof(cl_command_queue),
+            "Size of cl::CommandQueue must be equal to size of cl_command_queue");
+
+        return detail::errHandler(pfn_clEnqueueCommandBufferKHR((cl_uint) queues.size(),
+                (cl_command_queue *) &queues.front(),
+                object_,
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (cl_event*) event),
+                __ENQUEUE_COMMAND_BUFFER_KHR_ERR);
+    }
+
+    cl_int commandBarrierWithWaitList(const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandBarrierWithWaitListKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_BARRIER_WITH_WAIT_LIST_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandBarrierWithWaitListKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_BARRIER_WITH_WAIT_LIST_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+    cl_int commandCopyBuffer(const Buffer& src,
+        const Buffer& dst,
+        size_type src_offset,
+        size_type dst_offset,
+        size_type size,
+        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandCopyBufferKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_COPY_BUFFER_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandCopyBufferKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                src(),
+                dst(),
+                src_offset,
+                dst_offset,
+                size,
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_COPY_BUFFER_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+    cl_int commandCopyBufferRect(const Buffer& src,
+        const Buffer& dst,
+        const array<size_type, 3>& src_origin,
+        const array<size_type, 3>& dst_origin,
+        const array<size_type, 3>& region,
+        size_type src_row_pitch,
+        size_type src_slice_pitch,
+        size_type dst_row_pitch,
+        size_type dst_slice_pitch,
+        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandCopyBufferRectKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_COPY_BUFFER_RECT_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandCopyBufferRectKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                src(),
+                dst(),
+                src_origin.data(),
+                dst_origin.data(),
+                region.data(),
+                src_row_pitch,
+                src_slice_pitch,
+                dst_row_pitch,
+                dst_slice_pitch,
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_COPY_BUFFER_RECT_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+    cl_int commandCopyBufferToImage(const Buffer& src,
+        const Image& dst,
+        size_type src_offset,
+        const array<size_type, 3>& dst_origin,
+        const array<size_type, 3>& region,
+        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandCopyBufferToImageKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_COPY_BUFFER_TO_IMAGE_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandCopyBufferToImageKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                src(),
+                dst(),
+                src_offset,
+                dst_origin.data(),
+                region.data(),
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_COPY_BUFFER_TO_IMAGE_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+    cl_int commandCopyImage(const Image& src,
+        const Image& dst,
+        const array<size_type, 3>& src_origin,
+        const array<size_type, 3>& dst_origin,
+        const array<size_type, 3>& region,
+        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandCopyImageKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_COPY_IMAGE_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandCopyImageKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                src(),
+                dst(),
+                src_origin.data(),
+                dst_origin.data(),
+                region.data(),
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_COPY_IMAGE_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+    cl_int commandCopyImageToBuffer(const Image& src,
+        const Buffer& dst,
+        const array<size_type, 3>& src_origin,
+        const array<size_type, 3>& region,
+        size_type dst_offset,
+        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandCopyImageToBufferKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_COPY_IMAGE_TO_BUFFER_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandCopyImageToBufferKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                src(),
+                dst(),
+                src_origin.data(),
+                region.data(),
+                dst_offset,
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_COPY_IMAGE_TO_BUFFER_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+    template<typename PatternType>
+    cl_int commandFillBuffer(const Buffer& buffer,
+        PatternType pattern,
+        size_type offset,
+        size_type size,
+        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandFillBufferKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_FILL_BUFFER_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandFillBufferKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                buffer(),
+                static_cast<void*>(&pattern),
+                sizeof(PatternType),
+                offset,
+                size,
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_FILL_BUFFER_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+    cl_int commandFillImage(const Image& image,
+        cl_float4 fillColor,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandFillImageKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_FILL_IMAGE_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandFillImageKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                image(),
+                static_cast<void*>(&fillColor),
+                origin.data(),
+                region.data(),
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_FILL_IMAGE_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+    cl_int commandNDRangeKernel(const cl::vector<cl_ndrange_kernel_command_properties_khr> &properties,
+        const Kernel& kernel,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local = NullRange,
+        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandNDRangeKernelKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_NDRANGE_KERNEL_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandNDRangeKernelKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                &properties[0],
+                kernel(),
+                (cl_uint) global.dimensions(),
+                offset.dimensions() != 0 ? (const size_type*) offset : nullptr,
+                (const size_type*) global,
+                local.dimensions() != 0 ? (const size_type*) local : nullptr,
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_NDRANGE_KERNEL_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+#if defined(cl_khr_command_buffer_mutable_dispatch)
+    cl_int updateMutableCommands(const cl_mutable_base_config_khr* mutable_config)
+    {
+        if (pfn_clUpdateMutableCommandsKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __UPDATE_MUTABLE_COMMANDS_KHR_ERR);
+        }
+        return detail::errHandler(pfn_clUpdateMutableCommandsKHR(object_, mutable_config),
+                        __UPDATE_MUTABLE_COMMANDS_KHR_ERR);
+    }
+#endif /* cl_khr_command_buffer_mutable_dispatch */
+
+private:
+    static std::once_flag ext_init_;
+
+    static void initExtensions(const cl::Device& device)
+    {
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clFinalizeCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clRetainCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clReleaseCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetCommandBufferInfoKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandBarrierWithWaitListKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyBufferRectKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyBufferToImageKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyImageKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyImageToBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandFillBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandFillImageKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandNDRangeKernelKHR);
+#if defined(cl_khr_command_buffer_mutable_dispatch)
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clUpdateMutableCommandsKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetMutableCommandInfoKHR);
+#endif /* cl_khr_command_buffer_mutable_dispatch */
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 110
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clFinalizeCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clRetainCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clReleaseCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetCommandBufferInfoKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandBarrierWithWaitListKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyBufferRectKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyBufferToImageKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyImageKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyImageToBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandFillBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandFillImageKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandNDRangeKernelKHR);
+#if defined(cl_khr_command_buffer_mutable_dispatch)
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clUpdateMutableCommandsKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetMutableCommandInfoKHR);
+#endif /* cl_khr_command_buffer_mutable_dispatch */
+#endif
+        if ((pfn_clCreateCommandBufferKHR        == nullptr) &&
+            (pfn_clFinalizeCommandBufferKHR      == nullptr) &&
+            (pfn_clRetainCommandBufferKHR        == nullptr) &&
+            (pfn_clReleaseCommandBufferKHR       == nullptr) &&
+            (pfn_clGetCommandBufferInfoKHR       == nullptr) &&
+            (pfn_clEnqueueCommandBufferKHR       == nullptr) &&
+            (pfn_clCommandBarrierWithWaitListKHR == nullptr) &&
+            (pfn_clCommandCopyBufferKHR          == nullptr) &&
+            (pfn_clCommandCopyBufferRectKHR      == nullptr) &&
+            (pfn_clCommandCopyBufferToImageKHR   == nullptr) &&
+            (pfn_clCommandCopyImageKHR           == nullptr) &&
+            (pfn_clCommandCopyImageToBufferKHR   == nullptr) &&
+            (pfn_clCommandFillBufferKHR          == nullptr) &&
+            (pfn_clCommandFillImageKHR           == nullptr) &&
+            (pfn_clCommandNDRangeKernelKHR       == nullptr)
+#if defined(cl_khr_command_buffer_mutable_dispatch)
+            && (pfn_clUpdateMutableCommandsKHR      == nullptr)
+            && (pfn_clGetMutableCommandInfoKHR      == nullptr)
+#endif /* cl_khr_command_buffer_mutable_dispatch */
+            )
+        {
+            detail::errHandler(CL_INVALID_VALUE, __CREATE_COMMAND_BUFFER_KHR_ERR);
+        }
+    }
+}; // CommandBufferKhr
+
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandBufferKhr::ext_init_;
+
+#if defined(cl_khr_command_buffer_mutable_dispatch)
+/*! \class MutableCommandKhr
+ * \brief MutableCommandKhr interface for cl_mutable_command_khr.
+ */
+class MutableCommandKhr : public detail::Wrapper<cl_mutable_command_khr>
+{
+public:
+    //! \brief Default constructor - initializes to nullptr.
+    MutableCommandKhr() : detail::Wrapper<cl_type>() { }
+
+    explicit MutableCommandKhr(const cl_mutable_command_khr& mutableCommandKhr, bool retainObject = false) :
+        detail::Wrapper<cl_type>(mutableCommandKhr, retainObject) { }
+
+    MutableCommandKhr& operator=(const cl_mutable_command_khr& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_mutable_command_info_khr name, T* param) const
+    {
+        if (pfn_clGetMutableCommandInfoKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __GET_MUTABLE_COMMAND_INFO_KHR_ERR);
+        }
+        return detail::errHandler(
+            detail::getInfo(pfn_clGetMutableCommandInfoKHR, object_, name, param),
+                __GET_MUTABLE_COMMAND_INFO_KHR_ERR);
+    }
+
+    template <cl_mutable_command_info_khr name> typename
+        detail::param_traits<detail::cl_mutable_command_info_khr, name>::param_type
+        getInfo(cl_int* err = nullptr) const
+    {
+        typename detail::param_traits<
+            detail::cl_mutable_command_info_khr, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != nullptr) {
+            *err = result;
+        }
+        return param;
+    }
+}; // MutableCommandKhr
+#endif /* cl_khr_command_buffer_mutable_dispatch */
+
+#endif // cl_khr_command_buffer
 //----------------------------------------------------------------------------------------------------------------------
 
 #undef CL_HPP_ERR_STR_
@@ -10233,8 +11152,26 @@ namespace compatibility {
 #undef __GET_PROGRAM_BUILD_INFO_ERR        
 #undef __GET_COMMAND_QUEUE_INFO_ERR        
 #undef __CREATE_CONTEXT_ERR                
-#undef __CREATE_CONTEXT_FROM_TYPE_ERR      
+#undef __CREATE_CONTEXT_FROM_TYPE_ERR
+#undef __CREATE_COMMAND_BUFFER_KHR_ERR
+#undef __GET_COMMAND_BUFFER_INFO_KHR_ERR
+#undef __FINALIZE_COMMAND_BUFFER_KHR_ERR
+#undef __ENQUEUE_COMMAND_BUFFER_KHR_ERR
+#undef __COMMAND_BARRIER_WITH_WAIT_LIST_KHR_ERR
+#undef __COMMAND_COPY_BUFFER_KHR_ERR
+#undef __COMMAND_COPY_BUFFER_RECT_KHR_ERR
+#undef __COMMAND_COPY_BUFFER_TO_IMAGE_KHR_ERR
+#undef __COMMAND_COPY_IMAGE_KHR_ERR
+#undef __COMMAND_COPY_IMAGE_TO_BUFFER_KHR_ERR
+#undef __COMMAND_FILL_BUFFER_KHR_ERR
+#undef __COMMAND_FILL_IMAGE_KHR_ERR
+#undef __COMMAND_NDRANGE_KERNEL_KHR_ERR
+#undef __UPDATE_MUTABLE_COMMANDS_KHR_ERR
+#undef __GET_MUTABLE_COMMAND_INFO_KHR_ERR
+#undef __RETAIN_COMMAND_BUFFER_KHR_ERR
+#undef __RELEASE_COMMAND_BUFFER_KHR_ERR
 #undef __GET_SUPPORTED_IMAGE_FORMATS_ERR   
+#undef __SET_CONTEXT_DESCTRUCTOR_CALLBACK_ERR
 #undef __CREATE_BUFFER_ERR                 
 #undef __COPY_ERR                          
 #undef __CREATE_SUBBUFFER_ERR              
@@ -10252,7 +11189,6 @@ namespace compatibility {
 #undef __CREATE_KERNEL_ERR                 
 #undef __SET_KERNEL_ARGS_ERR               
 #undef __CREATE_PROGRAM_WITH_SOURCE_ERR    
-#undef __CREATE_PROGRAM_WITH_IL_ERR        
 #undef __CREATE_PROGRAM_WITH_BINARY_ERR    
 #undef __CREATE_PROGRAM_WITH_IL_ERR        
 #undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    
@@ -10292,8 +11228,9 @@ namespace compatibility {
 #undef __FLUSH_ERR                         
 #undef __FINISH_ERR                        
 #undef __VECTOR_CAPACITY_ERR               
-#undef __CREATE_SUB_DEVICES_ERR            
-#undef __CREATE_SUB_DEVICES_ERR            
+#undef __CREATE_SUB_DEVICES_ERR
+#undef __ENQUEUE_ACQUIRE_EXTERNAL_MEMORY_ERR
+#undef __ENQUEUE_RELEASE_EXTERNAL_MEMORY_ERR
 #undef __ENQUEUE_MARKER_ERR                
 #undef __ENQUEUE_WAIT_FOR_EVENTS_ERR       
 #undef __ENQUEUE_BARRIER_ERR               
@@ -10310,17 +11247,18 @@ namespace compatibility {
 #undef __CLONE_KERNEL_ERR     
 #undef __GET_HOST_TIMER_ERR
 #undef __GET_DEVICE_AND_HOST_TIMER_ERR
+#undef __GET_SEMAPHORE_KHR_INFO_ERR
+#undef __CREATE_SEMAPHORE_KHR_WITH_PROPERTIES_ERR
+#undef __ENQUEUE_WAIT_SEMAPHORE_KHR_ERR
+#undef __ENQUEUE_SIGNAL_SEMAPHORE_KHR_ERR
 
 #endif //CL_HPP_USER_OVERRIDE_ERROR_STRINGS
 
 // Extensions
+#undef CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_
 #undef CL_HPP_INIT_CL_EXT_FCN_PTR_
 #undef CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_
 
-#if defined(CL_HPP_USE_CL_DEVICE_FISSION)
-#undef CL_HPP_PARAM_NAME_DEVICE_FISSION_
-#endif // CL_HPP_USE_CL_DEVICE_FISSION
-
 #undef CL_HPP_NOEXCEPT_
 #undef CL_HPP_DEFINE_STATIC_MEMBER_
 
diff --git a/include/half/ChangeLog.txt b/include/half/ChangeLog.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9100b6ab74d3aa529b86884f9b73acb8fbb21ebf
--- /dev/null
+++ b/include/half/ChangeLog.txt
@@ -0,0 +1,184 @@
+Release Notes											{#changelog}
+=============
+
+1.12.0 release (2017-03-06):
+----------------------------
+
+- Changed behaviour of `half_cast` to perform conversions to/from `double` 
+  and `long double` directly according to specified rounding mode, without an 
+  intermediate `float` conversion.
+- Added `noexcept` specifiers to constructors.
+- Fixed minor portability problem with `logb` and `ilogb`.
+- Tested for *VC++ 2015*.
+
+
+1.11.0 release (2013-11-16):
+----------------------------
+
+- Made tie-breaking behaviour in round to nearest configurable by 
+  `HALF_ROUND_TIES_TO_EVEN` macro.
+- Completed support for all C++11 mathematical functions even if single-
+  precision versions from `<cmath>` are unsupported.
+- Fixed inability to disable support for C++11 mathematical functions on 
+  *VC++ 2013*.
+
+
+1.10.0 release (2013-11-09):
+----------------------------
+
+- Made default rounding mode configurable by `HALF_ROUND_STYLE` macro.
+- Added support for non-IEEE single-precision implementations.
+- Added `HALF_ENABLE_CPP11_TYPE_TRAITS` preprocessor flag for checking 
+  support for C++11 type traits and TMP features.
+- Restricted `half_cast` to support built-in arithmetic types only.
+- Changed behaviour of `half_cast` to respect rounding mode when casting 
+  to/from integer types.
+
+
+1.9.2 release (2013-11-01):
+---------------------------
+
+- Tested for *gcc 4.8*.
+- Tested and fixed for *VC++ 2013*.
+- Removed unnecessary warnings in *MSVC*.
+
+
+1.9.1 release (2013-08-08):
+---------------------------
+
+- Fixed problems with older gcc and MSVC versions.
+- Small fix to non-C++11 implementations of `remainder` and `remquo`.
+
+
+1.9.0 release (2013-08-07):
+---------------------------
+
+- Changed behaviour of `nearbyint`, `rint`, `lrint` and `llrint` to use 
+  rounding mode of half-precision implementation (which is 
+  truncating/indeterminate) instead of single-precision rounding mode.
+- Added support for more C++11 mathematical functions even if single-
+  precision versions from `<cmath>` are unsupported, in particular 
+  `remainder`, `remquo` and `cbrt`.
+- Minor implementation changes.
+
+
+1.8.1 release (2013-01-22):
+---------------------------
+
+- Fixed bug resulting in multiple definitions of the `nanh` function due to 
+  a missing `inline` specification.
+
+
+1.8.0 release (2013-01-19):
+---------------------------
+
+- Added support for more C++11 mathematical functions even if single-
+  precision versions from `<cmath>` are unsupported, in particular 
+  exponential and logarithm functions, hyperbolic area functions and the 
+  hypotenuse function.
+- Made `fma` function use default implementation if single-precision version
+  from `<cmath>` is not faster and thus `FP_FAST_FMAH` to be defined always.
+- Fixed overload resolution issues when invoking certain mathematical 
+  functions by unqualified calls.
+
+
+1.7.0 release (2012-10-26):
+---------------------------
+
+- Added support for C++11 `noexcept` specifiers.
+- Changed C++11 `long long` to be supported on *VC++ 2003* and up.
+
+
+1.6.1 release (2012-09-13):
+---------------------------
+
+- Made `fma` and `fdim` functions available even if corresponding 
+  single-precision functions are not.
+
+
+1.6.0 release (2012-09-12):
+---------------------------
+
+- Added `HALF_ENABLE_CPP11_LONG_LONG` to control support for `long long` 
+  integers and corresponding mathematical functions.
+- Fixed C++98 compatibility on non-VC compilers.
+
+
+1.5.1 release (2012-08-17):
+---------------------------
+
+- Recorrected `std::numeric_limits::round_style` to always return 
+  `std::round_indeterminate`, due to overflow-handling deviating from 
+  correct round-toward-zero behaviour.
+
+
+1.5.0 release (2012-08-16):
+---------------------------
+
+- Added `half_cast` for explicitly casting between half and any type 
+  convertible to/from `float` and allowing the explicit specification of 
+  the rounding mode to use.
+
+
+1.4.0 release (2012-08-12):
+---------------------------
+
+- Added support for C++11 generalized constant expressions (`constexpr`).
+
+
+1.3.1 release (2012-08-11):
+---------------------------
+
+- Fixed requirement for `std::signbit` and `std::isnan` (even if C++11 
+  `<cmath>` functions disabled) on non-VC compilers.
+
+
+1.3.0 release (2012-08-10):
+---------------------------
+
+- Made requirement for `<cstdint>` and `static_assert` optional and thus 
+  made the library C++98-compatible.
+- Made support for C++11 features user-overridable through explicit 
+  definition of corresponding preprocessor symbols to either 0 or 1.
+- Renamed `HALF_ENABLE_HASH` to `HALF_ENABLE_CPP11_HASH` in correspondence 
+  with other C++11 preprocessor symbols.
+
+
+1.2.0 release (2012-08-07):
+---------------------------
+
+- Added proper preprocessor definitions for `HUGE_VALH` and `FP_FAST_FMAH` 
+  in correspondence with their single-precision counterparts from `<cmath>`.
+- Fixed internal preprocessor macros to be properly undefined after use.
+
+
+1.1.2 release (2012-08-07):
+---------------------------
+
+- Revised `std::numeric_limits::round_style` to return 
+  `std::round_toward_zero` if the `float` version also does and 
+  `std::round_indeterminate` otherwise.
+- Fixed `std::numeric_limits::round_error` to reflect worst-case round 
+  toward zero behaviour.
+
+
+1.1.1 release (2012-08-06):
+---------------------------
+
+- Fixed `std::numeric_limits::min` to return smallest positive normal 
+  number, instead of subnormal number.
+- Fixed `std::numeric_limits::round_style` to return 
+  `std::round_indeterminate` due to mixture of separately rounded 
+  single-precision arithmetics with truncating single-to-half conversions.
+
+
+1.1.0 release (2012-08-06):
+---------------------------
+
+- Added half-precision literals.
+
+
+1.0.0 release (2012-08-05):
+---------------------------
+
+- First release.
diff --git a/include/half/LICENSE.txt b/include/half/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9e4618bb77d000e29573dbb18a3dc028c70a5be7
--- /dev/null
+++ b/include/half/LICENSE.txt
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2012-2017 Christian Rau
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/include/half/README.txt b/include/half/README.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3a0960c1258c25c9c2ccd160e4b1ff4c43dd3778
--- /dev/null
+++ b/include/half/README.txt
@@ -0,0 +1,288 @@
+HALF-PRECISION FLOATING POINT LIBRARY (Version 1.12.0)
+------------------------------------------------------
+
+This is a C++ header-only library to provide an IEEE 754 conformant 16-bit 
+half-precision floating point type along with corresponding arithmetic 
+operators, type conversions and common mathematical functions. It aims for both 
+efficiency and ease of use, trying to accurately mimic the behaviour of the 
+builtin floating point types at the best performance possible.
+
+
+INSTALLATION AND REQUIREMENTS
+-----------------------------
+
+Comfortably enough, the library consists of just a single header file 
+containing all the functionality, which can be directly included by your 
+projects, without the neccessity to build anything or link to anything.
+
+Whereas this library is fully C++98-compatible, it can profit from certain 
+C++11 features. Support for those features is checked automatically at compile 
+(or rather preprocessing) time, but can be explicitly enabled or disabled by 
+defining the corresponding preprocessor symbols to either 1 or 0 yourself. This 
+is useful when the automatic detection fails (for more exotic implementations) 
+or when a feature should be explicitly disabled:
+
+  - 'long long' integer type for mathematical functions returning 'long long' 
+    results (enabled for VC++ 2003 and newer, gcc and clang, overridable with 
+    'HALF_ENABLE_CPP11_LONG_LONG').
+
+  - Static assertions for extended compile-time checks (enabled for VC++ 2010, 
+    gcc 4.3, clang 2.9 and newer, overridable with 'HALF_ENABLE_CPP11_STATIC_ASSERT').
+
+  - Generalized constant expressions (enabled for VC++ 2015, gcc 4.6, clang 3.1 
+    and newer, overridable with 'HALF_ENABLE_CPP11_CONSTEXPR').
+
+  - noexcept exception specifications (enabled for VC++ 2015, gcc 4.6, clang 3.0 
+    and newer, overridable with 'HALF_ENABLE_CPP11_NOEXCEPT').
+
+  - User-defined literals for half-precision literals to work (enabled for 
+    VC++ 2015, gcc 4.7, clang 3.1 and newer, overridable with 
+    'HALF_ENABLE_CPP11_USER_LITERALS').
+
+  - Type traits and template meta-programming features from <type_traits> 
+    (enabled for VC++ 2010, libstdc++ 4.3, libc++ and newer, overridable with 
+    'HALF_ENABLE_CPP11_TYPE_TRAITS').
+
+  - Special integer types from <cstdint> (enabled for VC++ 2010, libstdc++ 4.3, 
+    libc++ and newer, overridable with 'HALF_ENABLE_CPP11_CSTDINT').
+
+  - Certain C++11 single-precision mathematical functions from <cmath> for 
+    an improved implementation of their half-precision counterparts to work 
+    (enabled for VC++ 2013, libstdc++ 4.3, libc++ and newer, overridable with 
+    'HALF_ENABLE_CPP11_CMATH').
+
+  - Hash functor 'std::hash' from <functional> (enabled for VC++ 2010, 
+    libstdc++ 4.3, libc++ and newer, overridable with 'HALF_ENABLE_CPP11_HASH').
+
+The library has been tested successfully with Visual C++ 2005-2015, gcc 4.4-4.8 
+and clang 3.1. Please contact me if you have any problems, suggestions or even 
+just success testing it on other platforms.
+
+
+DOCUMENTATION
+-------------
+
+Here follow some general words about the usage of the library and its 
+implementation. For a complete documentation of its iterface look at the 
+corresponding website http://half.sourceforge.net. You may also generate the 
+complete developer documentation from the library's only include file's doxygen 
+comments, but this is more relevant to developers rather than mere users (for 
+reasons described below).
+
+BASIC USAGE
+
+To make use of the library just include its only header file half.hpp, which 
+defines all half-precision functionality inside the 'half_float' namespace. The 
+actual 16-bit half-precision data type is represented by the 'half' type. This 
+type behaves like the builtin floating point types as much as possible, 
+supporting the usual arithmetic, comparison and streaming operators, which 
+makes its use pretty straight-forward:
+
+    using half_float::half;
+    half a(3.4), b(5);
+    half c = a * b;
+    c += 3;
+    if(c > a)
+	    std::cout << c << std::endl;
+
+Additionally the 'half_float' namespace also defines half-precision versions 
+for all mathematical functions of the C++ standard library, which can be used 
+directly through ADL:
+
+    half a(-3.14159);
+    half s = sin(abs(a));
+    long l = lround(s);
+
+You may also specify explicit half-precision literals, since the library 
+provides a user-defined literal inside the 'half_float::literal' namespace, 
+which you just need to import (assuming support for C++11 user-defined literals):
+
+    using namespace half_float::literal;
+    half x = 1.0_h;
+
+Furthermore the library provides proper specializations for 
+'std::numeric_limits', defining various implementation properties, and 
+'std::hash' for hashing half-precision numbers (assuming support for C++11 
+'std::hash'). Similar to the corresponding preprocessor symbols from <cmath> 
+the library also defines the 'HUGE_VALH' constant and maybe the 'FP_FAST_FMAH' 
+symbol.
+
+CONVERSIONS AND ROUNDING
+
+The half is explicitly constructible/convertible from a single-precision float 
+argument. Thus it is also explicitly constructible/convertible from any type 
+implicitly convertible to float, but constructing it from types like double or 
+int will involve the usual warnings arising when implicitly converting those to 
+float because of the lost precision. On the one hand those warnings are 
+intentional, because converting those types to half neccessarily also reduces 
+precision. But on the other hand they are raised for explicit conversions from 
+those types, when the user knows what he is doing. So if those warnings keep 
+bugging you, then you won't get around first explicitly converting to float 
+before converting to half, or use the 'half_cast' described below. In addition 
+you can also directly assign float values to halfs.
+
+In contrast to the float-to-half conversion, which reduces precision, the 
+conversion from half to float (and thus to any other type implicitly 
+convertible from float) is implicit, because all values represetable with 
+half-precision are also representable with single-precision. This way the 
+half-to-float conversion behaves similar to the builtin float-to-double 
+conversion and all arithmetic expressions involving both half-precision and 
+single-precision arguments will be of single-precision type. This way you can 
+also directly use the mathematical functions of the C++ standard library, 
+though in this case you will invoke the single-precision versions which will 
+also return single-precision values, which is (even if maybe performing the 
+exact same computation, see below) not as conceptually clean when working in a 
+half-precision environment.
+
+The default rounding mode for conversions from float to half uses truncation 
+(round toward zero, but mapping overflows to infinity) for rounding values not 
+representable exactly in half-precision. This is the fastest rounding possible 
+and is usually sufficient. But by redefining the 'HALF_ROUND_STYLE' 
+preprocessor symbol (before including half.hpp) this default can be overridden 
+with one of the other standard rounding modes using their respective constants 
+or the equivalent values of 'std::float_round_style' (it can even be 
+synchronized with the underlying single-precision implementation by defining it 
+to 'std::numeric_limits<float>::round_style'):
+
+  - 'std::round_indeterminate' or -1 for the fastest rounding (default).
+
+  - 'std::round_toward_zero' or 0 for rounding toward zero.
+
+  - std::round_to_nearest' or 1 for rounding to the nearest value.
+
+  - std::round_toward_infinity' or 2 for rounding toward positive infinity.
+
+  - std::round_toward_neg_infinity' or 3 for rounding toward negative infinity.
+
+In addition to changing the overall default rounding mode one can also use the 
+'half_cast'. This converts between half and any built-in arithmetic type using 
+a configurable rounding mode (or the default rounding mode if none is 
+specified). In addition to a configurable rounding mode, 'half_cast' has 
+another big difference to a mere 'static_cast': Any conversions are performed 
+directly using the given rounding mode, without any intermediate conversion 
+to/from 'float'. This is especially relevant for conversions to integer types, 
+which don't necessarily truncate anymore. But also for conversions from 
+'double' or 'long double' this may produce more precise results than a 
+pre-conversion to 'float' using the single-precision implementation's current 
+rounding mode would.
+
+    half a = half_cast<half>(4.2);
+    half b = half_cast<half,std::numeric_limits<float>::round_style>(4.2f);
+    assert( half_cast<int, std::round_to_nearest>( 0.7_h )     == 1 );
+    assert( half_cast<half,std::round_toward_zero>( 4097 )     == 4096.0_h );
+    assert( half_cast<half,std::round_toward_infinity>( 4097 ) == 4100.0_h );
+    assert( half_cast<half,std::round_toward_infinity>( std::numeric_limits<double>::min() ) > 0.0_h );
+
+When using round to nearest (either as default or through 'half_cast') ties are 
+by default resolved by rounding them away from zero (and thus equal to the 
+behaviour of the 'round' function). But by redefining the 
+'HALF_ROUND_TIES_TO_EVEN' preprocessor symbol to 1 (before including half.hpp) 
+this default can be changed to the slightly slower but less biased and more 
+IEEE-conformant behaviour of rounding half-way cases to the nearest even value.
+
+    #define HALF_ROUND_TIES_TO_EVEN 1
+    #include <half.hpp>
+    ...
+    assert( half_cast<int,std::round_to_nearest>(3.5_h) 
+         == half_cast<int,std::round_to_nearest>(4.5_h) );
+
+IMPLEMENTATION
+
+For performance reasons (and ease of implementation) many of the mathematical 
+functions provided by the library as well as all arithmetic operations are 
+actually carried out in single-precision under the hood, calling to the C++ 
+standard library implementations of those functions whenever appropriate, 
+meaning the arguments are converted to floats and the result back to half. But 
+to reduce the conversion overhead as much as possible any temporary values 
+inside of lengthy expressions are kept in single-precision as long as possible, 
+while still maintaining a strong half-precision type to the outside world. Only 
+when finally assigning the value to a half or calling a function that works 
+directly on halfs is the actual conversion done (or never, when further 
+converting the result to float.
+
+This approach has two implications. First of all you have to treat the 
+library's documentation at http://half.sourceforge.net as a simplified version, 
+describing the behaviour of the library as if implemented this way. The actual 
+argument and return types of functions and operators may involve other internal 
+types (feel free to generate the exact developer documentation from the Doxygen 
+comments in the library's header file if you really need to). But nevertheless 
+the behaviour is exactly like specified in the documentation. The other 
+implication is, that in the presence of rounding errors or over-/underflows 
+arithmetic expressions may produce different results when compared to 
+converting to half-precision after each individual operation:
+
+    half a = std::numeric_limits<half>::max() * 2.0_h / 2.0_h;       // a = MAX
+    half b = half(std::numeric_limits<half>::max() * 2.0_h) / 2.0_h; // b = INF
+    assert( a != b );
+
+But this should only be a problem in very few cases. One last word has to be 
+said when talking about performance. Even with its efforts in reducing 
+conversion overhead as much as possible, the software half-precision 
+implementation can most probably not beat the direct use of single-precision 
+computations. Usually using actual float values for all computations and 
+temproraries and using halfs only for storage is the recommended way. On the 
+one hand this somehow makes the provided mathematical functions obsolete 
+(especially in light of the implicit conversion from half to float), but 
+nevertheless the goal of this library was to provide a complete and 
+conceptually clean half-precision implementation, to which the standard 
+mathematical functions belong, even if usually not needed.
+
+IEEE CONFORMANCE
+
+The half type uses the standard IEEE representation with 1 sign bit, 5 exponent 
+bits and 10 mantissa bits (11 when counting the hidden bit). It supports all 
+types of special values, like subnormal values, infinity and NaNs. But there 
+are some limitations to the complete conformance to the IEEE 754 standard:
+
+  - The implementation does not differentiate between signalling and quiet 
+    NaNs, this means operations on halfs are not specified to trap on 
+    signalling NaNs (though they may, see last point).
+
+  - Though arithmetic operations are internally rounded to single-precision 
+    using the underlying single-precision implementation's current rounding 
+    mode, those values are then converted to half-precision using the default 
+    half-precision rounding mode (changed by defining 'HALF_ROUND_STYLE' 
+    accordingly). This mixture of rounding modes is also the reason why 
+    'std::numeric_limits<half>::round_style' may actually return 
+    'std::round_indeterminate' when half- and single-precision rounding modes 
+    don't match.
+
+  - Because of internal truncation it may also be that certain single-precision 
+    NaNs will be wrongly converted to half-precision infinity, though this is 
+    very unlikely to happen, since most single-precision implementations don't 
+    tend to only set the lowest bits of a NaN mantissa.
+
+  - The implementation does not provide any floating point exceptions, thus 
+    arithmetic operations or mathematical functions are not specified to invoke 
+    proper floating point exceptions. But due to many functions implemented in 
+    single-precision, those may still invoke floating point exceptions of the 
+    underlying single-precision implementation.
+
+Some of those points could have been circumvented by controlling the floating 
+point environment using <cfenv> or implementing a similar exception mechanism. 
+But this would have required excessive runtime checks giving two high an impact 
+on performance for something that is rarely ever needed. If you really need to 
+rely on proper floating point exceptions, it is recommended to explicitly 
+perform computations using the built-in floating point types to be on the safe 
+side. In the same way, if you really need to rely on a particular rounding 
+behaviour, it is recommended to either use single-precision computations and 
+explicitly convert the result to half-precision using 'half_cast' and 
+specifying the desired rounding mode, or synchronize the default half-precision 
+rounding mode to the rounding mode of the single-precision implementation (most 
+likely 'HALF_ROUND_STYLE=1', 'HALF_ROUND_TIES_TO_EVEN=1'). But this is really 
+considered an expert-scenario that should be used only when necessary, since 
+actually working with half-precision usually comes with a certain 
+tolerance/ignorance of exactness considerations and proper rounding comes with 
+a certain performance cost.
+
+
+CREDITS AND CONTACT
+-------------------
+
+This library is developed by CHRISTIAN RAU and released under the MIT License 
+(see LICENSE.txt). If you have any questions or problems with it, feel free to 
+contact me at rauy@users.sourceforge.net.
+
+Additional credit goes to JEROEN VAN DER ZIJP for his paper on "Fast Half Float 
+Conversions", whose algorithms have been used in the library for converting 
+between half-precision and single-precision values.
diff --git a/python/requirements.txt b/python/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e90ff3c328b0fcd46486da6e8c0bd98ceb26dd07
--- /dev/null
+++ b/python/requirements.txt
@@ -0,0 +1,3 @@
+flatbuffers==2.0
+numpy==1.19.5
+tflite==2.4.0
diff --git a/scripts/check_header_guards.py b/scripts/check_header_guards.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c48b7501fb1cfd5fb79a0f06869d100663d1466
--- /dev/null
+++ b/scripts/check_header_guards.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Arm Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import argparse
+from typing import List, Tuple
+import logging
+import re
+
+logger = logging.getLogger("check_header_guards")
+
+def find_code_boundaries(lines: List[str]) -> (int, int):
+    inside_comment : bool = False
+
+    start = len(lines)
+    end = -1
+    line_num = 0
+    for line in lines:
+        stripped_line : str = line.strip()
+        if stripped_line.startswith("/*"):    # block comment start
+            inside_comment = True
+
+        if not inside_comment and not stripped_line.startswith("//") and stripped_line != "":
+            start = min(line_num, start)
+            end = line_num
+
+        if inside_comment and stripped_line.endswith("*/"):
+            inside_comment = False
+
+        line_num += 1
+
+    return start, end
+
+
+def is_define(line: str) -> bool:
+    return line.strip().startswith("#define")
+
+def is_endif(line: str) -> bool:
+    return line.strip().startswith("#endif")
+
+def is_ifndef(line: str) -> bool:
+    return line.strip().startswith("#ifndef")
+
+# Strips the given line from // and /* */ blocks
+def strip_comments(line: str) -> str:
+    line = re.sub(r"/\*.*\*/", "", line)
+    line = re.sub(r"//.*", "", line)
+    return line.strip()
+
+# If the line
+#   1) startswith #ifndef
+#   2) is all uppercase
+#   3) does not start with double underscore, i.e. __
+# Then
+#   It "looks" like a header guard
+def looks_like_header_guard(line: str) -> bool:
+    sline = line.strip()
+    guard_candidate = strip_comments(sline[len("#ifndef"):])
+
+    return is_ifndef(sline) and not guard_candidate.startswith("__") and guard_candidate.isupper()
+
+
+def fix_header_guard(lines: List[str], expected_header_guard: str, comment_style: str) -> Tuple[List[str], bool]:
+    start_line, next_line, last_line = "", "", ""
+    start_index, last_index = find_code_boundaries(lines)
+    guards_updated: bool = True
+
+    if start_index < len(lines):
+        # if not, the file is full of comments
+        start_line = lines[start_index]
+
+    if start_index + 1 < len(lines):
+        # if not, the file has only one line of code
+        next_line = lines[start_index + 1]
+
+    if last_index < len(lines) and last_index > start_index + 1:
+        # if not, either the file is full of comments OR it has less than three code lines
+        last_line = lines[last_index]
+
+    expected_start_line = f"#ifndef {expected_header_guard}\n"
+    expected_next_line = f"#define {expected_header_guard}\n"
+
+    if comment_style == 'double_slash':
+        expected_last_line = f"#endif // {expected_header_guard}\n"
+    elif comment_style == 'slash_asterix':
+        expected_last_line = f"#endif /* {expected_header_guard} */\n"
+
+    empty_line = "\n"
+
+    if looks_like_header_guard(start_line) and is_define(next_line) and is_endif(last_line):
+        # modify the current header guard if necessary
+        lines = lines[:start_index] + [expected_start_line, expected_next_line] + \
+            lines[start_index+2:last_index] + [expected_last_line] + lines[last_index+1:]
+
+        guards_updated = (start_line != expected_start_line) or (next_line != expected_next_line) \
+            or (last_line != expected_last_line)
+    else:
+        # header guard could not be detected, add header guards
+        lines = lines[:start_index] + [empty_line, expected_start_line, expected_next_line] + \
+            [empty_line] + lines[start_index:] + [empty_line, expected_last_line]
+
+
+    return lines, guards_updated
+
+
+def find_expected_header_guard(filepath: str, prefix: str, add_extension: str, drop_outermost_subdir: str) -> str:
+    if drop_outermost_subdir:
+        arr : List[str] = filepath.split("/")
+        arr = arr[min(1, len(arr)-1):]
+        filepath = "/".join(arr)
+
+    if not add_extension:
+        filepath = ".".join(filepath.split(".")[:-1])
+
+    guard = filepath.replace("/", "_").replace(".", "_").upper()    # snake case full path
+    return prefix + "_" + guard
+
+
+def skip_file(filepath: str, extensions: List[str], exclude: List[str], include: List[str]) -> bool:
+    extension = filepath.split(".")[-1]
+
+    if extension.lower() not in extensions:
+        return True
+
+    if exclude and any([filepath.startswith(exc) for exc in exclude]):
+        print(exclude)
+        return True
+
+    if include:
+        return not any([filepath.startswith(inc) for inc in include])
+
+    return False
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description="Header Guard Checker. It adds full path snake case header guards with or without extension.",
+    )
+
+    parser.add_argument("files", type=str, nargs="+", help="Files to check the header guards")
+    parser.add_argument("--extensions", type=str, help="Comma separated list of extensions to run the checks. \
+        If the input file does not have any of the extensions, it'll be skipped", required=True)
+    parser.add_argument("--comment_style", choices=['double_slash', 'slash_asterix'], required=True)
+    parser.add_argument("--exclude", type=str, help="Comma separated list of paths to exclude from header guard checks", default="")
+    parser.add_argument("--include", type=str, help="Comma separated list of paths to include. Defaults to empty string, \
+        which means all the paths are included", default="")
+    parser.add_argument("--prefix", help="Prefix to apply to header guards", required=True)
+    parser.add_argument("--add_extension", action="store_true", help="If true, it adds the file extension to the end of the guard")
+    parser.add_argument("--drop_outermost_subdir", action="store_true", help="If true, it'll not use the outermost folder in the path. \
+        This is intended for using in subdirs with different rules")
+
+    args = parser.parse_args()
+
+    files = args.files
+    extensions = args.extensions.split(",")
+    exclude = args.exclude.split(",") if args.exclude != '' else []
+    include = args.include.split(",") if args.include != '' else []
+    prefix = args.prefix
+    add_extension = args.add_extension
+    drop_outermost_subdir = args.drop_outermost_subdir
+    comment_style = args.comment_style
+
+    logging_level = logging.INFO
+    logging.basicConfig(level=logging_level)
+
+    retval = 0
+    for file in files:
+        if skip_file(file, extensions, exclude, include):
+            logger.info(f"File {file} is SKIPPED")
+            continue
+
+        expected_header_guard : str = find_expected_header_guard(file, prefix, add_extension, drop_outermost_subdir)
+
+        with open(file, "r") as fd:
+            lines: List = fd.readlines()
+
+        new_lines, guards_updated = fix_header_guard(lines, expected_header_guard, comment_style)
+
+        with open(file, "w") as fd:
+            fd.writelines([f"{line}" for line in new_lines])
+
+        if guards_updated:
+            logger.info("File has been modified")
+            retval = 1
+
+    exit(retval)
diff --git a/scripts/copyright_eula.txt b/scripts/copyright_eula.txt
new file mode 100644
index 0000000000000000000000000000000000000000..16cd734a68d1c053340fd7f89a3df2ec3112d5b1
--- /dev/null
+++ b/scripts/copyright_eula.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2016, 2017 Arm Limited.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/scripts/copyright_mit.txt b/scripts/copyright_mit.txt
new file mode 100644
index 0000000000000000000000000000000000000000..049d44488777193882d1ea7b3c17b6c710e3e051
--- /dev/null
+++ b/scripts/copyright_mit.txt
@@ -0,0 +1,21 @@
+Copyright (c) 2018 Arm Limited.
+
+SPDX-License-Identifier: MIT
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/scripts/ensure_single_eol.py b/scripts/ensure_single_eol.py
new file mode 100755
index 0000000000000000000000000000000000000000..0eb105e0910c852b97dcd0bc88699ba0be993e56
--- /dev/null
+++ b/scripts/ensure_single_eol.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 Arm Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""
+This script ensures a file ends with exactly one end-of-line character
+"""
+
+import argparse
+import pathlib as pl
+import os
+
+
+def main(args):
+    f_p = pl.Path(args.file)
+    with open(f_p, "r") as f:
+        lines = f.read()
+        lines = lines.rstrip(os.linesep)
+        lines += os.linesep
+    with open(f_p, "w") as f:
+        f.write(lines)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("file")
+    args = parser.parse_args()
+    main(args)
diff --git a/scripts/format_code.py b/scripts/format_code.py
new file mode 100755
index 0000000000000000000000000000000000000000..f1ee7a731c41bf59a7d89ea5a4d6c4e2b0ee64e9
--- /dev/null
+++ b/scripts/format_code.py
@@ -0,0 +1,425 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 Arm Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import argparse
+import datetime
+import difflib
+import filecmp
+import logging
+import os
+import re
+import subprocess
+import sys
+
+from modules.Shell import Shell
+
+logger = logging.getLogger("format_code")
+
+# List of directories to exclude
+exceptions = [
+    "src/core/NEON/kernels/assembly/gemm",
+    "src/core/NEON/kernels/assembly/arm",
+    "/winograd/",
+    "/convolution/",
+    "/arm_gemm/",
+    "/arm_conv/",
+    "SConscript",
+    "SConstruct"
+]
+
+def adjust_copyright_year(copyright_years, curr_year):
+    ret_copyright_year = str()
+    # Read last year in the Copyright
+    last_year = int(copyright_years[-4:])
+    if last_year == curr_year:
+        ret_copyright_year = copyright_years
+    elif last_year == (curr_year - 1):
+        # Create range if latest year on the copyright is the previous
+        if len(copyright_years) > 4 and copyright_years[-5] == "-":
+            # Range already exists, update year to current
+            ret_copyright_year = copyright_years[:-5] + "-" + str(curr_year)
+        else:
+            # Create a new range
+            ret_copyright_year = copyright_years + "-" + str(curr_year)
+    else:
+        ret_copyright_year = copyright_years + ", " + str(curr_year)
+    return ret_copyright_year
+
+def check_copyright( filename ):
+    f = open(filename, "r")
+    content = f.readlines()
+    f.close()
+    f = open(filename, "w")
+    year = datetime.datetime.now().year
+    ref = open("scripts/copyright_mit.txt","r").readlines()
+
+    # Need to handle python files separately
+    if("SConstruct" in filename or "SConscript" in filename):
+        start = 2
+        if("SConscript" in filename):
+            start = 3
+        m = re.match("(# Copyright \(c\) )(.*\d{4})( [Arm|ARM].*)", content[start])
+        line = m.group(1)
+
+        if m.group(2): # Is there a year already?
+            # Yes: adjust accordingly
+            line += adjust_copyright_year(m.group(2), year)
+        else:
+            # No: add current year
+            line += str(year)
+        line += m.group(3).replace("ARM", "Arm")
+        if("SConscript" in filename):
+            f.write('#!/usr/bin/python\n')
+
+        f.write('# -*- coding: utf-8 -*-\n\n')
+        f.write(line+"\n")
+        # Copy the rest of the file's content:
+        f.write("".join(content[start + 1:]))
+        f.close()
+
+        return
+
+    # This only works until year 9999
+    m = re.match("(.*Copyright \(c\) )(.*\d{4})( [Arm|ARM].*)", content[1])
+    start =len(ref)+2
+    if content[0] != "/*\n" or not m:
+        start = 0
+        f.write("/*\n * Copyright (c) %d Arm Limited.\n" % year)
+    else:
+        logger.debug("Found Copyright start")
+        logger.debug("\n\t".join([ g or "" for g in m.groups()]))
+        line = m.group(1)
+
+        if m.group(2): # Is there a year already?
+            # Yes: adjust accordingly
+            line += adjust_copyright_year(m.group(2), year)
+        else:
+            # No: add current year
+            line += str(year)
+        line += m.group(3).replace("ARM", "Arm")
+        f.write("/*\n"+line+"\n")
+        logger.debug(line)
+    # Write out the rest of the Copyright header:
+    for i in range(1, len(ref)):
+        line = ref[i]
+        f.write(" *")
+        if line.rstrip() != "":
+            f.write(" %s" % line)
+        else:
+            f.write("\n")
+    f.write(" */\n")
+    # Copy the rest of the file's content:
+    f.write("".join(content[start:]))
+    f.close()
+
+def check_license(filename):
+    """
+    Check that the license file is up-to-date
+    """
+    f = open(filename, "r")
+    content = f.readlines()
+    f.close()
+
+    f = open(filename, "w")
+    f.write("".join(content[:2]))
+
+    year = datetime.datetime.now().year
+    # This only works until year 9999
+    m = re.match("(.*Copyright \(c\) )(.*\d{4})( [Arm|ARM].*)", content[2])
+
+    if not m:
+        f.write("Copyright (c) {} Arm Limited\n".format(year))
+    else:
+        updated_year = adjust_copyright_year(m.group(2), year)
+        f.write("Copyright (c) {} Arm Limited\n".format(updated_year))
+
+    # Copy the rest of the file's content:
+    f.write("".join(content[3:]))
+    f.close()
+
+
+class OtherChecksRun:
+    def __init__(self, folder, error_diff=False, strategy="all"):
+        self.folder = folder
+        self.error_diff=error_diff
+        self.strategy = strategy
+
+    def error_on_diff(self, msg):
+        retval = 0
+        if self.error_diff:
+            diff = self.shell.run_single_to_str("git diff")
+            if len(diff) > 0:
+                retval = -1
+                logger.error(diff)
+                logger.error("\n"+msg)
+        return retval
+
+    def run(self):
+        retval = 0
+        self.shell = Shell()
+        self.shell.save_cwd()
+        this_dir = os.path.dirname(__file__)
+        self.shell.cd(self.folder)
+        self.shell.prepend_env("PATH","%s/../bin" % this_dir)
+
+        to_check = ""
+        if self.strategy != "all":
+            to_check, skip_copyright = FormatCodeRun.get_files(self.folder, self.strategy)
+            #FIXME: Exclude shaders!
+
+        logger.info("Running ./scripts/format_doxygen.py")
+        logger.debug(self.shell.run_single_to_str("./scripts/format_doxygen.py %s" % " ".join(to_check)))
+        retval = self.error_on_diff("Doxygen comments badly formatted (check above diff output for more details) try to run ./scripts/format_doxygen.py on your patch and resubmit")
+        if retval == 0:
+            logger.info("Running ./scripts/include_functions_kernels.py")
+            logger.debug(self.shell.run_single_to_str("python ./scripts/include_functions_kernels.py"))
+            retval = self.error_on_diff("Some kernels or functions are not included in their corresponding master header (check above diff output to see which includes are missing)")
+        if retval == 0:
+            try:
+                logger.info("Running ./scripts/check_bad_style.sh")
+                logger.debug(self.shell.run_single_to_str("./scripts/check_bad_style.sh"))
+                #logger.debug(self.shell.run_single_to_str("./scripts/check_bad_style.sh %s" % " ".join(to_check)))
+            except subprocess.CalledProcessError as e:
+                logger.error("Command %s returned:\n%s" % (e.cmd, e.output))
+                retval -= 1
+
+        if retval != 0:
+            raise Exception("format-code failed with error code %d" % retval)
+
+class FormatCodeRun:
+    @staticmethod
+    def get_files(folder, strategy="git-head"):
+        shell = Shell()
+        shell.cd(folder)
+        skip_copyright = False
+        if strategy == "git-head":
+            cmd = "git diff-tree --no-commit-id --name-status -r HEAD | grep \"^[AMRT]\" | cut -f 2"
+        elif strategy == "git-diff":
+            cmd = "git diff --name-status --cached -r HEAD | grep \"^[AMRT]\" | cut -f 2"
+        else:
+            cmd = "git ls-tree -r HEAD --name-only"
+            # Skip copyright checks when running on all files because we don't know when they were last modified
+            # Therefore we can't tell if their copyright dates are correct
+            skip_copyright = True
+
+        grep_folder = "grep -e \"^\\(arm_compute\\|src\\|examples\\|tests\\|utils\\|support\\)/\""
+        grep_extension = "grep -e \"\\.\\(cpp\\|h\\|hh\\|inl\\|cl\\|cs\\|hpp\\)$\""
+        list_files = shell.run_single_to_str(cmd+" | { "+ grep_folder+" | "+grep_extension + " || true; }")
+        to_check = [ f for f in list_files.split("\n") if len(f) > 0]
+
+        # Check for scons files as they are excluded from the above list
+        list_files = shell.run_single_to_str(cmd+" | { grep -e \"SC\" || true; }")
+        to_check += [ f for f in list_files.split("\n") if len(f) > 0]
+
+        return (to_check, skip_copyright)
+
+    def __init__(self, files, folder, error_diff=False, skip_copyright=False):
+        self.files = files
+        self.folder = folder
+        self.skip_copyright = skip_copyright
+        self.error_diff=error_diff
+
+    def error_on_diff(self, msg):
+        retval = 0
+        if self.error_diff:
+            diff = self.shell.run_single_to_str("git diff")
+            if len(diff) > 0:
+                retval = -1
+                logger.error(diff)
+                logger.error("\n"+msg)
+        return retval
+
+    def run(self):
+        if len(self.files) < 1:
+            logger.debug("No file: early exit")
+        retval = 0
+        self.shell = Shell()
+        self.shell.save_cwd()
+        this_dir = os.path.dirname(__file__)
+        try:
+            self.shell.cd(self.folder)
+            self.shell.prepend_env("PATH","%s/../bin" % this_dir)
+
+            for f in self.files:
+                skip_this_file = False
+                for e in exceptions:
+                    if e in f:
+                        logger.warning("Skipping '%s' file: %s" % (e,f))
+                        skip_this_file = True
+                        break
+                if skip_this_file:
+                    continue
+
+                logger.info("Formatting %s" % f)
+                if not self.skip_copyright:
+                    check_copyright(f)
+
+            check_license("LICENSE")
+
+        except subprocess.CalledProcessError as e:
+            retval = -1
+            logger.error(e)
+            logger.error("OUTPUT= %s" % e.output)
+
+        retval += self.error_on_diff("See above for clang-tidy errors")
+
+        if retval != 0:
+            raise Exception("format-code failed with error code %d" % retval)
+
+class GenerateAndroidBP:
+    def __init__(self, folder):
+        self.folder = folder
+        self.bp_output_file = "Generated_Android.bp"
+
+    def run(self):
+        retval = 0
+        self.shell = Shell()
+        self.shell.save_cwd()
+        this_dir = os.path.dirname(__file__)
+
+        logger.debug("Running Android.bp check")
+        try:
+            self.shell.cd(self.folder)
+            cmd = "%s/generate_android_bp.py --folder %s --output_file %s" % (this_dir, self.folder, self.bp_output_file)
+            output = self.shell.run_single_to_str(cmd)
+            if len(output) > 0:
+                logger.info(output)
+        except subprocess.CalledProcessError as e:
+            retval = -1
+            logger.error(e)
+            logger.error("OUTPUT= %s" % e.output)
+
+        # Compare the genereated file with the one in the review
+        if not filecmp.cmp(self.bp_output_file, self.folder + "/Android.bp"):
+            is_mismatched = True
+
+            with open(self.bp_output_file, 'r') as generated_file:
+                with open(self.folder + "/Android.bp", 'r') as review_file:
+                    diff = list(difflib.unified_diff(generated_file.readlines(), review_file.readlines(),
+                                                     fromfile='Generated_Android.bp', tofile='Android.bp'))
+
+                    # If the only mismatch in Android.bp file is the copyright year,
+                    # the content of the file is considered unchanged and we don't need to update
+                    # the copyright year. This will resolve the issue that emerges every new year.
+                    num_added_lines = 0
+                    num_removed_lines = 0
+                    last_added_line = ""
+                    last_removed_line = ""
+                    expect_add_line = False
+
+                    for line in diff:
+                        if line.startswith("-") and not line.startswith("---"):
+                            num_removed_lines += 1
+                            if num_removed_lines > 1:
+                                break
+                            last_removed_line = line
+                            expect_add_line = True
+                        elif line.startswith("+") and not line.startswith("+++"):
+                            num_added_lines += 1
+                            if num_added_lines > 1:
+                                break
+                            if expect_add_line:
+                                last_added_line = line
+                        else:
+                            expect_add_line = False
+
+                    if num_added_lines == 1 and num_removed_lines == 1:
+                        re_copyright = re.compile("^(?:\+|\-)// Copyright © ([0-9]+)\-([0-9]+) Arm Ltd. All rights reserved.\n$")
+                        generated_matches = re_copyright.search(last_removed_line)
+                        review_matches = re_copyright.search(last_added_line)
+
+                        if generated_matches is not None and review_matches is not None:
+                            if generated_matches.group(1) == review_matches.group(1) and \
+                               int(generated_matches.group(2)) > int(review_matches.group(2)):
+                                is_mismatched = False
+
+                    if is_mismatched:
+                        logger.error("Lines with '-' need to be added to Android.bp")
+                        logger.error("Lines with '+' need to be removed from Android.bp")
+
+                        for line in diff:
+                            logger.error(line.rstrip())
+            if is_mismatched:
+                raise Exception("Android bp file is not updated")
+
+        if retval != 0:
+            raise Exception("generate Android bp file failed with error code %d" % retval)
+
+def run_fix_code_formatting( files="git-head", folder=".", num_threads=1, error_on_diff=True):
+    try:
+        retval = 0
+
+        # Genereate Android.bp file and test it
+        gen_android_bp = GenerateAndroidBP(folder)
+        gen_android_bp.run()
+
+        to_check, skip_copyright = FormatCodeRun.get_files(folder, files)
+        other_checks = OtherChecksRun(folder,error_on_diff, files)
+        other_checks.run()
+
+        logger.debug(to_check)
+        num_files = len(to_check)
+        per_thread = max( num_files / num_threads,1)
+        start=0
+        logger.info("Files to format:\n\t%s" % "\n\t".join(to_check))
+
+        for i in range(num_threads):
+            if i == num_threads -1:
+                end = num_files
+            else:
+                end= min(start+per_thread, num_files)
+            sub = to_check[start:end]
+            logger.debug("[%d] [%d,%d] %s" % (i, start, end, sub))
+            start = end
+            format_code_run = FormatCodeRun(sub, folder, skip_copyright=skip_copyright)
+            format_code_run.run()
+
+        return retval
+    except Exception as e:
+        logger.error("Exception caught in run_fix_code_formatting: %s" % e)
+        return -1
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description="Build & run pre-commit tests",
+    )
+
+    file_sources=["git-diff","git-head","all"]
+    parser.add_argument("-D", "--debug", action='store_true', help="Enable script debugging output")
+    parser.add_argument("--error_on_diff", action='store_true', help="Show diff on error and stop")
+    parser.add_argument("--files", nargs='?', metavar="source", choices=file_sources, help="Which files to run fix_code_formatting on, choices=%s" % file_sources, default="git-head")
+    parser.add_argument("--folder", metavar="path", help="Folder in which to run fix_code_formatting", default=".")
+
+    args = parser.parse_args()
+
+    logging_level = logging.INFO
+    if args.debug:
+        logging_level = logging.DEBUG
+
+    logging.basicConfig(level=logging_level)
+
+    logger.debug("Arguments passed: %s" % str(args.__dict__))
+
+    exit(run_fix_code_formatting(args.files, args.folder, 1, error_on_diff=args.error_on_diff))
diff --git a/scripts/format_doxygen.py b/scripts/format_doxygen.py
index 5882958fc574060393db23fd3988331d77bef510..8ac5e630b97033cc9e03806827bc7dc421cfdfdc 100755
--- a/scripts/format_doxygen.py
+++ b/scripts/format_doxygen.py
@@ -81,7 +81,7 @@ def process_comment(fd, comment, first_param, last_param):
 if __name__ == "__main__":
     n_file=0
 
-    if len(sys.argv) == 1:
+    if len(sys.argv) == 2 and sys.argv[1] == '--all':
         paths = []
 
         for top_level in ["./arm_compute", "./src", "./examples", "./tests", "./utils", "./framework", "./support"]:
diff --git a/scripts/generate_android_bp.py b/scripts/generate_android_bp.py
new file mode 100755
index 0000000000000000000000000000000000000000..350b52f01c19bee8c90242f870c23694cd9949e1
--- /dev/null
+++ b/scripts/generate_android_bp.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 Arm Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import argparse
+import os
+from jinja2 import Template
+import datetime
+
+# Paths to exclude
+excluded_paths = ["build",
+                  "compute_kernel_writer/build",
+                  "compute_kernel_writer/src",
+                  "compute_kernel_writer/validation",
+                  "docs/",
+                  "documentation/",
+                  "examples/",
+                  "opencl-1.2-stubs/",
+                  "release_repository/",
+                  "opengles-3.1-stubs/",
+                  "scripts/",
+                  "tests/",
+                  "/GLES_COMPUTE/",
+                  "/graph/",
+                  "/sve/",
+                  "/SVE/",
+                  "/sve2/",
+                  "/SVE2/"
+                  ]
+
+excluded_files = ["TracePoint.cpp"]
+
+# Android bp template to render
+year = datetime.datetime.now().year
+
+bp_tm = Template(
+"""//
+// Copyright © 2020-""" + str(year) + """ Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+// OpenCL sources are NOT required by ArmNN or its Android NNAPI driver and are used for CI purposes only.
+opencl_srcs = [
+        {% for cl_src in cl_srcs -%}
+            "{{ cl_src }}",
+        {% endfor %}
+]
+
+bootstrap_go_package {
+    name: "arm_compute_library_nn_driver",
+    pkgPath: "arm_compute_library_nn_driver",
+    deps: [
+        "blueprint",
+        "blueprint-pathtools",
+        "blueprint-proptools",
+        "soong",
+        "soong-android",
+        "soong-cc",
+    ],
+    srcs: [
+        "scripts/arm_compute_library_nn_driver.go",
+    ],
+    pluginFor: [ "soong_build" ],
+}
+
+arm_compute_library_defaults {
+       name: "acl-default-cppflags",
+       cppflags: [
+            "-std=c++14",
+            "-fexceptions",
+            "-DBOOST_NO_AUTO_PTR",
+            "-DEMBEDDED_KERNELS",
+            "-DARM_COMPUTE_ASSERTS_ENABLED",
+            "-DARM_COMPUTE_CPP_SCHEDULER",
+            "-DENABLE_NEON",
+            "-DARM_COMPUTE_ENABLE_NEON",
+            "-Wno-unused-parameter",
+            "-DNO_DOT_IN_TOOLCHAIN",
+            "-Wno-implicit-fallthrough",
+            "-fPIC",
+            "-DACL_INTERNAL_TEST_CKW_IN_DF"
+    ],
+    rtti: true,
+}
+
+cc_library_static {
+    name: "arm_compute_library",
+    defaults: ["acl-default-cppflags"],
+    proprietary: true,
+    local_include_dirs: ["build/android-arm64v8a/src/core",
+                         "build/android-arm64v8a/src/core/CL",
+                         "compute_kernel_writer/prototype/include",
+                         "compute_kernel_writer/prototype",
+                         "src/core/common",
+                         "src/core/helpers",
+                         "src/core/NEON/kernels/arm_gemm",
+                         "src/core/NEON/kernels/assembly",
+                         "src/core/NEON/kernels/convolution/common",
+                         "src/core/NEON/kernels/convolution/winograd",
+                         "src/cpu/kernels/assembly"],
+    export_include_dirs: [".", "./include"],
+    srcs: [
+        {% for src in srcs -%}
+            "{{ src }}",
+        {% endfor %}
+    ],
+    arch: {
+        arm: {
+            srcs: [
+                {% for arm_src in arm_srcs -%}
+                    "{{ arm_src }}",
+                {% endfor %}
+            ],
+        },
+        arm64: {
+            srcs: [
+                {% for arm64_src in arm64_srcs -%}
+                    "{{ arm64_src }}",
+                {% endfor %}
+            ],
+        },
+    },
+    rtti: true,
+}
+""")
+
+
+def generate_bp_file(cpp_files, opencl_files):
+    arm_files = [f for f in cpp_files if "a32_" in f]
+    arm64_files = [f for f in cpp_files if any(a64 in f for a64 in ["a64_", "sve_", 'sme_', 'sme2_'])]
+    gen_files = [x for x in cpp_files if x not in arm_files + arm64_files]
+
+    arm_files.sort()
+    arm64_files.sort()
+    gen_files.sort()
+    opencl_files.sort()
+
+    bp_file = bp_tm.render(srcs=gen_files,
+                           arm_srcs=arm_files,
+                           arm64_srcs=arm64_files,
+                           cl_srcs=opencl_files)
+    return bp_file
+
+
+def list_all_files(repo_path):
+    """ Gets the list of files to include to the Android.bp
+
+    :param repo_path: Path of the repository
+    :return: The filtered list of useful filess
+    """
+    if not repo_path.endswith('/'):
+        repo_path = repo_path + "/"
+
+    # Get cpp files
+    cpp_files = []
+    cl_files = []
+    for path, subdirs, files in os.walk(repo_path):
+        for file in files:
+            if file.endswith(".cpp"):
+                cpp_files.append(os.path.join(path, file))
+            elif file.endswith(".cl"):
+                cl_files.append(os.path.join(path, file))
+            # Include CL headers
+            if "src/core/CL/cl_kernels" in path and file.endswith(".h"):
+                cl_files.append(os.path.join(path, file))
+    # Filter out unused cpp files
+    filtered_cpp_files = []
+    for cpp_file in cpp_files:
+        if any(ep in cpp_file for ep in excluded_paths) or any(ef in cpp_file for ef in excluded_files):
+            continue
+        filtered_cpp_files.append(cpp_file.replace(repo_path, ""))
+    # Filter out unused cl files
+    filtered_cl_files = []
+    for cl_file in cl_files:
+        if any(ep in cl_file for ep in excluded_paths):
+            continue
+        filtered_cl_files.append(cl_file.replace(repo_path, ""))
+
+    return filtered_cpp_files, filtered_cl_files
+
+
+if __name__ == "__main__":
+    # Parse arguments
+    parser = argparse.ArgumentParser('Generate Android.bp file for ComputeLibrary')
+    parser.add_argument('--folder', default=".", metavar="folder", dest='folder', type=str, required=False, help='Compute Library source path')
+    parser.add_argument('--output_file', metavar="output_file", default='Android.bp', type=str, required=False, help='Specify Android bp output file')
+    args = parser.parse_args()
+
+    cpp_files, opencl_files = list_all_files(args.folder)
+    bp_file = generate_bp_file(cpp_files, opencl_files)
+
+    with open(args.output_file, 'w') as f:
+        f.write(bp_file)
diff --git a/scripts/generate_build_files.py b/scripts/generate_build_files.py
index 0ca27179ebc77d4ad4a66628a1ed4c95f3a03880..17cf49c0a917dea6fb11cc149efee7f3b2696847 100644
--- a/scripts/generate_build_files.py
+++ b/scripts/generate_build_files.py
@@ -202,8 +202,6 @@ def gather_sources():
     # Common backend files
     lib_files = filelist['common']
 
-    # TODO Add Fixed format GEMM kernels ?
-
     # Logging files
     lib_files += filelist['logging']
 
@@ -278,16 +276,17 @@ if "__main__" in __name__:
     graph_files, lib_files_sve, lib_files_sve2, lib_files = gather_sources()
 
     if args.bazel:
+        # 8562a4ec: Remove CommonGraphOptions from Utils target and warnings
+        graph_files += ["//utils:CommonGraphOptions.cpp"]
+
         bazel_build_string = build_from_template_bazel(
             graph_files, lib_files_sve, lib_files_sve2, lib_files)
-        print(bazel_build_string)
         with open("src/BUILD.bazel", "w") as fp:
             fp.write(bazel_build_string)
 
     if args.cmake:
         cmake_build_string = build_from_template_cmake(
             graph_files, lib_files_sve, lib_files_sve2, lib_files)
-        print(cmake_build_string)
         with open("src/CMakeLists.txt", "w") as fp:
             fp.write(cmake_build_string)
 
diff --git a/scripts/include_functions_kernels.py b/scripts/include_functions_kernels.py
index 82b40f0e36a043e2ba61c69e08b18409c1515114..49c12867e6a6b0d71493b1a19b0663652c85ffb8 100644
--- a/scripts/include_functions_kernels.py
+++ b/scripts/include_functions_kernels.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-# Copyright (c) 2017-2018, 2020-2021 Arm Limited.
+# Copyright (c) 2017-2018, 2020-2021, 2023 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -74,7 +74,7 @@ def create_include_list(folder):
     files_path = folder + "/*.h"
     files = glob.glob(files_path)
     updated_files = [include_str + folder + "/" + x.rsplit('/',1)[1] + "\"\n" for x in files]
-    updated_files.sort()
+    updated_files.sort(key=lambda x: x.lower())
     return updated_files
 
 
@@ -86,7 +86,7 @@ def include_components(target, path, header_prefix, folder, subfolders=None):
             include_list = create_include_list(target_path + folder)
             for s in subfolders or []:
                 include_list += create_include_list( target_path + folder + "/" + s)
-            include_list.sort()
+            include_list.sort(key=lambda x: x.lower())
             lines = read_file(components_file)
             lines, first_pos = remove_existing_includes(lines)
             lines = add_updated_includes(lines, first_pos, include_list)
diff --git a/scripts/modules/Shell.py b/scripts/modules/Shell.py
new file mode 100755
index 0000000000000000000000000000000000000000..f3fd0bd242484c20f5d1bd58a718f13ce585a794
--- /dev/null
+++ b/scripts/modules/Shell.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2023 Arm Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import logging
+import subprocess
+
+logger = logging.getLogger("Shell")
+
+class Shell:
+    def __init__(self, is_interactive=False):
+        self.line=""
+        self.env=os.environ.copy()
+        self.initial_path = self.env["PATH"]
+        self.save_cwd()
+        self.is_interactive = is_interactive
+
+    def reset_path(self):
+        self.env["PATH"]=self.initial_path
+
+    def set_env(self, key, value):
+        self.env[key] = value
+
+    def append_env(self, key, value):
+        logger.debug("Appending '%s' to '%s'" % (value, key))
+        if key not in list(self.env.keys()):
+            self.set_env(key,value)
+        else:
+            self.env[key] += ":"+value
+    def prepend_env(self, key, value):
+        logger.debug("Prepending '%s' to '%s'" % (value, key))
+        if key not in list(self.env.keys()):
+            self.set_env(key,value)
+        else:
+            self.env[key] = value+":"+self.env[key]
+    def run(self, cmd):
+        if isinstance(cmd, list):
+            for c in cmd:
+                self.run_single(c)
+        else:
+            self.run_single(cmd)
+    def run_to_str(self, cmd):
+        out = ""
+        if isinstance(cmd, list):
+            for c in cmd:
+                out += self.run_single_to_str(c)
+        else:
+            out = self.run_single_to_str(cmd)
+        return out
+    def cd(self, dirname):
+        os.chdir(dirname)
+
+    def save_cwd(self):
+        self.cwd = os.getcwd()
+
+    def restore_cwd(self):
+        self.cd( self.cwd )
+
+    def run_single_interactive(self,cmd):
+        subprocess.check_call(cmd, env=self.env,stderr=subprocess.STDOUT, shell=True)
+        logger.debug("%s returned" % cmd)
+
+    def run_single(self,cmd):
+        if self.is_interactive:
+            self.run_single_interactive(cmd)
+        else:
+            self.run_single_to_str(cmd)
+
+    def run_single_to_str_no_output_check(self,cmd):
+        try:
+            out = subprocess.check_output(cmd, env=self.env, stderr=subprocess.STDOUT, shell=True)
+        except subprocess.CalledProcessError as cpe:
+            out = cpe.output
+        if (len(out.strip()) > 0):
+            logger.debug(out)
+        logger.debug("%s returned" % cmd)
+        return out
+
+    def run_single_to_str(self,cmd):
+        out = subprocess.check_output(cmd, env=self.env, stderr=subprocess.STDOUT, shell=True).decode('utf-8')
+        if (len(out.strip()) > 0):
+            logger.debug(out)
+        logger.debug("%s returned" % cmd)
+        return out
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index ca90107a5c99a1d88c1b6f0e8d0224522ac96a47..a22632e1f5029b70e9b09a64198fdf7b8b399244 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -72,8 +72,6 @@ filegroup(
 	"graph/nodes/FlattenLayerNode.cpp",
 	"graph/nodes/FullyConnectedLayer.cpp",
 	"graph/nodes/FusedConvolutionBatchNormalizationNode.cpp",
-	"graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp",
-	"graph/nodes/FusedConvolutionWithPostOpNode.cpp",
 	"graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp",
 	"graph/nodes/GenerateProposalsLayerNode.cpp",
 	"graph/nodes/InputNode.cpp",
@@ -98,7 +96,8 @@ filegroup(
 	"graph/nodes/SplitLayerNode.cpp",
 	"graph/nodes/StackLayerNode.cpp",
 	"graph/nodes/StridedSliceLayerNode.cpp",
-	"graph/printers/DotGraphPrinter.cpp"]  +
+	"graph/printers/DotGraphPrinter.cpp",
+	"//utils:CommonGraphOptions.cpp"]  +
     glob(["**/*.h",
     "**/*.hpp",
     "**/*.inl"]),
@@ -247,6 +246,7 @@ filegroup(
 	"core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp",
 	"core/NEON/kernels/arm_gemm/interleave_indirect-sve.cpp",
 	"core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp",
+	"core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16fp32fp16_dot_16VL/generic.cpp",
 	"core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp",
 	"core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp",
 	"core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp",
@@ -644,6 +644,7 @@ filegroup(
 	"core/Validate.cpp",
 	"core/Version.cpp",
 	"core/helpers/SoftmaxHelpers.cpp",
+	"core/helpers/Utils.cpp",
 	"core/helpers/WindowHelpers.cpp",
 	"core/utils/ActivationFunctionUtils.cpp",
 	"core/utils/AssemblyUtils.cpp",
@@ -651,7 +652,9 @@ filegroup(
 	"core/utils/DataTypeUtils.cpp",
 	"core/utils/FormatUtils.cpp",
 	"core/utils/InterpolationPolicyUtils.cpp",
+	"core/utils/Math.cpp",
 	"core/utils/ScaleUtils.cpp",
+	"core/utils/StringUtils.cpp",
 	"core/utils/helpers/fft.cpp",
 	"core/utils/helpers/tensor_transform.cpp",
 	"core/utils/io/FileHandler.cpp",
@@ -661,7 +664,6 @@ filegroup(
 	"core/utils/logging/LoggerRegistry.cpp",
 	"core/utils/misc/MMappedFile.cpp",
 	"core/utils/quantization/AsymmHelpers.cpp",
-	"core/utils/StringUtils.cpp",
 	"cpu/CpuContext.cpp",
 	"cpu/CpuQueue.cpp",
 	"cpu/CpuTensor.cpp",
@@ -733,11 +735,9 @@ filegroup(
 	"cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp",
 	"cpu/kernels/boundingboxtransform/generic/neon/impl.cpp",
 	"cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp",
-	"cpu/kernels/cast/generic/neon/bfloat16.cpp",
 	"cpu/kernels/cast/generic/neon/fp16.cpp",
 	"cpu/kernels/crop/generic/neon/fp16.cpp",
 	"cpu/kernels/crop/generic/neon/fp32.cpp",
-	"cpu/kernels/crop/generic/neon/impl.cpp",
 	"cpu/kernels/crop/generic/neon/integer.cpp",
 	"cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp",
 	"cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp",
@@ -745,8 +745,11 @@ filegroup(
 	"cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp",
 	"cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp",
 	"cpu/kernels/directconv2d/nchw/all.cpp",
+	"cpu/kernels/directconv2d/nchw/fp16.cpp",
+	"cpu/kernels/directconv2d/nhwc/neon/fp16.cpp",
 	"cpu/kernels/directconv2d/nhwc/neon/fp32.cpp",
 	"cpu/kernels/directconv2d/nhwc/neon/impl.cpp",
+	"cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp",
 	"cpu/kernels/elementwise_binary/generic/neon/fp16.cpp",
 	"cpu/kernels/elementwise_binary/generic/neon/fp32.cpp",
 	"cpu/kernels/elementwise_binary/generic/neon/integer.cpp",
@@ -754,7 +757,6 @@ filegroup(
 	"cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp",
 	"cpu/kernels/elementwise_unary/generic/neon/fp16.cpp",
 	"cpu/kernels/elementwise_unary/generic/neon/fp32.cpp",
-	"cpu/kernels/elementwise_unary/generic/neon/impl.cpp",
 	"cpu/kernels/elementwise_unary/generic/neon/integer.cpp",
 	"cpu/kernels/elementwise_unary/generic/neon/q8.cpp",
 	"cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp",
@@ -763,11 +765,9 @@ filegroup(
 	"cpu/kernels/floor/neon/fp32.cpp",
 	"cpu/kernels/fuse_batch_normalization/generic/fp16.cpp",
 	"cpu/kernels/fuse_batch_normalization/generic/fp32.cpp",
-	"cpu/kernels/fuse_batch_normalization/generic/impl.cpp",
 	"cpu/kernels/fuse_batch_normalization/nchw/all.cpp",
 	"cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp",
 	"cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp",
-	"cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.cpp",
 	"cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp",
 	"cpu/kernels/gemm_matrix_add/generic/neon/fp32.cpp",
 	"cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp",
@@ -785,11 +785,9 @@ filegroup(
 	"cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp",
 	"cpu/kernels/l2normlayer/generic/neon/fp16.cpp",
 	"cpu/kernels/l2normlayer/generic/neon/fp32.cpp",
-	"cpu/kernels/l2normlayer/generic/neon/impl.cpp",
 	"cpu/kernels/lut/generic/neon/u8.cpp",
 	"cpu/kernels/maxunpool/generic/neon/fp16.cpp",
 	"cpu/kernels/maxunpool/generic/neon/fp32.cpp",
-	"cpu/kernels/maxunpool/generic/neon/impl.cpp",
 	"cpu/kernels/maxunpool/generic/neon/qasymm8.cpp",
 	"cpu/kernels/maxunpool/generic/neon/qasymm8_signed.cpp",
 	"cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp",
@@ -803,16 +801,13 @@ filegroup(
 	"cpu/kernels/pool2d/neon/qasymm8_signed.cpp",
 	"cpu/kernels/pool3d/neon/fp16.cpp",
 	"cpu/kernels/pool3d/neon/fp32.cpp",
-	"cpu/kernels/pool3d/neon/impl.cpp",
 	"cpu/kernels/pool3d/neon/qasymm8.cpp",
 	"cpu/kernels/pool3d/neon/qasymm8_signed.cpp",
 	"cpu/kernels/range/generic/neon/fp16.cpp",
 	"cpu/kernels/range/generic/neon/fp32.cpp",
-	"cpu/kernels/range/generic/neon/impl.cpp",
 	"cpu/kernels/range/generic/neon/integer.cpp",
 	"cpu/kernels/roialign/generic/neon/fp16.cpp",
 	"cpu/kernels/roialign/generic/neon/fp32.cpp",
-	"cpu/kernels/roialign/generic/neon/impl.cpp",
 	"cpu/kernels/roialign/generic/neon/qasymm8.cpp",
 	"cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp",
 	"cpu/kernels/scale/neon/fp16.cpp",
@@ -821,13 +816,13 @@ filegroup(
 	"cpu/kernels/scale/neon/qasymm8_signed.cpp",
 	"cpu/kernels/select/generic/neon/fp16.cpp",
 	"cpu/kernels/select/generic/neon/fp32.cpp",
-	"cpu/kernels/select/generic/neon/impl.cpp",
 	"cpu/kernels/select/generic/neon/integer.cpp",
 	"cpu/kernels/softmax/generic/neon/fp16.cpp",
 	"cpu/kernels/softmax/generic/neon/fp32.cpp",
 	"cpu/kernels/softmax/generic/neon/impl.cpp",
 	"cpu/kernels/softmax/generic/neon/qasymm8.cpp",
 	"cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp",
+	"cpu/kernels/sub/neon/fp16.cpp",
 	"cpu/kernels/sub/neon/qasymm8.cpp",
 	"cpu/kernels/sub/neon/qasymm8_signed.cpp",
 	"cpu/kernels/sub/neon/qsymm16.cpp",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 20379dd5f484fd5f4c6d1c3293990f9ed7aeba8a..37599cdadddd8b07598d83bb86d1732e99955b8a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -73,8 +73,6 @@ target_sources(
 	graph/nodes/FlattenLayerNode.cpp
 	graph/nodes/FullyConnectedLayer.cpp
 	graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
-	graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
-	graph/nodes/FusedConvolutionWithPostOpNode.cpp
 	graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp
 	graph/nodes/GenerateProposalsLayerNode.cpp
 	graph/nodes/InputNode.cpp
@@ -223,6 +221,7 @@ target_sources(
 	core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
 	core/NEON/kernels/arm_gemm/interleave_indirect-sve.cpp
 	core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp
+	core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16fp32fp16_dot_16VL/generic.cpp
 	core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp
 	core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp
 	core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp
@@ -636,6 +635,7 @@ target_sources(
 	core/Validate.cpp
 	core/Version.cpp
 	core/helpers/SoftmaxHelpers.cpp
+	core/helpers/Utils.cpp
 	core/helpers/WindowHelpers.cpp
 	core/utils/ActivationFunctionUtils.cpp
 	core/utils/AssemblyUtils.cpp
@@ -643,6 +643,7 @@ target_sources(
 	core/utils/DataTypeUtils.cpp
 	core/utils/FormatUtils.cpp
 	core/utils/InterpolationPolicyUtils.cpp
+	core/utils/Math.cpp
 	core/utils/ScaleUtils.cpp
 	core/utils/StringUtils.cpp
 	core/utils/helpers/fft.cpp
@@ -725,11 +726,9 @@ target_sources(
 	cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp
 	cpu/kernels/boundingboxtransform/generic/neon/impl.cpp
 	cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp
-	cpu/kernels/cast/generic/neon/bfloat16.cpp
 	cpu/kernels/cast/generic/neon/fp16.cpp
 	cpu/kernels/crop/generic/neon/fp16.cpp
 	cpu/kernels/crop/generic/neon/fp32.cpp
-	cpu/kernels/crop/generic/neon/impl.cpp
 	cpu/kernels/crop/generic/neon/integer.cpp
 	cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
 	cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
@@ -737,8 +736,11 @@ target_sources(
 	cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp
 	cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp
 	cpu/kernels/directconv2d/nchw/all.cpp
+	cpu/kernels/directconv2d/nchw/fp16.cpp
+	cpu/kernels/directconv2d/nhwc/neon/fp16.cpp
 	cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
 	cpu/kernels/directconv2d/nhwc/neon/impl.cpp
+	cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp
 	cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
 	cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
 	cpu/kernels/elementwise_binary/generic/neon/integer.cpp
@@ -746,7 +748,6 @@ target_sources(
 	cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
 	cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
 	cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
-	cpu/kernels/elementwise_unary/generic/neon/impl.cpp
 	cpu/kernels/elementwise_unary/generic/neon/integer.cpp
 	cpu/kernels/elementwise_unary/generic/neon/q8.cpp
 	cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
@@ -755,11 +756,9 @@ target_sources(
 	cpu/kernels/floor/neon/fp32.cpp
 	cpu/kernels/fuse_batch_normalization/generic/fp16.cpp
 	cpu/kernels/fuse_batch_normalization/generic/fp32.cpp
-	cpu/kernels/fuse_batch_normalization/generic/impl.cpp
 	cpu/kernels/fuse_batch_normalization/nchw/all.cpp
 	cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp
 	cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp
-	cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.cpp
 	cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp
 	cpu/kernels/gemm_matrix_add/generic/neon/fp32.cpp
 	cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp
@@ -777,11 +776,9 @@ target_sources(
 	cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
 	cpu/kernels/l2normlayer/generic/neon/fp16.cpp
 	cpu/kernels/l2normlayer/generic/neon/fp32.cpp
-	cpu/kernels/l2normlayer/generic/neon/impl.cpp
 	cpu/kernels/lut/generic/neon/u8.cpp
 	cpu/kernels/maxunpool/generic/neon/fp16.cpp
 	cpu/kernels/maxunpool/generic/neon/fp32.cpp
-	cpu/kernels/maxunpool/generic/neon/impl.cpp
 	cpu/kernels/maxunpool/generic/neon/qasymm8.cpp
 	cpu/kernels/maxunpool/generic/neon/qasymm8_signed.cpp
 	cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
@@ -795,16 +792,13 @@ target_sources(
 	cpu/kernels/pool2d/neon/qasymm8_signed.cpp
 	cpu/kernels/pool3d/neon/fp16.cpp
 	cpu/kernels/pool3d/neon/fp32.cpp
-	cpu/kernels/pool3d/neon/impl.cpp
 	cpu/kernels/pool3d/neon/qasymm8.cpp
 	cpu/kernels/pool3d/neon/qasymm8_signed.cpp
 	cpu/kernels/range/generic/neon/fp16.cpp
 	cpu/kernels/range/generic/neon/fp32.cpp
-	cpu/kernels/range/generic/neon/impl.cpp
 	cpu/kernels/range/generic/neon/integer.cpp
 	cpu/kernels/roialign/generic/neon/fp16.cpp
 	cpu/kernels/roialign/generic/neon/fp32.cpp
-	cpu/kernels/roialign/generic/neon/impl.cpp
 	cpu/kernels/roialign/generic/neon/qasymm8.cpp
 	cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp
 	cpu/kernels/scale/neon/fp16.cpp
@@ -813,13 +807,13 @@ target_sources(
 	cpu/kernels/scale/neon/qasymm8_signed.cpp
 	cpu/kernels/select/generic/neon/fp16.cpp
 	cpu/kernels/select/generic/neon/fp32.cpp
-	cpu/kernels/select/generic/neon/impl.cpp
 	cpu/kernels/select/generic/neon/integer.cpp
 	cpu/kernels/softmax/generic/neon/fp16.cpp
 	cpu/kernels/softmax/generic/neon/fp32.cpp
 	cpu/kernels/softmax/generic/neon/impl.cpp
 	cpu/kernels/softmax/generic/neon/qasymm8.cpp
 	cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
+	cpu/kernels/sub/neon/fp16.cpp
 	cpu/kernels/sub/neon/qasymm8.cpp
 	cpu/kernels/sub/neon/qasymm8_signed.cpp
 	cpu/kernels/sub/neon/qsymm16.cpp
@@ -981,4 +975,4 @@ target_sources(
 	runtime/Tensor.cpp
 	runtime/TensorAllocator.cpp
 	runtime/Utils.cpp
-)
+)
\ No newline at end of file
diff --git a/src/c/AclContext.cpp b/src/c/AclContext.cpp
index 9b8ffea6198ee520d4617c0588fdf3b1ff28be78..c6c0820c926e79c319a117f3bed48ba911c47d53 100644
--- a/src/c/AclContext.cpp
+++ b/src/c/AclContext.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "arm_compute/AclEntrypoints.h"
-
 #include "arm_compute/core/Error.h"
 
 #include "src/common/IContext.h"
@@ -42,25 +41,25 @@ namespace
 template <typename ContextType>
 arm_compute::IContext *create_backend_ctx(const AclContextOptions *options)
 {
-    return new(std::nothrow) ContextType(options);
+    return new (std::nothrow) ContextType(options);
 }
 
 bool is_target_valid(AclTarget target)
 {
-    return arm_compute::utils::is_in(target, { AclCpu, AclGpuOcl });
+    return arm_compute::utils::is_in(target, {AclCpu, AclGpuOcl});
 }
 
 bool are_context_options_valid(const AclContextOptions *options)
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(options);
-    return arm_compute::utils::is_in(options->mode, { AclPreferFastRerun, AclPreferFastStart });
+    return arm_compute::utils::is_in(options->mode, {AclPreferFastRerun, AclPreferFastStart});
 }
 
 arm_compute::IContext *create_context(AclTarget target, const AclContextOptions *options)
 {
     ARM_COMPUTE_UNUSED(options);
 
-    switch(target)
+    switch (target)
     {
 #ifdef ARM_COMPUTE_CPU_ENABLED
         case AclCpu:
@@ -77,24 +76,22 @@ arm_compute::IContext *create_context(AclTarget target, const AclContextOptions
 }
 } // namespace
 
-extern "C" AclStatus AclCreateContext(AclContext              *external_ctx,
-                                      AclTarget                target,
-                                      const AclContextOptions *options)
+extern "C" AclStatus AclCreateContext(AclContext *external_ctx, AclTarget target, const AclContextOptions *options)
 {
-    if(!is_target_valid(target))
+    if (!is_target_valid(target))
     {
         ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Target is invalid!");
         return AclUnsupportedTarget;
     }
 
-    if(options != nullptr && !are_context_options_valid(options))
+    if (options != nullptr && !are_context_options_valid(options))
     {
         ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Context options are invalid!");
         return AclInvalidArgument;
     }
 
     auto ctx = create_context(target, options);
-    if(ctx == nullptr)
+    if (ctx == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Couldn't allocate internal resources for context creation!");
         return AclOutOfMemory;
@@ -113,7 +110,7 @@ extern "C" AclStatus AclDestroyContext(AclContext external_ctx)
     StatusCode status = detail::validate_internal_context(ctx);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
-    if(ctx->refcount() != 0)
+    if (ctx->refcount() != 0)
     {
         ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Context has references on it that haven't been released!");
         // TODO: Fix the refcount with callback when reaches 0
diff --git a/src/c/AclQueue.cpp b/src/c/AclQueue.cpp
index 020c6ed53182bbe2f47457e4ee3ff76d5d9a2cee..c3e867bffcd1f5fb1e205f32b7512c2aa2ab2814 100644
--- a/src/c/AclQueue.cpp
+++ b/src/c/AclQueue.cpp
@@ -38,7 +38,7 @@ namespace
 bool is_mode_valid(const AclQueueOptions *options)
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(options);
-    return arm_compute::utils::is_in(options->mode, { AclTuningModeNone, AclRapid, AclNormal, AclExhaustive });
+    return arm_compute::utils::is_in(options->mode, {AclTuningModeNone, AclRapid, AclNormal, AclExhaustive});
 }
 } // namespace
 
@@ -51,14 +51,14 @@ extern "C" AclStatus AclCreateQueue(AclQueue *external_queue, AclContext externa
     StatusCode status = detail::validate_internal_context(ctx);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
-    if(options != nullptr && !is_mode_valid(options))
+    if (options != nullptr && !is_mode_valid(options))
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Queue options are invalid");
         return AclInvalidArgument;
     }
 
     auto queue = ctx->create_queue(options);
-    if(queue == nullptr)
+    if (queue == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
         return AclOutOfMemory;
diff --git a/src/c/AclTensor.cpp b/src/c/AclTensor.cpp
index 5b184697aa983e2dcf2e5ba16ee522e8f1804f3a..c4cd08ac70aae3eacb75931e55269cae1ea9342c 100644
--- a/src/c/AclTensor.cpp
+++ b/src/c/AclTensor.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/AclEntrypoints.h"
 #include "arm_compute/AclUtils.h"
 #include "arm_compute/core/Error.h"
+
 #include "src/common/ITensorV2.h"
 #include "src/common/utils/Macros.h"
 
@@ -41,17 +42,17 @@ constexpr int32_t max_allowed_dims = 6;
  */
 bool is_desc_valid(const AclTensorDescriptor &desc)
 {
-    if(desc.data_type > AclFloat32 || desc.data_type <= AclDataTypeUnknown)
+    if (desc.data_type > AclFloat32 || desc.data_type <= AclDataTypeUnknown)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Unknown data type!");
         return false;
     }
-    if(desc.ndims > max_allowed_dims)
+    if (desc.ndims > max_allowed_dims)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Dimensions surpass the maximum allowed value!");
         return false;
     }
-    if(desc.ndims > 0 && desc.shape == nullptr)
+    if (desc.ndims > 0 && desc.shape == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Dimensions values are empty while dimensionality is > 0!");
         return false;
@@ -66,10 +67,8 @@ StatusCode convert_and_validate_tensor(AclTensor tensor, ITensorV2 **internal_te
 }
 } // namespace
 
-extern "C" AclStatus AclCreateTensor(AclTensor                 *external_tensor,
-                                     AclContext                 external_ctx,
-                                     const AclTensorDescriptor *desc,
-                                     bool                       allocate)
+extern "C" AclStatus
+AclCreateTensor(AclTensor *external_tensor, AclContext external_ctx, const AclTensorDescriptor *desc, bool allocate)
 {
     using namespace arm_compute;
 
@@ -78,14 +77,14 @@ extern "C" AclStatus AclCreateTensor(AclTensor                 *external_tensor,
     StatusCode status = detail::validate_internal_context(ctx);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
-    if(desc == nullptr || !is_desc_valid(*desc))
+    if (desc == nullptr || !is_desc_valid(*desc))
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Descriptor is invalid!");
         return AclInvalidArgument;
     }
 
     auto tensor = ctx->create_tensor(*desc, allocate);
-    if(tensor == nullptr)
+    if (tensor == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Couldn't allocate internal resources for tensor creation!");
         return AclOutOfMemory;
@@ -103,7 +102,7 @@ extern "C" AclStatus AclMapTensor(AclTensor external_tensor, void **handle)
     StatusCode status = detail::validate_internal_tensor(tensor);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
-    if(handle == nullptr)
+    if (handle == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclMapTensor]: Handle object is nullptr!");
         return AclInvalidArgument;
@@ -160,12 +159,12 @@ extern "C" AclStatus AclGetTensorSize(AclTensor tensor, uint64_t *size)
 {
     using namespace arm_compute;
 
-    if(size == nullptr)
+    if (size == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    ITensorV2 *internal_tensor{ nullptr };
+    ITensorV2 *internal_tensor{nullptr};
     auto       status = convert_and_validate_tensor(tensor, &internal_tensor);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
@@ -177,15 +176,15 @@ extern "C" AclStatus AclGetTensorDescriptor(AclTensor tensor, AclTensorDescripto
 {
     using namespace arm_compute;
 
-    if(desc == nullptr)
+    if (desc == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    ITensorV2 *internal_tensor{ nullptr };
+    ITensorV2 *internal_tensor{nullptr};
     const auto status = convert_and_validate_tensor(tensor, &internal_tensor);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
     *desc = internal_tensor->get_descriptor();
     return utils::as_cenum<AclStatus>(status);
-}
\ No newline at end of file
+}
diff --git a/src/c/AclTensorPack.cpp b/src/c/AclTensorPack.cpp
index 6202524ca7486a8580a1aaad9668bcb7c1f83223..daf1be4f4436729c6816b631b3ebe0d5df403494 100644
--- a/src/c/AclTensorPack.cpp
+++ b/src/c/AclTensorPack.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/AclEntrypoints.h"
+
 #include "src/common/ITensorV2.h"
 #include "src/common/TensorPack.h"
 #include "src/common/utils/Macros.h"
@@ -36,7 +37,7 @@ StatusCode PackTensorInternal(TensorPack &pack, AclTensor external_tensor, int32
 
     status = detail::validate_internal_tensor(tensor);
 
-    if(status != StatusCode::Success)
+    if (status != StatusCode::Success)
     {
         return status;
     }
@@ -57,7 +58,7 @@ extern "C" AclStatus AclCreateTensorPack(AclTensorPack *external_pack, AclContex
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
     auto pack = new TensorPack(ctx);
-    if(pack == nullptr)
+    if (pack == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Couldn't allocate internal resources!");
         return AclOutOfMemory;
@@ -77,14 +78,15 @@ extern "C" AclStatus AclPackTensor(AclTensorPack external_pack, AclTensor extern
     return AclStatus::AclSuccess;
 }
 
-extern "C" AclStatus AclPackTensors(AclTensorPack external_pack, AclTensor *external_tensors, int32_t *slot_ids, size_t num_tensors)
+extern "C" AclStatus
+AclPackTensors(AclTensorPack external_pack, AclTensor *external_tensors, int32_t *slot_ids, size_t num_tensors)
 {
     using namespace arm_compute;
 
     auto pack = get_internal(external_pack);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(detail::validate_internal_pack(pack));
 
-    for(unsigned i = 0; i < num_tensors; ++i)
+    for (unsigned i = 0; i < num_tensors; ++i)
     {
         ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(PackTensorInternal(*pack, external_tensors[i], slot_ids[i]));
     }
diff --git a/src/c/AclVersion.cpp b/src/c/AclVersion.cpp
index 971189a6d41a828bdd605c53d50006f5d9042dd0..a659e90837be698064866ea470d6cd0fc0294b74 100644
--- a/src/c/AclVersion.cpp
+++ b/src/c/AclVersion.cpp
@@ -25,8 +25,7 @@
 
 namespace
 {
-constexpr AclVersion version_info
-{
+constexpr AclVersion version_info{
     ARM_COMPUTE_LIBRARY_VERSION_MAJOR,
     ARM_COMPUTE_LIBRARY_VERSION_MINOR,
     ARM_COMPUTE_LIBRARY_VERSION_PATCH,
diff --git a/src/c/cl/AclOpenClExt.cpp b/src/c/cl/AclOpenClExt.cpp
index e72babcae864af25b0b11781b3dc966ebf07182d..8e42cf551039de8cfba74667f18df4d27b834fb4 100644
--- a/src/c/cl/AclOpenClExt.cpp
+++ b/src/c/cl/AclOpenClExt.cpp
@@ -23,13 +23,12 @@
  */
 #include "arm_compute/AclOpenClExt.h"
 
+#include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/common/ITensorV2.h"
 #include "src/common/Types.h"
 #include "src/gpu/cl/ClContext.h"
 #include "src/gpu/cl/ClQueue.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-
 #include "support/Cast.h"
 
 extern "C" AclStatus AclGetClContext(AclContext external_ctx, cl_context *opencl_context)
@@ -37,17 +36,17 @@ extern "C" AclStatus AclGetClContext(AclContext external_ctx, cl_context *opencl
     using namespace arm_compute;
     IContext *ctx = get_internal(external_ctx);
 
-    if(detail::validate_internal_context(ctx) != StatusCode::Success)
+    if (detail::validate_internal_context(ctx) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(ctx->type() != Target::GpuOcl)
+    if (ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
-    if(opencl_context == nullptr)
+    if (opencl_context == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
@@ -62,23 +61,23 @@ extern "C" AclStatus AclSetClContext(AclContext external_ctx, cl_context opencl_
     using namespace arm_compute;
     IContext *ctx = get_internal(external_ctx);
 
-    if(detail::validate_internal_context(ctx) != StatusCode::Success)
+    if (detail::validate_internal_context(ctx) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(ctx->type() != Target::GpuOcl)
+    if (ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
-    if(ctx->refcount() != 0)
+    if (ctx->refcount() != 0)
     {
         return AclStatus::AclUnsupportedConfig;
     }
 
     auto cl_ctx = utils::cast::polymorphic_downcast<arm_compute::gpu::opencl::ClContext *>(ctx);
-    if(!cl_ctx->set_cl_ctx(::cl::Context(opencl_context)))
+    if (!cl_ctx->set_cl_ctx(::cl::Context(opencl_context)))
     {
         return AclStatus::AclRuntimeError;
     }
@@ -91,17 +90,17 @@ extern "C" AclStatus AclGetClDevice(AclContext external_ctx, cl_device_id *openc
     using namespace arm_compute;
     IContext *ctx = get_internal(external_ctx);
 
-    if(detail::validate_internal_context(ctx) != StatusCode::Success)
+    if (detail::validate_internal_context(ctx) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(ctx->type() != Target::GpuOcl)
+    if (ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
-    if(opencl_device == nullptr)
+    if (opencl_device == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
@@ -116,17 +115,17 @@ extern "C" AclStatus AclGetClQueue(AclQueue external_queue, cl_command_queue *op
     using namespace arm_compute;
     IQueue *queue = get_internal(external_queue);
 
-    if(detail::validate_internal_queue(queue) != StatusCode::Success)
+    if (detail::validate_internal_queue(queue) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(queue->header.ctx->type() != Target::GpuOcl)
+    if (queue->header.ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
-    if(opencl_queue == nullptr)
+    if (opencl_queue == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
@@ -141,18 +140,18 @@ extern "C" AclStatus AclSetClQueue(AclQueue external_queue, cl_command_queue ope
     using namespace arm_compute;
     IQueue *queue = get_internal(external_queue);
 
-    if(detail::validate_internal_queue(queue) != StatusCode::Success)
+    if (detail::validate_internal_queue(queue) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(queue->header.ctx->type() != Target::GpuOcl)
+    if (queue->header.ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
     auto cl_queue = utils::cast::polymorphic_downcast<arm_compute::gpu::opencl::ClQueue *>(queue);
-    if(!cl_queue->set_cl_queue(::cl::CommandQueue(opencl_queue)))
+    if (!cl_queue->set_cl_queue(::cl::CommandQueue(opencl_queue)))
     {
         return AclStatus::AclRuntimeError;
     }
@@ -165,17 +164,17 @@ extern "C" AclStatus AclGetClMem(AclTensor external_tensor, cl_mem *opencl_mem)
     using namespace arm_compute;
     ITensorV2 *tensor = get_internal(external_tensor);
 
-    if(detail::validate_internal_tensor(tensor) != StatusCode::Success)
+    if (detail::validate_internal_tensor(tensor) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(tensor->header.ctx->type() != Target::GpuOcl)
+    if (tensor->header.ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
-    if(opencl_mem == nullptr)
+    if (opencl_mem == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
@@ -184,4 +183,4 @@ extern "C" AclStatus AclGetClMem(AclTensor external_tensor, cl_mem *opencl_mem)
     *opencl_mem    = cl_tensor->cl_buffer().get();
 
     return AclStatus::AclSuccess;
-}
\ No newline at end of file
+}
diff --git a/src/common/AllocatorWrapper.cpp b/src/common/AllocatorWrapper.cpp
index 7b5bb3443358a29d834b28d686347118affcadb8..28d81a9fa442e343d02c4196d46708b5948cd886 100644
--- a/src/common/AllocatorWrapper.cpp
+++ b/src/common/AllocatorWrapper.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/common/AllocatorWrapper.h"
+
 #include "arm_compute/core/Error.h"
 
 namespace arm_compute
@@ -57,7 +58,7 @@ void AllocatorWrapper::aligned_free(void *ptr)
 
 void AllocatorWrapper::set_user_data(void *user_data)
 {
-    if(user_data != nullptr)
+    if (user_data != nullptr)
     {
         _backing_allocator.user_data = user_data;
     }
diff --git a/src/common/AllocatorWrapper.h b/src/common/AllocatorWrapper.h
index 5e1f138f16203b581b0587d51b02b8e03f764ad8..bbf70a2cb1fa2116cee9a49dd45b3ff556dc3369 100644
--- a/src/common/AllocatorWrapper.h
+++ b/src/common/AllocatorWrapper.h
@@ -37,8 +37,8 @@ public:
      * @param[in] backing_allocator Backing memory allocator to be used
      */
     AllocatorWrapper(const AclAllocator &backing_allocator) noexcept;
-    AllocatorWrapper(const AllocatorWrapper &) noexcept = default;
-    AllocatorWrapper(AllocatorWrapper &&) noexcept      = default;
+    AllocatorWrapper(const AllocatorWrapper &) noexcept            = default;
+    AllocatorWrapper(AllocatorWrapper &&) noexcept                 = default;
     AllocatorWrapper &operator=(const AllocatorWrapper &) noexcept = delete;
     AllocatorWrapper &operator=(AllocatorWrapper &&other) noexcept = default;
     /** Allocate a chunk of memory of a given size in bytes
@@ -78,4 +78,4 @@ private:
 };
 } // namespace arm_compute
 
-#endif /* SRC_COMMON_ALLOCATORWRAPPER_H */
\ No newline at end of file
+#endif /* SRC_COMMON_ALLOCATORWRAPPER_H */
diff --git a/src/common/IContext.h b/src/common/IContext.h
index 65bb76744d800ba62fa5192c6702bcca04ef263a..a221e5db6108561505ccbc1999dee483a4ece42a 100644
--- a/src/common/IContext.h
+++ b/src/common/IContext.h
@@ -33,7 +33,7 @@
 
 struct AclContext_
 {
-    arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Context, nullptr };
+    arm_compute::detail::Header header{arm_compute::detail::ObjectType::Context, nullptr};
 
 protected:
     AclContext_()  = default;
@@ -51,8 +51,7 @@ class IOperator;
 class IContext : public AclContext_
 {
 public:
-    IContext(Target target)
-        : AclContext_(), _target(target), _refcount(0)
+    IContext(Target target) : AclContext_(), _target(target), _refcount(0)
     {
     }
     /** Virtual Destructor */
@@ -108,11 +107,11 @@ public:
      *
      * @return A pointer to the created queue object
      */
-    virtual IQueue *create_queue(const AclQueueOptions *options) = 0;
-    virtual std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src,
+    virtual IQueue                             *create_queue(const AclQueueOptions *options) = 0;
+    virtual std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor     &src,
                                                                   const AclTensorDescriptor     &dst,
                                                                   const AclActivationDescriptor &act,
-                                                                  bool                           is_validate) = 0;
+                                                                  bool                           is_validate)          = 0;
 
 private:
     Target                   _target;   /**< Target type of context */
@@ -140,7 +139,7 @@ namespace detail
  */
 inline StatusCode validate_internal_context(const IContext *ctx)
 {
-    if(ctx == nullptr || !ctx->is_valid())
+    if (ctx == nullptr || !ctx->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Invalid context object");
         return StatusCode::InvalidArgument;
diff --git a/src/common/IOperator.cpp b/src/common/IOperator.cpp
index b56f0e97fb62aa8898481a1bb6ce4e564a279e88..90e3473814a9de4cb0802887cbd3fb964b9ddfc5 100644
--- a/src/common/IOperator.cpp
+++ b/src/common/IOperator.cpp
@@ -22,13 +22,13 @@
  * SOFTWARE.
  */
 #include "src/common/IOperator.h"
+
 #include "src/common/utils/Validate.h"
 
 namespace arm_compute
 {
 #ifndef DOXYGEN_SKIP_THIS
-IOperator::IOperator(IContext *ctx)
-    : AclOperator_()
+IOperator::IOperator(IContext *ctx) : AclOperator_()
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(ctx);
     this->header.ctx = ctx;
diff --git a/src/common/IOperator.h b/src/common/IOperator.h
index 1b65a09e0d8bf07197a9e5ebf848b18858352bc6..e86e11fe25d547eb535013cbab9fa154f664d04d 100644
--- a/src/common/IOperator.h
+++ b/src/common/IOperator.h
@@ -30,13 +30,14 @@
 // TODO: Remove when all functions have been ported
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/IOperator.h"
+
 #include "src/common/utils/Validate.h"
 
 #include <vector>
 
 struct AclOperator_
 {
-    arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Operator, nullptr };
+    arm_compute::detail::Header header{arm_compute::detail::ObjectType::Operator, nullptr};
 
 protected:
     AclOperator_()  = default;
@@ -100,7 +101,7 @@ public:
     }
 
 private:
-    std::unique_ptr<experimental::IOperator> _op{ nullptr };
+    std::unique_ptr<experimental::IOperator> _op{nullptr};
 };
 
 /** Extract internal representation of an Operator
@@ -124,7 +125,7 @@ namespace detail
  */
 inline StatusCode validate_internal_operator(const IOperator *op)
 {
-    if(op == nullptr || !op->is_valid())
+    if (op == nullptr || !op->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[IOperator]: Invalid operator object");
         return StatusCode::InvalidArgument;
diff --git a/src/common/IQueue.h b/src/common/IQueue.h
index 6a0cbc75da151b281ee57b19c86bde462a6e68e5..60745d206e57286cecdfad4e33282b9f4b686297 100644
--- a/src/common/IQueue.h
+++ b/src/common/IQueue.h
@@ -28,7 +28,7 @@
 
 struct AclQueue_
 {
-    arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Queue, nullptr };
+    arm_compute::detail::Header header{arm_compute::detail::ObjectType::Queue, nullptr};
 
 protected:
     AclQueue_()  = default;
@@ -88,7 +88,7 @@ namespace detail
  */
 inline StatusCode validate_internal_queue(const IQueue *queue)
 {
-    if(queue == nullptr || !queue->is_valid())
+    if (queue == nullptr || !queue->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[IQueue]: Invalid queue object");
         return StatusCode::InvalidArgument;
diff --git a/src/common/ITensorV2.cpp b/src/common/ITensorV2.cpp
index 39bf1c6fb3cec758331fb06646f43240c0829985..bf3d9639262b2b9457490398090bc00957b8576c 100644
--- a/src/common/ITensorV2.cpp
+++ b/src/common/ITensorV2.cpp
@@ -22,7 +22,9 @@
  * SOFTWARE.
  */
 #include "src/common/ITensorV2.h"
+
 #include "arm_compute/core/TensorInfo.h"
+
 #include "src/common/utils/LegacySupport.h"
 
 namespace arm_compute
@@ -36,4 +38,4 @@ AclTensorDescriptor ITensorV2::get_descriptor() const
 {
     return detail::convert_to_descriptor(*tensor()->info());
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/common/ITensorV2.h b/src/common/ITensorV2.h
index 965aacea23d6fbee4877ac2ed63e7a3ab7de9ebf..903bfad66a4ba827bea134249520ab103cb05b24 100644
--- a/src/common/ITensorV2.h
+++ b/src/common/ITensorV2.h
@@ -29,7 +29,7 @@
 
 struct AclTensor_
 {
-    arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Tensor, nullptr };
+    arm_compute::detail::Header header{arm_compute::detail::ObjectType::Tensor, nullptr};
 
 protected:
     AclTensor_()  = default;
@@ -49,8 +49,7 @@ public:
      *
      * @param[in] ctx Context to be used by the operator
      */
-    explicit ITensorV2(IContext *ctx)
-        : AclTensor_()
+    explicit ITensorV2(IContext *ctx) : AclTensor_()
     {
         ARM_COMPUTE_ASSERT_NOT_NULLPTR(ctx);
         this->header.ctx = ctx;
@@ -128,7 +127,7 @@ namespace detail
  */
 inline StatusCode validate_internal_tensor(const ITensorV2 *tensor)
 {
-    if(tensor == nullptr || !tensor->is_valid())
+    if (tensor == nullptr || !tensor->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[ITensorV2]: Invalid tensor object");
         return StatusCode::InvalidArgument;
diff --git a/src/common/TensorPack.cpp b/src/common/TensorPack.cpp
index 6c2c7f96226bd0f72134c1f4cfd37b46eb2853e0..b51fc0bdd8796b9bd1729e43f85a64fc6f337d7b 100644
--- a/src/common/TensorPack.cpp
+++ b/src/common/TensorPack.cpp
@@ -22,13 +22,13 @@
  * SOFTWARE.
  */
 #include "src/common/TensorPack.h"
+
 #include "src/common/ITensorV2.h"
 #include "src/common/utils/Validate.h"
 
 namespace arm_compute
 {
-TensorPack::TensorPack(IContext *ctx)
-    : AclTensorPack_(), _pack()
+TensorPack::TensorPack(IContext *ctx) : AclTensorPack_(), _pack()
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(ctx);
     this->header.ctx = ctx;
diff --git a/src/common/TensorPack.h b/src/common/TensorPack.h
index f330eee7402c5ec941d73b7d215e4df2218d357b..b3d1624daea23e3523cdc7be90117d9338214927 100644
--- a/src/common/TensorPack.h
+++ b/src/common/TensorPack.h
@@ -25,11 +25,12 @@
 #define SRC_COMMON_ITENSORPACK_H_
 
 #include "arm_compute/core/ITensorPack.h"
+
 #include "src/common/IContext.h"
 
 struct AclTensorPack_
 {
-    arm_compute::detail::Header header{ arm_compute::detail::ObjectType::TensorPack, nullptr };
+    arm_compute::detail::Header header{arm_compute::detail::ObjectType::TensorPack, nullptr};
 
 protected:
     AclTensorPack_()  = default;
@@ -118,7 +119,7 @@ namespace detail
  */
 inline StatusCode validate_internal_pack(const TensorPack *pack)
 {
-    if(pack == nullptr || !pack->is_valid())
+    if (pack == nullptr || !pack->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[TensorPack]: Invalid tensor pack object");
         return StatusCode::InvalidArgument;
diff --git a/src/common/cpuinfo/CpuInfo.cpp b/src/common/cpuinfo/CpuInfo.cpp
index cdcdea916c71f15482a2b6b28b32c350a0d733f9..23a477332a23deb590bfc624941fee958bb80524 100644
--- a/src/common/cpuinfo/CpuInfo.cpp
+++ b/src/common/cpuinfo/CpuInfo.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Log.h"
+
 #include "support/StringSupport.h"
 #include "support/ToolchainSupport.h"
 
@@ -53,16 +54,16 @@
 #endif /* defined(__APPLE__) && defined(__aarch64__)) */
 #endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
 
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID (1 << 11)
-#define ARM_COMPUTE_GET_FEATURE_REG(var, freg) __asm __volatile("MRS %0, " #freg \
-                                                                : "=r"(var))
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID    (1 << 11)
+#define ARM_COMPUTE_GET_FEATURE_REG(var, freg) __asm __volatile("MRS %0, " #freg : "=r"(var))
 namespace arm_compute
 {
 namespace cpuinfo
 {
 namespace
 {
-#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__))
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__))
 /** Extract MIDR using CPUID information that are exposed to user-space
  *
  * @param[in] max_num_cpus Maximum number of possible CPUs
@@ -72,15 +73,15 @@ namespace
 std::vector<uint32_t> midr_from_cpuid(uint32_t max_num_cpus)
 {
     std::vector<uint32_t> cpus;
-    for(unsigned int i = 0; i < max_num_cpus; ++i)
+    for (unsigned int i = 0; i < max_num_cpus; ++i)
     {
         std::stringstream str;
         str << "/sys/devices/system/cpu/cpu" << i << "/regs/identification/midr_el1";
         std::ifstream file(str.str(), std::ios::in);
-        if(file.is_open())
+        if (file.is_open())
         {
             std::string line;
-            if(bool(getline(file, line)))
+            if (bool(getline(file, line)))
             {
                 cpus.emplace_back(support::cpp11::stoul(line, nullptr, support::cpp11::NumericBase::BASE_16));
             }
@@ -122,34 +123,35 @@ std::vector<uint32_t> midr_from_proc_cpuinfo(int max_num_cpus)
     ARM_COMPUTE_ERROR_ON_MSG(ret_status != 0, "Regex compilation failed.");
 
     std::ifstream file("/proc/cpuinfo", std::ios::in);
-    if(file.is_open())
+    if (file.is_open())
     {
         std::string line;
         int         midr   = 0;
         int         curcpu = -1;
 
-        while(bool(getline(file, line)))
+        while (bool(getline(file, line)))
         {
             std::array<regmatch_t, 2> match;
             ret_status = regexec(&proc_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
+            if (ret_status == 0)
             {
                 std::string id     = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         newcpu = support::cpp11::stoi(id, nullptr);
 
-                if(curcpu >= 0 && midr == 0)
+                if (curcpu >= 0 && midr == 0)
                 {
                     // Matched a new CPU ID without any description of the previous one - looks like old format.
                     return {};
                 }
 
-                if(curcpu >= 0 && curcpu < max_num_cpus)
+                if (curcpu >= 0 && curcpu < max_num_cpus)
                 {
                     cpus.emplace_back(midr);
                 }
                 else
                 {
-                    ARM_COMPUTE_LOG_INFO_MSG_CORE("Trying to populate a core id with id greater than the expected number of cores!");
+                    ARM_COMPUTE_LOG_INFO_MSG_CORE(
+                        "Trying to populate a core id with id greater than the expected number of cores!");
                 }
 
                 midr   = 0;
@@ -159,7 +161,7 @@ std::vector<uint32_t> midr_from_proc_cpuinfo(int max_num_cpus)
             }
 
             ret_status = regexec(&imp_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
+            if (ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         impv   = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
@@ -169,7 +171,7 @@ std::vector<uint32_t> midr_from_proc_cpuinfo(int max_num_cpus)
             }
 
             ret_status = regexec(&var_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
+            if (ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         varv   = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
@@ -179,7 +181,7 @@ std::vector<uint32_t> midr_from_proc_cpuinfo(int max_num_cpus)
             }
 
             ret_status = regexec(&part_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
+            if (ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         partv  = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
@@ -189,7 +191,7 @@ std::vector<uint32_t> midr_from_proc_cpuinfo(int max_num_cpus)
             }
 
             ret_status = regexec(&rev_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
+            if (ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         regv   = support::cpp11::stoi(subexp, nullptr);
@@ -200,13 +202,14 @@ std::vector<uint32_t> midr_from_proc_cpuinfo(int max_num_cpus)
             }
         }
 
-        if(curcpu >= 0 && curcpu < max_num_cpus)
+        if (curcpu >= 0 && curcpu < max_num_cpus)
         {
             cpus.emplace_back(midr);
         }
         else
         {
-            ARM_COMPUTE_LOG_INFO_MSG_CORE("Trying to populate a core id with id greater than the expected number of cores!");
+            ARM_COMPUTE_LOG_INFO_MSG_CORE(
+                "Trying to populate a core id with id greater than the expected number of cores!");
         }
     }
 
@@ -231,11 +234,11 @@ int get_max_cpus()
     CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in);
     bool success = false;
 
-    if(CPUspresent.is_open())
+    if (CPUspresent.is_open())
     {
         std::string line;
 
-        if(bool(getline(CPUspresent, line)))
+        if (bool(getline(CPUspresent, line)))
         {
             /* The content of this file is a list of ranges or single values, e.g.
                  * 0-5, or 1-3,5,7 or similar.  As we are interested in the
@@ -244,9 +247,9 @@ int get_max_cpus()
                  */
             auto startfrom = line.begin();
 
-            for(auto i = line.begin(); i < line.end(); ++i)
+            for (auto i = line.begin(); i < line.end(); ++i)
             {
-                if(*i == '-' || *i == ',')
+                if (*i == '-' || *i == ',')
                 {
                     startfrom = i + 1;
                 }
@@ -260,13 +263,14 @@ int get_max_cpus()
     }
 
     // Return std::thread::hardware_concurrency() as a fallback.
-    if(!success)
+    if (!success)
     {
         max_cpus = std::thread::hardware_concurrency();
     }
     return max_cpus;
 }
-#elif defined(__aarch64__) && defined(__APPLE__) /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */
+#elif defined(__aarch64__) && \
+    defined(__APPLE__) /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */
 /** Query features through sysctlbyname
   *
   * @return int value queried
@@ -278,46 +282,45 @@ int get_hw_capability(const std::string &cap)
     sysctlbyname(cap.c_str(), &result, &size, NULL, 0);
     return result;
 }
-#endif                                           /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
+#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
 
 #if defined(BARE_METAL) && defined(__aarch64__)
 uint64_t get_sve_feature_reg()
 {
     uint64_t svefr0 = 0;
-    __asm __volatile(
-        ".inst 0xd5380483 // mrs x3, ID_AA64ZFR0_EL1\n"
-        "MOV  %0, X3"
-        : "=r"(svefr0)
-        :
-        : "x3");
+    __asm __volatile(".inst 0xd5380483 // mrs x3, ID_AA64ZFR0_EL1\n"
+                     "MOV  %0, X3"
+                     : "=r"(svefr0)
+                     :
+                     : "x3");
     return svefr0;
 }
 #endif /* defined(BARE_METAL) && defined(__aarch64__) */
 } // namespace
 
-CpuInfo::CpuInfo(CpuIsaInfo isa, std::vector<CpuModel> cpus)
-    : _isa(std::move(isa)), _cpus(std::move(cpus))
+CpuInfo::CpuInfo(CpuIsaInfo isa, std::vector<CpuModel> cpus) : _isa(std::move(isa)), _cpus(std::move(cpus))
 {
 }
 
 CpuInfo CpuInfo::build()
 {
-#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__))
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__))
     const uint32_t hwcaps   = getauxval(AT_HWCAP);
     const uint32_t hwcaps2  = getauxval(AT_HWCAP2);
     const uint32_t max_cpus = get_max_cpus();
 
     // Populate midr values
     std::vector<uint32_t> cpus_midr;
-    if(hwcaps & ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID)
+    if (hwcaps & ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID)
     {
         cpus_midr = midr_from_cpuid(max_cpus);
     }
-    if(cpus_midr.empty())
+    if (cpus_midr.empty())
     {
         cpus_midr = midr_from_proc_cpuinfo(max_cpus);
     }
-    if(cpus_midr.empty())
+    if (cpus_midr.empty())
     {
         cpus_midr.resize(max_cpus, 0);
     }
@@ -333,7 +336,9 @@ CpuInfo CpuInfo::build()
     CpuInfo info(isa, cpus_model);
     return info;
 
-#elif(BARE_METAL) && defined(__aarch64__)        /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
+#elif (BARE_METAL) && \
+    defined(          \
+        __aarch64__) /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
 
     // Assume single CPU in bare metal mode.  Just read the ID register and feature bits directly.
     uint64_t isar0 = 0, isar1 = 0, pfr0 = 0, pfr1 = 0, svefr0 = 0, midr = 0;
@@ -342,7 +347,7 @@ CpuInfo CpuInfo::build()
     ARM_COMPUTE_GET_FEATURE_REG(pfr0, ID_AA64PFR0_EL1);
     ARM_COMPUTE_GET_FEATURE_REG(pfr1, ID_AA64PFR1_EL1);
     ARM_COMPUTE_GET_FEATURE_REG(midr, MIDR_EL1);
-    if((pfr0 >> 32) & 0xf)
+    if ((pfr0 >> 32) & 0xf)
     {
         svefr0 = get_sve_feature_reg();
     }
@@ -361,14 +366,14 @@ CpuInfo CpuInfo::build()
     CpuInfo info(isainfo, cpus_model);
     return info;
 #else                                            /* #elif defined(__aarch64__) && defined(__APPLE__) */
-    CpuInfo info(CpuIsaInfo(), { CpuModel::GENERIC });
+    CpuInfo info(CpuIsaInfo(), {CpuModel::GENERIC});
     return info;
-#endif                                           /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
+#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
 }
 
 CpuModel CpuInfo::cpu_model(uint32_t cpuid) const
 {
-    if(cpuid < _cpus.size())
+    if (cpuid < _cpus.size())
     {
         return _cpus[cpuid];
     }
@@ -377,9 +382,10 @@ CpuModel CpuInfo::cpu_model(uint32_t cpuid) const
 
 CpuModel CpuInfo::cpu_model() const
 {
-#if defined(_WIN64) || defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__))
+#if defined(_WIN64) || defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || \
+    (!defined(__arm__) && !defined(__aarch64__))
     return cpu_model(0);
-#else  /* defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__)) */
+#else /* defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__)) */
     return cpu_model(sched_getcpu());
 #endif /* defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__)) */
 }
@@ -406,13 +412,13 @@ uint32_t num_threads_hint()
 
     // Read cpuinfo and get occurrence of each core
     std::ifstream cpuinfo_file("/proc/cpuinfo", std::ios::in);
-    if(cpuinfo_file.is_open())
+    if (cpuinfo_file.is_open())
     {
         std::string line;
-        while(bool(getline(cpuinfo_file, line)))
+        while (bool(getline(cpuinfo_file, line)))
         {
             std::array<regmatch_t, 2> match;
-            if(regexec(&cpu_part_rgx, line.c_str(), 2, match.data(), 0) == 0)
+            if (regexec(&cpu_part_rgx, line.c_str(), 2, match.data(), 0) == 0)
             {
                 cpus.emplace_back(line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so)));
             }
@@ -425,13 +431,13 @@ uint32_t num_threads_hint()
     auto least_frequent_cpu_occurences = [](const std::vector<std::string> &cpus) -> uint32_t
     {
         std::unordered_map<std::string, uint32_t> cpus_freq;
-        for(const auto &cpu : cpus)
+        for (const auto &cpu : cpus)
         {
             cpus_freq[cpu]++;
         }
 
         uint32_t vmin = cpus.size() + 1;
-        for(const auto &cpu_freq : cpus_freq)
+        for (const auto &cpu_freq : cpus_freq)
         {
             vmin = std::min(vmin, cpu_freq.second);
         }
diff --git a/src/common/cpuinfo/CpuIsaInfo.cpp b/src/common/cpuinfo/CpuIsaInfo.cpp
index 23da54a35d1c9cd50e00c4c7de408f20949cd81a..597768530ba8dcc5ee3b7b18b9c3514abb4d8e89 100644
--- a/src/common/cpuinfo/CpuIsaInfo.cpp
+++ b/src/common/cpuinfo/CpuIsaInfo.cpp
@@ -24,6 +24,7 @@
 #include "src/common/cpuinfo/CpuIsaInfo.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "src/common/cpuinfo/CpuModel.h"
 
 /* Arm Feature flags */
@@ -31,18 +32,18 @@
 #define ARM_COMPUTE_CPU_FEATURE_HWCAP_NEON (1 << 12)
 
 /* Arm64 Feature flags */
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMD (1 << 1)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP (1 << 9)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP (1 << 10)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDDP (1 << 20)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_SVE (1 << 22)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVE2 (1 << 1)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEI8MM (1 << 9)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMD     (1 << 1)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP      (1 << 9)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP   (1 << 10)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDDP   (1 << 20)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_SVE       (1 << 22)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVE2     (1 << 1)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEI8MM  (1 << 9)
 #define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEF32MM (1 << 10)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEBF16 (1 << 12)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_I8MM (1 << 13)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_BF16 (1 << 14)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SME (1 << 23)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEBF16  (1 << 12)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_I8MM     (1 << 13)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_BF16     (1 << 14)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SME      (1 << 23)
 
 namespace arm_compute
 {
@@ -71,12 +72,12 @@ void decode_hwcaps(CpuIsaInfo &isa, const uint32_t hwcaps, const uint32_t hwcaps
     isa.sve2 = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVE2);
 
     // Detection of SME from type HWCAP2 in the auxillary vector
-    isa.sme   = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_SME);
-    isa.sme2  = isa.sme; // Needs to be set properly
+    isa.sme  = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_SME);
+    isa.sme2 = isa.sme; // Needs to be set properly
 
     // Data-type support
-    isa.fp16    = is_feature_supported(hwcaps, ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP | ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP);
-    isa.bf16    = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_BF16);
+    isa.fp16 = is_feature_supported(hwcaps, ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP | ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP);
+    isa.bf16 = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_BF16);
     isa.svebf16 = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEBF16);
 
     // Instruction extensions
@@ -92,12 +93,15 @@ void decode_hwcaps(CpuIsaInfo &isa, const uint32_t hwcaps, const uint32_t hwcaps
 }
 #endif /* defined(__aarch64__) */
 
-void decode_regs(CpuIsaInfo &isa, const uint64_t isar0, const uint64_t isar1, const uint64_t pfr0, const uint64_t pfr1, const uint64_t svefr0)
+void decode_regs(CpuIsaInfo    &isa,
+                 const uint64_t isar0,
+                 const uint64_t isar1,
+                 const uint64_t pfr0,
+                 const uint64_t pfr1,
+                 const uint64_t svefr0)
 {
     auto is_supported = [](uint64_t feature_reg, uint8_t feature_pos) -> bool
-    {
-        return ((feature_reg >> feature_pos) & 0xf);
-    };
+    { return ((feature_reg >> feature_pos) & 0xf); };
 
     // High-level SIMD support
     isa.sve  = is_supported(pfr0, 32);
@@ -124,11 +128,11 @@ void decode_regs(CpuIsaInfo &isa, const uint64_t isar0, const uint64_t isar1, co
  */
 void allowlisted_model_features(CpuIsaInfo &isa, CpuModel model)
 {
-    if(isa.dot == false)
+    if (isa.dot == false)
     {
         isa.dot = model_supports_dot(model);
     }
-    if(isa.fp16 == false)
+    if (isa.fp16 == false)
     {
         isa.fp16 = model_supports_fp16(model);
     }
@@ -147,7 +151,8 @@ CpuIsaInfo init_cpu_isa_from_hwcaps(uint32_t hwcaps, uint32_t hwcaps2, uint32_t
     return isa;
 }
 
-CpuIsaInfo init_cpu_isa_from_regs(uint64_t isar0, uint64_t isar1, uint64_t pfr0, uint64_t pfr1, uint64_t svefr0, uint64_t midr)
+CpuIsaInfo
+init_cpu_isa_from_regs(uint64_t isar0, uint64_t isar1, uint64_t pfr0, uint64_t pfr1, uint64_t svefr0, uint64_t midr)
 {
     CpuIsaInfo isa;
 
diff --git a/src/common/cpuinfo/CpuIsaInfo.h b/src/common/cpuinfo/CpuIsaInfo.h
index b92b6538b6ce03c34e70717e6569ff1934669d61..9d6bc07b67a98013fe06ffb94c3899917e625d86 100644
--- a/src/common/cpuinfo/CpuIsaInfo.h
+++ b/src/common/cpuinfo/CpuIsaInfo.h
@@ -37,22 +37,22 @@ namespace cpuinfo
 struct CpuIsaInfo
 {
     /* SIMD extension support */
-    bool neon{ false };
-    bool sve{ false };
-    bool sve2{ false };
-    bool sme{ false };
-    bool sme2{ false };
+    bool neon{false};
+    bool sve{false};
+    bool sve2{false};
+    bool sme{false};
+    bool sme2{false};
 
     /* Data-type extensions support */
-    bool fp16{ false };
-    bool bf16{ false };
-    bool svebf16{ false };
+    bool fp16{false};
+    bool bf16{false};
+    bool svebf16{false};
 
     /* Instruction support */
-    bool dot{ false };
-    bool i8mm{ false };
-    bool svei8mm{ false };
-    bool svef32mm{ false };
+    bool dot{false};
+    bool i8mm{false};
+    bool svei8mm{false};
+    bool svef32mm{false};
 };
 
 /** Identify ISA related information through system information
@@ -76,7 +76,8 @@ CpuIsaInfo init_cpu_isa_from_hwcaps(uint32_t hwcaps, uint32_t hwcaps2, uint32_t
  *
  * @return CpuIsaInfo A populated ISA feature structure
  */
-CpuIsaInfo init_cpu_isa_from_regs(uint64_t isar0, uint64_t isar1, uint64_t pfr0, uint64_t pfr1, uint64_t svefr0, uint64_t midr);
+CpuIsaInfo
+init_cpu_isa_from_regs(uint64_t isar0, uint64_t isar1, uint64_t pfr0, uint64_t pfr1, uint64_t svefr0, uint64_t midr);
 } // namespace cpuinfo
 } // namespace arm_compute
 
diff --git a/src/common/cpuinfo/CpuModel.cpp b/src/common/cpuinfo/CpuModel.cpp
index d6d91df1332c29d747a047d92f50179dc68efcbc..8c3f8a8fafaa31424781459949645a7ce1c435ff 100644
--- a/src/common/cpuinfo/CpuModel.cpp
+++ b/src/common/cpuinfo/CpuModel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,12 +29,12 @@ namespace cpuinfo
 {
 std::string cpu_model_to_string(CpuModel model)
 {
-    switch(model)
+    switch (model)
     {
 #define X(MODEL)          \
-case CpuModel::MODEL: \
-    return #MODEL;
-            ARM_COMPUTE_CPU_MODEL_LIST
+    case CpuModel::MODEL: \
+        return #MODEL;
+        ARM_COMPUTE_CPU_MODEL_LIST
 #undef X
         default:
         {
@@ -45,7 +45,7 @@ case CpuModel::MODEL: \
 
 bool model_supports_fp16(CpuModel model)
 {
-    switch(model)
+    switch (model)
     {
         case CpuModel::GENERIC_FP16:
         case CpuModel::GENERIC_FP16_DOT:
@@ -63,7 +63,7 @@ bool model_supports_fp16(CpuModel model)
 
 bool model_supports_dot(CpuModel model)
 {
-    switch(model)
+    switch (model)
     {
         case CpuModel::GENERIC_FP16_DOT:
         case CpuModel::A55r1:
@@ -87,16 +87,16 @@ CpuModel midr_to_model(uint32_t midr)
     const int cpunum      = (midr >> 4) & 0xFFF;
 
     // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
-    if(implementer == 0x41) // Arm CPUs
+    if (implementer == 0x41) // Arm CPUs
     {
-        switch(cpunum)
+        switch (cpunum)
         {
             case 0xd03: // A53
             case 0xd04: // A35
                 model = CpuModel::A53;
                 break;
             case 0xd05: // A55
-                if(variant != 0)
+                if (variant != 0)
                 {
                     model = CpuModel::A55r1;
                 }
@@ -109,7 +109,7 @@ CpuModel midr_to_model(uint32_t midr)
                 model = CpuModel::A73;
                 break;
             case 0xd0a: // A75
-                if(variant != 0)
+                if (variant != 0)
                 {
                     model = CpuModel::GENERIC_FP16_DOT;
                 }
@@ -136,17 +136,21 @@ CpuModel midr_to_model(uint32_t midr)
             case 0xd44: // X1
                 model = CpuModel::X1;
                 break;
-            case 0xd46:
+            case 0xd46: // A510
+            case 0xd80: // A520
                 model = CpuModel::A510;
                 break;
+            case 0xd15: // R82
+                model = CpuModel::A55r1;
+                break;
             default:
                 model = CpuModel::GENERIC;
                 break;
         }
     }
-    else if(implementer == 0x46)
+    else if (implementer == 0x46)
     {
-        switch(cpunum)
+        switch (cpunum)
         {
             case 0x001: // A64FX
                 model = CpuModel::A64FX;
@@ -156,9 +160,9 @@ CpuModel midr_to_model(uint32_t midr)
                 break;
         }
     }
-    else if(implementer == 0x48)
+    else if (implementer == 0x48)
     {
-        switch(cpunum)
+        switch (cpunum)
         {
             case 0xd40: // A76
                 model = CpuModel::GENERIC_FP16_DOT;
@@ -168,9 +172,9 @@ CpuModel midr_to_model(uint32_t midr)
                 break;
         }
     }
-    else if(implementer == 0x51)
+    else if (implementer == 0x51)
     {
-        switch(cpunum)
+        switch (cpunum)
         {
             case 0x800: // A73
                 model = CpuModel::A73;
@@ -196,4 +200,4 @@ CpuModel midr_to_model(uint32_t midr)
     return model;
 }
 } // namespace cpuinfo
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/common/cpuinfo/CpuModel.h b/src/common/cpuinfo/CpuModel.h
index 4fe6c29e535cf6a033dcb4f23dde835ef21ac870..3b9d9e3494e56795e5f776e02d9c6576e0dbbc39 100644
--- a/src/common/cpuinfo/CpuModel.h
+++ b/src/common/cpuinfo/CpuModel.h
@@ -24,11 +24,11 @@
 #ifndef SRC_COMMON_CPUINFO_CPUMODEL_H
 #define SRC_COMMON_CPUINFO_CPUMODEL_H
 
+#include "arm_compute/core/CPP/CPPTypes.h"
+
 #include <cstdint>
 #include <string>
 
-#include "arm_compute/core/CPP/CPPTypes.h"
-
 namespace arm_compute
 {
 namespace cpuinfo
diff --git a/src/common/utils/LegacySupport.cpp b/src/common/utils/LegacySupport.cpp
index 06b1693bd1cbd23b9cb9ba5730ffbda3db11978b..102644227eec131d263f2075e43efb49b0f1f92a 100644
--- a/src/common/utils/LegacySupport.cpp
+++ b/src/common/utils/LegacySupport.cpp
@@ -33,7 +33,7 @@ namespace
 {
 DataType convert_to_legacy_data_type(AclDataType data_type)
 {
-    switch(data_type)
+    switch (data_type)
     {
         case AclDataType::AclFloat32:
             return DataType::F32;
@@ -48,7 +48,7 @@ DataType convert_to_legacy_data_type(AclDataType data_type)
 
 AclDataType convert_to_c_data_type(DataType data_type)
 {
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::F32:
             return AclDataType::AclFloat32;
@@ -64,7 +64,7 @@ AclDataType convert_to_c_data_type(DataType data_type)
 TensorShape create_legacy_tensor_shape(int32_t ndims, int32_t *shape)
 {
     TensorShape legacy_shape{};
-    for(int32_t d = 0; d < ndims; ++d)
+    for (int32_t d = 0; d < ndims; ++d)
     {
         legacy_shape.set(d, shape[d], false);
     }
@@ -73,14 +73,14 @@ TensorShape create_legacy_tensor_shape(int32_t ndims, int32_t *shape)
 int32_t *create_tensor_shape_array(const TensorInfo &info)
 {
     const auto num_dims = info.num_dimensions();
-    if(num_dims <= 0)
+    if (num_dims <= 0)
     {
         return nullptr;
     }
 
     int32_t *shape_array = new int32_t[num_dims];
 
-    for(size_t d = 0; d < num_dims; ++d)
+    for (size_t d = 0; d < num_dims; ++d)
     {
         shape_array[d] = info.tensor_shape()[d];
     }
@@ -92,28 +92,23 @@ int32_t *create_tensor_shape_array(const TensorInfo &info)
 TensorInfo convert_to_legacy_tensor_info(const AclTensorDescriptor &desc)
 {
     TensorInfo legacy_desc;
-    legacy_desc.init(create_legacy_tensor_shape(desc.ndims, desc.shape), 1, convert_to_legacy_data_type(desc.data_type));
+    legacy_desc.init(create_legacy_tensor_shape(desc.ndims, desc.shape), 1,
+                     convert_to_legacy_data_type(desc.data_type));
     return legacy_desc;
 }
 
 AclTensorDescriptor convert_to_descriptor(const TensorInfo &info)
 {
     const auto          num_dims = info.num_dimensions();
-    AclTensorDescriptor desc
-    {
-        static_cast<int32_t>(num_dims),
-        create_tensor_shape_array(info),
-        convert_to_c_data_type(info.data_type()),
-        nullptr,
-        0
-    };
+    AclTensorDescriptor desc{static_cast<int32_t>(num_dims), create_tensor_shape_array(info),
+                             convert_to_c_data_type(info.data_type()), nullptr, 0};
     return desc;
 }
 
 ActivationLayerInfo convert_to_activation_info(const AclActivationDescriptor &desc)
 {
     ActivationLayerInfo::ActivationFunction act;
-    switch(desc.type)
+    switch (desc.type)
     {
         case AclActivationType::AclIdentity:
             act = ActivationLayerInfo::ActivationFunction::IDENTITY;
diff --git a/src/common/utils/Log.h b/src/common/utils/Log.h
index bbfe1ce1b3def8ebfc03ceadfd11486460d554cb..6ebfed366ea649d35329d49993c29ba309e91c6e 100644
--- a/src/common/utils/Log.h
+++ b/src/common/utils/Log.h
@@ -38,20 +38,22 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/logging/Macros.h"
+
 #include "utils/TypePrinter.h"
 
 /** Create a logger
  *
  * @note It will eventually create all default loggers in don't exist
  */
-#define ARM_COMPUTE_CREATE_ACL_LOGGER()                                                                                        \
-    do                                                                                                                         \
-    {                                                                                                                          \
-        if(arm_compute::logging::LoggerRegistry::get().logger("ComputeLibrary") == nullptr)                                    \
-        {                                                                                                                      \
-            arm_compute::logging::LoggerRegistry::get().create_logger("ComputeLibrary", arm_compute::logging::LogLevel::INFO); \
-        }                                                                                                                      \
-    } while(false)
+#define ARM_COMPUTE_CREATE_ACL_LOGGER()                                                                      \
+    do                                                                                                       \
+    {                                                                                                        \
+        if (arm_compute::logging::LoggerRegistry::get().logger("ComputeLibrary") == nullptr)                 \
+        {                                                                                                    \
+            arm_compute::logging::LoggerRegistry::get().create_logger("ComputeLibrary",                      \
+                                                                      arm_compute::logging::LogLevel::INFO); \
+        }                                                                                                    \
+    } while (false)
 
 /** Log a message to the logger
  *
@@ -63,7 +65,7 @@
     {                                                          \
         ARM_COMPUTE_CREATE_ACL_LOGGER();                       \
         ARM_COMPUTE_LOG_MSG("ComputeLibrary", log_level, msg); \
-    } while(false)
+    } while (false)
 
 /** Log a message with format to the logger
  *
@@ -76,7 +78,7 @@
     {                                                                                   \
         ARM_COMPUTE_CREATE_ACL_LOGGER();                                                \
         ARM_COMPUTE_LOG_MSG_WITH_FORMAT("ComputeLibrary", log_level, fmt, __VA_ARGS__); \
-    } while(false)
+    } while (false)
 
 /** Log an error message to the logger
  *
@@ -87,7 +89,7 @@
     {                                                                                      \
         ARM_COMPUTE_CREATE_ACL_LOGGER();                                                   \
         ARM_COMPUTE_LOG_MSG("ComputeLibrary", arm_compute::logging::LogLevel::ERROR, msg); \
-    } while(false)
+    } while (false)
 
 /** Log an error message to the logger with function name before the message
  *
@@ -98,7 +100,7 @@
     {                                                                                                    \
         ARM_COMPUTE_CREATE_ACL_LOGGER();                                                                 \
         ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME("ComputeLibrary", arm_compute::logging::LogLevel::ERROR, msg); \
-    } while(false)
+    } while (false)
 
 /** Log an information message to the logger with function name before the message
  *
@@ -109,7 +111,7 @@
     {                                                                                                   \
         ARM_COMPUTE_CREATE_ACL_LOGGER();                                                                \
         ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME("ComputeLibrary", arm_compute::logging::LogLevel::INFO, msg); \
-    } while(false)
+    } while (false)
 
 /** Function template specialization for the out of bound element at index = tuple_size
  *
@@ -131,12 +133,13 @@ logParamsImpl(std::vector<std::string> &data_registry, const std::tuple<Tp...> &
  * @param[in]     in_params_tuple Constant reference to a tuple of different input data types
  */
 template <std::size_t Index, typename... Tp>
-inline typename std::enable_if < Index<sizeof...(Tp), void>::type
-logParamsImpl(std::vector<std::string> &data_registry, const std::tuple<Tp...> &in_params_tuple)
+    inline typename std::enable_if <
+    Index<sizeof...(Tp), void>::type logParamsImpl(std::vector<std::string> &data_registry,
+                                                   const std::tuple<Tp...>  &in_params_tuple)
 {
     data_registry.push_back(arm_compute::to_string(std::get<Index>(in_params_tuple)));
     // Unfold the next tuple element
-    logParamsImpl < Index + 1, Tp... > (data_registry, in_params_tuple);
+    logParamsImpl<Index + 1, Tp...>(data_registry, in_params_tuple);
 }
 
 /** Function Template with variable number of inputs to collect all the passed parameters from
@@ -149,10 +152,10 @@ logParamsImpl(std::vector<std::string> &data_registry, const std::tuple<Tp...> &
  * @return  Vector of the parameters' data in a string format
  */
 template <typename... Ts>
-const std::vector<std::string> logParams(Ts &&... ins)
+const std::vector<std::string> logParams(Ts &&...ins)
 {
     std::vector<std::string> data_registry{};
-    std::tuple<Ts...>        in_params_tuple{ ins... };
+    std::tuple<Ts...>        in_params_tuple{ins...};
 
     // Start logging the tuple elements, starting from 0 to tuple_size-1
     logParamsImpl<0>(data_registry, in_params_tuple);
@@ -178,11 +181,11 @@ inline const std::vector<std::string> getParamsNames(const std::string &in_param
 
     // Usually the input parameters string would be name of parameters separated
     // by ',' e.g. "src0, src1, policy"
-    while(std::getline(ss, temp, ','))
+    while (std::getline(ss, temp, ','))
     {
         names.push_back(temp);
     }
-    for(auto &name : names)
+    for (auto &name : names)
     {
         // Totally get rid of white space characters
         name.erase(std::remove(name.begin(), name.end(), ' '), name.end());
@@ -205,7 +208,7 @@ inline const std::string constructDataLog(const std::vector<std::string> &params
 {
     std::string dataLog = "\n ";
     ARM_COMPUTE_ERROR_ON(params_names.size() != data_registry.size());
-    for(uint8_t i = 0; i < params_names.size(); ++i)
+    for (uint8_t i = 0; i < params_names.size(); ++i)
     {
         dataLog += params_names[i] + ": " + data_registry.at(i) + "\n ";
     }
@@ -220,11 +223,11 @@ inline const std::string constructDataLog(const std::vector<std::string> &params
  *
  * @param[in] ... Input parameters
  */
-#define ARM_COMPUTE_LOG_PARAMS(...)                                                           \
-    do                                                                                        \
-    {                                                                                         \
-        ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL(constructDataLog(getParamsNames(#__VA_ARGS__), \
-                                                                logParams(__VA_ARGS__)));     \
-    } while(false)
+#define ARM_COMPUTE_LOG_PARAMS(...)                                                  \
+    do                                                                               \
+    {                                                                                \
+        ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL(                                      \
+            constructDataLog(getParamsNames(#__VA_ARGS__), logParams(__VA_ARGS__))); \
+    } while (false)
 #endif /* ARM_COMPUTE_LOGGING_ENABLED */
 #endif /* SRC_COMMON_LOG_H */
diff --git a/src/common/utils/Macros.h b/src/common/utils/Macros.h
index 2e44ea599ecff86e554eecd7d486cc7483c8c5ff..35f7e759d33b2bb63bda570310443a19692bc42d 100644
--- a/src/common/utils/Macros.h
+++ b/src/common/utils/Macros.h
@@ -28,7 +28,7 @@
 
 #define ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status)                 \
     {                                                               \
-        if(status != arm_compute::StatusCode::Success)              \
+        if (status != arm_compute::StatusCode::Success)             \
         {                                                           \
             return arm_compute::utils::as_cenum<AclStatus>(status); \
         }                                                           \
diff --git a/src/common/utils/Object.h b/src/common/utils/Object.h
index 1f194737d48eba497e822bb73a410c258a4bfef9..b73de8e4303da2f8952107b7c2eec9725cfb0ad3 100644
--- a/src/common/utils/Object.h
+++ b/src/common/utils/Object.h
@@ -52,14 +52,12 @@ struct Header
      * @param[in] type_ Object identification type
      * @param[in] ctx_  Context to reference
      */
-    Header(ObjectType type_, IContext *ctx_) noexcept
-        : type(type_),
-          ctx(ctx_)
+    Header(ObjectType type_, IContext *ctx_) noexcept : type(type_), ctx(ctx_)
     {
     }
 
-    ObjectType type{ ObjectType::Invalid };
-    IContext *ctx{ nullptr };
+    ObjectType type{ObjectType::Invalid};
+    IContext  *ctx{nullptr};
 };
 } // namespace detail
 } // namespace arm_compute
diff --git a/src/common/utils/Utils.h b/src/common/utils/Utils.h
index 1bd1c7ec5726525334007fb5df243ab2eca0f675..33fe6c0e816e6a40f1479522194b126da6105f39 100644
--- a/src/common/utils/Utils.h
+++ b/src/common/utils/Utils.h
@@ -74,10 +74,7 @@ constexpr SE as_enum(const E val) noexcept
 template <typename E>
 bool is_in(E check, std::initializer_list<E> list)
 {
-    return std::any_of(list.begin(), list.end(), [&check](E e)
-    {
-        return check == e;
-    });
+    return std::any_of(list.begin(), list.end(), [&check](E e) { return check == e; });
 }
 } // namespace utils
 } // namespace arm_compute
diff --git a/src/common/utils/Validate.h b/src/common/utils/Validate.h
index 4e8807273abe273d7c641971a3d5ed8dc8e898d5..97819c619f4d11358ec4a5c2a0bf7229f887511f 100644
--- a/src/common/utils/Validate.h
+++ b/src/common/utils/Validate.h
@@ -29,7 +29,7 @@
 
 #include <cassert>
 
-#define ARM_COMPUTE_ASSERT(cond) assert(cond)
+#define ARM_COMPUTE_ASSERT(cond)            assert(cond)
 #define ARM_COMPUTE_ASSERT_NOT_NULLPTR(ptr) assert((ptr) != nullptr)
 
 #else /* defined(ARM_COMPUTE_ASSERTS_ENABLED) */
diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp
index ca2f7d238fbdf21c25cc1bcb3e8d05721146667a..52be6990ab911160f6d1b2b31d3c14a4b5c4f773 100644
--- a/src/core/AccessWindowAutoPadding.cpp
+++ b/src/core/AccessWindowAutoPadding.cpp
@@ -28,12 +28,14 @@
 
 using namespace arm_compute;
 
-AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info)
-    : _info(info)
+AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info) : _info(info)
 {
 }
 
-ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window,
+                                                          ValidRegion   input_valid_region,
+                                                          bool          border_undefined,
+                                                          BorderSize    border_size) const
 {
     ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_UNUSED(input_valid_region);
@@ -45,17 +47,17 @@ ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window,
 
 ValidRegion AccessWindowAutoPadding::compute_valid_region() const
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return ValidRegion{};
     }
 
-    return ValidRegion{ Coordinates(), _info->tensor_shape() };
+    return ValidRegion{Coordinates(), _info->tensor_shape()};
 }
 
 void AccessWindowAutoPadding::set_valid_region()
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return;
     }
@@ -75,7 +77,7 @@ bool AccessWindowAutoPadding::update_padding_if_needed(const Window &window)
     ARM_COMPUTE_UNUSED(window);
 
     // Only update the padding if the tensor allows it
-    if(_info == nullptr || !_info->is_resizable())
+    if (_info == nullptr || !_info->is_resizable())
     {
         return false;
     }
diff --git a/src/core/AccessWindowAutoPadding.h b/src/core/AccessWindowAutoPadding.h
index b8d15086799bb9a1368b944a78cdd9b551bddedb..406bdba0d8badd622276e6583afed45db4784b16 100644
--- a/src/core/AccessWindowAutoPadding.h
+++ b/src/core/AccessWindowAutoPadding.h
@@ -74,9 +74,12 @@ public:
     ValidRegion compute_valid_region() const;
 
     // Inherited methods overridden:
-    bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) override;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    bool        update_window_if_needed(Window &window) const override;
+    bool        update_padding_if_needed(const Window &window) override;
+    ValidRegion compute_valid_region(const Window &window,
+                                     ValidRegion   input_valid_region,
+                                     bool          border_undefined,
+                                     BorderSize    border_size) const override;
 
 private:
     ITensorInfo *_info;
diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
index 0607011bc5ebfd7c8f1a91a3ec401b8e80f1f2e1..98182b12022ad776629a00eb78b8e1fd9e1588bf 100644
--- a/src/core/AccessWindowStatic.cpp
+++ b/src/core/AccessWindowStatic.cpp
@@ -34,7 +34,10 @@ AccessWindowStatic::AccessWindowStatic(ITensorInfo *info, int start_x, int start
 {
 }
 
-ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowStatic::compute_valid_region(const Window &window,
+                                                     ValidRegion   input_valid_region,
+                                                     bool          border_undefined,
+                                                     BorderSize    border_size) const
 {
     ARM_COMPUTE_UNUSED(border_undefined);
     ARM_COMPUTE_UNUSED(border_size);
@@ -44,7 +47,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid
 
 ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region) const
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return input_valid_region;
     }
@@ -57,7 +60,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid
     // Start of the valid region is equal to the start of the static access but
     // never outside of the tensor.
     anchor.set(0, std::max<int>(0, _start_x));
-    if(_info->num_dimensions() > 1)
+    if (_info->num_dimensions() > 1)
     {
         anchor.set(1, std::max<int>(0, _start_y));
     }
@@ -65,7 +68,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid
     // End of the valid region is equal to the end of the static access but
     // never outside of the tensor.
     shape.set(0, std::min<int>(_end_x, _info->tensor_shape()[0]));
-    if(_info->num_dimensions() > 1)
+    if (_info->num_dimensions() > 1)
     {
         shape.set(1, std::min<int>(_end_y, _info->tensor_shape()[1]));
     }
@@ -75,7 +78,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid
 
 void AccessWindowStatic::set_valid_region(const Window &window, const ValidRegion &input_valid_region)
 {
-    if(_info != nullptr)
+    if (_info != nullptr)
     {
         _info->set_valid_region(compute_valid_region(window, input_valid_region));
     }
@@ -84,7 +87,7 @@ void AccessWindowStatic::set_valid_region(const Window &window, const ValidRegio
 bool AccessWindowStatic::update_window_if_needed(Window &window) const
 {
     // If the padding is not enough and the tensor is not resizable, shrink the window to size 0
-    if(_info == nullptr || _info->is_resizable())
+    if (_info == nullptr || _info->is_resizable())
     {
         return false;
     }
@@ -96,48 +99,50 @@ bool AccessWindowStatic::update_window_if_needed(Window &window) const
     bool window_modified = false;
 
     // Calculate if padding is enough
-    if(_start_y < 0)
+    if (_start_y < 0)
     {
         const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]);
 
-        if(_start_y < front_pad_y_available)
+        if (_start_y < front_pad_y_available)
         {
             window_modified = true;
         }
     }
 
-    if(!window_modified)
+    if (!window_modified)
     {
-        if(_end_y > static_cast<int>(shape[1]))
+        if (_end_y > static_cast<int>(shape[1]))
         {
             const int stride_z             = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
             const int tail_pad_y_available = (stride_z / strides[1]) - shape[1];
 
-            if(static_cast<int>(shape[1]) + tail_pad_y_available < _end_y)
+            if (static_cast<int>(shape[1]) + tail_pad_y_available < _end_y)
             {
                 window_modified = true;
             }
         }
 
-        if(!window_modified)
+        if (!window_modified)
         {
             const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
 
-            if(_start_x < 0)
+            if (_start_x < 0)
             {
-                const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element), stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+                const int front_pad_x_available =
+                    -std::min<int>(static_cast<int>(offset_first_element), stride_y - shape[0] * strides[0]) /
+                    static_cast<int>(strides[0]);
 
-                if(_start_x < front_pad_x_available)
+                if (_start_x < front_pad_x_available)
                 {
                     window_modified = true;
                 }
             }
 
-            if(!window_modified && _end_x > static_cast<int>(shape[0]))
+            if (!window_modified && _end_x > static_cast<int>(shape[0]))
             {
                 const int tail_pad_x_available = (stride_y / strides[0]) - shape[0];
 
-                if(static_cast<int>(shape[0]) + tail_pad_x_available < _end_x)
+                if (static_cast<int>(shape[0]) + tail_pad_x_available < _end_x)
                 {
                     window_modified = true;
                 }
@@ -146,9 +151,9 @@ bool AccessWindowStatic::update_window_if_needed(Window &window) const
     }
 
     // If padding is not enough
-    if(window_modified)
+    if (window_modified)
     {
-        for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+        for (size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
         {
             window.set(i, Window::Dimension(0, 0, 1));
         }
@@ -162,7 +167,7 @@ bool AccessWindowStatic::update_padding_if_needed(const Window &window)
     ARM_COMPUTE_UNUSED(window);
 
     // Only update the padding if the tensor allows it
-    if(_info == nullptr || !_info->is_resizable())
+    if (_info == nullptr || !_info->is_resizable())
     {
         return false;
     }
diff --git a/src/core/AccessWindowStatic.h b/src/core/AccessWindowStatic.h
index f7d43cbb55197c2ec4a6e09e1508f4e9a0aa4614..5c6d2c7db07519a2429daab8e62f50664ca73b62 100644
--- a/src/core/AccessWindowStatic.h
+++ b/src/core/AccessWindowStatic.h
@@ -86,9 +86,12 @@ public:
     ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region) const;
 
     // Inherited methods overriden:
-    bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) override;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    bool        update_window_if_needed(Window &window) const override;
+    bool        update_padding_if_needed(const Window &window) override;
+    ValidRegion compute_valid_region(const Window &window,
+                                     ValidRegion   input_valid_region,
+                                     bool          border_undefined,
+                                     BorderSize    border_size) const override;
 
 private:
     ITensorInfo *_info;
diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index d8bd4c4de14ab7df83ad2a74f0d6f32cbf547567..42f0081c14b1142c419b6f0e2b3251d7d310d2c6 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp
@@ -29,9 +29,12 @@
 
 using namespace arm_compute;
 
-ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window,
+                                                        ValidRegion   input_valid_region,
+                                                        bool          border_undefined,
+                                                        BorderSize    border_size) const
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return input_valid_region;
     }
@@ -41,7 +44,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va
     Coordinates  old_anchor(anchor);
     TensorShape  old_shape(shape);
 
-    if(!border_undefined)
+    if (!border_undefined)
     {
         border_size = BorderSize(0);
     }
@@ -53,7 +56,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va
     // the kernel to write back output values.
     // As the relation between input and output is transposed window.y() is
     // used for x anchor and window.x() for y anchor.
-    if(_info->dimension(0) > 1)
+    if (_info->dimension(0) > 1)
     {
         anchor.set(0, std::max<int>(window.y().start() * _scale_x, anchor[1] + border_size.top) + _x);
     }
@@ -69,15 +72,19 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va
     // a size of the region.
     // As the relation between input and output is transposed window.y() is
     // used for x shape and window.x() for y shape.
-    if(_info->dimension(0) > 1)
+    if (_info->dimension(0) > 1)
     {
-        shape.set(0, std::min<int>((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
+        shape.set(0, std::min<int>((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right,
+                                   (window.y().end() - window.y().step()) * _scale_x + _width) -
+                         anchor[0]);
     }
-    shape.set(1, std::min<int>((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
+    shape.set(1, std::min<int>((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom,
+                               (window.x().end() - window.x().step()) * _scale_y + _height) -
+                     anchor[1]);
 
     // For higher dimensions use the intersection of the window size and the
     // valid region of the input
-    for(size_t d = 2; d < _info->num_dimensions(); ++d)
+    for (size_t d = 2; d < _info->num_dimensions(); ++d)
     {
         anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
         shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
@@ -89,7 +96,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va
 bool AccessWindowTranspose::update_window_if_needed(Window &window) const
 {
     // Only update the window size if we can't use padding
-    if(_info == nullptr || _info->is_resizable())
+    if (_info == nullptr || _info->is_resizable())
     {
         return false;
     }
@@ -107,12 +114,12 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
     const int max_y = window.x().end() * _scale_y + _y;
 
     // Adjust window start for output's Y dimension (so X in (input) window)
-    if(min_y < 0)
+    if (min_y < 0)
     {
         // Calculate rows available above the tensor
         const int front_pad_y_available = -offset_first_element / strides[1];
 
-        if(min_y < front_pad_y_available)
+        if (min_y < front_pad_y_available)
         {
             // Not enough padding available, need to shrink the window
             const int start = adjust_up(min_y, front_pad_y_available, window.x().step() * _scale_y) - _y;
@@ -126,17 +133,18 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
     }
 
     // Adjust window end for Y dimension
-    if(max_y > static_cast<int>(shape[1]))
+    if (max_y > static_cast<int>(shape[1]))
     {
         const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
 
         // Calculate rows available below the tensor
         const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
 
-        if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
+        if (static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
         {
             // Not enough padding available, need to shrink the window
-            const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.x().step() * _scale_y) + window.x().step() * _scale_y - _y - _height;
+            const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.x().step() * _scale_y) +
+                            window.x().step() * _scale_y - _y - _height;
             window.set(0, Window::Dimension(window.x().start(), end / _scale_y, window.x().step()));
             window_modified = true;
         }
@@ -151,11 +159,14 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
     const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
 
     // Adjust window start for X dimension
-    if(min_x < 0)
+    if (min_x < 0)
     {
-        const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+        const int front_pad_x_available =
+            -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1],
+                           stride_y - shape[0] * strides[0]) /
+            static_cast<int>(strides[0]);
 
-        if(min_x < front_pad_x_available)
+        if (min_x < front_pad_x_available)
         {
             // Not enough padding available, need to shrink the window
             const int start = adjust_up(min_x, front_pad_x_available, window.y().step() * _scale_x) - _x;
@@ -168,14 +179,15 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
     }
 
     // Adjust window end for X dimension
-    if(max_x > static_cast<int>(shape[0]))
+    if (max_x > static_cast<int>(shape[0]))
     {
         const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
 
-        if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
+        if (static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
         {
             // Not enough padding available, need to shrink the window
-            const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.y().step() * _scale_x) + window.y().step() * _scale_x - _x - _width;
+            const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.y().step() * _scale_x) +
+                            window.y().step() * _scale_x - _x - _width;
             window.set(1, Window::Dimension(window.y().start(), end / _scale_x, window.y().step()));
             window_modified = true;
         }
@@ -189,7 +201,7 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
 bool AccessWindowTranspose::update_padding_if_needed(const Window &window)
 {
     // Only update the padding if the tensor allows it
-    if(_info == nullptr || !_info->is_resizable())
+    if (_info == nullptr || !_info->is_resizable())
     {
         return false;
     }
diff --git a/src/core/AccessWindowTranspose.h b/src/core/AccessWindowTranspose.h
index 0306076d6ea236169f7670c3801a411801f25c51..12bb9a535b9c59499b6f0fa6eb1739ecab62b44c 100644
--- a/src/core/AccessWindowTranspose.h
+++ b/src/core/AccessWindowTranspose.h
@@ -42,7 +42,10 @@ public:
     bool update_window_if_needed(Window &window) const override;
     bool update_padding_if_needed(const Window &window) override;
     using AccessWindowRectangle::compute_valid_region;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    ValidRegion compute_valid_region(const Window &window,
+                                     ValidRegion   input_valid_region,
+                                     bool          border_undefined,
+                                     BorderSize    border_size) const override;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H*/
diff --git a/src/core/CL/CLCommandBuffer.cpp b/src/core/CL/CLCommandBuffer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d094dcdaea2326848779feba61edbeeb62b82997
--- /dev/null
+++ b/src/core/CL/CLCommandBuffer.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/CL/CLCommandBuffer.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+
+#include "src/core/CL/CLCompatCommandBuffer.h"
+#include "src/core/CL/CLMutableCommandBuffer.h"
+
+namespace arm_compute
+{
+
+std::unique_ptr<CLCommandBuffer> CLCommandBuffer::create(cl_command_queue queue)
+{
+    const auto &cl_device            = CLKernelLibrary::get().get_device();
+    const auto  has_mutable_dispatch = command_buffer_mutable_dispatch_supported(cl_device);
+
+    if (has_mutable_dispatch)
+    {
+        return std::make_unique<CLMutableCommandBuffer>(queue);
+    }
+    else
+    {
+        return std::make_unique<CLCompatCommandBuffer>(queue);
+    }
+}
+
+CLCommandBuffer::CLCommandBuffer()  = default;
+CLCommandBuffer::~CLCommandBuffer() = default;
+
+CLCommandBuffer::State CLCommandBuffer::state() const
+{
+    return _state;
+}
+
+CLCommandBuffer &CLCommandBuffer::state(CLCommandBuffer::State state)
+{
+    _state = state;
+
+    return *this;
+}
+
+} // namespace arm_compute
diff --git a/src/core/CL/CLCommandBuffer.h b/src/core/CL/CLCommandBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..90e434161edc1a6b07ea7d265e702d9ef14713d1
--- /dev/null
+++ b/src/core/CL/CLCommandBuffer.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CORE_CL_CLCOMMANDBUFFER_H
+#define ACL_SRC_CORE_CL_CLCOMMANDBUFFER_H
+
+#include "arm_compute/core/CL/OpenCL.h"
+
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+namespace arm_compute
+{
+
+/** Command buffer contains a list of commands that is constructed once and later enqueued multiple times.
+ *
+ * To prepare a command buffer:
+ *   - Construct a new command buffer targeting a command queue using @ref CLCommandBuffer::create.
+ *   - Add kernel enqueue command to the buffer using @ref CLCommandBuffer::add_kernel.
+ *     The kernel must be ready to be enqueued with all the arguments set.
+ *   - Specify which kernel argument is mutable after the command buffer has been finalized.
+ *   - When all the kernel enqueue commands have been added, call @ref CLCommandBuffer::finalize.
+ *     After this point the command buffer is ready to be executed.
+ *
+ * To execute the command buffer:
+ *   - Make any changes in the value which the mutable arguments are pointing to.
+ *   - Call @ref CLCommandBuffer::update to apply the argument value changes.
+ *   - Call @ref CLCommandBuffer::enqueue to enqueue the command buffer to execute.
+ */
+class CLCommandBuffer
+{
+public:
+    /** Create a new command buffer targeting the specified command queue.
+     *
+     * @param[in] queue The command queue to execute the command buffer.
+     *
+     * @return A unique pointer to the newly created command buffer.
+     */
+    static std::unique_ptr<CLCommandBuffer> create(cl_command_queue queue);
+
+    /** Constructor. */
+    CLCommandBuffer();
+
+    /** Destructor. */
+    virtual ~CLCommandBuffer();
+
+    /** Disallow copy constructor. */
+    CLCommandBuffer(const CLCommandBuffer &) = delete;
+
+    /** Disallow copy assignment. */
+    CLCommandBuffer &operator=(const CLCommandBuffer &) = delete;
+
+    /** Disallow move constructor. */
+    CLCommandBuffer(CLCommandBuffer &&other) = delete;
+
+    /** Disallow move assignment. */
+    CLCommandBuffer &operator=(CLCommandBuffer &&other) = delete;
+
+    /** Add a kernel enqueue command to the command queue.
+     *
+     * This function must be called before the command buffer has been finalized.
+     *
+     * @param[in] kernel The CL kernel.
+     * @param[in] offset The global work offset.
+     * @param[in] global The global work size.
+     * @param[in] local  The local work size.
+     */
+    virtual void
+    add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) = 0;
+
+    /** Add the mutable argument to the current kernel enqueue command.
+     *
+     * This function must be called after @ref CLCommandBuffer::add_kernel but before the command buffer
+     * has been finalized.
+     *
+     * The pointer must be valid and it must point to the correct value at the time
+     * @ref CLCommandBuffer::update is called so that the value of the argument
+     * can be applied successfully to the kernel enqueue command.
+     *
+     * @param[in] arg_idx The index of the argument in the current kernel program.
+     * @param[in] value   The pointer to the value of the argument.
+     */
+    template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value || std::is_pointer<T>::value>>
+    void add_mutable_argument(cl_uint arg_idx, const T *value)
+    {
+        add_mutable_argument_generic(arg_idx, value, sizeof(T));
+    }
+
+    /** Finalize the command buffer. */
+    virtual void finalize() = 0;
+
+    /** Update the command buffer with new kernel argument values.
+     *
+     * This function must be called after the command buffer has been finalized.
+     *
+     * All the value pointed by the mutable argument will be applied to the command buffer.
+     */
+    virtual void update() = 0;
+
+    /** Enqueue the command buffer.
+     *
+     * This function must be called after the command buffer has been finalized.
+     */
+    virtual void enqueue() = 0;
+
+    /** Check if the command buffer has been finalized.
+     *
+     * @return true if the command buffer has been finalized.
+     */
+    virtual bool is_finalized() const = 0;
+
+protected:
+    /** Add the mutable argument to the current kernel enqueue command.
+     *
+     * @see CLCommandBuffer::add_mutable_argument for more information.
+     */
+    virtual void add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) = 0;
+
+    /** The state of the command buffer. */
+    enum class State : int32_t
+    {
+        /** The command buffer has been created and is being specified. */
+        Created,
+
+        /** The command buffer has been finalized and is ready to be executed. */
+        Finalized,
+    };
+
+    /** Get the state of the command buffer. */
+    State state() const;
+
+    /** Set the state of the command buffer. */
+    CLCommandBuffer &state(State state);
+
+private:
+    State _state{State::Created};
+};
+
+} // namespace arm_compute
+
+#endif // ACL_SRC_CORE_CL_CLCOMMANDBUFFER_H
diff --git a/src/core/CL/CLCompatCommandBuffer.cpp b/src/core/CL/CLCompatCommandBuffer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..242fd7719cb18de600f50ef4555bdaff217c7dac
--- /dev/null
+++ b/src/core/CL/CLCompatCommandBuffer.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/CL/CLCompatCommandBuffer.h"
+
+#include "arm_compute/core/Error.h"
+
+#include "src/core/CL/CLUtils.h"
+
+namespace arm_compute
+{
+
+CLCompatCommandBuffer::CLCompatCommandBuffer(cl_command_queue queue) : _queue(queue)
+{
+}
+
+CLCompatCommandBuffer::~CLCompatCommandBuffer()
+{
+}
+
+void CLCompatCommandBuffer::add_kernel(cl_kernel          kernel,
+                                       const cl::NDRange &offset,
+                                       const cl::NDRange &global,
+                                       const cl::NDRange &local)
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Created);
+
+    _kernel_cmds.push_back(KernelCommand{kernel, offset, global, local, {}});
+}
+
+void CLCompatCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size)
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Created);
+    ARM_COMPUTE_ERROR_ON(_kernel_cmds.empty());
+
+    _kernel_cmds.back().mutable_args.push_back(cl_mutable_dispatch_arg_khr{arg_idx, size, value});
+}
+
+void CLCompatCommandBuffer::finalize()
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Created);
+
+    _kernel_cmds.shrink_to_fit();
+
+    for (auto &cmd : _kernel_cmds)
+    {
+        cmd.mutable_args.shrink_to_fit();
+    }
+
+    state(State::Finalized);
+}
+
+void CLCompatCommandBuffer::update()
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
+
+    // Nothing to do here - The kernel arguments will be updated when each command is enqueued.
+}
+
+void CLCompatCommandBuffer::enqueue()
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
+
+    for (const auto &cmd : _kernel_cmds)
+    {
+        for (const auto &arg : cmd.mutable_args)
+        {
+            const auto error = clSetKernelArg(cmd.kernel, arg.arg_index, arg.arg_size, arg.arg_value);
+
+            handle_cl_error("clSetKernelArg", error);
+        }
+
+        const auto error =
+            clEnqueueNDRangeKernel(_queue, cmd.kernel, static_cast<cl_uint>(cmd.global.dimensions()),
+                                   cmd.offset.dimensions() != 0 ? cmd.offset.get() : nullptr, cmd.global.get(),
+                                   cmd.local.dimensions() != 0 ? cmd.local.get() : nullptr, 0, nullptr, nullptr);
+
+        handle_cl_error("clEnqueueNDRangeKernel", error);
+    }
+}
+
+bool CLCompatCommandBuffer::is_finalized() const
+{
+    return state() == State::Finalized;
+}
+
+} // namespace arm_compute
diff --git a/src/core/CL/CLCompatCommandBuffer.h b/src/core/CL/CLCompatCommandBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5df106425c2550e49fa671497bb7f5bc437abe2
--- /dev/null
+++ b/src/core/CL/CLCompatCommandBuffer.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CORE_CL_CLCOMPATCOMMANDBUFFER_H
+#define ACL_SRC_CORE_CL_CLCOMPATCOMMANDBUFFER_H
+
+#include "src/core/CL/CLCommandBuffer.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+
+/** Command buffer implementation for platform without mutable dispatch command buffer extension. */
+class CLCompatCommandBuffer final : public CLCommandBuffer
+{
+public:
+    /** Create a new command buffer targeting the specified command queue.
+     *
+     * @param[in] queue The command queue to execute the command buffer.
+     */
+    CLCompatCommandBuffer(cl_command_queue queue);
+
+    /** Destructor. */
+    virtual ~CLCompatCommandBuffer();
+
+    /** Disallow copy constructor. */
+    CLCompatCommandBuffer(const CLCompatCommandBuffer &) = delete;
+
+    /** Disallow copy assignment. */
+    CLCompatCommandBuffer &operator=(const CLCompatCommandBuffer &) = delete;
+
+    /** Disallow move constructor. */
+    CLCompatCommandBuffer(CLCompatCommandBuffer &&) = delete;
+
+    /** Disallow move assignment. */
+    CLCompatCommandBuffer &operator=(CLCompatCommandBuffer &&) = delete;
+
+    void add_kernel(cl_kernel          kernel,
+                    const cl::NDRange &offset,
+                    const cl::NDRange &global,
+                    const cl::NDRange &local) override;
+
+    void finalize() override;
+
+    void update() override;
+
+    void enqueue() override;
+
+    bool is_finalized() const override;
+
+protected:
+    void add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) override;
+
+private:
+    struct KernelCommand
+    {
+        cl_kernel   kernel;
+        cl::NDRange offset;
+        cl::NDRange global;
+        cl::NDRange local;
+
+        std::vector<cl_mutable_dispatch_arg_khr> mutable_args;
+    };
+
+private:
+    cl_command_queue           _queue{};
+    std::vector<KernelCommand> _kernel_cmds{};
+};
+
+} // namespace arm_compute
+
+#endif // ACL_SRC_CORE_CL_CLCOMPATCOMMANDBUFFER_H
diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp
index 2d024f9c2f76c114473e0c88711a18622d69a4cd..9bbc32657ec859154f22996c896469e3a4985c26 100644
--- a/src/core/CL/CLCompileContext.cpp
+++ b/src/core/CL/CLCompileContext.cpp
@@ -22,19 +22,19 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CL/CLCompileContext.h"
-#include "arm_compute/core/CL/OpenCL.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Utils.h"
+
 #include "support/StringSupport.h"
 
 #include <regex>
 
 namespace arm_compute
 {
-CLBuildOptions::CLBuildOptions()
-    : _build_opts()
+CLBuildOptions::CLBuildOptions() : _build_opts()
 {
 }
 
@@ -45,7 +45,7 @@ void CLBuildOptions::add_option(std::string option)
 
 void CLBuildOptions::add_option_if(bool cond, std::string option)
 {
-    if(cond)
+    if (cond)
     {
         add_option(std::move(option));
     }
@@ -63,7 +63,7 @@ void CLBuildOptions::add_options(const StringSet &options)
 
 void CLBuildOptions::add_options_if(bool cond, const StringSet &options)
 {
-    if(cond)
+    if (cond)
     {
         add_options(options);
     }
@@ -79,26 +79,35 @@ bool CLBuildOptions::operator==(const CLBuildOptions &other) const
     return _build_opts == other._build_opts;
 }
 
-Program::Program()
-    : _context(), _device(), _is_binary(false), _name(), _source(), _binary()
+Program::Program() : _context(), _device(), _is_binary(false), _name(), _source(), _binary()
 {
 }
 
 Program::Program(cl::Context context, std::string name, std::string source)
-    : _context(std::move(context)), _device(), _is_binary(false), _name(std::move(name)), _source(std::move(source)), _binary()
+    : _context(std::move(context)),
+      _device(),
+      _is_binary(false),
+      _name(std::move(name)),
+      _source(std::move(source)),
+      _binary()
 {
 }
 
 Program::Program(cl::Context context, cl::Device device, std::string name, std::vector<unsigned char> binary)
-    : _context(std::move(context)), _device(std::move(device)), _is_binary(true), _name(std::move(name)), _source(), _binary(std::move(binary))
+    : _context(std::move(context)),
+      _device(std::move(device)),
+      _is_binary(true),
+      _name(std::move(name)),
+      _source(),
+      _binary(std::move(binary))
 {
 }
 
 Program::operator cl::Program() const
 {
-    if(_is_binary)
+    if (_is_binary)
     {
-        return cl::Program(_context, { _device }, { _binary });
+        return cl::Program(_context, {_device}, {_binary});
     }
     else
     {
@@ -112,12 +121,12 @@ bool Program::build(const cl::Program &program, const std::string &build_options
     {
         return program.build(build_options.c_str()) == CL_SUCCESS;
     }
-    catch(const cl::Error &e)
+    catch (const cl::Error &e)
     {
         cl_int     err        = CL_SUCCESS;
         const auto build_info = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&err);
 
-        for(auto &pair : build_info)
+        for (auto &pair : build_info)
         {
             std::cerr << pair.second << std::endl;
         }
@@ -133,14 +142,12 @@ cl::Program Program::build(const std::string &build_options) const
     return cl_program;
 }
 
-Kernel::Kernel()
-    : _name(), _kernel()
+Kernel::Kernel() : _name(), _kernel()
 {
 }
 
 Kernel::Kernel(std::string name, const cl::Program &program)
-    : _name(std::move(name)),
-      _kernel(cl::Kernel(program, _name.c_str()))
+    : _name(std::move(name)), _kernel(cl::Kernel(program, _name.c_str()))
 {
 }
 CLCompileContext::CLCompileContext()
@@ -156,15 +163,19 @@ CLCompileContext::CLCompileContext(cl::Context context, const cl::Device &device
     _is_wbsm_supported = get_wbsm_support_info(device);
 }
 
-Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source,
-                                       const std::string &kernel_path, const StringSet &build_options_set, bool is_binary) const
+Kernel CLCompileContext::create_kernel(const std::string &kernel_name,
+                                       const std::string &program_name,
+                                       const std::string &program_source,
+                                       const std::string &kernel_path,
+                                       const StringSet   &build_options_set,
+                                       bool               is_binary) const
 {
     const std::string build_options      = generate_build_options(build_options_set, kernel_path);
     const std::string built_program_name = program_name + "_" + build_options;
     auto              built_program_it   = _built_programs_map.find(built_program_name);
     cl::Program       cl_program;
 
-    if(_built_programs_map.end() != built_program_it)
+    if (_built_programs_map.end() != built_program_it)
     {
         // If program has been built, retrieve to create kernel from it
         cl_program = built_program_it->second;
@@ -184,11 +195,12 @@ Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std
     return Kernel(kernel_name, cl_program);
 }
 
-const Program &CLCompileContext::load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const
+const Program &
+CLCompileContext::load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const
 {
     const auto program_it = _programs_map.find(program_name);
 
-    if(program_it != _programs_map.end())
+    if (program_it != _programs_map.end())
     {
         return program_it->second;
     }
@@ -199,9 +211,10 @@ const Program &CLCompileContext::load_program(const std::string &program_name, c
     ARM_COMPUTE_UNUSED(is_binary);
     program = Program(_context, program_name, program_source);
 #else  /* EMBEDDED_KERNELS */
-    if(is_binary)
+    if (is_binary)
     {
-        program = Program(_context, _device.cl_device(), program_name, std::vector<unsigned char>(program_source.begin(), program_source.end()));
+        program = Program(_context, _device.cl_device(), program_name,
+                          std::vector<unsigned char>(program_source.begin(), program_source.end()));
     }
     else
     {
@@ -218,18 +231,19 @@ const Program &CLCompileContext::load_program(const std::string &program_name, c
 void CLCompileContext::set_context(cl::Context context)
 {
     _context = std::move(context);
-    if(_context.get() != nullptr)
+    if (_context.get() != nullptr)
     {
         const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>();
 
-        if(!cl_devices.empty())
+        if (!cl_devices.empty())
         {
             _device = CLDevice(cl_devices[0]);
         }
     }
 }
 
-std::string CLCompileContext::generate_build_options(const StringSet &build_options_set, const std::string &kernel_path) const
+std::string CLCompileContext::generate_build_options(const StringSet   &build_options_set,
+                                                     const std::string &kernel_path) const
 {
     std::string concat_str;
     bool        ext_supported = false;
@@ -241,27 +255,27 @@ std::string CLCompileContext::generate_build_options(const StringSet &build_opti
 #endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
 
     GPUTarget gpu_arch = get_arch_from_target(_device.target());
-    concat_str += " -DGPU_ARCH=" + support::cpp11::to_string(
-                      static_cast<std::underlying_type<GPUTarget>::type>(gpu_arch));
+    concat_str +=
+        " -DGPU_ARCH=" + support::cpp11::to_string(static_cast<std::underlying_type<GPUTarget>::type>(gpu_arch));
 
-    if(_device.supported("cl_khr_fp16"))
+    if (_device.supported("cl_khr_fp16"))
     {
         concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
     }
 
-    if(_device.supported("cl_arm_integer_dot_product_int8") || _device.supported("cl_khr_integer_dot_product"))
+    if (_device.supported("cl_arm_integer_dot_product_int8") || _device.supported("cl_khr_integer_dot_product"))
     {
         concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ENABLED=1 ";
     }
 
-    if(_device.supported("cl_arm_integer_dot_product_accumulate_int8"))
+    if (_device.supported("cl_arm_integer_dot_product_accumulate_int8"))
     {
         concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED=1 ";
     }
 
     std::tie(ext_supported, ext_buildopts) = _device.is_non_uniform_workgroup_supported();
 
-    if(ext_supported)
+    if (ext_supported)
     {
         concat_str += ext_buildopts;
     }
@@ -270,7 +284,7 @@ std::string CLCompileContext::generate_build_options(const StringSet &build_opti
         ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
     }
 
-    if(gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11)
+    if (gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11)
     {
         concat_str += " -DUNROLL_WITH_PRAGMA ";
     }
@@ -295,7 +309,7 @@ std::string CLCompileContext::stringify_set(const StringSet &s, const std::strin
 #endif /* EMBEDDED_KERNELS */
 
     // Concatenate set
-    for(const auto &el : s)
+    for (const auto &el : s)
     {
         concat_set += " " + el;
     }
@@ -340,7 +354,7 @@ cl::NDRange CLCompileContext::default_ndrange() const
     GPUTarget   _target = get_target_from_device(_device.cl_device());
     cl::NDRange default_range;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::MIDGARD:
         case GPUTarget::T600:
@@ -370,7 +384,8 @@ size_t CLCompileContext::max_local_workgroup_size(const cl::Kernel &kernel) cons
     size_t result;
 
     size_t err = kernel.getWorkGroupInfo(_device.cl_device(), CL_KERNEL_WORK_GROUP_SIZE, &result);
-    ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+    ARM_COMPUTE_ERROR_ON_MSG(err != 0,
+                             "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
     ARM_COMPUTE_UNUSED(err);
 
     return result;
@@ -392,7 +407,7 @@ int32_t CLCompileContext::get_ddk_version() const
     const std::regex  ddk_regex("r([0-9]*)p[0-9]");
     std::smatch       ddk_match;
 
-    if(std::regex_search(device_version, ddk_match, ddk_regex))
+    if (std::regex_search(device_version, ddk_match, ddk_regex))
     {
         return std::stoi(ddk_match[1]);
     }
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index a32bcca65522f336685d5ef4a70ef1e8f6619a7a..5ea99d360a6dedaa830c6efb2ce8e8769e8be45d 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -22,14 +22,15 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CL/CLHelpers.h"
+
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLTypes.h"
-#include "arm_compute/core/utils/DataTypeUtils.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/Types.h"
-#include "src/gpu/cl/ClCompileContext.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
 
+#include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/ClKernelLibrary.h"
 
 #include <utility>
@@ -39,7 +40,7 @@ namespace arm_compute
 {
 std::string get_cl_type_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -75,7 +76,7 @@ std::string get_cl_type_from_data_type(const DataType &dt)
 
 std::string get_cl_promoted_type_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -105,7 +106,7 @@ std::string get_cl_promoted_type_from_data_type(const DataType &dt)
 
 std::string get_cl_unsigned_type_from_element_size(size_t element_size)
 {
-    switch(element_size)
+    switch (element_size)
     {
         case 1:
             return "uchar";
@@ -123,7 +124,7 @@ std::string get_cl_unsigned_type_from_element_size(size_t element_size)
 
 std::string get_cl_signed_type_from_element_size(size_t element_size)
 {
-    switch(element_size)
+    switch (element_size)
     {
         case 1:
             return "char";
@@ -141,7 +142,7 @@ std::string get_cl_signed_type_from_element_size(size_t element_size)
 
 std::string get_cl_select_type_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -174,7 +175,7 @@ std::string get_cl_select_type_from_data_type(const DataType &dt)
 
 std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -192,7 +193,7 @@ std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt)
 
 std::string get_data_size_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::S8:
@@ -244,8 +245,9 @@ bool dot8_supported(const cl::Device &device)
     const GPUTarget gpu_target  = get_target_from_name(device_name);
 
     // SW_WORKAROUND: Workaround for DDK revision r14p0.to enable cl_arm_integer_dot_product_int8
-    std::set<GPUTarget> sw_workaround_issue = { GPUTarget::G76 };
-    return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") || sw_workaround_issue.count(gpu_target) != 0);
+    std::set<GPUTarget> sw_workaround_issue = {GPUTarget::G76};
+    return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") ||
+            sw_workaround_issue.count(gpu_target) != 0);
 }
 
 bool dot8_acc_supported(const cl::Device &device)
@@ -256,23 +258,23 @@ bool dot8_acc_supported(const cl::Device &device)
 CLVersion get_cl_version(const cl::Device &device)
 {
     std::string version_str = device.getInfo<CL_DEVICE_VERSION>();
-    if(version_str.find("OpenCL 3") != std::string::npos)
+    if (version_str.find("OpenCL 3") != std::string::npos)
     {
         return CLVersion::CL30;
     }
-    else if(version_str.find("OpenCL 2") != std::string::npos)
+    else if (version_str.find("OpenCL 2") != std::string::npos)
     {
         return CLVersion::CL20;
     }
-    else if(version_str.find("OpenCL 1.2") != std::string::npos)
+    else if (version_str.find("OpenCL 1.2") != std::string::npos)
     {
         return CLVersion::CL12;
     }
-    else if(version_str.find("OpenCL 1.1") != std::string::npos)
+    else if (version_str.find("OpenCL 1.1") != std::string::npos)
     {
         return CLVersion::CL11;
     }
-    else if(version_str.find("OpenCL 1.0") != std::string::npos)
+    else if (version_str.find("OpenCL 1.0") != std::string::npos)
     {
         return CLVersion::CL10;
     }
@@ -287,14 +289,15 @@ bool device_supports_extension(const cl::Device &device, const char *extension_n
     return (pos != std::string::npos);
 }
 
-bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Size2D &kernel_size, DataLayout data_layout)
+bool cl_winograd_convolution_layer_supported(const Size2D &output_tile,
+                                             const Size2D &kernel_size,
+                                             DataLayout    data_layout)
 {
     ARM_COMPUTE_ERROR_ON(data_layout == DataLayout::UNKNOWN);
 
     using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    std::vector<WinogradConfiguration> winograd_configs_nchw =
-    {
+    std::vector<WinogradConfiguration> winograd_configs_nchw = {
         WinogradConfiguration(std::pair<int, int>(1, 2), std::pair<int, int>(1, 3)),
         WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)),
         WinogradConfiguration(std::pair<int, int>(2, 1), std::pair<int, int>(3, 1)),
@@ -303,11 +306,9 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)),
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
         WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1)),
-        WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))
-    };
+        WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))};
 
-    std::vector<WinogradConfiguration> winograd_configs_nhwc =
-    {
+    std::vector<WinogradConfiguration> winograd_configs_nhwc = {
         WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3)),
         WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)),
         WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(3, 1)),
@@ -324,19 +325,21 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si
                             std::pair<int, int>(kernel_size.width, kernel_size.height));
 
     // Return true if supported
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
-        return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) != winograd_configs_nchw.end());
+        return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) !=
+                winograd_configs_nchw.end());
     }
     else
     {
-        return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) != winograd_configs_nhwc.end());
+        return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) !=
+                winograd_configs_nhwc.end());
     }
 }
 
 size_t preferred_vector_width(const cl::Device &device, const DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::S8:
@@ -382,7 +385,7 @@ size_t get_cl_image_pitch_alignment(const cl::Device &device)
 
     cl_int err = clGetDeviceInfo(device(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT, sizeof(cl_uint), &pixel_aligment, nullptr);
 
-    if(err == CL_SUCCESS)
+    if (err == CL_SUCCESS)
     {
         return pixel_aligment;
     }
@@ -396,12 +399,14 @@ bool get_cl_non_uniform_work_group_supported(const cl::Device &device)
 {
     cl_bool supported = CL_FALSE;
 
-    cl_int err = clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), &supported, nullptr);
+    cl_int err =
+        clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), &supported, nullptr);
 
     return (err == CL_SUCCESS && supported == CL_TRUE);
 }
 
-cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts)
+cl::Kernel
+create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts)
 {
     opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
 
@@ -409,7 +414,8 @@ cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_
     auto              kernel_src   = klib.program(program_name);
     const std::string kernel_path  = klib.kernel_path();
 
-    return static_cast<cl::Kernel>(ctx.create_kernel(kernel_name, program_name, kernel_src.program, kernel_path, build_opts, kernel_src.is_binary));
+    return static_cast<cl::Kernel>(ctx.create_kernel(kernel_name, program_name, kernel_src.program, kernel_path,
+                                                     build_opts, kernel_src.is_binary));
 }
 
 cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size)
@@ -423,8 +429,9 @@ cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimensio
 bool get_wbsm_support_info(const cl::Device &device)
 {
     cl_bitfield capabilities = 0;
-    cl_int      err          = clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, sizeof(cl_bitfield), &capabilities, nullptr);
-    if((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM))
+    cl_int      err = clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, sizeof(cl_bitfield),
+                                      &capabilities, nullptr);
+    if ((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM))
     {
         return true;
     }
@@ -433,35 +440,33 @@ bool get_wbsm_support_info(const cl::Device &device)
 
 void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint)
 {
-    cl_int err = clSetKernelExecInfo(kernel.get(),
-                                     CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM,
-                                     sizeof(cl_int),
-                                     &wbsm_hint);
+    cl_int err = clSetKernelExecInfo(kernel.get(), CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM,
+                                     sizeof(cl_int), &wbsm_hint);
     ARM_COMPUTE_UNUSED(err);
     ARM_COMPUTE_ERROR_ON(err != CL_SUCCESS);
 }
 
 bool export_to_cl_image(const ITensorInfo *tensor)
 {
-    if(tensor->tensor_shape()[0] % 4 != 0)
+    if (tensor->tensor_shape()[0] % 4 != 0)
     {
         return false;
     }
 
     // If not floating point
-    if(!is_data_type_float(tensor->data_type()))
+    if (!is_data_type_float(tensor->data_type()))
     {
         return false;
     }
 
     // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
-    if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+    if (!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
     {
         return false;
     }
 
     // Check cl image pitch alignment
-    if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
+    if (get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
     {
         return false;
     }
@@ -471,7 +476,7 @@ bool export_to_cl_image(const ITensorInfo *tensor)
     const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
     const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
 
-    if(image_w > max_image_w || image_h > max_image_h)
+    if (image_w > max_image_w || image_h > max_image_h)
     {
         return false;
     }
@@ -481,9 +486,9 @@ bool export_to_cl_image(const ITensorInfo *tensor)
 
 void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list<int> values)
 {
-    for(const int value : values)
+    for (const int value : values)
     {
-        if(value > max_manual_loop_unrolling)
+        if (value > max_manual_loop_unrolling)
         {
             built_opts.add_option("-DUNROLL_WITH_PRAGMA");
             return;
@@ -495,4 +500,15 @@ bool arm_matrix_multiply_supported(const cl::Device &device)
 {
     return device_supports_extension(device, "cl_arm_matrix_multiply");
 }
+
+bool command_buffer_supported(const cl::Device &device)
+{
+    return device_supports_extension(device, "cl_khr_command_buffer");
+}
+
+bool command_buffer_mutable_dispatch_supported(const cl::Device &device)
+{
+    return device_supports_extension(device, "cl_khr_command_buffer_mutable_dispatch");
+}
+
 } // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index c5a0796c3aa316e147eb693c817b950eb5c98087..e69d006750119f4c1f085942a628fd6af4b724d7 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -22,8 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "arm_compute/core/Error.h"
+
 #include "src/gpu/cl/ClKernelLibrary.h"
+
 #include <algorithm>
 #include <array>
 #include <fstream>
@@ -31,8 +34,7 @@
 #include <vector>
 namespace arm_compute
 {
-CLKernelLibrary::CLKernelLibrary()
-    : _compile_context()
+CLKernelLibrary::CLKernelLibrary() : _compile_context()
 {
     opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the CLKernelLibrary is built
 }
@@ -41,13 +43,15 @@ CLKernelLibrary &CLKernelLibrary::get()
     static CLKernelLibrary _kernel_library;
     return _kernel_library;
 }
-Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, const std::set<std::string> &build_options_set) const
+Kernel CLKernelLibrary::create_kernel(const std::string           &kernel_name,
+                                      const std::set<std::string> &build_options_set) const
 {
     const opencl::ClKernelLibrary &klib         = opencl::ClKernelLibrary::get();
     const std::string              program_name = klib.program_name(kernel_name);
     auto                           program      = klib.program(program_name);
     const std::string             &kernel_path  = CLKernelLibrary::get().get_kernel_path();
-    return _compile_context.create_kernel(kernel_name, program_name, program.program, kernel_path, build_options_set, program.is_binary);
+    return _compile_context.create_kernel(kernel_name, program_name, program.program, kernel_path, build_options_set,
+                                          program.is_binary);
 }
 std::string CLKernelLibrary::get_program_name(const std::string &kernel_name) const
 {
@@ -131,4 +135,4 @@ CLCompileContext &CLKernelLibrary::get_compile_context()
 {
     return _compile_context;
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/CLKernels.h b/src/core/CL/CLKernels.h
index 63be7b1ea89453bcd732e97711c9615313ece538..9a681a4f457916c993afb8232eaeeb7e046bb614 100644
--- a/src/core/CL/CLKernels.h
+++ b/src/core/CL/CLKernels.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLKERNELS_H
-#define ARM_COMPUTE_CLKERNELS_H
+#ifndef ACL_SRC_CORE_CL_CLKERNELS_H
+#define ACL_SRC_CORE_CL_CLKERNELS_H
 
 /* Header regrouping all the CL kernels */
 #include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
@@ -52,12 +52,12 @@
 #include "src/core/CL/kernels/CLPadLayerKernel.h"
 #include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
 #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
-#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
-#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 #include "src/core/CL/kernels/CLRangeKernel.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
 #include "src/core/CL/kernels/CLReorgLayerKernel.h"
 #include "src/core/CL/kernels/CLReverseKernel.h"
+#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 #include "src/core/CL/kernels/CLSelectKernel.h"
 #include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 #include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
@@ -65,4 +65,4 @@
 #include "src/core/CL/kernels/CLStridedSliceKernel.h"
 #include "src/core/CL/kernels/CLTileKernel.h"
 
-#endif /* ARM_COMPUTE_CLKERNELS_H */
+#endif // ACL_SRC_CORE_CL_CLKERNELS_H
diff --git a/src/core/CL/CLMutableCommandBuffer.cpp b/src/core/CL/CLMutableCommandBuffer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..05b351fc259bf4669a916d0c80cb155b4e8978ae
--- /dev/null
+++ b/src/core/CL/CLMutableCommandBuffer.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/CL/CLMutableCommandBuffer.h"
+
+#include "arm_compute/core/Error.h"
+
+#include "src/core/CL/CLUtils.h"
+
+namespace arm_compute
+{
+
+CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue) : CLCommandBuffer()
+{
+    cl_int status = CL_SUCCESS;
+
+    cl_command_buffer_properties_khr properties[] = {
+        CL_COMMAND_BUFFER_FLAGS_KHR,
+        CL_COMMAND_BUFFER_MUTABLE_KHR,
+        0,
+    };
+
+    _cb = clCreateCommandBufferKHR(1, &queue, properties, &status);
+    handle_cl_error("clCreateCommandBufferKHR", status);
+}
+
+CLMutableCommandBuffer::~CLMutableCommandBuffer()
+{
+    const auto status = clReleaseCommandBufferKHR(_cb);
+    handle_cl_error("clReleaseCommandBufferKHR", status);
+}
+
+void CLMutableCommandBuffer::add_kernel(cl_kernel          kernel,
+                                        const cl::NDRange &offset,
+                                        const cl::NDRange &global,
+                                        const cl::NDRange &local)
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Created);
+
+    cl_mutable_command_khr mutable_handle = nullptr;
+
+    cl_ndrange_kernel_command_properties_khr properties[] = {
+        CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
+        CL_MUTABLE_DISPATCH_ARGUMENTS_KHR,
+        0,
+    };
+
+    const auto error = clCommandNDRangeKernelKHR(
+        _cb, nullptr, properties, kernel, global.dimensions(), offset.dimensions() != 0 ? offset.get() : nullptr,
+        global.get(), local.dimensions() != 0 ? local.get() : nullptr, 0, nullptr, nullptr, &mutable_handle);
+
+    handle_cl_error("clCommandNDRangeKernelKHR", error);
+
+    cl_mutable_dispatch_config_khr mut_dispatch_cfg{};
+    mut_dispatch_cfg.type    = CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR;
+    mut_dispatch_cfg.command = mutable_handle;
+
+    _mut_dispatch_cfgs.emplace_back(mut_dispatch_cfg);
+}
+
+void CLMutableCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size)
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Created);
+
+    cl_mutable_dispatch_arg_khr cfg{};
+    cfg.arg_index = arg_idx;
+    cfg.arg_size  = size;
+    cfg.arg_value = value;
+
+    _mut_arg_cfgs.emplace_back(cfg);
+    ++_mut_dispatch_cfgs.back().num_args;
+}
+
+void CLMutableCommandBuffer::finalize()
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Created);
+
+    const auto error = clFinalizeCommandBufferKHR(_cb);
+    handle_cl_error("clFinalizeCommandBufferKHR", error);
+
+    state(State::Finalized);
+
+    _mut_dispatch_cfgs.shrink_to_fit();
+    _mut_arg_cfgs.shrink_to_fit();
+
+    size_t arg_no = 0;
+
+    for (auto &mut_dispatch_cfg : _mut_dispatch_cfgs)
+    {
+        ARM_COMPUTE_ERROR_ON(arg_no >= _mut_arg_cfgs.size());
+        mut_dispatch_cfg.arg_list = &_mut_arg_cfgs[arg_no];
+
+        arg_no += mut_dispatch_cfg.num_args;
+    }
+
+    _mut_cfg.type                  = CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR;
+    _mut_cfg.next                  = nullptr;
+    _mut_cfg.num_mutable_dispatch  = _mut_dispatch_cfgs.size();
+    _mut_cfg.mutable_dispatch_list = &_mut_dispatch_cfgs[0];
+}
+
+void CLMutableCommandBuffer::update()
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
+
+    const auto error = clUpdateMutableCommandsKHR(_cb, &_mut_cfg);
+
+    handle_cl_error("clUpdateMutableCommandsKHR", error);
+}
+
+void CLMutableCommandBuffer::enqueue()
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
+
+    const auto error = clEnqueueCommandBufferKHR(0, nullptr, _cb, 0, nullptr, nullptr);
+
+    handle_cl_error("clEnqueueCommandBufferKHR", error);
+}
+
+bool CLMutableCommandBuffer::is_finalized() const
+{
+    return state() == State::Finalized;
+}
+
+} // namespace arm_compute
diff --git a/src/core/CL/CLMutableCommandBuffer.h b/src/core/CL/CLMutableCommandBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8997d7d1fd568b6c749f2974f24a6f0e33b43ec6
--- /dev/null
+++ b/src/core/CL/CLMutableCommandBuffer.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CORE_CL_CLMUTABLECOMMANDBUFFER_H
+#define ACL_SRC_CORE_CL_CLMUTABLECOMMANDBUFFER_H
+
+#include "src/core/CL/CLCommandBuffer.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+
+/** Command buffer implementaton based on CL mutable dispatch command buffer extension. */
+class CLMutableCommandBuffer : public CLCommandBuffer
+{
+public:
+    /** Create a new mutable dispatch command buffer targeting the specified command queue.
+     *
+     * @param[in] queue The command queue to execute the command buffer.
+     */
+    CLMutableCommandBuffer(cl_command_queue queue);
+
+    /** Destructor. */
+    virtual ~CLMutableCommandBuffer();
+
+    /** Disallow copy constructor. */
+    CLMutableCommandBuffer(const CLMutableCommandBuffer &) = delete;
+
+    /** Disallow copy assignment. */
+    CLMutableCommandBuffer &operator=(const CLMutableCommandBuffer &) = delete;
+
+    /** Disallow move constructor. */
+    CLMutableCommandBuffer(CLMutableCommandBuffer &&) = delete;
+
+    /** Disallow move assignment. */
+    CLMutableCommandBuffer &operator=(CLMutableCommandBuffer &&) = delete;
+
+    void add_kernel(cl_kernel          kernel,
+                    const cl::NDRange &offset,
+                    const cl::NDRange &global,
+                    const cl::NDRange &local) override;
+
+    void finalize() override;
+
+    void update() override;
+
+    void enqueue() override;
+
+    bool is_finalized() const override;
+
+protected:
+    void add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) override;
+
+private:
+    cl_command_buffer_khr                       _cb{};
+    cl_mutable_base_config_khr                  _mut_cfg{};
+    std::vector<cl_mutable_dispatch_config_khr> _mut_dispatch_cfgs{};
+    std::vector<cl_mutable_dispatch_arg_khr>    _mut_arg_cfgs{};
+};
+
+} // namespace arm_compute
+
+#endif // ACL_SRC_CORE_CL_CLMUTABLECOMMANDBUFFER_H
diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp
index 03f78697bc9cab661e9b1559dc6341c975f5160e..290ed3264807505f59fb5bc4c7d946b8dc01a8f4 100644
--- a/src/core/CL/CLUtils.cpp
+++ b/src/core/CL/CLUtils.cpp
@@ -23,15 +23,14 @@
  */
 #include "src/core/CL/CLUtils.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/StringUtils.h"
-#include "support/StringSupport.h"
+#include "arm_compute/core/Validate.h"
 
-#include "src/core/experimental/PostOpUtils.h"
+#include "support/StringSupport.h"
 
 namespace arm_compute
 {
@@ -42,13 +41,13 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im
     const cl::Context &ctx    = CLKernelLibrary::get().context();
     const cl::Buffer  &buffer = tensor->cl_buffer();
     const ITensorInfo *info   = tensor->info();
-    ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(),
-                             "Tensor paddings must not be locked to allow extending paddings to satisfy cl_image pitch alignment requirement");
+    ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(), "Tensor paddings must not be locked to allow extending paddings to "
+                                                    "satisfy cl_image pitch alignment requirement");
 
-    const size_t image_w{ info->dimension(0) / 4 };
-    const size_t image_h{ info->tensor_shape().total_size() / info->dimension(0) };
-    const size_t max_image_w{ CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>() };
-    const size_t max_image_h{ CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>() };
+    const size_t image_w{info->dimension(0) / 4};
+    const size_t image_h{info->tensor_shape().total_size() / info->dimension(0)};
+    const size_t max_image_w{CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>()};
+    const size_t max_image_h{CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>()};
 
     ARM_COMPUTE_UNUSED(max_image_w, max_image_h);
     ARM_COMPUTE_ERROR_ON_MSG(image_w > max_image_w, "Image width exceeds maximum width for exporting to cl_image");
@@ -60,18 +59,22 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im
     return create_image2d_from_buffer(ctx, buffer, shape2d, info->data_type(), image_row_pitch, image_type);
 }
 
-cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type)
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx,
+                                       const cl::Buffer  &buffer,
+                                       const TensorShape &shape2d,
+                                       DataType           data_type,
+                                       size_t             image_row_pitch,
+                                       CLImage2DType      image_type)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()),
                              "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
     ARM_COMPUTE_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0,
                              "Impossible to retrieve the cl_image pitch alignment");
-    ARM_COMPUTE_ERROR_ON_MSG(buffer.get() == nullptr,
-                             "Cannot create cl_image from empty cl_buffer");
+    ARM_COMPUTE_ERROR_ON_MSG(buffer.get() == nullptr, "Cannot create cl_image from empty cl_buffer");
 
     cl_channel_type cl_data_type;
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::F32:
             cl_data_type = CL_FLOAT;
@@ -86,7 +89,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer
     cl_mem cl_image;
     cl_int err = CL_SUCCESS;
 
-    const cl_image_format format = { CL_RGBA, cl_data_type };
+    const cl_image_format format = {CL_RGBA, cl_data_type};
 
     cl_image_desc desc;
     memset(&desc, 0, sizeof(desc));
@@ -96,7 +99,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer
     desc.image_width     = shape2d[0];
     desc.image_height    = shape2d[1];
 
-    switch(image_type)
+    switch (image_type)
     {
         case CLImage2DType::ReadOnly:
             cl_image = clCreateImage(ctx(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err);
@@ -114,111 +117,13 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer
     return cl::Image2D(cl_image);
 }
 
-namespace experimental
-{
-PostOpCLKernelUtils::PostOpCLKernelUtils(const Config &supported_config)
-    : _supported_config(supported_config)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(supported_config.empty(), "Empty PostOp CL kernel support configuration is not allowed");
-    for(auto it = _supported_config.begin(); it != _supported_config.end(); ++it)
-    {
-        auto post_op_sequence = it->first;
-        auto post_op_slots    = std::get<1>(it->second);
-        ARM_COMPUTE_ERROR_ON_MSG(post_op_sequence.size() != post_op_slots.size(), "The number of PostOps must be the same as that of the assigned slots");
-    }
-}
-
-bool PostOpCLKernelUtils::are_post_op_shapes_compliant(const ITensorInfo *dst, const experimental::PostOpList<ITensorInfo *> &post_ops)
-{
-    for(const auto &op : post_ops.get_list())
-    {
-        for(const auto &tensor : op->arguments())
-        {
-            const TensorShape &out_shape = TensorShape::broadcast_shape(dst->tensor_shape(), (*tensor)->tensor_shape());
-            // All post ops must be elementwise and must not alter the shape of the original dst tensor after broadcasting
-            if(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0))
-            {
-                return false;
-            }
-            // NOTE: Kernel limitation: currently only the following broadcasting types are supported:
-            //  1. Post op arg is scalar, broadcast in both first and second dims
-            //  2. Post op arg is of shape: second dim=1, first dim=N, broadcast only in second dim
-            //  This means this case: Post op arg is of shape: second dim=M, first dim=1, broadcast only in first dim, is NOT supported
-            if(dst->dimension(0) > 1 && dst->dimension(1) > 1 && (*tensor)->dimension(0) == 1 && (*tensor)->dimension(1) > 1)
-            {
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-bool PostOpCLKernelUtils::is_post_op_sequence_supported(const PostOpList<ITensorInfo *> &post_ops) const
+void handle_cl_error(const std::string &function_name, cl_int error_code)
 {
-    if(post_ops.size() == 0)
+    if (error_code != CL_SUCCESS)
     {
-        return true; // Always support cases where no post op is specified
+        std::string error_message = function_name + " - Error code: " + std::to_string(error_code);
+        ARM_COMPUTE_ERROR(error_message.c_str());
     }
-    const auto post_op_sequence = get_post_op_sequence(post_ops);
-
-    return _supported_config.find(post_op_sequence) != _supported_config.end();
-}
-
-void PostOpCLKernelUtils::set_post_ops_cl_build_options(CLBuildOptions &build_opts, const PostOpList<ITensorInfo *> &post_ops) const
-{
-    const auto post_op_sequence = get_post_op_sequence(post_ops);
-    const auto slots            = std::get<1>(_supported_config.at(post_op_sequence));
-    for(size_t post_op_id = 0; post_op_id < post_ops.size(); ++post_op_id)
-    {
-        const auto &post_op     = post_ops.get_list().at(post_op_id);
-        const auto  slot_prefix = "-DP" + support::cpp11::to_string(slots[post_op_id]);
-        if(post_op->type() == experimental::PostOpType::Activation)
-        {
-            const auto _post_op  = utils::cast::polymorphic_downcast<const experimental::PostOpAct<ITensorInfo *> *>(post_op.get());
-            const auto act_type  = slot_prefix + "_ACTIVATION_TYPE=" + lower_string(string_from_activation_func(_post_op->_act_info.activation()));
-            const auto act_a_val = slot_prefix + "_ACTIVATION_A_VAL=" + float_to_string_with_full_precision(_post_op->_act_info.a());
-            const auto act_b_val = slot_prefix + "_ACTIVATION_B_VAL=" + float_to_string_with_full_precision(_post_op->_act_info.b());
-            build_opts.add_option(act_type);
-            build_opts.add_option(act_a_val);
-            build_opts.add_option(act_b_val);
-        }
-        else if(post_op->type() == experimental::PostOpType::Eltwise_Add)
-        {
-            size_t     arg_id     = 1;
-            const auto eltwise_op = slot_prefix + "_ELTWISE_OP=ADD" + "_X_POS_" + support::cpp11::to_string(post_op->prev_dst_pos());
-            build_opts.add_option(eltwise_op);
-            for(const auto &tensor : post_op->arguments())
-            {
-                const auto height = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_HEIGHT=" + support::cpp11::to_string((*tensor)->dimension(1));
-                const auto width  = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_WIDTH=" + support::cpp11::to_string((*tensor)->dimension(0));
-                build_opts.add_option(height);
-                build_opts.add_option(width);
-                ++arg_id;
-            }
-        }
-        else if(post_op->type() == experimental::PostOpType::Eltwise_PRelu)
-        {
-            size_t     arg_id     = 1;
-            const auto eltwise_op = slot_prefix + "_ELTWISE_OP=PRELU" + "_X_POS_" + support::cpp11::to_string(post_op->prev_dst_pos());
-            build_opts.add_option(eltwise_op);
-            for(const auto &tensor : post_op->arguments())
-            {
-                const auto height = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_HEIGHT=" + support::cpp11::to_string((*tensor)->dimension(1));
-                const auto width  = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_WIDTH=" + support::cpp11::to_string((*tensor)->dimension(0));
-                build_opts.add_option(height);
-                build_opts.add_option(width);
-                ++arg_id;
-            }
-        }
-    }
-}
-
-void PostOpCLKernelUtils::set_post_ops_cl_kernel_name(std::string &kernel_name, const PostOpList<ITensorInfo *> &post_ops) const
-{
-    const auto post_op_sequence = get_post_op_sequence(post_ops);
-    const auto postfix          = std::get<0>(_supported_config.at(post_op_sequence));
-    kernel_name += postfix;
 }
-} // namespace experimental
 
 } // namespace arm_compute
diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h
index e3f12d4b53b88dffafa6fd8105d48ae6c6160a78..f9dcfeac3a5ed9194f84b8183a8c43185ccd305e 100644
--- a/src/core/CL/CLUtils.h
+++ b/src/core/CL/CLUtils.h
@@ -22,11 +22,10 @@
  * SOFTWARE.
  */
 
-#ifndef ARM_COMPUTE_CL_CLUTILS_H
-#define ARM_COMPUTE_CL_CLUTILS_H
+#ifndef ACL_SRC_CORE_CL_CLUTILS_H
+#define ACL_SRC_CORE_CL_CLUTILS_H
 
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/experimental/IPostOp.h"
 
 #include <map>
 
@@ -73,89 +72,20 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im
  *
  * @return cl::Image2D object
  */
-cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type);
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx,
+                                       const cl::Buffer  &buffer,
+                                       const TensorShape &shape2d,
+                                       DataType           data_type,
+                                       size_t             image_row_pitch,
+                                       CLImage2DType      image_type);
 
-namespace experimental
-{
-/** @name (EXPERIMENTAL_POST_OPS)
- * @{
+/** Check for CL error code and throw exception accordingly.
+ *
+ * @param[in] function_name The name of the CL function being called.
+ * @param[in] error_code    The error returned by the CL function.
  */
+void handle_cl_error(const std::string &function_name, cl_int error_code);
 
-/** Manage validation, building and configurations of PostOp CL kernels */
-class PostOpCLKernelUtils final
-{
-public:
-    /** CL kernel name postfix for post ops */
-    using NamePostfix = std::string;
-    /** CL kernels that supports post ops assign each post op to a 'slot', in accordance with the postfix
-     * For example, for a kernel with postfix '_act_prelu_eltwiseadd', there are 3 slots
-     * slot 1: (unary) activation, slot 2: pRelu, slot 3: elementwise addition
-     *
-     * Some kernels may allow some slots to be optional, to support multiple combinations of post op sequences.
-     * In such cases, we need to explicitly set up a mapping between each post op and the slots for that kernel.
-     * For example, suppose we have 2 kernels with postfixes: _eltwiseadd_prelu, _act_eltwiseadd_act_prelu, where the activations in the
-     * second kernel are optional. Say we want to support an eltwise addition, followed by a prelu (sequence { eltwiseadd, prelu }).
-     * Now we can choose which one of the 2 kernels to use, since they both support this post op sequence.
-     * We can either:
-     *  1. assign the elementwise to slot 1 and prelu to slot 2 of kernel 1
-     *  { { Eltwise_Add, PRelu } -> {"_eltwise_act", {1, 2} } } or
-     *  2. assign the elementwise to slot 2 and prelu to slot 4 of kernel 1
-     *  { { Eltwise_Add, PRelu } -> {"_act_eltwiseadd_act_prelu", {2, 4} } }
-     */
-    using Slots  = std::vector<unsigned int>;
-    using Config = std::map<PostOpTypeSequence, std::tuple<NamePostfix, Slots>>;
-
-public:
-    explicit PostOpCLKernelUtils(const Config &config);
-
-    /** Check if post op argument tensor shapes are compliant
-     * All post ops must not alter the shape of the original dst tensor (even after broadcasting)
-     *
-     * @param[in] dst      Dst tensor to apply the post ops to
-     * @param[in] post_ops Post ops
-     *
-     * @return true if shapes are compliant and false otherwise
-     */
-    static bool are_post_op_shapes_compliant(const ITensorInfo *dst, const experimental::PostOpList<ITensorInfo *> &post_ops);
-    /** Check if the post op sequence is supported in the current configuration
-     *
-     * @param[in] post_ops Post ops
-     *
-     * @return true if the post op sequence is supported and false otherwise
-     */
-    bool is_post_op_sequence_supported(const PostOpList<ITensorInfo *> &post_ops) const;
-    /** Helper function to set PostOp related build options
-     * @note Convention
-     *      1. Each post op "slot" is prefixed with "P<slot number>", followed by the usual parameters for that post op.
-     *      E.g. If the first slot is an activation, we need to pass 3 definitions in this way:
-     *          -P1_ACTIVATION_TYPE=...  -P1_ACTIVATION_A_VAL=...   -P1_ACTIVATION_B_VAL=...
-     *
-     *      2. For multi-ary post ops, to pass the position of the previous op's dest tensor,
-     *         we append "_X_POS_<pos>" to the post op type.
-     *      E.g. for a single post op add(dst, x), where dst is the result of the main op.
-     *         In this case, the position of the previous op's dest is 0, so we pass
-     *         -P1_ELTWISE_OP=ADD_X_POS_0
-     *
-     * @param[out] built_opts OpenCL kernel build options
-     * @param[in]  post_ops   Post ops
-     *
-     */
-    void set_post_ops_cl_build_options(CLBuildOptions &built_opts, const PostOpList<ITensorInfo *> &post_ops) const;
-    /** Helper function to set PostOp kernel name
-     *
-     * @param[out] kernel_name OpenCL kernel name
-     * @param[in]  post_ops    Post ops
-     *
-     */
-    void set_post_ops_cl_kernel_name(std::string &kernel_name, const PostOpList<ITensorInfo *> &post_ops) const;
-
-private:
-    Config _supported_config{};
-};
-/** @} */ // end of group (EXPERIMENTAL_POST_OPS)
-
-} // namespace experimental
-
-} // arm_compute
+} // namespace arm_compute
 
-#endif /* ARM_COMPUTE_CL_CLUTILS_H */
+#endif // ACL_SRC_CORE_CL_CLUTILS_H
diff --git a/src/core/CL/CLValidate.h b/src/core/CL/CLValidate.h
index 7b5294e45255c34489b7831d016c6d32626fbd00..50d224f1c0f2a1068ce58bbaa42968a96e6bff70 100644
--- a/src/core/CL/CLValidate.h
+++ b/src/core/CL/CLValidate.h
@@ -29,11 +29,13 @@
 
 namespace arm_compute
 {
-#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
+#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor)                                                          \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, \
+                                                                        CLKernelLibrary::get().fp16_supported()))
 
-#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
+#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor)                                                    \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, \
+                                                                         CLKernelLibrary::get().fp16_supported()))
 
 /** Return an error if int64_base_atomics extension is not supported by the device.
  *
@@ -43,11 +45,13 @@ namespace arm_compute
  *
  * @return Status
  */
-inline arm_compute::Status error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line)
+inline arm_compute::Status
+error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line)
 {
-    if(!CLKernelLibrary::get().int64_base_atomics_supported())
+    if (!CLKernelLibrary::get().int64_base_atomics_supported())
     {
-        return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line, "Atomic functions are not supported");
+        return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line,
+                                            "Atomic functions are not supported");
     }
     return arm_compute::Status{};
 }
diff --git a/src/core/CL/DefaultLWSHeuristics.cpp b/src/core/CL/DefaultLWSHeuristics.cpp
index a53fdbbab6d8700439fb9f797f45cedcfe44fe4c..f96b24d2a92871e8e0cff39766572f8957c14583 100644
--- a/src/core/CL/DefaultLWSHeuristics.cpp
+++ b/src/core/CL/DefaultLWSHeuristics.cpp
@@ -31,13 +31,13 @@ cl::NDRange get_gemm_lws(size_t gws_x, size_t gws_y, size_t gws_z)
 {
     ARM_COMPUTE_UNUSED(gws_y);
 
-    if(gws_z != 1)
+    if (gws_z != 1)
     {
         return cl::NDRange(4, 4, 2);
     }
     else
     {
-        if(gws_x > 256)
+        if (gws_x > 256)
         {
             return cl::NDRange(2, 16, 1);
         }
@@ -59,9 +59,9 @@ cl::NDRange get_direct_lws(size_t gws_x, size_t gws_y, size_t gws_z)
 {
     ARM_COMPUTE_UNUSED(gws_z);
 
-    if(gws_x < gws_y)
+    if (gws_x < gws_y)
     {
-        if(gws_x < 4)
+        if (gws_x < 4)
         {
             return cl::NDRange(std::min(gws_x, static_cast<size_t>(2u)), 32, 1);
         }
@@ -81,7 +81,7 @@ cl::NDRange get_dwc_lws(size_t gws_x, size_t gws_y, size_t gws_z)
     ARM_COMPUTE_UNUSED(gws_y);
     ARM_COMPUTE_UNUSED(gws_z);
 
-    if(gws_x < 32)
+    if (gws_x < 32)
     {
         return cl::NDRange(gws_x, 4, 4);
     }
@@ -100,7 +100,7 @@ cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws)
     const size_t gws_y = gws[1];
     const size_t gws_z = gws[2];
 
-    switch(kernel_type)
+    switch (kernel_type)
     {
         case CLKernelType::GEMM:
         {
@@ -124,4 +124,4 @@ cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws)
         }
     }
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index dc3a86a528f7ddca6a06037980c0af268b931890..ac53e7f1d2bd119c8cf702755bf96bf0b8981d07 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -25,18 +25,23 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/helpers/Utils.h"
 
 #include <cstddef>
 
-void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items)
+void arm_compute::enqueue(cl::CommandQueue  &queue,
+                          ICLKernel         &kernel,
+                          const Window      &window,
+                          const cl::NDRange &lws_hint,
+                          bool               use_dummy_work_items)
 {
-    if(kernel.kernel()() == nullptr)
+    if (kernel.kernel()() == nullptr)
     {
         return;
     }
 
-    for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_ERROR_ON(window[i].step() == 0);
         // Make sure that dimensions > Z are 1
@@ -46,7 +51,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
     cl::NDRange gws = ICLKernel::gws_from_window(window, use_dummy_work_items);
 
     // Check for empty NDRange
-    if(gws.dimensions() == 0)
+    if (gws.dimensions() == 0)
     {
         return;
     }
@@ -54,7 +59,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
     kernel.cache_gws(gws);
 
     cl::NDRange valid_lws;
-    if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
+    if (lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
     {
         valid_lws = cl::NullRange;
     }
@@ -65,12 +70,12 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
 
     cl::NDRange lws = cl::NullRange;
 
-    if((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2]))
+    if ((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2]))
     {
         lws = valid_lws;
     }
 
-    if(CLKernelLibrary::get().is_wbsm_supported())
+    if (CLKernelLibrary::get().is_wbsm_supported())
     {
         set_wbsm(kernel.kernel(), kernel.wbsm_hint());
     }
@@ -90,7 +95,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
     // Calculate offset to the start of the window
     unsigned int offset_first_element = info->offset_first_element_in_bytes();
 
-    for(unsigned int n = 0; n < info->num_dimensions(); ++n)
+    for (unsigned int n = 0; n < info->num_dimensions(); ++n)
     {
         offset_first_element += (window.is_broadcasted(n) ? 0 : window[n].start()) * strides[n];
     }
@@ -98,7 +103,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
     unsigned int idx_start = idx;
     _kernel.setArg(idx++, tensor->cl_buffer());
 
-    for(unsigned int d = 0; d < dimension_size; ++d)
+    for (unsigned int d = 0; d < dimension_size; ++d)
     {
         _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : strides[d]);
         _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : (strides[d] * window[d].step()));
@@ -107,7 +112,8 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
     _kernel.setArg<cl_uint>(idx++, offset_first_element);
 
     ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_tensor<dimension_size>() != idx,
-                                 "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor<dimension_size>());
+                                 "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel",
+                                 dimension_size, num_arguments_per_tensor<dimension_size>());
     ARM_COMPUTE_UNUSED(idx_start);
 }
 
@@ -178,7 +184,7 @@ void ICLKernel::set_target(cl::Device &device)
 
 size_t ICLKernel::get_max_workgroup_size()
 {
-    if(_max_workgroup_size == 0)
+    if (_max_workgroup_size == 0)
     {
         _max_workgroup_size = CLKernelLibrary::get().max_local_workgroup_size(_kernel);
     }
@@ -187,7 +193,7 @@ size_t ICLKernel::get_max_workgroup_size()
 
 cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work_items)
 {
-    if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
+    if ((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
     {
         return cl::NullRange;
     }
@@ -196,7 +202,7 @@ cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work
                     (window.y().end() - window.y().start()) / window.y().step(),
                     (window.z().end() - window.z().start()) / window.z().step());
 
-    if(use_dummy_work_items)
+    if (use_dummy_work_items)
     {
         gws.get()[0] = get_next_power_two(gws[0]);
         gws.get()[1] = get_next_power_two(gws[1]);
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index c82809cef35930fbbb8169c8aa08c17d82026d2e..6aebef15a567350511f5c46337aaaff53d749612 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -27,10 +27,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/IKernel.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/CL/CLTuningParams.h"
 
 #include "src/core/CL/DefaultLWSHeuristics.h"
@@ -43,14 +43,14 @@ namespace
 {
 bool is_same_lws(cl::NDRange lws0, cl::NDRange lws1)
 {
-    if(lws0.dimensions() != lws1.dimensions())
+    if (lws0.dimensions() != lws1.dimensions())
     {
         return false;
     }
 
-    for(size_t i = 0; i < lws0.dimensions(); ++i)
+    for (size_t i = 0; i < lws0.dimensions(); ++i)
     {
-        if(lws0.get()[i] != lws1.get()[i])
+        if (lws0.get()[i] != lws1.get()[i])
         {
             return false;
         }
@@ -71,7 +71,7 @@ private:
      *
      * @return The number of arguments enqueued per array object.
      */
-    template <unsigned int        dimension_size>
+    template <unsigned int dimension_size>
     constexpr static unsigned int num_arguments_per_array()
     {
         return num_arguments_per_tensor<dimension_size>();
@@ -80,7 +80,7 @@ private:
      *
      * @return The number of arguments enqueued per tensor object.
      */
-    template <unsigned int        dimension_size>
+    template <unsigned int dimension_size>
     constexpr static unsigned int num_arguments_per_tensor()
     {
         return 2 + 2 * dimension_size;
@@ -116,11 +116,13 @@ protected:
      * @param[in] window             The maximum window which will be returned by window()
      * @param[in] tuning_params_hint (Optional) Tuning parameters to use.
      */
-    void configure_internal(const Window &window, CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), 0))
+    void configure_internal(const Window  &window,
+                            CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(),
+                                                                               0))
     {
         _tuning_params_hint = tuning_params_hint;
 
-        if(is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange()))
+        if (is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange()))
         {
             // Disable use_dummy_work_items at configure time. Because dummy work items only affect gws size, which
             // will be recalculated with use_dummy_work_items flag at run time again anyway.
@@ -133,7 +135,13 @@ protected:
 public:
     /** Constructor */
     ICLKernel()
-        : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _type(CLKernelType::UNKNOWN), _tuning_params_hint(), _cached_gws(cl::NullRange)
+        : _kernel(nullptr),
+          _target(GPUTarget::MIDGARD),
+          _config_id(arm_compute::default_config_id),
+          _max_workgroup_size(0),
+          _type(CLKernelType::UNKNOWN),
+          _tuning_params_hint(),
+          _cached_gws(cl::NullRange)
     {
     }
     /** Returns a reference to the OpenCL kernel of this object.
@@ -161,7 +169,11 @@ public:
      * @param[in]     window         Window the kernel will be executed on.
      */
     template <typename T>
-    void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+    void add_1D_array_argument(unsigned int      &idx,
+                               const ICLArray<T> *array,
+                               const Strides     &strides,
+                               unsigned int       num_dimensions,
+                               const Window      &window)
     {
         add_array_argument<T, 1>(idx, array, strides, num_dimensions, window);
     }
@@ -184,7 +196,7 @@ public:
      */
     void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
     {
-        if(cond)
+        if (cond)
         {
             add_1D_tensor_argument(idx, tensor, window);
         }
@@ -208,7 +220,7 @@ public:
      */
     void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
     {
-        if(cond)
+        if (cond)
         {
             add_2D_tensor_argument(idx, tensor, window);
         }
@@ -469,7 +481,11 @@ private:
      * @param[in]     window         Window the kernel will be executed on.
      */
     template <typename T, unsigned int dimension_size>
-    void add_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window);
+    void add_array_argument(unsigned int      &idx,
+                            const ICLArray<T> *array,
+                            const Strides     &strides,
+                            unsigned int       num_dimensions,
+                            const Window      &window);
     /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
@@ -505,7 +521,11 @@ private:
  *
  * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
  */
-void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false);
+void enqueue(cl::CommandQueue  &queue,
+             ICLKernel         &kernel,
+             const Window      &window,
+             const cl::NDRange &lws_hint             = CLKernelLibrary::get().default_ndrange(),
+             bool               use_dummy_work_items = false);
 
 /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
  *
@@ -516,14 +536,15 @@ void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, c
  * @param[in]     window         Window the kernel will be executed on.
  */
 template <typename T, unsigned int dimension_size>
-void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+void ICLKernel::add_array_argument(
+    unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
 {
     ARM_COMPUTE_ERROR_ON(array == nullptr);
 
     // Calculate offset to the start of the window
     unsigned int offset_first_element = 0;
 
-    for(unsigned int n = 0; n < num_dimensions; ++n)
+    for (unsigned int n = 0; n < num_dimensions; ++n)
     {
         offset_first_element += window[n].start() * strides[n];
     }
@@ -531,7 +552,7 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons
     unsigned int idx_start = idx;
     _kernel.setArg(idx++, array->cl_buffer());
 
-    for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
+    for (unsigned int dimension = 0; dimension < dimension_size; dimension++)
     {
         _kernel.setArg<cl_uint>(idx++, strides[dimension]);
         _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
@@ -540,8 +561,9 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons
     _kernel.setArg<cl_uint>(idx++, offset_first_element);
 
     ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_array<dimension_size>() != idx,
-                                 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
+                                 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel",
+                                 dimension_size, num_arguments_per_array<dimension_size>());
     ARM_COMPUTE_UNUSED(idx_start);
 }
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLKERNEL_H */
diff --git a/src/core/CL/ICLSimple2DKernel.cpp b/src/core/CL/ICLSimple2DKernel.cpp
index 5d8295bdfeed0327dd5919593ae776599b107291..3f7edbb88db0d5df2da255d3a6a64de7c9db6f95 100644
--- a/src/core/CL/ICLSimple2DKernel.cpp
+++ b/src/core/CL/ICLSimple2DKernel.cpp
@@ -40,6 +40,5 @@ void ICLSimple2DKernel::run(const Window &window, cl::CommandQueue &queue)
         add_2D_tensor_argument(idx, _input, slice);
         add_2D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
diff --git a/src/core/CL/ICLSimple2DKernel.h b/src/core/CL/ICLSimple2DKernel.h
index 5246492401ea599a2c78610492d824e0b42a5904..97bc1e58c291a7d93513b7207554b2f76ea32c57 100644
--- a/src/core/CL/ICLSimple2DKernel.h
+++ b/src/core/CL/ICLSimple2DKernel.h
@@ -37,5 +37,5 @@ public:
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLSIMPLE2DKERNEL_H */
diff --git a/src/core/CL/ICLSimple3DKernel.cpp b/src/core/CL/ICLSimple3DKernel.cpp
index fef1a86125010cb4e4dbe4dd155ec2a98fffee42..71d7d1f07bfbd9a2ba11a69d8352c3d6add3ccf5 100644
--- a/src/core/CL/ICLSimple3DKernel.cpp
+++ b/src/core/CL/ICLSimple3DKernel.cpp
@@ -42,6 +42,5 @@ void ICLSimple3DKernel::run(const Window &window, cl::CommandQueue &queue)
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
diff --git a/src/core/CL/ICLSimple3DKernel.h b/src/core/CL/ICLSimple3DKernel.h
index ff0b2746636d0427e9acc0d3d277fcebd5d2eada..5071b6b33995c78b7084bfb7faf93d668fcefb46 100644
--- a/src/core/CL/ICLSimple3DKernel.h
+++ b/src/core/CL/ICLSimple3DKernel.h
@@ -39,5 +39,5 @@ public:
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLSIMPLE3DKERNEL_H */
diff --git a/src/core/CL/ICLSimpleKernel.cpp b/src/core/CL/ICLSimpleKernel.cpp
index d67fefdf710f51b9876e350fc2965230d0a442db..c31db8355f63fd1287b26b4155f732b51d737ab9 100644
--- a/src/core/CL/ICLSimpleKernel.cpp
+++ b/src/core/CL/ICLSimpleKernel.cpp
@@ -22,30 +22,35 @@
  * SOFTWARE.
  */
 #include "src/core/CL/ICLSimpleKernel.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
-ICLSimpleKernel::ICLSimpleKernel()
-    : _input(nullptr), _output(nullptr)
+ICLSimpleKernel::ICLSimpleKernel() : _input(nullptr), _output(nullptr)
 {
 }
 
-void ICLSimpleKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size)
+void ICLSimpleKernel::configure(const ICLTensor  *input,
+                                ICLTensor        *output,
+                                unsigned int      num_elems_processed_per_iteration,
+                                bool              border_undefined,
+                                const BorderSize &border_size)
 {
     _input  = input;
     _output = output;
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
+    Window win =
+        calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
                               output_access);
 
     output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size);
diff --git a/src/core/CL/ICLSimpleKernel.h b/src/core/CL/ICLSimpleKernel.h
index b35547a217363e0e5299524543dcc9ad87a0dbcc..6afd7309aac2b692791e44a16505d5024adcd024 100644
--- a/src/core/CL/ICLSimpleKernel.h
+++ b/src/core/CL/ICLSimpleKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -55,12 +56,16 @@ public:
      * @param[in]  border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
      * @param[in]  border_size                       (Optional) Size of the border.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
+    void configure(const ICLTensor  *input,
+                   ICLTensor        *output,
+                   unsigned int      num_elems_processed_per_iteration,
+                   bool              border_undefined = false,
+                   const BorderSize &border_size      = BorderSize());
 
 protected:
     const ICLTensor *_input;
     ICLTensor       *_output;
 };
-}
+} // namespace arm_compute
 
 #endif /*ARM_COMPUTE_ICLSIMPLEKERNEL_H */
diff --git a/src/core/CL/ICLTensor.cpp b/src/core/CL/ICLTensor.cpp
index b541bff04a9b2ad2d21055c4a4ef5b3303ea1508..0771db7f50a3f4a044642db5a57127accd66ba4a 100644
--- a/src/core/CL/ICLTensor.cpp
+++ b/src/core/CL/ICLTensor.cpp
@@ -27,8 +27,7 @@
 
 using namespace arm_compute;
 
-ICLTensor::ICLTensor()
-    : _mapping(nullptr)
+ICLTensor::ICLTensor() : _mapping(nullptr)
 {
 }
 
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 8aa9b2bc1e7e68454f187ea362069deb1d6aba80..35421d025eb855c80d275c9f73be525a9614aa7f 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -36,11 +36,7 @@
 
 namespace arm_compute
 {
-CLSymbols::CLSymbols() noexcept(false)
-    : _loaded(
-{
-    false, false
-})
+CLSymbols::CLSymbols() noexcept(false) : _loaded({false, false})
 {
 }
 
@@ -52,9 +48,9 @@ CLSymbols &CLSymbols::get()
 
 bool CLSymbols::load_default()
 {
-    static const std::vector<std::string> libraries_filenames{ "libOpenCL.so", "libGLES_mali.so", "libmali.so" };
+    static const std::vector<std::string> libraries_filenames{"libOpenCL.so", "libGLES_mali.so", "libmali.so"};
 
-    if(_loaded.first)
+    if (_loaded.first)
     {
         return _loaded.second;
     }
@@ -62,34 +58,32 @@ bool CLSymbols::load_default()
     // Indicate that default loading has been tried
     _loaded.first = true;
 
-    if(load(libraries_filenames, /* use_loader */ false))
+    if (load(libraries_filenames, /* use_loader */ false))
     {
-        ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, "Failed to load OpenCL symbols from shared library");
+        ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr,
+                                 "Failed to load OpenCL symbols from shared library");
         return true;
     }
 
 #ifdef __ANDROID__
     // When running in NDK environment, the above libraries are not accessible.
-    static const std::vector<std::string> android_libraries_filenames{ "libOpenCL-pixel.so", "libOpenCL-car.so" };
+    static const std::vector<std::string> android_libraries_filenames{"libOpenCL-pixel.so", "libOpenCL-car.so"};
 
-    if(load(android_libraries_filenames, /* use_loader */ true))
+    if (load(android_libraries_filenames, /* use_loader */ true))
     {
-        ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, "Failed to load OpenCL symbols from android shared library");
+        ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr,
+                                 "Failed to load OpenCL symbols from android shared library");
         return true;
     }
 #endif // __ANDROID__
 
     // If not returned till here then libraries not found
     std::stringstream ss;
-    std::for_each(libraries_filenames.begin(), libraries_filenames.end(), [&ss](const std::string & s)
-    {
-        ss << s << " ";
-    });
+    std::for_each(libraries_filenames.begin(), libraries_filenames.end(),
+                  [&ss](const std::string &s) { ss << s << " "; });
 #ifdef __ANDROID__
-    std::for_each(android_libraries_filenames.begin(), android_libraries_filenames.end(), [&ss](const std::string & s)
-    {
-        ss << s << " ";
-    });
+    std::for_each(android_libraries_filenames.begin(), android_libraries_filenames.end(),
+                  [&ss](const std::string &s) { ss << s << " "; });
 #endif // __ANDROID__
     std::cerr << "Couldn't find any of the following OpenCL library: " << ss.str() << std::endl;
     return false;
@@ -99,15 +93,15 @@ bool CLSymbols::load(const std::vector<std::string> &libraries_filenames, bool u
 {
     void        *handle = nullptr;
     unsigned int index  = 0;
-    for(index = 0; index < libraries_filenames.size(); ++index)
+    for (index = 0; index < libraries_filenames.size(); ++index)
     {
         handle = dlopen(libraries_filenames[index].c_str(), RTLD_LAZY | RTLD_LOCAL);
-        if(handle != nullptr)
+        if (handle != nullptr)
         {
             break;
         }
     }
-    if(index == libraries_filenames.size())
+    if (index == libraries_filenames.size())
     {
         // Set status of loading to failed
         _loaded.second = false;
@@ -115,22 +109,23 @@ bool CLSymbols::load(const std::vector<std::string> &libraries_filenames, bool u
     }
 
 #ifdef __ANDROID__
-    typedef void* (*loadOpenCLPointer_t)(const char* name);
+    typedef void *(*loadOpenCLPointer_t)(const char *name);
     loadOpenCLPointer_t loadOpenCLPointer;
-    if (use_loader) {
+    if (use_loader)
+    {
         typedef void (*enableOpenCL_t)();
-        enableOpenCL_t enableOpenCL =
-            reinterpret_cast<enableOpenCL_t>(dlsym(handle, "enableOpenCL"));
+        enableOpenCL_t enableOpenCL = reinterpret_cast<enableOpenCL_t>(dlsym(handle, "enableOpenCL"));
         enableOpenCL();
 
-        loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(
-            dlsym(handle, "loadOpenCLPointer"));
-    } else {
+        loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(dlsym(handle, "loadOpenCLPointer"));
+    }
+    else
+    {
         loadOpenCLPointer = nullptr;
     }
-#define LOAD_FUNCTION_PTR(func_name, _handle) \
-    func_name##_ptr = reinterpret_cast<decltype(func_name) *>( use_loader ? \
-        loadOpenCLPointer(#func_name) : dlsym(handle, #func_name));
+#define LOAD_FUNCTION_PTR(func_name, _handle)                                                            \
+    func_name##_ptr = reinterpret_cast<decltype(func_name) *>(use_loader ? loadOpenCLPointer(#func_name) \
+                                                                         : dlsym(handle, #func_name));
 #else /* __ANDROID__ */
     (void)use_loader; // Avoid unused warning
 #define LOAD_FUNCTION_PTR(func_name, handle) \
@@ -187,6 +182,16 @@ bool CLSymbols::load(const std::vector<std::string> &libraries_filenames, bool u
     LOAD_FUNCTION_PTR(clCreateImage, handle);
     LOAD_FUNCTION_PTR(clSetKernelExecInfo, handle);
 
+    // Command buffer and mutable dispatch command buffer extensions
+    LOAD_FUNCTION_PTR(clCreateCommandBufferKHR, handle);
+    LOAD_FUNCTION_PTR(clRetainCommandBufferKHR, handle);
+    LOAD_FUNCTION_PTR(clReleaseCommandBufferKHR, handle);
+    LOAD_FUNCTION_PTR(clFinalizeCommandBufferKHR, handle);
+    LOAD_FUNCTION_PTR(clEnqueueCommandBufferKHR, handle);
+    LOAD_FUNCTION_PTR(clCommandNDRangeKernelKHR, handle);
+
+    LOAD_FUNCTION_PTR(clUpdateMutableCommandsKHR, handle);
+
     // Third-party extensions
     LOAD_FUNCTION_PTR(clImportMemoryARM, handle);
 
@@ -224,12 +229,11 @@ bool opencl_is_available()
 }
 } // namespace arm_compute
 
-cl_int clEnqueueMarker(cl_command_queue command_queue,
-                       cl_event        *event)
+cl_int clEnqueueMarker(cl_command_queue command_queue, cl_event *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueMarker_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, event);
     }
@@ -239,12 +243,11 @@ cl_int clEnqueueMarker(cl_command_queue command_queue,
     }
 }
 
-cl_int clWaitForEvents(cl_uint         num_events,
-                       const cl_event *event_list)
+cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clWaitForEvents_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(num_events, event_list);
     }
@@ -254,12 +257,18 @@ cl_int clWaitForEvents(cl_uint         num_events,
     }
 }
 
-cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags flags, void *svm_ptr,
-                       size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+cl_int clEnqueueSVMMap(cl_command_queue command_queue,
+                       cl_bool          blocking_map,
+                       cl_map_flags     flags,
+                       void            *svm_ptr,
+                       size_t           size,
+                       cl_uint          num_events_in_wait_list,
+                       const cl_event  *event_wait_list,
+                       cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueSVMMap_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, blocking_map, flags, svm_ptr, size, num_events_in_wait_list, event_wait_list, event);
     }
@@ -269,12 +278,15 @@ cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_
     }
 }
 
-cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *svm_ptr, cl_uint num_events_in_wait_list,
-                         const cl_event *event_wait_list, cl_event *event)
+cl_int clEnqueueSVMUnmap(cl_command_queue command_queue,
+                         void            *svm_ptr,
+                         cl_uint          num_events_in_wait_list,
+                         const cl_event  *event_wait_list,
+                         cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueSVMUnmap_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, svm_ptr, num_events_in_wait_list, event_wait_list, event);
     }
@@ -288,7 +300,7 @@ void *clSVMAlloc(cl_context context, cl_svm_mem_flags_arm flags, size_t size, cl
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clSVMAlloc_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, flags, size, alignment);
     }
@@ -302,7 +314,7 @@ void clSVMFree(cl_context context, void *svm_pointer)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clSVMFree_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         func(context, svm_pointer);
     }
@@ -316,7 +328,7 @@ cl_int clGetContextInfo(cl_context      context,
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetContextInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -333,7 +345,7 @@ cl_command_queue clCreateCommandQueue(cl_context                  context,
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateCommandQueue_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, device, properties, errcode_ret);
     }
@@ -350,7 +362,7 @@ cl_command_queue clCreateCommandQueueWithProperties(cl_context                 c
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateCommandQueueWithProperties_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, device, properties, errcode_ret);
     }
@@ -360,17 +372,16 @@ cl_command_queue clCreateCommandQueueWithProperties(cl_context                 c
     }
 }
 
-cl_context clCreateContext(
-    const cl_context_properties *properties,
-    cl_uint                      num_devices,
-    const cl_device_id          *devices,
-    void (*pfn_notify)(const char *, const void *, size_t, void *),
-    void   *user_data,
-    cl_int *errcode_ret)
+cl_context clCreateContext(const cl_context_properties *properties,
+                           cl_uint                      num_devices,
+                           const cl_device_id          *devices,
+                           void (*pfn_notify)(const char *, const void *, size_t, void *),
+                           void   *user_data,
+                           cl_int *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateContext_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(properties, num_devices, devices, pfn_notify, user_data, errcode_ret);
     }
@@ -388,7 +399,7 @@ cl_context clCreateContextFromType(const cl_context_properties *properties,
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateContextFromType_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(properties, device_type, pfn_notify, user_data, errcode_ret);
     }
@@ -398,17 +409,16 @@ cl_context clCreateContextFromType(const cl_context_properties *properties,
     }
 }
 
-cl_int clBuildProgram(
-    cl_program          program,
-    cl_uint             num_devices,
-    const cl_device_id *device_list,
-    const char         *options,
-    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
-    void *user_data)
+cl_int clBuildProgram(cl_program          program,
+                      cl_uint             num_devices,
+                      const cl_device_id *device_list,
+                      const char         *options,
+                      void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+                      void *user_data)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clBuildProgram_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program, num_devices, device_list, options, pfn_notify, user_data);
     }
@@ -418,22 +428,22 @@ cl_int clBuildProgram(
     }
 }
 
-cl_int clEnqueueNDRangeKernel(
-    cl_command_queue command_queue,
-    cl_kernel        kernel,
-    cl_uint          work_dim,
-    const size_t    *global_work_offset,
-    const size_t    *global_work_size,
-    const size_t    *local_work_size,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event)
+cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue,
+                              cl_kernel        kernel,
+                              cl_uint          work_dim,
+                              const size_t    *global_work_offset,
+                              const size_t    *global_work_size,
+                              const size_t    *local_work_size,
+                              cl_uint          num_events_in_wait_list,
+                              const cl_event  *event_wait_list,
+                              cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event);
+        return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size,
+                    num_events_in_wait_list, event_wait_list, event);
     }
     else
     {
@@ -441,15 +451,11 @@ cl_int clEnqueueNDRangeKernel(
     }
 }
 
-cl_int clSetKernelArg(
-    cl_kernel   kernel,
-    cl_uint     arg_index,
-    size_t      arg_size,
-    const void *arg_value)
+cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clSetKernelArg_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel, arg_index, arg_size, arg_value);
     }
@@ -463,7 +469,7 @@ cl_int clRetainMemObject(cl_mem memobj)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainMemObject_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(memobj);
     }
@@ -477,7 +483,7 @@ cl_int clReleaseMemObject(cl_mem memobj)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseMemObject_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(memobj);
     }
@@ -487,17 +493,16 @@ cl_int clReleaseMemObject(cl_mem memobj)
     }
 }
 
-cl_int clEnqueueUnmapMemObject(
-    cl_command_queue command_queue,
-    cl_mem           memobj,
-    void            *mapped_ptr,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event)
+cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue,
+                               cl_mem           memobj,
+                               void            *mapped_ptr,
+                               cl_uint          num_events_in_wait_list,
+                               const cl_event  *event_wait_list,
+                               cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueUnmapMemObject_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
     }
@@ -511,7 +516,7 @@ cl_int clRetainCommandQueue(cl_command_queue command_queue)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainCommandQueue_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue);
     }
@@ -525,7 +530,7 @@ cl_int clReleaseContext(cl_context context)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseContext_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context);
     }
@@ -538,7 +543,7 @@ cl_int clReleaseEvent(cl_event event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseEvent_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(event);
     }
@@ -548,22 +553,22 @@ cl_int clReleaseEvent(cl_event event)
     }
 }
 
-cl_int clEnqueueWriteBuffer(
-    cl_command_queue command_queue,
-    cl_mem           buffer,
-    cl_bool          blocking_write,
-    size_t           offset,
-    size_t           size,
-    const void      *ptr,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event)
+cl_int clEnqueueWriteBuffer(cl_command_queue command_queue,
+                            cl_mem           buffer,
+                            cl_bool          blocking_write,
+                            size_t           offset,
+                            size_t           size,
+                            const void      *ptr,
+                            cl_uint          num_events_in_wait_list,
+                            const cl_event  *event_wait_list,
+                            cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueWriteBuffer_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+        return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list,
+                    event);
     }
     else
     {
@@ -571,22 +576,22 @@ cl_int clEnqueueWriteBuffer(
     }
 }
 
-cl_int clEnqueueReadBuffer(
-    cl_command_queue command_queue,
-    cl_mem           buffer,
-    cl_bool          blocking_read,
-    size_t           offset,
-    size_t           size,
-    void            *ptr,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event)
+cl_int clEnqueueReadBuffer(cl_command_queue command_queue,
+                           cl_mem           buffer,
+                           cl_bool          blocking_read,
+                           size_t           offset,
+                           size_t           size,
+                           void            *ptr,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event  *event_wait_list,
+                           cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueReadBuffer_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+        return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list,
+                    event);
     }
     else
     {
@@ -594,17 +599,16 @@ cl_int clEnqueueReadBuffer(
     }
 }
 
-cl_int clGetProgramBuildInfo(
-    cl_program            program,
-    cl_device_id          device,
-    cl_program_build_info param_name,
-    size_t                param_value_size,
-    void                 *param_value,
-    size_t               *param_value_size_ret)
+cl_int clGetProgramBuildInfo(cl_program            program,
+                             cl_device_id          device,
+                             cl_program_build_info param_name,
+                             size_t                param_value_size,
+                             void                 *param_value,
+                             size_t               *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetProgramBuildInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program, device, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -618,7 +622,7 @@ cl_int clRetainProgram(cl_program program)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainProgram_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program);
     }
@@ -628,27 +632,27 @@ cl_int clRetainProgram(cl_program program)
     }
 }
 
-void *clEnqueueMapBuffer(
-    cl_command_queue command_queue,
-    cl_mem           buffer,
-    cl_bool          blocking_map,
-    cl_map_flags     map_flags,
-    size_t           offset,
-    size_t           size,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event,
-    cl_int          *errcode_ret)
+void *clEnqueueMapBuffer(cl_command_queue command_queue,
+                         cl_mem           buffer,
+                         cl_bool          blocking_map,
+                         cl_map_flags     map_flags,
+                         size_t           offset,
+                         size_t           size,
+                         cl_uint          num_events_in_wait_list,
+                         const cl_event  *event_wait_list,
+                         cl_event        *event,
+                         cl_int          *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueMapBuffer_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, errcode_ret);
+        return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list,
+                    event_wait_list, event, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -660,7 +664,7 @@ cl_int clReleaseCommandQueue(cl_command_queue command_queue)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseCommandQueue_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue);
     }
@@ -670,24 +674,23 @@ cl_int clReleaseCommandQueue(cl_command_queue command_queue)
     }
 }
 
-cl_program clCreateProgramWithBinary(
-    cl_context            context,
-    cl_uint               num_devices,
-    const cl_device_id   *device_list,
-    const size_t         *lengths,
-    const unsigned char **binaries,
-    cl_int               *binary_status,
-    cl_int               *errcode_ret)
+cl_program clCreateProgramWithBinary(cl_context            context,
+                                     cl_uint               num_devices,
+                                     const cl_device_id   *device_list,
+                                     const size_t         *lengths,
+                                     const unsigned char **binaries,
+                                     cl_int               *binary_status,
+                                     cl_int               *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateProgramWithBinary_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -699,7 +702,7 @@ cl_int clRetainContext(cl_context context)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainContext_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context);
     }
@@ -713,7 +716,7 @@ cl_int clReleaseProgram(cl_program program)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseProgram_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program);
     }
@@ -727,7 +730,7 @@ cl_int clFlush(cl_command_queue command_queue)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clFlush_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue);
     }
@@ -741,7 +744,7 @@ cl_int clFinish(cl_command_queue command_queue)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clFinish_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue);
     }
@@ -751,16 +754,15 @@ cl_int clFinish(cl_command_queue command_queue)
     }
 }
 
-cl_int clGetProgramInfo(
-    cl_program      program,
-    cl_program_info param_name,
-    size_t          param_value_size,
-    void           *param_value,
-    size_t         *param_value_size_ret)
+cl_int clGetProgramInfo(cl_program      program,
+                        cl_program_info param_name,
+                        size_t          param_value_size,
+                        void           *param_value,
+                        size_t         *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetProgramInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -770,20 +772,17 @@ cl_int clGetProgramInfo(
     }
 }
 
-cl_kernel clCreateKernel(
-    cl_program  program,
-    const char *kernel_name,
-    cl_int     *errcode_ret)
+cl_kernel clCreateKernel(cl_program program, const char *kernel_name, cl_int *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateKernel_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program, kernel_name, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -795,7 +794,7 @@ cl_int clRetainKernel(cl_kernel kernel)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainKernel_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel);
     }
@@ -805,22 +804,17 @@ cl_int clRetainKernel(cl_kernel kernel)
     }
 }
 
-cl_mem clCreateBuffer(
-    cl_context   context,
-    cl_mem_flags flags,
-    size_t       size,
-    void        *host_ptr,
-    cl_int      *errcode_ret)
+cl_mem clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateBuffer_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, flags, size, host_ptr, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -829,21 +823,17 @@ cl_mem clCreateBuffer(
 }
 
 cl_program clCreateProgramWithSource(
-    cl_context    context,
-    cl_uint       count,
-    const char **strings,
-    const size_t *lengths,
-    cl_int       *errcode_ret)
+    cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateProgramWithSource_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, count, strings, lengths, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -855,7 +845,7 @@ cl_int clReleaseKernel(cl_kernel kernel)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseKernel_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel);
     }
@@ -868,12 +858,12 @@ cl_int clReleaseKernel(cl_kernel kernel)
 cl_int clGetDeviceIDs(cl_platform_id platform,
                       cl_device_type device_type,
                       cl_uint        num_entries,
-                      cl_device_id *devices,
+                      cl_device_id  *devices,
                       cl_uint       *num_devices)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetDeviceIDs_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(platform, device_type, num_entries, devices, num_devices);
     }
@@ -891,7 +881,7 @@ cl_int clGetDeviceInfo(cl_device_id   device,
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetDeviceInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(device, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -901,15 +891,12 @@ cl_int clGetDeviceInfo(cl_device_id   device,
     }
 }
 
-cl_int clGetMemObjectInfo(cl_mem      memobj,
-                          cl_mem_info param_name,
-                          size_t      param_value_size,
-                          void       *param_value,
-                          size_t     *param_value_size_ret)
+cl_int clGetMemObjectInfo(
+    cl_mem memobj, cl_mem_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetMemObjectInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(memobj, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -923,7 +910,7 @@ cl_int clRetainEvent(cl_event event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainEvent_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(event);
     }
@@ -941,7 +928,7 @@ cl_int clGetPlatformInfo(cl_platform_id   platform,
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetPlatformInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(platform, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -955,7 +942,7 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetPlatformIDs_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(num_entries, platforms, num_platforms);
     }
@@ -965,17 +952,16 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint
     }
 }
 
-cl_int
-clGetKernelWorkGroupInfo(cl_kernel                 kernel,
-                         cl_device_id              device,
-                         cl_kernel_work_group_info param_name,
-                         size_t                    param_value_size,
-                         void                     *param_value,
-                         size_t                   *param_value_size_ret)
+cl_int clGetKernelWorkGroupInfo(cl_kernel                 kernel,
+                                cl_device_id              device,
+                                cl_kernel_work_group_info param_name,
+                                size_t                    param_value_size,
+                                void                     *param_value,
+                                size_t                   *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetKernelWorkGroupInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -985,16 +971,15 @@ clGetKernelWorkGroupInfo(cl_kernel                 kernel,
     }
 }
 
-cl_int
-clGetCommandQueueInfo(cl_command_queue      command_queue,
-                      cl_command_queue_info param_name,
-                      size_t                param_value_size,
-                      void                 *param_value,
-                      size_t               *param_value_size_ret)
+cl_int clGetCommandQueueInfo(cl_command_queue      command_queue,
+                             cl_command_queue_info param_name,
+                             size_t                param_value_size,
+                             void                 *param_value,
+                             size_t               *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetCommandQueueInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -1004,16 +989,15 @@ clGetCommandQueueInfo(cl_command_queue      command_queue,
     }
 }
 
-cl_int
-clGetKernelInfo(cl_kernel      kernel,
-                cl_kernel_info param_name,
-                size_t         param_value_size,
-                void          *param_value,
-                size_t        *param_value_size_ret)
+cl_int clGetKernelInfo(cl_kernel      kernel,
+                       cl_kernel_info param_name,
+                       size_t         param_value_size,
+                       void          *param_value,
+                       size_t        *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetKernelInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -1023,16 +1007,15 @@ clGetKernelInfo(cl_kernel      kernel,
     }
 }
 
-cl_int
-clGetEventProfilingInfo(cl_event          event,
-                        cl_profiling_info param_name,
-                        size_t            param_value_size,
-                        void             *param_value,
-                        size_t           *param_value_size_ret)
+cl_int clGetEventProfilingInfo(cl_event          event,
+                               cl_profiling_info param_name,
+                               size_t            param_value_size,
+                               void             *param_value,
+                               size_t           *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetEventProfilingInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(event, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -1042,23 +1025,22 @@ clGetEventProfilingInfo(cl_event          event,
     }
 }
 
-cl_mem
-clCreateImage(cl_context             context,
-              cl_mem_flags           flags,
-              const cl_image_format *image_format,
-              const cl_image_desc   *image_desc,
-              void                  *host_ptr,
-              cl_int                *errcode_ret)
+cl_mem clCreateImage(cl_context             context,
+                     cl_mem_flags           flags,
+                     const cl_image_format *image_format,
+                     const cl_image_desc   *image_desc,
+                     void                  *host_ptr,
+                     cl_int                *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateImage_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, flags, image_format, image_desc, host_ptr, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -1066,14 +1048,12 @@ clCreateImage(cl_context             context,
     }
 }
 
-cl_int clSetKernelExecInfo(cl_kernel           kernel,
-                           cl_kernel_exec_info param_name,
-                           size_t              param_value_size,
-                           const void         *param_value)
+cl_int
+clSetKernelExecInfo(cl_kernel kernel, cl_kernel_exec_info param_name, size_t param_value_size, const void *param_value)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clSetKernelExecInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel, param_name, param_value_size, param_value);
     }
@@ -1083,23 +1063,153 @@ cl_int clSetKernelExecInfo(cl_kernel           kernel,
     }
 }
 
-cl_mem
-clImportMemoryARM(cl_context                      context,
-                  cl_mem_flags                    flags,
-                  const cl_import_properties_arm *properties,
-                  void                           *memory,
-                  size_t                          size,
-                  cl_int                         *errcode_ret)
+cl_command_buffer_khr clCreateCommandBufferKHR(cl_uint                                 num_queues,
+                                               const cl_command_queue                 *queues,
+                                               const cl_command_buffer_properties_khr *properties,
+                                               cl_int                                 *errcode_ret)
+{
+    arm_compute::CLSymbols::get().load_default();
+    const auto func = arm_compute::CLSymbols::get().clCreateCommandBufferKHR_ptr;
+
+    if (func != nullptr)
+    {
+        return func(num_queues, queues, properties, errcode_ret);
+    }
+    else
+    {
+        if (errcode_ret != nullptr)
+        {
+            *errcode_ret = CL_INVALID_OPERATION;
+        }
+
+        return {};
+    }
+}
+
+cl_int clFinalizeCommandBufferKHR(cl_command_buffer_khr command_buffer)
+{
+    arm_compute::CLSymbols::get().load_default();
+    const auto func = arm_compute::CLSymbols::get().clFinalizeCommandBufferKHR_ptr;
+
+    if (func != nullptr)
+    {
+        return func(command_buffer);
+    }
+    else
+    {
+        return CL_INVALID_OPERATION;
+    }
+}
+
+cl_int clRetainCommandBufferKHR(cl_command_buffer_khr command_buffer)
+{
+    arm_compute::CLSymbols::get().load_default();
+    const auto func = arm_compute::CLSymbols::get().clRetainCommandBufferKHR_ptr;
+
+    if (func != nullptr)
+    {
+        return func(command_buffer);
+    }
+    else
+    {
+        return CL_INVALID_OPERATION;
+    }
+}
+
+cl_int clReleaseCommandBufferKHR(cl_command_buffer_khr command_buffer)
+{
+    arm_compute::CLSymbols::get().load_default();
+    const auto func = arm_compute::CLSymbols::get().clReleaseCommandBufferKHR_ptr;
+
+    if (func != nullptr)
+    {
+        return func(command_buffer);
+    }
+    else
+    {
+        return CL_INVALID_OPERATION;
+    }
+}
+
+cl_int clEnqueueCommandBufferKHR(cl_uint               num_queues,
+                                 cl_command_queue     *queues,
+                                 cl_command_buffer_khr command_buffer,
+                                 cl_uint               num_events_in_wait_list,
+                                 const cl_event       *event_wait_list,
+                                 cl_event             *event)
+{
+    arm_compute::CLSymbols::get().load_default();
+    const auto func = arm_compute::CLSymbols::get().clEnqueueCommandBufferKHR_ptr;
+
+    if (func != nullptr)
+    {
+        return func(num_queues, queues, command_buffer, num_events_in_wait_list, event_wait_list, event);
+    }
+    else
+    {
+        return CL_INVALID_OPERATION;
+    }
+}
+
+cl_int clCommandNDRangeKernelKHR(cl_command_buffer_khr                           command_buffer,
+                                 cl_command_queue                                command_queue,
+                                 const cl_ndrange_kernel_command_properties_khr *properties,
+                                 cl_kernel                                       kernel,
+                                 cl_uint                                         work_dim,
+                                 const size_t                                   *global_work_offset,
+                                 const size_t                                   *global_work_size,
+                                 const size_t                                   *local_work_size,
+                                 cl_uint                                         num_sync_points_in_wait_list,
+                                 const cl_sync_point_khr                        *sync_point_wait_list,
+                                 cl_sync_point_khr                              *sync_point,
+                                 cl_mutable_command_khr                         *mutable_handle)
+{
+    arm_compute::CLSymbols::get().load_default();
+    const auto func = arm_compute::CLSymbols::get().clCommandNDRangeKernelKHR_ptr;
+
+    if (func != nullptr)
+    {
+        return func(command_buffer, command_queue, properties, kernel, work_dim, global_work_offset, global_work_size,
+                    local_work_size, num_sync_points_in_wait_list, sync_point_wait_list, sync_point, mutable_handle);
+    }
+    else
+    {
+        return CL_INVALID_OPERATION;
+    }
+}
+
+cl_int clUpdateMutableCommandsKHR(cl_command_buffer_khr             command_buffer,
+                                  const cl_mutable_base_config_khr *mutable_config)
+{
+    arm_compute::CLSymbols::get().load_default();
+    const auto func = arm_compute::CLSymbols::get().clUpdateMutableCommandsKHR_ptr;
+
+    if (func != nullptr)
+    {
+        return func(command_buffer, mutable_config);
+    }
+    else
+    {
+        return CL_INVALID_OPERATION;
+    }
+}
+
+cl_mem clImportMemoryARM(cl_context                      context,
+                         cl_mem_flags                    flags,
+                         const cl_import_properties_arm *properties,
+                         void                           *memory,
+                         size_t                          size,
+                         cl_int                         *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clImportMemoryARM_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, flags, properties, memory, size, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
diff --git a/src/core/CL/cl_kernels/activation_float_helpers.h b/src/core/CL/cl_kernels/activation_float_helpers.h
index 3f93c8d6fc61bd6d50c198b2e2f8e152ac6c8e9e..02faae2369aec87d573df889806ebb984b3abeb5 100644
--- a/src/core/CL/cl_kernels/activation_float_helpers.h
+++ b/src/core/CL/cl_kernels/activation_float_helpers.h
@@ -31,7 +31,8 @@
 #endif // GPU_ARCH == GPU_ARCH_BIFROST
 
 // Hard-Swish
-#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
+#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+    (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
 
 // Logistic Activation
 #define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
@@ -49,13 +50,16 @@
 #define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
 
 // Leaky RELU Activation
-#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+    ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
 
 // Soft RELU Activation
 #define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
 
 // ELU Activation
-#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
+#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)           \
+    (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, \
+            (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
 
 // Absolute Activation
 #define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
@@ -70,7 +74,8 @@
 #define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
 
 // GELU Activation
-#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
+#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+    (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
 
 // Identity Activation
 #define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
diff --git a/src/core/CL/cl_kernels/activation_quant_helpers.h b/src/core/CL/cl_kernels/activation_quant_helpers.h
index c420578546eaa3d6e83bd45521744ef90c92ceeb..c758ff127844d7182d4933dd12d9e69dd46fd587 100644
--- a/src/core/CL/cl_kernels/activation_quant_helpers.h
+++ b/src/core/CL/cl_kernels/activation_quant_helpers.h
@@ -60,17 +60,17 @@ inline TYPE identiy_op(TYPE x)
 }
 
 #define ACTIVATION_OP2(op, x) op##_op(x)
-#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+#define ACTIVATION_OP(op, x)  ACTIVATION_OP2(op, x)
 
 #if defined(S1_VAL) && defined(S2_VAL)
 #if defined(O1_VAL) && defined(O2_VAL)
 #define PERFORM_ACTIVATION_QUANT(act, data)                                                       \
     ({                                                                                            \
         data = ACTIVATION_OP(act, data);                                                          \
-        \
+                                                                                                  \
         VEC_DATA_TYPE(float, VEC_SIZE)                                                            \
         fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));                                    \
-        \
+                                                                                                  \
         fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \
         data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));                           \
     })
@@ -78,17 +78,14 @@ inline TYPE identiy_op(TYPE x)
 #define PERFORM_ACTIVATION_QUANT(act, data)                             \
     ({                                                                  \
         data = ACTIVATION_OP(act, data);                                \
-        \
+                                                                        \
         VEC_DATA_TYPE(float, VEC_SIZE)                                  \
         fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));          \
-        \
+                                                                        \
         fdata = round((fdata) * ((float)S1_VAL / (float)S2_VAL));       \
         data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \
     })
 #endif /* defined(O1_VAL) && defined(O2_VAL) */
 #else  /* defined(S1_VAL) && defined(S2_VAL) */
-#define PERFORM_ACTIVATION_QUANT(act, data) \
-    ({                                      \
-        data = ACTIVATION_OP(act, data);    \
-    })
+#define PERFORM_ACTIVATION_QUANT(act, data) ({ data = ACTIVATION_OP(act, data); })
 #endif /* defined(S1_VAL) && defined(S2_VAL) */
diff --git a/src/core/CL/cl_kernels/common/comparisons.cl b/src/core/CL/cl_kernels/common/comparisons.cl
index f05cb8783504215e7addc623c9528c83fee09c2c..00bb491f853db94ed4e56157ff8605454cea1753 100644
--- a/src/core/CL/cl_kernels/common/comparisons.cl
+++ b/src/core/CL/cl_kernels/common/comparisons.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,11 +30,13 @@
 #define LESS(x, y) ((x) < (y))
 #define LESSEQUAL(x, y) ((x) <= (y))
 
-#define DEFINE_KERNEL_STR(name) compare_##name
-#define DEFINE_KERNEL(name) DEFINE_KERNEL_STR(name)
+#ifdef IS_QUANTIZED
+#  define DEFINE_KERNEL_STR(name) compare_##name##_quantized
+#else // IS_QUANTIZED
+#  define DEFINE_KERNEL_STR(name) compare_##name
+#endif // IS_QUANTIZED
 
-#define DEFINE_KERNEL_QUANTIZED_STR(name) compare_##name##_quantized
-#define DEFINE_KERNEL_QUANTIZED(name) DEFINE_KERNEL_QUANTIZED_STR(name)
+#define DEFINE_KERNEL(name) DEFINE_KERNEL_STR(name)
 
 #if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME)
 /** This function compares two tensors.
@@ -73,78 +75,49 @@ __kernel void DEFINE_KERNEL(OP_NAME)(
     TENSOR3D_DECLARATION(in2),
     TENSOR3D_DECLARATION(out))
 {
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+    int dst_x = max((int)get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE, 0);
 
-    // Load values
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_a = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_b = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2.ptr);
+#if VEC_SIZE_IN1 == 1
+    int in1_x = 0;
+#else // VEC_SIZE_IN1 == 1
+    int in1_x = dst_x;
+#endif // VEC_SIZE_IN1 == 1
 
-    // Calculate and store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(OP(in_a, in_b), VEC_DATA_TYPE(uchar, VEC_SIZE)), 0, (__global uchar *)out.ptr);
-}
-#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME) */
+#if VEC_SIZE_IN2 == 1
+    int in2_x = 0;
+#else // VEC_SIZE_IN2 == 1
+    int in2_x = dst_x;
+#endif // VEC_SIZE_IN2 == 1
 
-#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2)
-/** This function compares two quantized tensors.
- *
- * @note The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
- * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
- * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
- * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
- * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: All quantized data types.
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void DEFINE_KERNEL_QUANTIZED(OP_NAME)(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
 
-    int16 in_a = CONVERT(vload16(0, (__global DATA_TYPE *)in1.ptr), int16);
-    int16 in_b = CONVERT(vload16(0, (__global DATA_TYPE *)in2.ptr), int16);
+    in1_ptr += in1_offset_first_element_in_bytes + z * in1_stride_z + y * in1_stride_y + in1_x * sizeof(DATA_TYPE);
+    in2_ptr += in2_offset_first_element_in_bytes + z * in2_stride_z + y * in2_stride_y + in2_x * sizeof(DATA_TYPE);
+    out_ptr += out_offset_first_element_in_bytes + z * out_stride_z + y * out_stride_y + dst_x * sizeof(uchar);
 
-    in_a = in_a - (int16)((int)OFFSET_IN1);
-    in_b = in_b - (int16)((int)OFFSET_IN2);
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_a = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE *)in1_ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_b = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE *)in2_ptr);
+
+    // Calculate and store result
+#ifdef IS_QUANTIZED
+    VEC_DATA_TYPE(int, VEC_SIZE) in_a_i32 = CONVERT(in_a, VEC_DATA_TYPE(int, VEC_SIZE));
+    VEC_DATA_TYPE(int, VEC_SIZE) in_b_i32 = CONVERT(in_b, VEC_DATA_TYPE(int, VEC_SIZE));
 
-    const float16 in1f32 = convert_float16(in_a) * (float16)((float)SCALE_IN1);
-    const float16 in2f32 = convert_float16(in_b) * (float16)((float)SCALE_IN2);
-    const int16   res    = OP(in1f32, in2f32);
+    VEC_DATA_TYPE(float, VEC_SIZE) in_a_fp = CONVERT(in_a_i32 - OFFSET_IN1, VEC_DATA_TYPE(float, VEC_SIZE)) * SCALE_IN1;
+    VEC_DATA_TYPE(float, VEC_SIZE) in_b_fp = CONVERT(in_b_i32 - OFFSET_IN2, VEC_DATA_TYPE(float, VEC_SIZE)) * SCALE_IN2;
+#else // IS_QUANTIZED
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_a_fp = in_a;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_b_fp = in_b;
+#endif // IS_QUANTIZED
 
-    // Store result
-    vstore16(convert_uchar16(res), 0, (__global uchar *)out.ptr);
+#if VEC_SIZE == 1
+    uchar res0 = (uchar)select(0, 255, OP(in_a_fp, in_b_fp));
+#else // VEC_SIZE == 1
+    VEC_DATA_TYPE(uchar, VEC_SIZE) res0 = CONVERT(OP(in_a_fp, in_b_fp), VEC_DATA_TYPE(uchar, VEC_SIZE));
+#endif // VEC_SIZE == 1
+
+    STORE_VECTOR_SELECT(res, uchar, out_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) */
\ No newline at end of file
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME) */
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h
deleted file mode 100644
index 2c2d60ed13f67b25187b448f45ad7e739e64c8fa..0000000000000000000000000000000000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h"
-
-/** (EXPERIMENTAL_POST_OPS) Post Op expansions for the post op sequence:
- * act (optional): POST_OP1_ACTIVATION_OPTIONAL
- * eltwise_op   : POST_OP2_ELTWISE_OP
- * act (optional): POST_OP3_ACTIVATION_OPTIONAL
- */
-
-/** Post Op 1: Activation Block (Optional)
- * @name POST_OP1_ACTIVATION_OPTIONAL
- * Toggled by -DP1_ACTIVATION_TYPE
- * params: same as those in @ref MIXED_PRECISION_ACTIVATION_BLOCK
- * @{
- */
-#if defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
-#define POST_OP1_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) \
-    MIXED_PRECISION_ACTIVATION_BLOCK(N, P1_ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, P1_ACTIVATION_A_VAL, P1_ACTIVATION_B_VAL, DATA_TYPE_ACCUMULATOR);
-#else                                                                                         // defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
-#define POST_OP1_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) // noop
-#endif                                                                                        // defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
-/** @} */                                                                                     // end of group POST_OP1_ACTIVATION_OPTIONAL
-
-/** Post Op 2: Eltwise Op Block
- * Handles both broadcasting and non-broadcasting cases
- * @name POST_OP2_ELTWISE_OP
- *
- * @param[in] P2_ELTWISE_ARG1_HEIGHT Height (number of rows) of the @ref ELTWISE_OPERAND_NAME tensor
- * @param[in] P2_ELTWISE_ARG1_WIDTH  Width (number of columns) of the @ref ELTWISE_OPERAND_NAME tensor
- * @param[in] OP                     The elementwise post op
- * @param[in] M0                     The number of consecutive rows
- * @param[in] N0                     The number of consecutive columns
- * @param[in] BASENAME               The basename of the result variables
- * @param[in] ELTWISE_OPERAND_NAME   The basename of the other operand variables
- * @param[in] ELTWISE_OPERAND_ROW    The starting row of the other operand variables. Required as different boundary handling strategies are used by different kernels
- *                                   E.g. reshaped_only_rhs and native kernels shifts rows (by using COMPUTE_M0_START_ROW) to handle boundary rows,
- *                                   whereas reshaped kernels do not shift rows
- * @param[in] DATA_TYPE              Data type of the result variables
- * @param[in] DATA_TYPE_ACCUMULATR   Higher-precision accumulator data type in case of mixed-precision op
- * @param[in] ZERO                   Zero vector for z offset
- * @param[in] PARTIAL_LOAD_M0        The partial size in y, for partial blocks. Supported: [0, @p M0)
- * @param[in] PARTIAL_LOAD_N0        The partial size in x, for partial blocks. Supported: [0, @p N0)
- * @param[in] PARTIAL_COND_Y         Condition on the y axis to perform the partial load Y. True to use PARTIAL_LOAD_M0 rather than M0.
- * @param[in] PARTIAL_COND_X         Condition on the x axis to perform the partial load X. True to use PARTIAL_LOAD_N0 rather than N0.
- * @{
- */
-#if defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#if P2_ELTWISE_ARG1_HEIGHT == 1
-#if P2_ELTWISE_ARG1_WIDTH == 1 // Case 1: Broadcasting in both X and Y; op2 arg tile shape[YxX] == [1x1]
-#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_ROW, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z;              \
-    VEC_DATA_TYPE(DATA_TYPE, 1)                                                                                                                                                                        \
-    ELTWISE_OPERAND_NAME##0 = VLOAD(1)(0, (__global DATA_TYPE *)ELTWISE_OPERAND_NAME##_addr);                                                                                                          \
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, 1, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
-#else // P2_ELTWISE_ARG1_WIDTH == 1; Case 2: Broadcasting in only Y; op2 arg tile shape[YxX] == [1xN0]
-#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_ROW, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                                        \
-    __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_NAME##_addr, 0, ELTWISE_OPERAND_NAME##_stride_y, ZERO, 1, PARTIAL_LOAD_N0, false, PARTIAL_COND_X);                                                      \
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
-#endif // P2_ELTWISE_ARG1_WIDTH == 1
-#else  // P2_ELTWISE_ARG1_HEIGHT == 1; Case 3: No broadcasting; op2 arg tile shape[YxX] == [M0xN0]
-#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_ROW, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                                                                                                  \
-    __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (ELTWISE_OPERAND_ROW * ELTWISE_OPERAND_NAME##_stride_y) + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_NAME##_addr, 0, ELTWISE_OPERAND_NAME##_stride_y, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X);                                                                                        \
-    MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
-#endif    // P2_ELTWISE_ARG1_HEIGHT == 1
-#endif    // defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-/** @} */ // end of group POST_OP2_ELTWISE_OP
-/** Post Op 3: Activation Block (Optional)
- * @name POST_OP3_ACTIVATION_OPTIONAL
- * Toggled by -DP3_ACTIVATION_TYPE
- * params: same as those in @ref MIXED_PRECISION_ACTIVATION_BLOCK
- * @{
- */
-#if defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
-#define POST_OP3_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) \
-    MIXED_PRECISION_ACTIVATION_BLOCK(N, P3_ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, P3_ACTIVATION_A_VAL, P3_ACTIVATION_B_VAL, DATA_TYPE_ACCUMULATOR);
-#else                                                                                         // defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
-#define POST_OP3_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) // noop
-#endif                                                                                        // defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
-/** @} */                                                                                     // end of group POST_OP3_ACTIVATION_OPTIONAL
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
deleted file mode 100644
index 22ae0987722cc8bf211c1ebfe3f0d68e9cfe4bf6..0000000000000000000000000000000000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h"
-#include "common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h"
-#include "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h"
-
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-/** (EXPERIMENTAL_POST_OPS) gemm_mm_native kernel */
-#if defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-
-#define VFMA(a, b, c)     \
-    ({                    \
-        c = fma(a, b, c); \
-    })
-
-#if M0 == 1
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-    })
-#elif M0 == 2 // M0 == 2
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-    })
-#elif M0 == 3 // M0 == 3
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-    })
-#elif M0 == 4 // M0 == 4
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-    })
-#elif M0 == 5 // M0 == 5
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-    })
-#elif M0 == 6 // M0 == 6
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-    })
-#elif M0 == 7 // M0 == 7
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-    })
-#elif M0 == 8 // M0 == 8
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
-    })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-
-#if defined(GEMM_MM_NATIVE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_native, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_native_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                     IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                                     IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                     IMAGE_DECLARATION(dst),
-                                                     // Post Op arguments
-                                                     IMAGE_DECLARATION(eltwise_operand),
-                                                     uint lhs_stride_z,
-                                                     uint rhs_stride_z,
-#if defined(BETA)
-                                                     uint bias_stride_z,
-#endif //defined(BETA)
-                                                     uint      dst_stride_z,
-                                                     uint      eltwise_operand_stride_z,
-                                                     const int M,
-                                                     const int N,
-                                                     const int K
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                     ,
-                                                     uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                     ,
-                                                     uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                    )
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
-
-    int i = 0;
-#if K0 > 1
-    for(; i <= (K - K0); i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
-
-        RHS_VFMA_M0xN0(0, a, b0, c);
-        RHS_VFMA_M0xN0(1, a, b1, c);
-#if K0 > 2
-        RHS_VFMA_M0xN0(2, a, b2, c);
-#endif // K0 > 2
-#if K0 > 3
-        RHS_VFMA_M0xN0(3, a, b3, c);
-#endif // K0 > 3
-#if K0 > 4
-        RHS_VFMA_M0xN0(4, a, b4, c);
-        RHS_VFMA_M0xN0(5, a, b5, c);
-        RHS_VFMA_M0xN0(6, a, b6, c);
-        RHS_VFMA_M0xN0(7, a, b7, c);
-#endif // K0 > 4
-#if K0 > 8
-        RHS_VFMA_M0xN0(8, a, b8, c);
-        RHS_VFMA_M0xN0(9, a, b9, c);
-        RHS_VFMA_M0xN0(A, a, bA, c);
-        RHS_VFMA_M0xN0(B, a, bB, c);
-        RHS_VFMA_M0xN0(C, a, bC, c);
-        RHS_VFMA_M0xN0(D, a, bD, c);
-        RHS_VFMA_M0xN0(E, a, bE, c);
-        RHS_VFMA_M0xN0(F, a, bF, c);
-#endif // K0 > 8
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        rhs_offset += K0 * rhs_stride_y;
-    }
-#endif // K0 > 1
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
-#endif // M0 > 7
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
-        RHS_VFMA_M0xN0(0, a, b, c);
-
-        lhs_offset += sizeof(DATA_TYPE);
-        rhs_offset += rhs_stride_y;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-#endif // defined(GEMM_MM_NATIVE_POST_ACT_ELTWISE_OP_ACT)
-#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
deleted file mode 100644
index 89577e9ebd9efd763685ae6de25682e632fdc646..0000000000000000000000000000000000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
+++ /dev/null
@@ -1,1424 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "fp_post_ops_act_eltwise_op_act.h"
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-/** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped kernel */
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
-#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-
-#if defined(MIXED_PRECISION)
-#if K0 == 2
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-    })
-#elif K0 == 3 // K0 == 3
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-    })
-#elif K0 == 4 // K0 == 4
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-        c += a.s3 * b.s3;   \
-    })
-#elif K0 == 8 // K0 == 8
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-        c += a.s3 * b.s3;   \
-        c += a.s4 * b.s4;   \
-        c += a.s5 * b.s5;   \
-        c += a.s6 * b.s6;   \
-        c += a.s7 * b.s7;   \
-    })
-#elif K0 == 16 // K0 == 16
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-        c += a.s3 * b.s3;   \
-        c += a.s4 * b.s4;   \
-        c += a.s5 * b.s5;   \
-        c += a.s6 * b.s6;   \
-        c += a.s7 * b.s7;   \
-        c += a.s8 * b.s8;   \
-        c += a.s9 * b.s9;   \
-        c += a.sA * b.sA;   \
-        c += a.sB * b.sB;   \
-        c += a.sC * b.sC;   \
-        c += a.sD * b.sD;   \
-        c += a.sE * b.sE;   \
-        c += a.sF * b.sF;   \
-    })
-#else // K0 not supported
-#error "K0 value not supported"
-#endif // K0 conditions
-#else  // defined(MIXED_PRECISION)
-#if K0 == 2
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-    })
-#elif K0 == 3 // K0 == 3
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-    })
-#elif K0 == 4 // K0 == 4
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-        c = fma(a.s3, b.s3, c); \
-    })
-#elif K0 == 8 // K0 == 8
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-        c = fma(a.s3, b.s3, c); \
-        c = fma(a.s4, b.s4, c); \
-        c = fma(a.s5, b.s5, c); \
-        c = fma(a.s6, b.s6, c); \
-        c = fma(a.s7, b.s7, c); \
-    })
-#elif K0 == 16 // K0 == 16
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-        c = fma(a.s3, b.s3, c); \
-        c = fma(a.s4, b.s4, c); \
-        c = fma(a.s5, b.s5, c); \
-        c = fma(a.s6, b.s6, c); \
-        c = fma(a.s7, b.s7, c); \
-        c = fma(a.s8, b.s8, c); \
-        c = fma(a.s9, b.s9, c); \
-        c = fma(a.sA, b.sA, c); \
-        c = fma(a.sB, b.sB, c); \
-        c = fma(a.sC, b.sC, c); \
-        c = fma(a.sD, b.sD, c); \
-        c = fma(a.sE, b.sE, c); \
-        c = fma(a.sF, b.sF, c); \
-    })
-#else // K0 not supported
-#error "K0 value not supported"
-#endif // K0 conditions
-#endif // defined(MIXED_PRECISION)
-
-#if defined(ARM_DOT_K0XN0)
-#undef ARM_DOT_K0XN0
-#endif // defined(ARM_DOT_K0XN0)
-
-#if N0 == 2
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-    })
-#elif N0 == 3 // N0 == 3
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-    })
-#elif N0 == 4 // N0 == 4
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-        ARM_DOT_K0((a), (b##3), (c.s3)); \
-    })
-#elif N0 == 8 // N0 == 8
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-        ARM_DOT_K0((a), (b##3), (c.s3)); \
-        ARM_DOT_K0((a), (b##4), (c.s4)); \
-        ARM_DOT_K0((a), (b##5), (c.s5)); \
-        ARM_DOT_K0((a), (b##6), (c.s6)); \
-        ARM_DOT_K0((a), (b##7), (c.s7)); \
-    })
-#elif N0 == 16 // N0 == 16
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-        ARM_DOT_K0((a), (b##3), (c.s3)); \
-        ARM_DOT_K0((a), (b##4), (c.s4)); \
-        ARM_DOT_K0((a), (b##5), (c.s5)); \
-        ARM_DOT_K0((a), (b##6), (c.s6)); \
-        ARM_DOT_K0((a), (b##7), (c.s7)); \
-        ARM_DOT_K0((a), (b##8), (c.s8)); \
-        ARM_DOT_K0((a), (b##9), (c.s9)); \
-        ARM_DOT_K0((a), (b##A), (c.sA)); \
-        ARM_DOT_K0((a), (b##B), (c.sB)); \
-        ARM_DOT_K0((a), (b##C), (c.sC)); \
-        ARM_DOT_K0((a), (b##D), (c.sD)); \
-        ARM_DOT_K0((a), (b##E), (c.sE)); \
-        ARM_DOT_K0((a), (b##F), (c.sF)); \
-    })
-#else // N0 not supported
-#error "N0 value not supported"
-#endif // N0 conditions
-
-#if defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_nt_rhs_t, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                    IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                                                    IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                    IMAGE_DECLARATION(dst),
-                                                                    // Post Op arguments
-                                                                    IMAGE_DECLARATION(eltwise_operand),
-                                                                    uint lhs_stride_z,
-                                                                    uint rhs_stride_z,
-#if defined(BETA)
-                                                                    uint bias_stride_z,
-#endif //defined(BETA)
-                                                                    uint dst_stride_z,
-                                                                    uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                    ,
-                                                                    uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                    ,
-                                                                    const int M,
-                                                                    const int N,
-                                                                    const int K)
-{
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (K0)
-#define LHS_STEP_X ((K0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (K0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
-                               (get_global_id(2) * lhs_stride_z);
-
-    // Compute RHS matrix address
-    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_addr += get_global_id(2) * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-    for(int i = 0; i < K; i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
-        // Accumulate
-        ARM_DOT_K0XN0(a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(a7, b, c7);
-#endif // M0 > 7
-
-        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
-        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-    // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += get_global_id(2) * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_nt_rhs_t_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                            __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                                            IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                            IMAGE_DECLARATION(dst),
-                                                                            // Post Op arguments
-                                                                            IMAGE_DECLARATION(eltwise_operand),
-                                                                            uint lhs_stride_z,
-                                                                            uint rhs_stride_z,
-#if defined(BETA)
-                                                                            uint bias_stride_z,
-#endif //defined(BETA)
-                                                                            uint dst_stride_z,
-                                                                            uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                            ,
-                                                                            uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                            ,
-                                                                            const int M,
-                                                                            const int N,
-                                                                            const int K)
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
-
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (K0)
-#define LHS_STEP_X ((K0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (K0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X (PIXEL_UNIT * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X PIXEL_UNIT
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
-                               (get_global_id(2) * lhs_stride_z);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = get_global_id(2);
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-    for(int i = 0; i < K; i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
-
-        // Load values from RHS matrix stored in a cl_image
-        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
-        // Accumulate
-        ARM_DOT_K0XN0(a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(a7, b, c7);
-#endif // M0 > 7
-
-        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
-
-        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-    // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += get_global_id(2) * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(LHS_TRANSPOSE)
-
-#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
-
-#if defined(MIXED_PRECISION)
-
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
-#else // GPU_ARCH == GPU_ARCH_MIDGARD
-#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-#else // defined(MIXED_PRECISION
-
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
-#else // GPU_ARCH == GPU_ARCH_MIDGARD
-#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-#endif // defined(MIXED_PRECISION)
-
-#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C)         \
-    ({                                                 \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
-    })
-#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
-    })
-#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C);           \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
-    })
-#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C);           \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
-    })
-#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C);           \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
-    })
-
-// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
-// a is the column-vector (transposed)
-// b is the row-vector (not transposed)
-// C is the output matrix
-// Lower case is a vector (a, b)
-// Upper case is a matrix (C)
-#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
-
-#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C)           \
-    ({                                                        \
-        ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C);           \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
-    })
-
-// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
-// The dimensions for this matrix multiplications are defined through M0, N0 and K0
-// The dimensions supported are:
-// M0: 1, 2, 3, 4, 8
-// N0: 1, 2, 3, 4, 8, 16
-// K0: 1, 2, 3, 4, 8, 16
-// This macro calls the vector-by-matrix macro K0 times
-// A, B and C are matrices
-#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
-    CONCAT(ARM_MM_T_NT_M0xN0x, K0)             \
-    (M0, N0, TYPE, A, B, C)
-
-#if defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_t_rhs_nt, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M                        Number of rows in LHS matrix not reshaped.
- * @param[in] N                        Number of columns in RHS matrix not reshaped.
- * @param[in] K                        Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                    IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                                                    IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                    IMAGE_DECLARATION(dst),
-                                                                    // Post Op arguments
-                                                                    IMAGE_DECLARATION(eltwise_operand),
-                                                                    uint lhs_stride_z,
-                                                                    uint rhs_stride_z,
-#if defined(BETA)
-                                                                    uint bias_stride_z,
-#endif //defined(BETA)
-                                                                    uint dst_stride_z,
-                                                                    uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                    ,
-                                                                    uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                    ,
-                                                                    const int M,
-                                                                    const int N,
-                                                                    const int K)
-{
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (M0)
-#define LHS_STEP_X ((M0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (M0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (N0)
-#define RHS_STEP_X ((N0) * (H0))
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (N0)
-#endif // defined(RHS_INTERLEAVE)
-
-    const uint x = get_global_id(0);
-    const uint y = get_global_id(1);
-    const uint z = get_global_id(2);
-
-    // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
-
-    // Compute RHS matrix address
-    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_addr += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
-    __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
-
-    for(int i = 0; i < K; i += K0)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, M0)
-        a0;
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-#if K0 > 1
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 1
-
-#if K0 > 2
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 2
-
-#if K0 > 3
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 3
-
-#if K0 > 4
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 4
-
-#if K0 > 8
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 8
-
-#ifndef LHS_INTERLEAVE
-        lhs += (M0 * K0 * (V0 - 1));
-#endif // LHS_INTERLEAVE
-
-#ifndef RHS_INTERLEAVE
-        rhs += (N0 * K0 * (H0 - 1));
-#endif // RHS_INTERLEAVE
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_t_rhs_nt_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                            __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                                            IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                            IMAGE_DECLARATION(dst),
-                                                                            // Post Op arguments
-                                                                            IMAGE_DECLARATION(eltwise_operand),
-                                                                            uint lhs_stride_z,
-                                                                            uint rhs_stride_z,
-#if defined(BETA)
-                                                                            uint bias_stride_z,
-#endif //defined(BETA)
-                                                                            uint dst_stride_z,
-                                                                            uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                            ,
-                                                                            uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                            ,
-                                                                            const int M,
-                                                                            const int N,
-                                                                            const int K)
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
-
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (M0)
-#define LHS_STEP_X ((M0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (M0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (PIXEL_UNIT)
-#endif // defined(RHS_INTERLEAVE)
-
-    const uint x = get_global_id(0);
-    const uint y = get_global_id(1);
-    const uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (z % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
-
-    for(int i = 0; i < K; i += K0)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, M0)
-        a0;
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-#if K0 > 1
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 1
-
-#if K0 > 2
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 2
-
-#if K0 > 3
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 3
-
-#if K0 > 4
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 4
-
-#if K0 > 8
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 8
-
-#ifndef LHS_INTERLEAVE
-        lhs += (M0 * K0 * (V0 - 1));
-#endif // LHS_INTERLEAVE
-
-        x_rhs += K0 * RHS_STEP_X;
-#ifndef RHS_INTERLEAVE
-        x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
-#endif // RHS_INTERLEAVE
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-    // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-
-#endif // defined(LHS_TRANSPOSE)
-#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
deleted file mode 100644
index 09ddcde043699230c91078dc70a66350686dccca..0000000000000000000000000000000000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
+++ /dev/null
@@ -1,1399 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "fp_post_ops_act_eltwise_op_act.h"
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-/** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped_only_rhs kernel */
-#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
-#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-
-#define CONCAT(a, b) a##b
-
-#define ARM_DOT1(a, b, c) \
-    ({                    \
-        c = fma(a, b, c); \
-    })
-#define ARM_DOT2(a, b, c)       \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-    })
-#define ARM_DOT3(a, b, c)           \
-    ({                              \
-        ARM_DOT2(a, b, c);          \
-        c = fma((a.s2), (b.s2), c); \
-    })
-#define ARM_DOT4(a, b, c)           \
-    ({                              \
-        ARM_DOT3(a, b, c);          \
-        c = fma((a.s3), (b.s3), c); \
-    })
-#define ARM_DOT8(a, b, c)            \
-    ({                               \
-        ARM_DOT4((a.lo), (b.lo), c); \
-        ARM_DOT4((a.hi), (b.hi), c); \
-    })
-#define ARM_DOT16(a, b, c)           \
-    ({                               \
-        ARM_DOT8((a.lo), (b.lo), c); \
-        ARM_DOT8((a.hi), (b.hi), c); \
-    })
-
-#if N0 == 2
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-    })
-#elif N0 == 3 // N0 == 3
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-    })
-#elif N0 == 4 // N0 == 4
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-    })
-#elif N0 == 8 // N0 == 8
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##4), (c.s4));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##5), (c.s5));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##6), (c.s6));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##7), (c.s7));     \
-    })
-#elif N0 == 16 // N0 == 16
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##4), (c.s4));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##5), (c.s5));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##6), (c.s6));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##7), (c.s7));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##8), (c.s8));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##9), (c.s9));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##A), (c.sA));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##B), (c.sB));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##C), (c.sC));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##D), (c.sD));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##E), (c.sE));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##F), (c.sF));     \
-    })
-#else // N0 not supported
-#error "N0 value not supported"
-#endif // N0 conditions
-
-#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_t, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                  IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                                                  IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                  IMAGE_DECLARATION(dst),
-                                                                  // Post Op arguments
-                                                                  IMAGE_DECLARATION(eltwise_operand),
-                                                                  uint lhs_stride_z,
-                                                                  uint rhs_stride_z,
-#if defined(BETA)
-                                                                  uint bias_stride_z,
-#endif //defined(BETA)
-                                                                  uint dst_stride_z,
-                                                                  uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                                  ,
-                                                                  uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                  ,
-                                                                  uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                  ,
-                                                                  const int M,
-                                                                  const int N,
-                                                                  const int K)
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS reshaped matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS reshaped matrix
-        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
-        // Accumulate
-        ARM_DOT_K0XN0(K0, a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(K0, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(K0, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(K0, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(K0, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(K0, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(K0, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(K0, a7, b, c7);
-#endif // M0 > 7
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS reshaped matrix
-        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
-        // Accumulate
-        ARM_DOT_K0XN0(1, a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(1, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(1, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(1, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(1, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(1, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(1, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(1, a7, b, c7);
-#endif // M0 > 7
-
-        lhs_offset += sizeof(DATA_TYPE);
-        rhs_offset += sizeof(DATA_TYPE);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_t_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M                        Number of rows in LHS matrix not reshaped.
- * @param[in] N                        Number of columns in RHS matrix not reshaped.
- * @param[in] K                        Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                          __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                                          IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                          IMAGE_DECLARATION(dst),
-                                                                          // Post Op arguments
-                                                                          IMAGE_DECLARATION(eltwise_operand),
-                                                                          uint lhs_stride_z,
-                                                                          uint rhs_stride_z,
-#if defined(BETA)
-                                                                          uint bias_stride_z,
-#endif //defined(BETA)
-                                                                          uint dst_stride_z,
-                                                                          uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                                          ,
-                                                                          uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                          ,
-                                                                          uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                          ,
-                                                                          const int M,
-                                                                          const int N,
-                                                                          const int K)
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
-
-    const uint LEFTOVER_K = K % K0;
-
-    // Block size
-#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X (PIXEL_UNIT * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X PIXEL_UNIT
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = get_global_id(2);
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix stored in a cl_image
-        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
-        // Accumulate
-        ARM_DOT_K0XN0(K0, a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(K0, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(K0, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(K0, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(K0, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(K0, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(K0, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(K0, a7, b, c7);
-#endif // M0 > 7
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-
-    if(LEFTOVER_K != 0)
-    {
-        // Note: We cannot read out-of-bound elements from the RHS matrix because
-        // the RHS width is always multiple of K0. This is not be true for the LHS matrix
-
-        union UNION_VEC_TYPE
-        {
-            DATA_TYPE s[K0];
-            VEC_DATA_TYPE(DATA_TYPE, K0)
-            v;
-        };
-
-        union UNION_VEC_TYPE a0 = {.v = 0 };
-#if M0 > 1
-        union UNION_VEC_TYPE a1 = {.v = 0 };
-#endif // M0 > 1
-#if M0 > 2
-        union UNION_VEC_TYPE a2 = {.v = 0 };
-#endif // M0 > 2
-#if M0 > 3
-        union UNION_VEC_TYPE a3 = {.v = 0 };
-#endif // M0 > 3
-#if M0 > 4
-        union UNION_VEC_TYPE a4 = {.v = 0 };
-#endif // M0 > 4
-#if M0 > 5
-        union UNION_VEC_TYPE a5 = {.v = 0 };
-#endif // M0 > 5
-#if M0 > 6
-        union UNION_VEC_TYPE a6 = {.v = 0 };
-#endif // M0 > 6
-#if M0 > 7
-        union UNION_VEC_TYPE a7 = {.v = 0 };
-#endif // M0 > 7
-
-        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-
-        // Load from RHS matrix
-        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
-        // Load from LHS matrix
-        for(int k = 0; k < LEFTOVER_K; ++k)
-        {
-            a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
-#if M0 > 1
-            a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
-#endif // M0 > 1
-#if M0 > 2
-            a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
-#endif // M0 > 2
-#if M0 > 3
-            a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
-#endif // M0 > 3
-#if M0 > 4
-            a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
-#endif // M0 > 4
-#if M0 > 5
-            a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
-#endif // M0 > 5
-#if M0 > 6
-            a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
-#endif // M0 > 6
-#if M0 > 7
-            a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
-#endif // M0 > 7
-
-            lhs_offset += sizeof(DATA_TYPE);
-        }
-
-        // Accumulate
-        ARM_DOT_K0XN0(K0, a0.v, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(K0, a1.v, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(K0, a2.v, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(K0, a3.v, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(K0, a4.v, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(K0, a5.v, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(K0, a6.v, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(K0, a7.v, b, c7);
-#endif // M0 > 7
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-
-#define VFMA(a, b, c)     \
-    ({                    \
-        c = fma(a, b, c); \
-    })
-
-#if M0 == 1
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-    })
-#elif M0 == 2 // M0 == 2
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-    })
-#elif M0 == 3 // M0 == 3
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-    })
-#elif M0 == 4 // M0 == 4
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-    })
-#elif M0 == 5 // M0 == 5
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-    })
-#elif M0 == 6 // M0 == 6
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-    })
-#elif M0 == 7 // M0 == 7
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-    })
-#elif M0 == 8 // M0 == 8
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
-    })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-
-#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_nt, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M                        Number of rows in LHS matrix not reshaped.
- * @param[in] N                        Number of columns in RHS matrix not reshaped.
- * @param[in] K                        Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                   IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                                                   IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                   IMAGE_DECLARATION(dst),
-                                                                   // Post Op arguments
-                                                                   IMAGE_DECLARATION(eltwise_operand),
-                                                                   uint lhs_stride_z,
-                                                                   uint rhs_stride_z,
-#if defined(BETA)
-                                                                   uint bias_stride_z,
-#endif //defined(BETA)
-                                                                   uint dst_stride_z,
-                                                                   uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                                   ,
-                                                                   uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                   ,
-                                                                   uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                   ,
-                                                                   const int M,
-                                                                   const int N,
-                                                                   const int K)
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (N0)
-#define RHS_STEP_X ((N0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (N0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS reshaped matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);   //uint zin0=0,zin1=0,zin2=0,... zin7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(0, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(1, a, b0, c);
-#if K0 > 2
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(2, a, b0, c);
-#endif // K0 > 2
-#if K0 > 3
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(3, a, b0, c);
-#endif // K0 > 3
-#if K0 > 4
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(4, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(5, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(6, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(7, a, b0, c);
-#endif // K0 > 4
-#if K0 > 8
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(8, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(9, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(A, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(B, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(C, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(D, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(E, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(F, a, b0, c);
-#endif // K0 > 8
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(0, a, b0, c);
-
-        lhs_offset += sizeof(DATA_TYPE);
-        rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef RHS_STEP_LOOP
-}
-#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_nt_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M                        Number of rows in LHS matrix not reshaped.
- * @param[in] N                        Number of columns in RHS matrix not reshaped.
- * @param[in] K                        Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                           __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                                           IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                           IMAGE_DECLARATION(dst),
-                                                                           // Post Op arguments
-                                                                           IMAGE_DECLARATION(eltwise_operand),
-                                                                           uint lhs_stride_z,
-                                                                           uint rhs_stride_z,
-#if defined(BETA)
-                                                                           uint bias_stride_z,
-#endif //defined(BETA)
-                                                                           uint dst_stride_z,
-                                                                           uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                                           ,
-                                                                           uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                           ,
-                                                                           uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                           ,
-                                                                           const int M,
-                                                                           const int N,
-                                                                           const int K)
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (PIXEL_UNIT)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (z % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(0, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(1, a, b0, c);
-#if K0 > 2
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(2, a, b0, c);
-#endif // K0 > 2
-#if K0 > 3
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(3, a, b0, c);
-#endif // K0 > 3
-#if K0 > 4
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(4, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(5, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(6, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(7, a, b0, c);
-#endif // K0 > 4
-#if K0 > 8
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(8, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(9, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(A, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(B, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(C, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(D, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(E, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(F, a, b0, c);
-#endif // K0 > 8
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-
-        VFMA_M0xN0(0, a, b0, c);
-
-        lhs_offset += sizeof(DATA_TYPE);
-        x_rhs += RHS_STEP_X;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h
deleted file mode 100644
index b584251c2aa0017f02f778995d6e3fbc1099b8af..0000000000000000000000000000000000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** (EXPERIMENTAL_POST_OPS) Macros for (binary) elementwise operations */
-
-/** List of (binary) elementwise operators, accounting for the argument position of argument X
- * @note X_Pos denotes the position of argument X. e.g. X_POS_0 means X is in the first place whereas X_POS_1 means X is in the second place
- * @name elementwise_post_ops
- * @{
- */
-#if defined(N0) && !defined(VEC_SIZE)
-#define VEC_SIZE N0
-#endif // defined(N0) && !defined(VEC_SIZE)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE)
-
-#define ADD_X_POS_0(x, y) (x) + (y)
-#define SUB_X_POS_0(x, y) (x) - (y)
-#define MAX_X_POS_0(x, y) max(x, y)
-#define MIN_X_POS_0(x, y) min(x, y)
-#define SQUARED_DIFF_X_POS_0(x, y) (x - y) * (x - y)
-#define POWER_X_POS_0(x, y) pow(x, y)
-#if VEC_SIZE == 1
-#define PRELU_X_POS_0(x, y) (x > 0 ? x : x * y)
-#else // VEC_SIZE == 1
-
-#if defined(MIXED_PRECISION)
-#define PRELU_X_POS_0(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_ACCUMULATOR)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, VEC_SIZE))))
-#else // MIXED_PRECISION
-#define PRELU_X_POS_0(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))))
-#endif // MIXED_PRECISION
-
-#endif // VEC_SIZE == 1
-#define DIV_X_POS_0(x, y) (x / y)
-#define AND_X_POS_0(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1))
-#define OR_X_POS_0(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1))
-
-#define ADD_X_POS_1(x, y) ADD_X_POS_0(x, y)
-#define SUB_X_POS_1(x, y) (y) - (x)
-#define MAX_X_POS_1(x, y) MAX_X_POS_0(x, y)
-#define MIN_X_POS_1(x, y) MIN_X_POS_0(x, y)
-#define SQUARED_DIFF_X_POS_1(x, y) SQUARED_DIFF_X_POS_0(x, y)
-#define POWER_X_POS_1(x, y) pow(y, x)
-#if VEC_SIZE == 1
-#define PRELU_X_POS_1(x, y) (y > 0 ? y : y * x)
-#else // VEC_SIZE == 1
-
-#if defined(MIXED_PRECISION)
-#define PRELU_X_POS_1(x, y) (select(x * y, y, CONVERT((y > (DATA_TYPE_ACCUMULATOR)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, VEC_SIZE))))
-#else // MIXED_PRECISION
-#define PRELU_X_POS_1(x, y) (select(x * y, y, CONVERT((y > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))))
-#endif // MIXED_PRECISION
-
-#endif // VEC_SIZE == 1
-#define DIV_X_POS_1(x, y) (y / x)
-#define AND_X_POS_1(x, y) AND_X_POS_0(x, y)
-#define OR_X_POS_1(x, y) OR_X_POS_0(x, y)
-
-// By default use the order of the arguments as they are passed in, ie. _X_POS_0
-#define ADD(x, y) ADD_X_POS_0(x, y)
-#define SUB(x, y) SUB_X_POS_0(x, y)
-#define MAX(x, y) MAX_X_POS_0(x, y)
-#define MIN(x, y) MIN_X_POS_0(x, y)
-#define SQUARED_DIFF(x, y) SQUARED_DIFF_X_POS_0(x, y)
-#define POWER(x, y) POWER_X_POS_0(x, y)
-#define PRELU(x, y) PRELU_X_POS_0(x, y)
-#define DIV(x, y) DIV_X_POS_0(x, y)
-#define AND(x, y) AND_X_POS_0(x, y)
-#define OR(x, y) OR_X_POS_0(x, y)
-
-#endif    // defined(VEC_SIZE) && defined(DATA_TYPE)
-/** @} */ // end of group elementwise_post_ops
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name ELTWISE_OP_ROW_n
- *
- * @param[in]      OP       The elementwise post op
- * @param[in, out] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in]      OPERAND2 The basename of the operand 2 variables
- * @{
- */
-#define ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2) \
-    OPERAND1##0 = OP(OPERAND1##0, OPERAND2##0);
-
-#define ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##1 = OP(OPERAND1##1, OPERAND2##1);
-
-#define ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##2 = OP(OPERAND1##2, OPERAND2##2);
-
-#define ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##3 = OP(OPERAND1##3, OPERAND2##3);
-
-#define ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##4 = OP(OPERAND1##4, OPERAND2##4);
-
-#define ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##5 = OP(OPERAND1##5, OPERAND2##5);
-
-#define ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##6 = OP(OPERAND1##6, OPERAND2##6);
-
-#define ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##7 = OP(OPERAND1##7, OPERAND2##7);
-
-#define ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##8 = OP(OPERAND1##8, OPERAND2##8);
-
-#define ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2)      \
-    OPERAND1##9 = OP(OPERAND1##9, OPERAND2##9);
-
-#define ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##A = OP(OPERAND1##A, OPERAND2##A);
-
-#define ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##B = OP(OPERAND1##B, OPERAND2##B);
-
-#define ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##C = OP(OPERAND1##C, OPERAND2##C);
-
-#define ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##D = OP(OPERAND1##D, OPERAND2##D);
-
-#define ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##E = OP(OPERAND1##E, OPERAND2##E);
-
-#define ELTWISE_OP_ROW_16(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##F = OP(OPERAND1##F, OPERAND2##F);
-
-/** @} */ // end of group ELTWISE_OP_ROW_n
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name ELTWISE_OP_BLOCK
- *
- * Supported cases are N=1,2,3,...,16
- *
- * @param[in] OP       The elementwise post op
- * @param[in] N        The number of vectors in the block
- * @param[in] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in] OPERAND2 The basename of the operand 2 variables
- * @{
- */
-#define ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_##N(OP, OPERAND1, OPERAND2)
-#define ELTWISE_OP_BLOCK(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2)
-/** @} */ // end of group ELTWISE_OP_BLOCK
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2) with broadcasting
- * @name ELTWISE_OP_ROW_BROADCAST_n
- *
- * @param[in]      OP       The elementwise post op
- * @param[in, out] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in]      OPERAND2 The basename of the broadcast operand 2 variables
- * @{
- */
-#define ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2) \
-    OPERAND1##0 = OP(OPERAND1##0, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##1 = OP(OPERAND1##1, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##2 = OP(OPERAND1##2, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##3 = OP(OPERAND1##3, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##4 = OP(OPERAND1##4, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##5 = OP(OPERAND1##5, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##6 = OP(OPERAND1##6, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##7 = OP(OPERAND1##7, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##8 = OP(OPERAND1##8, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2)      \
-    OPERAND1##9 = OP(OPERAND1##9, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##A = OP(OPERAND1##A, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##B = OP(OPERAND1##B, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##C = OP(OPERAND1##C, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##D = OP(OPERAND1##D, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##E = OP(OPERAND1##E, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_16(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##F = OP(OPERAND1##F, OPERAND2);
-
-/** @} */ // end of group ELTWISE_OP_ROW_BROADCAST_n
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2) with broadcasting
- * @name ELTWISE_OP_BLOCK_BROADCAST
- * @note Only support:
- *      case 1 broadcast in Y dimension : Operand1 [YxX] + Operand2 [1xX];
- *      case 2 broadcast in both Y and X dimensions : Operand1 [YxX] + Operand2 [1x1] (scalar);
- *      Does NOT support broad cast in X dimension: Operand1 [YxX] + Operand2 [Yx1];
- *
- * Supported cases are N=1,2,3,...,16
- *
- * @param[in] OP       The elementwise post op
- * @param[in] N        The number of vectors in the block
- * @param[in] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in] OPERAND2 The basename of the operand 2 variables
- * @{
- */
-#define ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_BROADCAST_##N(OP, OPERAND1, OPERAND2)
-#define ELTWISE_OP_BLOCK_BROADCAST(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2)
-/** @} */ // end of group ELTWISE_OP_BLOCK_BROADCAST
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h
deleted file mode 100644
index e107f4452dd82eaf5e53323853bbdd595b05f61a..0000000000000000000000000000000000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h"
-#include "gemm_helpers.h"
-#include "load_store_utility.h"
-
-/** (EXPERIMENTAL_POST_OPS) Convenience macros for automatically handling mixed precision (fp16 and fp32) operations
- * -DMIXED_PRECISION toggles mixed precision mode
- */
-
-/** Mixed-Precision-Aware Activation Block
- * @name MIXED_PRECISION_ACTIVATION_BLOCK
- * params N ... B_VAL: same as those in @ref ACTIVATION_BLOCK
- *
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \
-    ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME, A_VAL, B_VAL);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \
-    ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL);
-#endif    // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_ACTIVATION_BLOCK
-
-/** Mixed-Precision-Aware Elementwise Op Block
- * Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name MIXED_PRECISION_ELTWISE_OP_BLOCK
- *
- * @param[in] OP                   The elementwise post op
- * @param[in] M0                   The number of consecutive rows
- * @param[in] N0                   The number of consecutive columns
- * @param[in] OPERAND1             The basename of the first and result operand variables
- * @param[in] OPERAND2             The basename of the second operand variables
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @param[in] CONVERTED_OPERAND2   The basename of the second operand variables converted to higher-precision in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
-    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2);                                     \
-    ELTWISE_OP_BLOCK(OP, M0, OPERAND1, CONVERTED_OPERAND2);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
-    ELTWISE_OP_BLOCK(OP, M0, OPERAND1, OPERAND2);
-#endif    // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_ELTWISE_OP_BLOCK
-
-/** Mixed-Precision-Aware Elementwise Op Broadcast Block
- * Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST
- * @note Only support:
- *      case 1 broadcast in Y dimension : Operand1 [YxX] + Operand2 [1xX]; this means @p N0 > 1
- *      case 2 broadcast in both Y and X dimensions : Operand1 [YxX] + Operand2 [1x1] (scalar) ; this means @p N0 == 1
- *      Does NOT support broad cast in X dimension: Operand1 [YxX] + Operand2 [Yx1]; this means @p M0 should never == 1
- *
- * @param[in] OP                   The elementwise post op
- * @param[in] M0                   The number of consecutive rows, > 1
- * @param[in] N0                   The number of consecutive columns, >= 1
- * @param[in] OPERAND1             The basename of the first and result operand variables
- * @param[in] OPERAND2             The basename of the second operand variables
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @param[in] CONVERTED_OPERAND2   The basename of the second operand variables converted to higher-precision in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
-    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2);                                                \
-    ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, CONVERTED_OPERAND2##0);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
-    ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, OPERAND2##0);
-#endif    // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST
-
-/** Mixed-Precision-Aware Boundary-Aware Store Block
- * @name MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE
- * params M0 ... PARTIAL_COND_X, same as those in STORE_BLOCK_BOUNDARY_AWARE
- *
- * @param[in] BASENAME_LP The name of the low precision variables, converted from BASENAME, in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \
-    CONVERT_BLOCK(M0, N0, DATA_TYPE, BASENAME, BASENAME_LP);                                                                                                                       \
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME_LP, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X);
-#endif    // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl
index a32301d8e3293c485920a95ea163764e9158b16c..0c30c0e626011fe4f3464bf8ad57652a24d488c4 100644
--- a/src/core/CL/cl_kernels/common/gemm.cl
+++ b/src/core/CL/cl_kernels/common/gemm.cl
@@ -152,7 +152,6 @@
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
  *
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
  * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
@@ -453,7 +452,6 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -887,7 +885,6 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
  *
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
  * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
@@ -1213,7 +1210,6 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -1713,7 +1709,6 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
@@ -1993,7 +1988,6 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
@@ -2380,7 +2374,6 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
  *
  * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -2767,7 +2760,6 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
@@ -3226,7 +3218,6 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS matrix is NOT reshaped
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
  *
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
  * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl
new file mode 100644
index 0000000000000000000000000000000000000000..fdfb75d39ccae744d41f2929be38f8af6a667094
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl
@@ -0,0 +1,832 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#ifdef BIAS
+// This function performs in-place bias addition for integer datatype when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(int, M0, N0, acc), uint x)
+{
+    TILE(int, 1, N0, bias_tile);
+
+    // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+    T_LOAD(int, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0) // MMUL block size for the output matrix
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at
+ *       compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER
+ *       (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The dimensions M, N, K must be passed at compile time using -DK (e.g. -DM=5, -DN=8, -DK=6).
+ *       K must be a multiple of 16.
+ * @note MMUL block sizes must be passed at compile time using -DMMUL_K0, -DMMUL_M0, -DMMUL_N0
+ *       (e.g. -DMMUL_K0=16, -DMMUL_M0=4, -DMMUL_N0=4)
+ * @note If there is bias -DBIAS option must be passed at compile time
+ * @note Quantization offsets of lhs, rhs and dst tensors must be passed at compile time using -DLHS_OFFSET,
+ *       -DRHS_OFFSET, -DDST_OFFSET (e.g. -DLHS_OFFSET=10, -DRHS_OFFSET=0, -DDST_OFFSET=-6)
+ * @note Effective quantization multiplier and shift for the destination tensor must be passed at compile time using
+ *       -DDST_MULTIPLIER and -DDST_SHIFT (e.g. -DDST_MULTIPLIER=2091, -DST_SHIFT=8)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_MMUL_NT_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 4
+ * @note For a generic view on how the MMUL works, see mat_mul_mmul.cl
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: S32
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_quantized_mmul_nt_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    // The explanation of how this kernel works is very similar to the explanation given in
+    // mat_mul_mmul.cl. The MMUL logic, and terminology is the same. The only difference is
+    // in quantization multiplication, the MMUL block sizes are (4 x 16) for Lhs matrix and
+    // (16 x 4) for Rhs matrix, resulting in (4 x 4) MMUL block size for the destination.
+    //
+    // Figures 1, 2 and 3 in the previous explanation works the same. Since the Lhs and Rhs
+    // MMUL block sizes are different in quantized extension, the thread access pattern is
+    // slightly different. We can redraw Figure 4 (Thread access pattern) as follows:
+    //
+    //                                 (Modified Figure 4 from mat_mul_mmul.cl)
+    //                               Thread Access Layouts in LHS & RHS matrices
+    //
+    //                                    LHS matrix
+    //                 4 times      4 times          4 times        4 times
+    //           _______________________________________________________________
+    //          |T0_|T0_|T0_|T0_|T1_|T1_|T1_|T1_|T2_|T2_|T2_|T2_|T3_|T3_|T3_|T3_|
+    //          |T0_| ...                                                       |
+    //    M0    |   .    .                                                      |
+    //   Times  |   .       .                                                   |
+    //          |   .           .                                               |
+    //          |T0_|T0_|T0_|T0_|T1_|T1_|T1_|T1_|T2_|T2_|T2_|T2_|T3_|T3_|T3_|T3_|
+    //          |T4_|T4_|T4_|T4_|T5_|T5_|T5_|T5_|T6_|T6_|T6_|T6_|T7_|T7_|T7_|T7_|
+    //          |T4_|T4_|T4_|T4_|T5_|T5_|T5_|T5_|T6_|T6_|T6_|T6_|T7_|T7_|T7_|T7_|
+    //    M0    |   .    .                                                      |
+    //   Times  |   .       .                                                   |
+    //          |   .           .                                               |
+    //          |T4_|T4_|T4_|T4_|T5_|T5_|T5_|T5_|T6_|T6_|T6_|T6_|T7_|T7_|T7_|T7_|
+    //          |T8_|T8_|T8_|T8_|T9_|T9_|T9_|T9_|T10|T10|T10|T10|T11|T11|T11|T11|
+    //    M0    |   .                                                           |
+    //   Times  |   .                                                           |
+    //          |   .                                                           |
+    //          |T8_|T8_|T8_|T8_|T9_|T9_|T9_|T9_|T10|T10|T10|T10|T11|T11|T11|T11|
+    //    M0    |   .                                                           |
+    //   Times  |   .                                                           |
+    //          |   .                                                           |
+    //          |T12|T12|T12|T12|T13|T13|T13|T13|T14|T14|T14|T14|T15|T15|T15|T15|
+    //
+    //
+    //                                                  RHS Matrix
+    //
+    //                   __________N0 times______N0 times____________________N0 times_______
+    //                  |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //       4 times    |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //                  |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //                  |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //                  |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //       4 times    |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //                  |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //           X      |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //                  |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //                  |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //       4 times    |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //                  |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //                  |__T12_| ... |__T12_|__T13_| ...  |__T13_| ... |__T15_| ... |__T15_|
+    //       4 times    |__T12_| ... |__T12_|__T13_| ...  |__T13_| ... |__T15_| ... |__T15_|
+    //                  |__T12_| ... |__T12_|__T13_| ...  |__T13_| ... |__T15_| ... |__T15_|
+    //                  |__T12_|_____|__T12_|__T13_|______|__T13_|_____|__T15_|_____|__T15_|
+    //
+    //
+    // The logic behind this thread access pattern is already descried in the explanation
+    // in mat_mul_mmul.cl. The only change is threads accesses are extended to 4 elements
+    // from 1, in rightward direction in Lhs, and in downward direction in Rhs, because they
+    // are now operating on 4 char/uchar's (again 32-bit data), instead of one 32-bit floating point.
+    //
+    // The mathematical view of the matrix multiplication explained in Figure 5 also holds for this,
+    // except the dimension 4 is 16 instead, but the vector notations do not change, i.e. it's as follows:
+    //
+    //   Settings:
+    //          - a 8 x 16 LHS section
+    //          - 16 x 8 RHS section
+    //          - Each vector variable ai, bj represent a 16x1 vector
+    //          - ^T (superscript T) denotes transpose
+    //          - M0 = N0 = 2
+    //          - MMUL_N0 = MMUL_M0 = 4, MMUL_K0 = 16
+    //
+    //
+    //                                             (Modified Figure 5)
+    //                              Mathematical view of the Matrix Multiplication
+    //
+    //      LHS                           RHS                                           DST
+    //    [  a1^T  ]            [ b1 b2 b3 b4 b5 b6 b7 ]                [ a1^Tb1  a1^Tb2  a1^Tb3 ... a1^Tb7 ]
+    //    [  a2^T  ]                                    16 x 8          [ a2^Tb1  a2^Tb2  a2^Tb3 ... a2^Tb7 ]
+    //    [  a3^T  ]                                                    [                                   ]
+    //    [  a4^T  ]                                                =   [   .       .                       ]
+    //    [  a5^T  ]        X                                           [   .          .                    ]
+    //    [  a6^T  ]                                                    [   .             .                 ]
+    //    [  a7^T  ]                                                    [                                   ]
+    //    [  a8^T  ]                                                    [ a7^Tb1  a7^Tb2  a7^Tb3 ... a7^Tb7 ]
+    //              8 x 16                                                                                     8 x 8
+    //
+    //
+    //  For the first iteration, i.e. (m0, n0) = (0, 0), the arm_matrix_multiply would multiply the following matrices:
+    //
+    //    [  a1^T  ]            [  b1 b3 b5 b7 ]                [ a1^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a3^T  ]        x                   4 x 4     =     [ a3^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a5^T  ]                                            [ a5^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a7^T  ]                                            [ a7^Tb1  a7^Tb3  a7^Tb5  a7^Tb7 ]
+    //              4 x 4                                                                         4 x 4
+    // The elements calculated in the 4x4 output block are the "interleaved" elements in the DST above.
+    // When we follow for each combination of (m0, n0), every element of the DST matrix "section" is filled.
+    //
+    // Please refer to mat_mul_mmul.cl for more details.
+
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within an mmul block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Calculate dst coordinates
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = K0 * thread_x;
+    const uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    const uint rhs_x = dst_x;
+    const uint rhs_y = K0 * thread_y;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    // Calculate row and column sums
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);
+
+    for(int k = 0; k < lhs_w; k += MMUL_K0)
+    {
+        // A tile of M0xK0 but K0 must be set to K0
+        TILE(DATA_TYPE, M0, K0, a);
+        // A tile of K0xN0 but K0 must be set to K0
+        TILE(DATA_TYPE, K0, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);
+
+            LOOP_UNROLLING(int, m0, 0, 1, M0,
+            {
+                c[m0].s[n0] = arm_matrix_multiply(a[m0].v, vec_b, c[m0].s[n0]);
+            })
+
+#if LHS_OFFSET != 0
+            // Column Sum of B: Calculate the sum of columns by multiplying B
+            // with a matrix of 1's from Left
+            b_sum[0].s[n0] = arm_matrix_multiply(vec_1, vec_b, b_sum[0].s[n0]);
+#endif // LHS_OFFSET != 0s
+        })
+
+#if RHS_OFFSET != 0
+        // Row Sum of A: Calculate the sum of rows by multiplying A with
+        // a matrix of 1's from Right
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            a_sum[0].s[m0] = arm_matrix_multiply(a[m0].v, vec_1, a_sum[0].s[m0]);
+        })
+#endif // RHS_OFFSET != 0
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;
+    }
+
+    // Do not write if the coordinates are out of bound
+    // But, read has to happen as arm_matrix_multiply() expects certain number of calls
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if RHS_OFFSET != 0 || LHS_OFFSET != 0
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_NT)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
+ *
+ * Supported block configurations:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 4
+ *
+ * Similar to mat_mul_native_quantized_mmul_nt_nt()
+ */
+__kernel void mat_mul_native_quantized_mmul_nt_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within an mmul block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Calculate dst coordinates
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = K0 * thread_x;
+    const uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    const uint rhs_x = K0 * thread_y;
+    const uint rhs_y = dst_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    // Calculate row and column sums
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);
+
+    for(int k = 0; k < lhs_w; k += MMUL_K0)
+    {
+        // A tile of M0xK0 but K0 must be set to K0
+        TILE(DATA_TYPE, M0, K0, a);
+        // A tile of K0xN0 but K0 must be set to K0
+        TILE(DATA_TYPE, N0, K0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c[m0].s[n0] = arm_matrix_multiply(a[m0].v, b[n0].v, c[m0].s[n0]);
+            })
+        })
+
+#if RHS_OFFSET != 0
+        // Row Sum of A: Calculate the sum of rows by multiplying A with
+        // a matrix of 1's from Right
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            a_sum[0].s[m0] = arm_matrix_multiply(a[m0].v, vec_1, a_sum[0].s[m0]);
+        })
+#endif // RHS_OFFSET != 0
+
+#if LHS_OFFSET != 0
+        // Column Sum of B: Calculate the sum of columns by multiplying B
+        // with a matrix of 1's from Left
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            b_sum[0].s[n0] = arm_matrix_multiply(vec_1, b[n0].v, b_sum[0].s[n0]);
+        })
+#endif // LHS_OFFSET != 0
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+    }
+
+    // Do not write if the coordinates are out of bound
+    // But, read has to happen as arm_matrix_multiply() expects certain number of calls
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if RHS_OFFSET != 0 || LHS_OFFSET != 0
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_T)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed
+ *
+ * Supported block configurations:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 4
+ *
+ * Similar to mat_mul_native_quantized_mmul_nt_nt()
+ */
+__kernel void mat_mul_native_quantized_mmul_t_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within an mmul block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Calculate dst coordinates
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = dst_y;
+    const uint lhs_y = K0 * thread_x;
+
+    // Starting RHS coordinates
+    const uint rhs_x = dst_x;
+    const uint rhs_y = K0 * thread_y;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    // Calculate row and column sums
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);
+
+    for(int k = 0; k < lhs_h; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, K0, M0, a);
+        TILE(DATA_TYPE, K0, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            vec_a = (VEC_DATA_TYPE(DATA_TYPE, K0))(a[0].s[m0], a[1].s[m0], a[2].s[m0], a[3].s[m0]);
+
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                VEC_DATA_TYPE(DATA_TYPE, K0)
+                vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);
+
+                c[m0].s[n0] = arm_matrix_multiply(vec_a, vec_b, c[m0].s[n0]);
+            })
+
+#if RHS_OFFSET != 0
+            // Row Sum of A: Calculate the sum of rows by multiplying A with
+            // a matrix of 1's from Right
+            a_sum[0].s[m0] = arm_matrix_multiply(vec_a, vec_1, a_sum[0].s[m0]);
+#endif // RHS_OFFSET != 0
+        })
+
+#if LHS_OFFSET != 0
+        // Column Sum of B: Calculate the sum of columns by multiplying B
+        // with a matrix of 1's from Left
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);
+
+            b_sum[0].s[n0] = arm_matrix_multiply(vec_1, vec_b, b_sum[0].s[n0]);
+        })
+#endif // LHS_OFFSET != 0
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;
+    }
+
+    // Do not write if the coordinates are out of bound
+    // But, read has to happen as arm_matrix_multiply() expects certain number of calls
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if RHS_OFFSET != 0 || LHS_OFFSET != 0
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_NT)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed
+ *
+ * Supported block configurations:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 4
+ *
+ * Similar to mat_mul_native_quantized_mmul_nt_nt()
+ */
+__kernel void mat_mul_native_quantized_mmul_t_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within an mmul block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Calculate dst coordinates
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = dst_y;
+    const uint lhs_y = K0 * thread_x;
+
+    // Starting RHS coordinates
+    const uint rhs_x = K0 * thread_y;
+    const uint rhs_y = dst_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    // Calculate row and column sums
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);
+
+    for(int k = 0; k < lhs_h; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, K0, M0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            vec_a = (VEC_DATA_TYPE(DATA_TYPE, K0))(a[0].s[m0], a[1].s[m0], a[2].s[m0], a[3].s[m0]);
+
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c[m0].s[n0] = arm_matrix_multiply(vec_a, b[n0].v, c[m0].s[n0]);
+            })
+#if RHS_OFFSET != 0
+            // Row Sum of A: Calculate the sum of rows by multiplying A with
+            // a matrix of 1's from Right
+            a_sum[0].s[m0] = arm_matrix_multiply(vec_a, vec_1, a_sum[0].s[m0]);
+#endif // RHS_OFFSET != 0
+        })
+
+#if LHS_OFFSET != 0
+        // Column Sum of B: Calculate the sum of columns by multiplying B
+        // with a matrix of 1's from Left
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            b_sum[0].s[n0] = arm_matrix_multiply(vec_1, b[n0].v, b_sum[0].s[n0]);
+        })
+#endif // LHS_OFFSET != 0
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+    }
+
+    // Do not write if the coordinates are out of bound
+    // But, read has to happen as arm_matrix_multiply() expects certain number of calls
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if RHS_OFFSET != 0 || LHS_OFFSET != 0
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_T)
diff --git a/src/core/CL/cl_kernels/common/reduction_operation.cl b/src/core/CL/cl_kernels/common/reduction_operation.cl
index 9f2c6e23b579a154c3a7d692e7548766b3eff2ba..99369be19a9623a6190dfc8aaa6eeb1cecd52e0a 100644
--- a/src/core/CL/cl_kernels/common/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/common/reduction_operation.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,6 +47,8 @@
 #define sum(in0, in1, size) (in0 + SUM_REDUCE(in1, size))
 #define square_sum(in0, in1, size) (in0 + SUM_REDUCE((in1 * in1), size))
 #define product(in0, in1, size) (in0 * PROD_REDUCE(in1, size))
+#define min_(in0, in1, size) (min(in0, MIN_REDUCE(in1, size)))
+#define max_(in0, in1, size) (max(in0, MAX_REDUCE(in1, size)))
 
 /** This kernel performs parallel reduction given an operation on x-axis.
  *
@@ -79,12 +81,15 @@ __kernel void reduction_operation_x(
     __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + y * input_stride_y + z * input_stride_z;
     __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + y * output_stride_y + z * output_stride_z;
 
+#if !defined(MIN) && !defined(MAX)
 #if defined(PROD)
     DATA_TYPE res = (DATA_TYPE)1;
 #else  // defined(PROD)
     DATA_TYPE res = (DATA_TYPE)0;
 #endif // defined(PROD)
-
+#else  // #if !defined(MIN) && !defined(MAX)
+    DATA_TYPE res = *((__global DATA_TYPE *)input_addr);
+#endif // #if defined(MIN) || defined(MAX)
     int x = 0;
 
     for(; x <= (WIDTH - VEC_SIZE); x += VEC_SIZE)
@@ -181,27 +186,28 @@ __kernel void reduction_operation_non_parallel_x(
  * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
  *
  * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
  * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
  */
 __kernel void reduction_operation_y(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
+    __global uchar *input_ptr,
+    uint            input_stride_y,
+    uint            input_stride_z,
+    uint            input_offset_first_element_in_bytes,
+
+    __global uchar *output_ptr,
+    uint            output_stride_z,
+    uint            output_offset_first_element_in_bytes)
 {
     int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int y = get_global_id(1);
+    int z = get_global_id(1);
 
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y;
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + z * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + z * output_stride_z;
 
     VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
     res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
@@ -270,32 +276,33 @@ __kernel void reduction_operation_y(
  * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
  *
  * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
  * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
  * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
  */
 __kernel void reduction_operation_z(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
+    __global uchar *input_ptr,
+    uint            input_stride_y,
+    uint            input_stride_z,
+    uint            input_stride_w,
+    uint            input_offset_first_element_in_bytes,
+
+    __global uchar *output_ptr,
+    uint            output_stride_y,
+    uint            output_stride_w,
+    uint            output_offset_first_element_in_bytes)
 {
     int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     int y = get_global_id(1);
-    int z = get_global_id(2);
+    int w = get_global_id(2);
 
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + z * input_stride_z;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + z * output_stride_z;
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + w * input_stride_w;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + w * output_stride_w;
 
     VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
     res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
@@ -364,39 +371,43 @@ __kernel void reduction_operation_z(
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
- * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
  *
  * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_stride_v                       Stride of the source tensor in V dimension (in bytes)
  * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
  * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
- * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_stride_v                      Stride of the output tensor in V dimension (in bytes)
  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
  */
 __kernel void reduction_operation_w(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
+    __global uchar *input_ptr,
+    uint            input_stride_y,
+    uint            input_stride_z,
+    uint            input_stride_w,
+    uint            input_stride_v,
+    uint            input_offset_first_element_in_bytes,
+
+    __global uchar *output_ptr,
+    uint            output_stride_y,
+    uint            output_stride_z,
+    uint            output_stride_v,
+    uint            output_offset_first_element_in_bytes)
 {
     int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     int y = get_global_id(1);
-    int z = get_global_id(2);
 
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + (z % DEPTH) * input_stride_z + (z / DEPTH) * input_stride_w;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + (z % DEPTH) * output_stride_z + (z / DEPTH) * output_stride_z;
+    int gid_2 = get_global_id(2);
+    int z = get_global_id(2) % DEPTH;
+    int v = get_global_id(2) / DEPTH;
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + z * input_stride_z + v * input_stride_v;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + z * output_stride_z + v * output_stride_v;
 
     VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
     res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
diff --git a/src/core/CL/cl_kernels/common/reverse.cl b/src/core/CL/cl_kernels/common/reverse.cl
index 6b0afb9c2c95af46fd9b40a6ac740a069cf35407..f94bfb66406c1220e89dd880b59a7c5b990bdd0c 100644
--- a/src/core/CL/cl_kernels/common/reverse.cl
+++ b/src/core/CL/cl_kernels/common/reverse.cl
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2018-2021 Arm Limited.
+* Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,6 +33,8 @@
  *
  * @note The data type must be given as a preprocessor argument using -DDATA_TYPE=num. e.g. -DDATA_TYPE=uint
  * @note The number of dimensions to reverse must be given as a preprocessor argument using -DNUM_REVERSE_DIMS=num, e.g. -DNUM_REVERSE_DIMS=3
+ * @note The number of dimensions of the source tensor must be given as a preprocessor argument using -DRANK=num, e.g. -DRANK=3
+ * @note The values in axis_tensor must be within [-rank, rank-1].
  *
  * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
@@ -78,20 +80,24 @@ __kernel void reverse(TENSOR4D_DECLARATION(src),
 
     const uint4 dims       = (uint4)(0, 1, 2, 3);
     int4        to_reverse = (int4)(0, 0, 0, 0);
+
+    VEC_DATA_TYPE(int, NUM_REVERSE_DIMS) indices =  VLOAD(NUM_REVERSE_DIMS)(0,(__global int *)axis.ptr);
+#if defined(USE_INVERTED_AXIS)
+    indices    = select((VEC_DATA_TYPE(int, NUM_REVERSE_DIMS)) RANK - 1, -1, indices < 0) - indices;
+#else /* defined(USE_INVERTED_AXIS) */
+    indices    = select(indices, indices + RANK, indices < 0);
+#endif /* defined(USE_INVERTED_AXIS) */
+
 #if NUM_REVERSE_DIMS == 1
-    const uint index = *((__global uint *)axis.ptr);
-    to_reverse       = (uint4)index == dims;
+    to_reverse = ((uint4)indices == dims);
 #elif NUM_REVERSE_DIMS == 2
-    const uint2 indices = vload2(0, (__global uint *)axis.ptr);
-    to_reverse          = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims);
+    to_reverse = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims);
 #elif NUM_REVERSE_DIMS == 3
-    const uint2 indices01 = vload2(0, (__global uint *)axis.ptr);
-    const uint index2     = *((__global uint *)axis.ptr + 2);
-    to_reverse            = ((uint4)indices01.s0 == dims) || ((uint4)indices01.s1 == dims) || ((uint4)index2 == dims);
-#else  /* NUM_REVERSE_DIMS == 3 */
-    const uint4 indices = vload4(0, (__global uint *)axis.ptr);
-    to_reverse          = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims) || ((uint4)indices.s2 == dims) || ((uint4)indices.s3 == dims);
+    to_reverse = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims) || ((uint4)indices.s2 == dims);
+#else /* NUM_REVERSE_DIMS == 1 */
+    to_reverse    = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims) || ((uint4)indices.s2 == dims) || ((uint4)indices.s3 == dims);
 #endif /* NUM_REVERSE_DIMS == 1 */
+
     const uint x_out = to_reverse.s0 ? width - x_in - 1 : x_in;
     const uint y_out = to_reverse.s1 ? height - y_in - 1 : y_in;
     const uint z_out = to_reverse.s2 ? depth - z_in - 1 : z_in;
diff --git a/src/core/CL/cl_kernels/common/softmax_layer.cl b/src/core/CL/cl_kernels/common/softmax_layer.cl
index 4d2d89dd734e335f903faf21049a5c1fbbedbbe5..bfc0995bb8997de15ae6d6442b7acb828cffc754 100644
--- a/src/core/CL/cl_kernels/common/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/common/softmax_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,511 +21,351 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
+#define MIN_VALUE_float -FLT_MAX
+#define MIN_VALUE_half  -HALF_MAX
+#define MIN_VALUE_char  CHAR_MIN
+#define MIN_VALUE_uchar 0
+
+#define MIN_VALUE_TYPE_STR(data_type) MIN_VALUE_##data_type
+#define MIN_VALUE_TYPE(data_type) MIN_VALUE_TYPE_STR(data_type)
+#define MIN_VALUE MIN_VALUE_TYPE(DATA_TYPE)
+
+#ifdef SOFTMAX_X
 
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+/** 3-pass softmax in the x dimension.
  *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
+ * List of preprocessors:
+ *   - DATA_TYPE: the input/output data type.
+ *   - TMP_DATA_TYPE: the data type used for computing and temporary tensor storage.
+ *     If DATA_TYPE is quantized, TMP_DATA_TYPE is floating-point, otherwise TMP_DATA_TYPE is the same as DATA_TYPE.
+ *   - IS_LOG (optional): indicating whether this is log softmax.
+ *   - LENGTH: the number of elements in softmax axis in the input/output tensors.
+ *   - BETA: the beta coefficient.
+ *   - IS_QUANTIZED (optional): indicating whether the input/output data type is quantized data.
+ *   - VEC_SIZE: the size of the vector.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * Additional preprocessors in case IS_QUANTIZED is present:
+ *   - SRC_SCALE and SRC_OFFSET: the quantization information of the source tensor.
+ *   - DST_SCALE and DST_OFFSET: the quantization information of the destination tensor.
+ *
+ * @param[in] src_ptr                  Pointer to the source tensor.
+ * @param[in] src_stride_0             Stride in bytes of the source tensor in the dimension corresponding to global ID 0.
+ * @param[in] src_stride_1             Stride in bytes of the source tensor in the dimension corresponding to global ID 1.
+ * @param[in] src_stride_2             Stride in bytes of the source tensor in the dimension corresponding to global ID 2.
+ * @param[in] src_offset_first_element Offset of the first element in the source tensor.
+ * @param[in] dst_ptr                  Pointer to the destination tensor.
+ * @param[in] dst_stride_0             Stride in bytes of the destination tensor in the dimension corresponding to global ID 0.
+ * @param[in] dst_stride_1             Stride in bytes of the destination tensor in the dimension corresponding to global ID 1.
+ * @param[in] dst_stride_2             Stride in bytes of the destination tensor in the dimension corresponding to global ID 2.
+ * @param[in] dst_offset_first_element Offset of the first element in the destination tensor.
+ * @param[in] tmp_ptr                  Pointer to the temporary tensor.
+ * @param[in] tmp_stride_0             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 0.
+ * @param[in] tmp_stride_1             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 1.
+ * @param[in] tmp_stride_2             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 2.
+ * @param[in] tmp_offset_first_element Offset of the first element in the temporary tensor.
  */
-__kernel void softmax_layer_norm(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(sum),
-    TENSOR3D_DECLARATION(dst))
+__kernel void softmax_x(
+    __global uchar *src_ptr,
+    uint src_stride_0,
+    uint src_stride_1,
+    uint src_stride_2,
+    uint src_offset_first_element,
+
+    __global uchar *dst_ptr,
+    uint dst_stride_0,
+    uint dst_stride_1,
+    uint dst_stride_2,
+    uint dst_offset_first_element
+
+#ifdef IS_QUANTIZED
+    ,
+    __global uchar *tmp_ptr,
+    uint tmp_stride_0,
+    uint tmp_stride_1,
+    uint tmp_stride_2,
+    uint tmp_offset_first_element
+#endif // IS_QUANTIZED
+)
 {
-    const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0) * sizeof(DATA_TYPE);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+    const int dim_0 = get_global_id(0);
+    const int dim_1 = get_global_id(1);
+    const int dim_2 = get_global_id(2);
 
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+    src_ptr += src_offset_first_element + dim_2 * src_stride_2 + dim_1 * src_stride_1 + dim_0 * src_stride_0;
+    dst_ptr += dst_offset_first_element + dim_2 * dst_stride_2 + dim_1 * dst_stride_1 + dim_0 * dst_stride_0;
 
-    // Load max value of 1D logits vector (row)
-    DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
-    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    data0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
+#ifdef IS_QUANTIZED
+    tmp_ptr += tmp_offset_first_element + dim_2 * tmp_stride_2 + dim_1 * tmp_stride_1 + dim_0 * tmp_stride_0;
+#else // IS_QUANTIZED
+    __global uchar *tmp_ptr = dst_ptr;
+#endif // IS_QUANTIZED
 
-#if defined(LOG_SOFTMAX)
-    sum_val = log(sum_val);
-    data0 -= sum_val;
-#else  // defined(LOG_SOFTMAX)
-    data0 /= sum_val;
-#endif // defined(LOG_SOFTMAX)
+    // Calculate max value.
+    DATA_TYPE max_value = MIN_VALUE;
+    int i = 0;
 
-    STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
+    for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE)));
 
-#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
+        max_value = max(max_value, MAX_REDUCE(data, VEC_SIZE));
+    }
 
-/* Number of workitems in dimension 0. */
-#if !defined(GRID_SIZE)
-#define GRID_SIZE 1
-#endif /* !defined(GRID_SIZE) */
+    for (; i < LENGTH; ++i)
+    {
+        DATA_TYPE data = *(__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE));
 
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+        max_value = max(max_value, data);
+    }
 
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
- * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
- *
- * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_serial(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+    // Regularize the data.
+    TMP_DATA_TYPE sum_value = 0;
 
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+#ifdef IS_QUANTIZED
+    TMP_DATA_TYPE max_value_f = (CONVERT(max_value, TMP_DATA_TYPE) - SRC_OFFSET) * SRC_SCALE;
+    TMP_DATA_TYPE regularize_offset = -SRC_OFFSET * SRC_SCALE * (TMP_DATA_TYPE)BETA - max_value_f * (TMP_DATA_TYPE)BETA;
+# define REGULARIZE(x) ((x) * SRC_SCALE * (TMP_DATA_TYPE)BETA + regularize_offset)
+#else // IS_QUANTIZED
+# define REGULARIZE(x) (((x) - max_value) * (TMP_DATA_TYPE)BETA)
+#endif // IS_QUANTIZED
 
-#ifdef BETA
-    // Initialize beta
-    VEC_TYPE beta = (VEC_TYPE)BETA;
-#endif /* BETA */
+    for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE))), VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE));
 
-    // Initialize local maximum
-    VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
+        data = REGULARIZE(data);
 
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_TYPE data    = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
-    SELECT_TYPE widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
-    max_val_vec      = max(max_val_vec, select((VEC_TYPE)(MINVAL), data, widx));
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#ifdef IS_LOG
+        sum_value += SUM_REDUCE(exp(data), VEC_SIZE);
+#else // IS_LOG
+        data = exp(data);
+        sum_value += SUM_REDUCE(data, VEC_SIZE);
+#endif // IS_LOG
 
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        max_val_vec   = max(data, max_val_vec);
+        VSTORE(VEC_SIZE)(data, 0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)));
     }
 
-    // Perform max reduction
-    DATA_TYPE max_val                 = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    *((__global DATA_TYPE *)maxo.ptr) = max_val;
-
-    /* Second section */
-
-    // Set sum vector
-    VEC_TYPE sum1D = 0;
-
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    data -= max_val;
-#ifdef BETA
-    data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-    (data, 0, (__global DATA_TYPE *)dst_addr);
-    data = exp(data);
-    data = select(0, data, widx);
-#else  /* LOG_SOFTMAX */
-    data = exp(data);
-    data = select(0, data, widx);
-    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-    (data, 0, (__global DATA_TYPE *)dst_addr);
-#endif /* LOG_SOFTMAX */
-    sum1D += data;
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
-    // Shift values, exp and sum
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+    for (; i < LENGTH; ++i)
     {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
-        data = exp(data);
-#else  /* LOG_SOFTMAX */
+        TMP_DATA_TYPE data = CONVERT(*(__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE)), TMP_DATA_TYPE);
+
+        data = REGULARIZE(data);
+
+#ifdef IS_LOG
+        sum_value += exp(data);
+#else // IS_LOG
         data = exp(data);
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
+        sum_value += data;
+#endif // IS_LOG
+
+        *(__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)) = data;
     }
 
-    // Perform sum reduction
-    *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
-}
+#undef REGULARIZE
+
+    // Normalize the data.
+#ifdef IS_QUANTIZED
+# if IS_LOG
+    TMP_DATA_TYPE norm_offset = -log(sum_value) + DST_OFFSET;
+#  define NORMALIZE(SIZE, x) CONVERT_SAT_ROUND((x) / DST_SCALE + norm_offset, VEC_DATA_TYPE(DATA_TYPE, SIZE), rte)
+# else // IS_LOG
+    TMP_DATA_TYPE norm_div = sum_value * DST_SCALE;
+#  define NORMALIZE(SIZE, x) CONVERT_SAT(add_sat(CONVERT_SAT_ROUND((x) / norm_div, VEC_DATA_TYPE(int, SIZE), rte), DST_OFFSET), VEC_DATA_TYPE(DATA_TYPE, SIZE))
+#  endif // IS_LOG
+#else // IS_QUANTIZED
+# if IS_LOG
+#  define NORMALIZE(SIZE, x) ((x) - log(sum_value))
+# else // IS_LOG
+#  define NORMALIZE(SIZE, x) ((x) / sum_value)
+# endif // IS_LOG
+#endif // IS_QUANTIZED
+
+    for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)));
 
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
- * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
- *
- * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_parallel(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    const uint lid    = get_local_id(0);
-    const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE) * sizeof(DATA_TYPE);
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) result = NORMALIZE(VEC_SIZE, data);
 
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+        VSTORE(VEC_SIZE)(result, 0, (__global DATA_TYPE *)(dst_ptr + i * sizeof(DATA_TYPE)));
+    }
 
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+    for (; i < LENGTH; ++i)
+    {
+        TMP_DATA_TYPE data = *(__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE));
 
-#ifdef BETA
-    // Initialize beta
-    VEC_TYPE beta = (VEC_TYPE)BETA;
-#endif /* BETA */
+        DATA_TYPE result = NORMALIZE(1, data);
 
-    // Define one temporary vector per work-item.
-    __local VEC_TYPE tmp_local[GRID_SIZE];
-    __local DATA_TYPE max_local;
+        *(__global DATA_TYPE *)(dst_ptr + i * sizeof(DATA_TYPE)) = result;
+    }
 
-    VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
+#undef NORMALIZE
+}
 
-    // Number of iterations per work-item.
-    const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
-    // Calculate max of row
-    uint i = 0;
-    for(; i < width; ++i)
-    {
-        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    // How many work-items needed to complete the computation.
-    int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    SELECT_TYPE widx;
-    if(lid == 0)
-    {
-        // Handle non multiple of 4
-        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        widx              = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
-        max_val_vec       = max(max_val_vec, select((VEC_TYPE)(MINVAL), data_max, widx));
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = max_val_vec;
+#endif // SOFTMAX_X
 
-    barrier(CLK_LOCAL_MEM_FENCE);
+#ifdef SOFTMAX_NON_X
 
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
+/** 3-pass softmax in any dimension higher than the x dimension.
+ *
+ * List of preprocessors:
+ *   - DATA_TYPE: the input/output data type.
+ *   - TMP_DATA_TYPE: the data type used for computing and temporary tensor storage.
+ *     If DATA_TYPE is quantized, TMP_DATA_TYPE is floating-point, otherwise TMP_DATA_TYPE is the same as DATA_TYPE.
+ *   - IS_LOG (optional): indicating whether this is log softmax.
+ *   - LENGTH: the number of elements in softmax axis in the input/output tensors.
+ *   - BETA: the beta coefficient.
+ *   - IS_QUANTIZED (optional): indicating whether the input/output data type is quantized data.
+ *   - VEC_SIZE: the size of the vector.
+ *   - VEC_SIZE_LEFTOVER: the size of the leftover part.
+ *
+ * Additional preprocessors in case IS_QUANTIZED is present:
+ *   - SRC_SCALE and SRC_OFFSET: the quantization information of the source tensor.
+ *   - DST_SCALE and DST_OFFSET: the quantization information of the destination tensor.
+ *
+ * @param[in] src_ptr                  Pointer to the source tensor.
+ * @param[in] src_stride_0             Stride in bytes of the source tensor in the dimension corresponding to global ID 0.
+ * @param[in] src_stride_1             Stride in bytes of the source tensor in the dimension corresponding to global ID 1.
+ * @param[in] src_stride_2             Stride in bytes of the source tensor in the dimension corresponding to global ID 2.
+ * @param[in] src_offset_first_element Offset of the first element in the source tensor.
+ * @param[in] dst_ptr                  Pointer to the destination tensor.
+ * @param[in] dst_stride_0             Stride in bytes of the destination tensor in the dimension corresponding to global ID 0.
+ * @param[in] dst_stride_1             Stride in bytes of the destination tensor in the dimension corresponding to global ID 1.
+ * @param[in] dst_stride_2             Stride in bytes of the destination tensor in the dimension corresponding to global ID 2.
+ * @param[in] dst_offset_first_element Offset of the first element in the destination tensor.
+ * @param[in] tmp_ptr                  Pointer to the temporary tensor.
+ * @param[in] tmp_stride_0             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 0.
+ * @param[in] tmp_stride_1             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 1.
+ * @param[in] tmp_stride_2             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 2.
+ * @param[in] tmp_offset_first_element Offset of the first element in the temporary tensor.
+ */
+__kernel void softmax_non_x(
+    __global uchar *src_ptr,
+    uint src_stride_0,
+    uint src_stride_1,
+    uint src_stride_2,
+    uint src_offset_first_element,
+
+    __global uchar *dst_ptr,
+    uint dst_stride_0,
+    uint dst_stride_1,
+    uint dst_stride_2,
+    uint dst_offset_first_element,
+
+    __global uchar *tmp_ptr,
+    uint tmp_stride_0,
+    uint tmp_stride_1,
+    uint tmp_stride_2,
+    uint tmp_offset_first_element,
+
+    uint src_stride_axis,
+    uint dst_stride_axis
+)
+{
+    const int dim_0 = max((int)get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE, 0);
+    const int dim_1 = get_global_id(1);
+    const int dim_2 = get_global_id(2);
+
+    src_ptr += src_offset_first_element + dim_2 * src_stride_2 + dim_1 * src_stride_1 + dim_0 * src_stride_0;
+    dst_ptr += dst_offset_first_element + dim_2 * dst_stride_2 + dim_1 * dst_stride_1 + dim_0 * dst_stride_0;
+    tmp_ptr += tmp_offset_first_element + dim_2 * tmp_stride_2 + dim_1 * tmp_stride_1 + dim_0 * tmp_stride_0;
+
+    // In case of processing quantized data, i.e. DATA_TYPE is smaller than TMP_DATA_TYPE:
+    //
+    // In the first pass (finding max), the quantized data is copied from the input tensor to the temporary tensor.
+    // Dequantization is not needed to find the max value and since dequantization widens the data, we defer it
+    // to the second pass pass to reduce memory bandwidth of the first pass.
+    //
+    // In the second pass, it reads the quantized data from the temporary tensor and writes the dequantized data
+    // back to the temporary tensor.
+    //
+    // To avoid dequantized data overwritting the unprocessed quantized data in the temporary tensor,
+    // this extra offset is introduced to store the quantized data at the end of the temporary tensor.
+    //
+    // Note: Another approach is to perform the second pass in reverse order, but for unexplanable reason
+    // it doesn't work in some devices.
+    uint tmp_extra_offset = LENGTH * VEC_SIZE * (sizeof(TMP_DATA_TYPE) - sizeof(DATA_TYPE));
+
+    // Calculate max value and store the input data to the temporary tensor in suitable format.
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) max_value = MIN_VALUE;
+    int i = 0;
+
+    for (i = 0; i < LENGTH; ++i)
     {
-        max_val_vec = max(tmp_local[lid + 1], tmp_local[lid]);
-        max_local   = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * src_stride_axis));
+
+        max_value = max(max_value, data);
+
+        VSTORE(VEC_SIZE)(data, 0, (__global DATA_TYPE *)(tmp_ptr + tmp_extra_offset + i * VEC_SIZE * sizeof(DATA_TYPE)));
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
 
-    /* Second section */
+    // Regularize the data.
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) sum_value = 0;
 
-    // Set sum vector
-    VEC_TYPE  sum1D   = 0;
-    DATA_TYPE max_val = max_local;
+#ifdef IS_QUANTIZED
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) max_value_f = (CONVERT(max_value, VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE)) - SRC_OFFSET) * SRC_SCALE;
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) regularize_offset = -SRC_OFFSET * SRC_SCALE * (TMP_DATA_TYPE)BETA - max_value_f * (TMP_DATA_TYPE)BETA;
+# define REGULARIZE(x) ((x) * SRC_SCALE * (TMP_DATA_TYPE)BETA + regularize_offset)
+#else // IS_QUANTIZED
+# define REGULARIZE(x) (((x) - max_value) * (TMP_DATA_TYPE)BETA)
+#endif // IS_QUANTIZED
 
-    // Shift values, exp and sum
-    for(i = 0; i < width; ++i)
+    for (i = 0; i < LENGTH; ++i)
     {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data = exp(data);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data = exp(data);
-#else  /* LOG_SOFTMAX */
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(tmp_ptr + tmp_extra_offset + i * VEC_SIZE * sizeof(DATA_TYPE))), VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE));
+
+        data = REGULARIZE(data);
+
+#ifdef IS_LOG
+        sum_value += exp(data);
+#else // IS_LOG
         data = exp(data);
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
+        sum_value += data;
+#endif // IS_LOG
+
+        VSTORE(VEC_SIZE)(data, 0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(TMP_DATA_TYPE)));
     }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    if(lid == 0)
+
+#undef REGULARIZE
+
+    // Normalize the data.
+#ifdef IS_QUANTIZED
+# if IS_LOG
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_offset = -log(sum_value) + DST_OFFSET;
+#  define NORMALIZE(x) CONVERT_SAT_ROUND((x) / DST_SCALE + norm_offset, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE), rte)
+# else // IS_LOG
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_div = sum_value * DST_SCALE;
+#  define NORMALIZE(x) CONVERT_SAT(add_sat(CONVERT_SAT_ROUND((x) / norm_div, VEC_DATA_TYPE(int, VEC_SIZE), rte), DST_OFFSET), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))
+#  endif // IS_LOG
+#else // IS_QUANTIZED
+# if IS_LOG
+#  define NORMALIZE(x) ((x) - log(sum_value))
+# else // IS_LOG
+#  define NORMALIZE(x) ((x) / sum_value)
+# endif // IS_LOG
+#endif // IS_QUANTIZED
+
+    for (i = 0; i < LENGTH; ++i)
     {
-        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-        (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        data = exp(data);
-        data = select(0, data, widx);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        data = select(0, data, widx);
-        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-        (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = sum1D;
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(TMP_DATA_TYPE)));
 
-    barrier(CLK_LOCAL_MEM_FENCE);
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) result0 = NORMALIZE(data);
 
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] += tmp_local[lid + 128];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] += tmp_local[lid + 64];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] += tmp_local[lid + 32];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] += tmp_local[lid + 16];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] += tmp_local[lid + 8];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] += tmp_local[lid + 4];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] += tmp_local[lid + 2];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
-        // Perform sum reduction
-        *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
+        STORE_VECTOR_SELECT(result, DATA_TYPE, dst_ptr + i * dst_stride_axis, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
     }
+
+#undef NORMALIZE
 }
 
-#endif // defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
-#endif // defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
\ No newline at end of file
+#endif // SOFTMAX_NON_X
+
+#undef MIN_VALUE
+#undef MIN_VALUE_TYPE
+#undef MIN_VALUE_TYPE_STR
+
+#undef MIN_VALUE_float
+#undef MIN_VALUE_half
+#undef MIN_VALUE_char
+#undef MIN_VALUE_uchar
diff --git a/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl
deleted file mode 100644
index 192c5f97a18a91e576fdf6fcf2ba52fa59f65a06..0000000000000000000000000000000000000000
--- a/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl
+++ /dev/null
@@ -1,529 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(DIFF_MIN)
-
-#define VEC_BASE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
-#define VEC_FLOAT VEC_DATA_TYPE(float, VECTOR_SIZE)
-
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: S32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void softmax_layer_norm_quantized(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(sum),
-    TENSOR3D_DECLARATION(dst))
-{
-    const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
-
-#ifdef BETA
-    // Initialize beta
-    VEC_FLOAT beta       = (VEC_FLOAT)BETA;
-    VEC_FLOAT scale_beta = -BETA * SCALE;
-#else  /* BETA */
-    VEC_FLOAT scale_beta = -SCALE;
-#endif /* BETA */
-
-    // Load max value of 1D logits vector (row)
-    float sum_val         = *((__global float *)offset(&sum, 0, get_global_id(1)));
-    float sum_val_inverse = 256.f / sum_val;
-
-    VEC_INT   data_diff   = VLOAD(VECTOR_SIZE)(0, (__global int *)src_addr);
-    VEC_FLOAT data_diff_f = CONVERT(data_diff, VEC_FLOAT);
-
-    data_diff_f *= scale_beta;
-    data_diff_f = exp(data_diff_f);
-    data_diff_f *= sum_val_inverse;
-
-#ifdef QASYMM8_SIGNED
-    data_diff_f -= 128.f;
-#endif /* QASYMM8_SIGNED */
-    VEC_INT  data  = CONVERT(data_diff_f, VEC_INT);
-    VEC_BASE data0 = CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE));
-    STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-
-#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
-
-/* Number of workitems in dimension 0. */
-#if !defined(GRID_SIZE)
-#define GRID_SIZE 1
-#endif /* !defined(GRID_SIZE) */
-
-#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
-
-VEC_INT mult_by_quantized_multiplier(VEC_INT data)
-{
-#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
-    if(INPUT_BETA_MULTIPLIER > 1)
-    {
-        return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
-    }
-#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
-    return data;
-}
-
-/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not multiple of VECTOR_SIZE -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  max_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  max_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: S32
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-    VEC_BASE max_val_vec = (VEC_BASE)(MIN_VALUE);
-
-#ifdef BETA
-    // Initialize beta
-    VEC_FLOAT beta       = (VEC_FLOAT)BETA;
-    VEC_FLOAT scale_beta = -BETA * SCALE;
-#else  /* BETA */
-    VEC_FLOAT scale_beta = -SCALE;
-#endif /* BETA */
-
-    // Calculate max of row
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
-    VEC_BASE data        = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
-    VEC_INT widx         = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
-    max_val_vec          = max(max_val_vec, select(vec_min_val, data, CONVERT(widx, VEC_BASE)));
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        max_val_vec   = max(data, max_val_vec);
-    }
-
-    // Perform max reduction
-    DATA_TYPE max_local               = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    *((__global DATA_TYPE *)maxo.ptr) = max_local;
-
-    // Second part
-
-    // Load max value of 1D logits vector (row)
-    int       max_val = convert_int(max_local);
-    VEC_FLOAT sum1D_f = 0.f;
-    // Start with the leftover items
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_INT   data_fp   = CONVERT(data, VEC_INT);
-    VEC_INT   data_diff = max_val - data_fp;
-    VEC_FLOAT data_fp_f = CONVERT(data_diff, VEC_FLOAT);
-    data_fp_f *= scale_beta;
-    data_fp_f = exp(data_fp_f);
-    data_fp_f = select(0, data_fp_f, widx);
-    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-    (data_diff, 0, (__global int *)dst_addr);
-    sum1D_f += data_fp_f;
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-    // Do the rest and compute exp and sum
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_BASE data       = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        VEC_INT   data_fp   = CONVERT(data, VEC_INT);
-        VEC_INT   data_diff = max_val - data_fp;
-        VEC_FLOAT data_fp_f = CONVERT(data_diff, VEC_FLOAT);
-        data_fp_f *= scale_beta;
-        data_fp_f = exp(data_fp_f);
-        sum1D_f += data_fp_f;
-        VSTORE(VECTOR_SIZE)
-        (data_diff, 0, (__global int *)(dst_addr + i * sizeof(int)));
-    }
-    // Perform sum reduction
-    *((__global float *)sum.ptr) = SUM_REDUCE(sum1D_f, VECTOR_SIZE);
-}
-
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
- *
- * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    const uint lid    = get_local_id(0);
-    const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-    // Define one temporary vector per work-item.
-    __local VEC_INT tmp_local[GRID_SIZE];
-    __local DATA_TYPE max_local;
-
-    VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
-    VEC_BASE max_val_vec = vec_min_val;
-
-    // Number of iterations per work-item.
-    const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
-    // Calculate max of row
-    uint i = 0;
-    for(; i < width; ++i)
-    {
-        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    // How many work-items needed to complete the computation.
-    int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_INT widx;
-    if(lid == 0)
-    {
-        // Handle non multiple of 4
-        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        widx              = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
-        max_val_vec       = max(max_val_vec, select(vec_min_val, data_max, CONVERT(widx, VEC_BASE)));
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = CONVERT(max_val_vec, VEC_INT);
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        max_val_vec = max(CONVERT((tmp_local[lid + 1]), VEC_BASE), CONVERT((tmp_local[lid]), VEC_BASE));
-        max_local   = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    /* Second section */
-
-    // Set sum vector
-    VEC_INT sum1D   = 0;
-    int     max_val = convert_int(max_local);
-
-    // Shift values, exp and sum
-    for(i = 0; i < width; ++i)
-    {
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE(VECTOR_SIZE)
-        (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
-        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE(VECTOR_SIZE)
-        (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
-        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    if(lid == 0)
-    {
-        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-        (data_diff, 0, (__global int *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(int)));
-        data_fp = select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-        data_fp = select(0, data_fp, widx);
-        sum1D   = sum1D + data_fp;
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = sum1D;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] += tmp_local[lid + 128];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] += tmp_local[lid + 64];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] += tmp_local[lid + 32];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] += tmp_local[lid + 16];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] += tmp_local[lid + 8];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] += tmp_local[lid + 4];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] += tmp_local[lid + 2];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
-        // Perform sum reduction
-        *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
-    }
-}
-#endif // #if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
-#endif /* defined(DATA_TYPE) && defined(DIFF_MIN) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(MIN_VALUE) */
diff --git a/src/core/CL/cl_kernels/common/transpose.cl b/src/core/CL/cl_kernels/common/transpose.cl
index 82db2908b56e2d02908a373b3e105a6a25521efb..5b4c68ca1053a56bb7ccfcbaf311198bbf4e0afc 100644
--- a/src/core/CL/cl_kernels/common/transpose.cl
+++ b/src/core/CL/cl_kernels/common/transpose.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -124,23 +124,28 @@
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
  * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination matrix in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_gx_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
  */
-__kernel void transpose(IMAGE_DECLARATION(src),
-                        IMAGE_DECLARATION(dst))
+__kernel void transpose(TENSOR3D_DECLARATION(src),
+                        TENSOR3D_DECLARATION(dst))
 {
     uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
     uint y_offs = max((int)(get_global_id(1) * VEC_SIZE_Y - (VEC_SIZE_Y - VEC_SIZE_LEFTOVER_Y) % VEC_SIZE_Y), 0);
+    uint z_offs = get_global_id(2);
 
     // Compute addresses
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * DATA_TYPE_IN_BYTES + y_offs * src_stride_y;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + y_offs * DATA_TYPE_IN_BYTES + x_offs * dst_stride_y;
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * DATA_TYPE_IN_BYTES + y_offs * src_stride_y + z_offs * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + y_offs * DATA_TYPE_IN_BYTES + x_offs * dst_stride_y + z_offs * dst_stride_z;
 
     // Load the NxM block at (x, y)
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
@@ -237,4 +242,4 @@ __kernel void transpose(IMAGE_DECLARATION(src),
                                VEC_SIZE_LEFTOVER_Y != 0 && get_global_id(1) == 0);
 }
 
-#endif // defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y)
\ No newline at end of file
+#endif // defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y)
diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h
index 0e938cb6682d813273bde4cba03d051780a182be..4bef02314f1edc3ec639617e2f221cb1c8973138 100644
--- a/src/core/CL/cl_kernels/gemm_helpers.h
+++ b/src/core/CL/cl_kernels/gemm_helpers.h
@@ -34,14 +34,14 @@
  *
  */
 #define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
-#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
+#define SCALAR_ACCESS(offset, n0, x)     SCALAR_ACCESS_STR(offset, n0, x)
 
 // offset == 0
-#define scalar_access_0_1(x) ((x).s0)
-#define scalar_access_0_2(x) ((x).s01)
-#define scalar_access_0_3(x) ((x).s012)
-#define scalar_access_0_4(x) ((x).s0123)
-#define scalar_access_0_8(x) ((x).s01234567)
+#define scalar_access_0_1(x)  ((x).s0)
+#define scalar_access_0_2(x)  ((x).s01)
+#define scalar_access_0_3(x)  ((x).s012)
+#define scalar_access_0_4(x)  ((x).s0123)
+#define scalar_access_0_8(x)  ((x).s01234567)
 #define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
 
 // offset == 1
@@ -100,8 +100,7 @@
  * @param[in] Z          The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
-    ({})
+#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) ({})
 
 #define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
@@ -186,8 +185,10 @@
  * @param[in] Z          The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
-#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
+#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
+#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
 /** @} */ // end of group LOAD_TENSOR
 
 /** Load 2D tensor (consecutive rows and columns) with Z offset.
@@ -202,8 +203,7 @@
  * @param[in] Z         The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
-    ({})
+#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) ({})
 
 #define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
     LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
@@ -279,8 +279,10 @@
  * @param[in] Z         The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** @}*/ // end of group LOAD_TENSOR_M0XN0
 
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
@@ -395,8 +397,10 @@
  * @param[in] Z         The z-axis offset vector
  * @{
  */
-#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
-#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
 /** @} */ // end of group LOAD_BLOCK
 
 /** Partially load the 0 to (n-1)th rows of the given variables
@@ -517,8 +521,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
-#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
 /** Load a block that can be partial in both x and y dimensions
  *
  * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
@@ -541,22 +547,23 @@
  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0.
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
  */
-#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                                   \
-    {                                                                                                                                                            \
-        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                                           \
-    }                                                                                                                                                            \
-    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                               \
-    {                                                                                                                                                            \
-        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
-    }                                                                                                                                                            \
-    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                               \
-    {                                                                                                                                                            \
-        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
-    }                                                                                                                                                            \
-    else                                                                                                                                                         \
-    {                                                                                                                                                            \
-        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                               \
+#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0,     \
+                                      PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                            \
+    if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                    \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                             \
+    }                                                                                                              \
+    else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);               \
+    }                                                                                                              \
+    else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);               \
+    }                                                                                                              \
+    else                                                                                                           \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
     }
 /** Load a block that can only be partial in x but not y.
  *
@@ -578,14 +585,15 @@
  * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
  */
-#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X))                                                                                                \
-    {                                                                                                                    \
-        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
-    }                                                                                                                    \
-    else                                                                                                                 \
-    {                                                                                                                    \
-        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
+#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, \
+                                PARTIAL_COND_X)                                                          \
+    if (!(PARTIAL_COND_X))                                                                               \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                   \
+    }                                                                                                    \
+    else                                                                                                 \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);     \
     }
 /** Load a block that can only be partial in y but not x.
  *
@@ -607,14 +615,15 @@
  * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
  */
-#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
-    if(!(PARTIAL_COND_Y))                                                                                                \
-    {                                                                                                                    \
-        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
-    }                                                                                                                    \
-    else                                                                                                                 \
-    {                                                                                                                    \
-        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
+#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                PARTIAL_COND_Y)                                                          \
+    if (!(PARTIAL_COND_Y))                                                                               \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                   \
+    }                                                                                                    \
+    else                                                                                                 \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);     \
     }
 /** @} */ // end of group LOAD_BLOCK_PARTIAL
 /** Boundary-aware GeMM block load
@@ -676,28 +685,33 @@
  */
 #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case1: No partial blocks in either x or y
-#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
     LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
 
 #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
 // Case2: Partial blocks in y
-#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                               \
     LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
 
 #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
 // Case3: Partial blocks in x
-#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                               \
     LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
 
 #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case4: Partial blocks in both x and y
-#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
-    LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                               \
+    LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
 
-#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+#endif    // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 /** @} */ // end of group LOAD_BLOCK_BOUNDARY_AWARE
 
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
@@ -795,8 +809,10 @@
  * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels)
  * @{
  */
-#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
-#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
+#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
+#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
 /** @} */ // end of group LOAD_TEXTURE2D
 
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) passing the Y index for each row to be loaded.
@@ -815,7 +831,7 @@
 #define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##0;                                                                            \
-    if(Y_MASK##0 != 0)                                                                      \
+    if (Y_MASK##0 != 0)                                                                     \
         BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##0 = 0;
@@ -824,7 +840,7 @@
     LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##1;                                                                            \
-    if(Y_MASK##1 != 0)                                                                      \
+    if (Y_MASK##1 != 0)                                                                     \
         BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##1 = 0;
@@ -833,7 +849,7 @@
     LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##2;                                                                            \
-    if(Y_MASK##2 != 0)                                                                      \
+    if (Y_MASK##2 != 0)                                                                     \
         BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##2 = 0;
@@ -842,7 +858,7 @@
     LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##3;                                                                            \
-    if(Y_MASK##3 != 0)                                                                      \
+    if (Y_MASK##3 != 0)                                                                     \
         BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##3 = 0;
@@ -851,7 +867,7 @@
     LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##4;                                                                            \
-    if(Y_MASK##4 != 0)                                                                      \
+    if (Y_MASK##4 != 0)                                                                     \
         BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##4 = 0;
@@ -860,7 +876,7 @@
     LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##5;                                                                            \
-    if(Y_MASK##5 != 0)                                                                      \
+    if (Y_MASK##5 != 0)                                                                     \
         BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##5 = 0;
@@ -869,7 +885,7 @@
     LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##6;                                                                            \
-    if(Y_MASK##6 != 0)                                                                      \
+    if (Y_MASK##6 != 0)                                                                     \
         BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##6 = 0;
@@ -878,7 +894,7 @@
     LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##7;                                                                            \
-    if(Y_MASK##7 != 0)                                                                      \
+    if (Y_MASK##7 != 0)                                                                     \
         BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##7 = 0;
@@ -887,7 +903,7 @@
     LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##8;                                                                            \
-    if(Y_MASK##8 != 0)                                                                      \
+    if (Y_MASK##8 != 0)                                                                     \
         BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##8 = 0;
@@ -896,7 +912,7 @@
     LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##9;                                                                            \
-    if(Y_MASK##9 != 0)                                                                      \
+    if (Y_MASK##9 != 0)                                                                     \
         BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##9 = 0;
@@ -905,7 +921,7 @@
     LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##A;                                                                            \
-    if(Y_MASK##A != 0)                                                                      \
+    if (Y_MASK##A != 0)                                                                     \
         BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##A = 0;
@@ -914,7 +930,7 @@
     LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##B;                                                                            \
-    if(Y_MASK##B != 0)                                                                      \
+    if (Y_MASK##B != 0)                                                                     \
         BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##B = 0;
@@ -923,7 +939,7 @@
     LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##C;                                                                            \
-    if(Y_MASK##C != 0)                                                                      \
+    if (Y_MASK##C != 0)                                                                     \
         BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##C = 0;
@@ -932,7 +948,7 @@
     LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##D;                                                                            \
-    if(Y_MASK##D != 0)                                                                      \
+    if (Y_MASK##D != 0)                                                                     \
         BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##D = 0;
@@ -941,7 +957,7 @@
     LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##E;                                                                            \
-    if(Y_MASK##E != 0)                                                                      \
+    if (Y_MASK##E != 0)                                                                     \
         BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##E = 0;
@@ -950,7 +966,7 @@
     LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##F;                                                                            \
-    if(Y_MASK##F != 0)                                                                      \
+    if (Y_MASK##F != 0)                                                                     \
         BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##F = 0;
@@ -976,8 +992,10 @@
  * @param[in] Y_MASK    The y-axis mask vector. If 0, forces BASENAMEn to 0
  * @{
  */
-#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
-#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
+#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
+    LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
+#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
+    LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
 /** @} */ // end of group LOAD_BLOCK_INDIRECT
 
 /** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
@@ -1088,8 +1106,10 @@
  * @param[in] STRIDE_Y  The stride in y-axis direction
  * @{
  */
-#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
-#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+    LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+    LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
 /** @} */ // end of group LOAD_SCALAR_AS_VECTOR
 
 /** Basic macros to calculate Z offset values from Z0 to Zn-1
@@ -1187,8 +1207,10 @@
  * @param[in] STRIDE_Y        The stride value in y-axis direction
  * @{
  */
-#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
-#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
 /** @} */ // end of group CALCULATE_Z_OFFSET
 
 /** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
@@ -1199,8 +1221,7 @@
  * @param[in] SCALE     The scale factor
  * @{
  */
-#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
-    BASENAME##0 *= (DATA_TYPE)SCALE;
+#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) BASENAME##0 *= (DATA_TYPE)SCALE;
 
 #define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
     SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
@@ -1275,7 +1296,7 @@
  * @{
  */
 #define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
-#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
+#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE)     SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
 /** @} */ // end of group SCALE_BLOCK
 
 /** Create a new vector containing the values at the given index for a set of given vectors
@@ -1287,8 +1308,7 @@
  * @param[in] TYPE     The data type of the destination vectors
  * @{
  */
-#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
-    TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
+#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
 #define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 2)                         \
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
@@ -1297,13 +1317,20 @@
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
 #define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 4)                         \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
-#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
-    VEC_DATA_TYPE(TYPE, 8)                         \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
-#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
-    VEC_DATA_TYPE(TYPE, 16)                         \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
+    BASENAME##IDX_COL =                            \
+        (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
+#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE)                                                           \
+    VEC_DATA_TYPE(TYPE, 8)                                                                                   \
+    BASENAME##IDX_COL =                                                                                      \
+        (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \
+                                 (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
+#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE)                                                           \
+    VEC_DATA_TYPE(TYPE, 16)                                                                                   \
+    BASENAME##IDX_COL =                                                                                       \
+        (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \
+                                  (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, \
+                                  (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, \
+                                  (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
 /** @} */ // end of group COLUMN_VECTORn
 
 /** Create a new vector containing the values at the given index. Utility macros for transposing a colum-vector
@@ -1315,8 +1342,7 @@
  * @param[in] TYPE     The data type of the destination vectors
  * @{
  */
-#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
-    TYPE BASENAME##IDX_COL = (TYPE)((X##0));
+#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0));
 #define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 2)                                \
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
@@ -1329,9 +1355,10 @@
 #define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 8)                                \
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
-#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
-    VEC_DATA_TYPE(TYPE, 16)                                \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
+#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE)                                                        \
+    VEC_DATA_TYPE(TYPE, 16)                                                                                       \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), \
+                                                  (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
 /** @} */ // end of group COLUMN_VECTOR_SCALARn
 
 /** Create transposed vectors of the given vectors
@@ -1343,8 +1370,7 @@
  * @param[in] TYPE     The data type of the transposed vectors
  * @{
  */
-#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \
-    COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
+#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
 #define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
     COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE);  \
     COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
@@ -1417,8 +1443,7 @@
  * @param[in] BIAS     The basename of the added variables
  * @{
  */
-#define ADD_ROW_1(BASENAME, BIAS) \
-    BASENAME##0 += BIAS##0;
+#define ADD_ROW_1(BASENAME, BIAS) BASENAME##0 += BIAS##0;
 
 #define ADD_ROW_2(BASENAME, BIAS) \
     ADD_ROW_1(BASENAME, BIAS)     \
@@ -1493,7 +1518,7 @@
  * @{
  */
 #define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
-#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
+#define ADD_BLOCK(N, BASENAME, BIAS)     ADD_BLOCK_STR(N, BASENAME, BIAS)
 /** @} */ // end of group ADD_BLOCK
 
 /** Broadcast (add single value) to the each element of the destination variables
@@ -1503,8 +1528,7 @@
  * @param[in] BIAS     The variable containing the value to add
  * @{
  */
-#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
-    BASENAME##0 += BIAS;
+#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) BASENAME##0 += BIAS;
 
 #define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
     ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
@@ -1578,7 +1602,7 @@
  * @{
  */
 #define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
-#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
+#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS)     ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
 /** @} */ // end of group ADD_BLOCK_BROADCAST
 
 /** Apply activation to the given variables
@@ -1668,8 +1692,10 @@
  * @param[in] B_VAL           Additional value required by the activation
  * @{
  */
-#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
-#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
 /** @} */ // end of group ACTIVATION_BLOCK
 
 /** Apply convert_<data_type> to the given variables
@@ -1773,6 +1799,8 @@
  * @param[in] BASENAME_DST The basename of the destination variables
  * @{
  */
-#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
-#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
 /** @} */ // end of group CONVERT_BLOCK
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index e0fd8dc3e379c7f0c1150453afc79c6cad89fe40..87a1875f93790deadbb9d85c244d241f345e36f9 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -81,11 +81,11 @@
  * @return The reversed vector
  * @{
  */
-#define REV1(x) ((x))
-#define REV2(x) ((x).s10)
-#define REV3(x) ((x).s210)
-#define REV4(x) ((x).s3210)
-#define REV8(x) ((x).s76543210)
+#define REV1(x)  ((x))
+#define REV2(x)  ((x).s10)
+#define REV3(x)  ((x).s210)
+#define REV4(x)  ((x).s3210)
+#define REV8(x)  ((x).s76543210)
 #define REV16(x) ((x).sFEDCBA9876543210)
 /** @} */ // end of group REVn
 
@@ -99,7 +99,7 @@
  * @{
  */
 #define REVERSE_STR(x, s) REV##s((x))
-#define REVERSE(x, s) REVERSE_STR(x, s)
+#define REVERSE(x, s)     REVERSE_STR(x, s)
 /** @} */ // end of group REVERSE
 
 /** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
@@ -138,16 +138,16 @@
 #define ROT8_7(x) ((x).s12345670)
 #define ROT8_8(x) ((x))
 
-#define ROT16_0(x) ((x))
-#define ROT16_1(x) ((x).sF0123456789ABCDE)
-#define ROT16_2(x) ((x).sEF0123456789ABCD)
-#define ROT16_3(x) ((x).sDEF0123456789ABC)
-#define ROT16_4(x) ((x).sCDEF0123456789AB)
-#define ROT16_5(x) ((x).sBCDEF0123456789A)
-#define ROT16_6(x) ((x).sABCDEF0123456789)
-#define ROT16_7(x) ((x).s9ABCDEF012345678)
-#define ROT16_8(x) ((x).s89ABCDEF01234567)
-#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_0(x)  ((x))
+#define ROT16_1(x)  ((x).sF0123456789ABCDE)
+#define ROT16_2(x)  ((x).sEF0123456789ABCD)
+#define ROT16_3(x)  ((x).sDEF0123456789ABC)
+#define ROT16_4(x)  ((x).sCDEF0123456789AB)
+#define ROT16_5(x)  ((x).sBCDEF0123456789A)
+#define ROT16_6(x)  ((x).sABCDEF0123456789)
+#define ROT16_7(x)  ((x).s9ABCDEF012345678)
+#define ROT16_8(x)  ((x).s89ABCDEF01234567)
+#define ROT16_9(x)  ((x).s789ABCDEF0123456)
 #define ROT16_10(x) ((x).s6789ABCDEF012345)
 #define ROT16_11(x) ((x).s56789ABCDEF01234)
 #define ROT16_12(x) ((x).s456789ABCDEF0123)
@@ -168,7 +168,7 @@
  * @{
  */
 #define ROTATE_STR(x, s, n) ROT##s##_##n(x)
-#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+#define ROTATE(x, s, n)     ROTATE_STR(x, s, n)
 /** @} */ // end of group ROTATE
 
 /** Creates a vector of size n filled with offset values corresponding to the location of each element.
@@ -179,11 +179,11 @@
  * @return The vector filled with offset values
  * @{
  */
-#define V_OFFS1(dt) (dt##1)(0)
-#define V_OFFS2(dt) (dt##2)(0, 1)
-#define V_OFFS3(dt) (dt##3)(0, 1, 2)
-#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
-#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS1(dt)  (dt##1)(0)
+#define V_OFFS2(dt)  (dt##2)(0, 1)
+#define V_OFFS3(dt)  (dt##3)(0, 1, 2)
+#define V_OFFS4(dt)  (dt##4)(0, 1, 2, 3)
+#define V_OFFS8(dt)  (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
 #define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
 /** @} */ // end of group V_OFFSn
 
@@ -197,11 +197,11 @@
  * @{
  */
 #define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
-#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+#define VEC_OFFS(dt, s)     VEC_OFFS_STR(dt, s)
 /** @} */ // end of group VEC_OFFS
 
 #define VLOAD_STR(size) vload##size
-#define VLOAD(size) VLOAD_STR(size)
+#define VLOAD(size)     VLOAD_STR(size)
 
 /** Extended partial vload that correctly handles scalar values as well.
  * Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of load ops
@@ -219,23 +219,23 @@
  * @{
  */
 #define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
-#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
+#define VLOAD_PARTIAL(size, load_size)     VLOAD_PARTIAL_STR(size, load_size)
 
 #define NO_LOAD(data, offs, ptr) \
     {                            \
     }
 
 // Size == 1 (scalar)
-#define vload_partial_1_0 NO_LOAD
-#define vload_partial_1_1 vload1
-#define vload_partial_1_2 NO_LOAD
-#define vload_partial_1_3 NO_LOAD
-#define vload_partial_1_4 NO_LOAD
-#define vload_partial_1_5 NO_LOAD
-#define vload_partial_1_6 NO_LOAD
-#define vload_partial_1_7 NO_LOAD
-#define vload_partial_1_8 NO_LOAD
-#define vload_partial_1_9 NO_LOAD
+#define vload_partial_1_0  NO_LOAD
+#define vload_partial_1_1  vload1
+#define vload_partial_1_2  NO_LOAD
+#define vload_partial_1_3  NO_LOAD
+#define vload_partial_1_4  NO_LOAD
+#define vload_partial_1_5  NO_LOAD
+#define vload_partial_1_6  NO_LOAD
+#define vload_partial_1_7  NO_LOAD
+#define vload_partial_1_8  NO_LOAD
+#define vload_partial_1_9  NO_LOAD
 #define vload_partial_1_10 NO_LOAD
 #define vload_partial_1_11 NO_LOAD
 #define vload_partial_1_12 NO_LOAD
@@ -244,16 +244,16 @@
 #define vload_partial_1_15 NO_LOAD
 #define vload_partial_1_16 NO_LOAD
 // Size == 2
-#define vload_partial_2_0 NO_LOAD
-#define vload_partial_2_1 vload_partial_1
-#define vload_partial_2_2 vload_partial_2
-#define vload_partial_2_3 NO_LOAD
-#define vload_partial_2_4 NO_LOAD
-#define vload_partial_2_5 NO_LOAD
-#define vload_partial_2_6 NO_LOAD
-#define vload_partial_2_7 NO_LOAD
-#define vload_partial_2_8 NO_LOAD
-#define vload_partial_2_9 NO_LOAD
+#define vload_partial_2_0  NO_LOAD
+#define vload_partial_2_1  vload_partial_1
+#define vload_partial_2_2  vload_partial_2
+#define vload_partial_2_3  NO_LOAD
+#define vload_partial_2_4  NO_LOAD
+#define vload_partial_2_5  NO_LOAD
+#define vload_partial_2_6  NO_LOAD
+#define vload_partial_2_7  NO_LOAD
+#define vload_partial_2_8  NO_LOAD
+#define vload_partial_2_9  NO_LOAD
 #define vload_partial_2_10 NO_LOAD
 #define vload_partial_2_11 NO_LOAD
 #define vload_partial_2_12 NO_LOAD
@@ -262,16 +262,16 @@
 #define vload_partial_2_15 NO_LOAD
 #define vload_partial_2_16 NO_LOAD
 // Size == 3
-#define vload_partial_3_0 NO_LOAD
-#define vload_partial_3_1 vload_partial_1
-#define vload_partial_3_2 vload_partial_2
-#define vload_partial_3_3 vload_partial_3
-#define vload_partial_3_4 NO_LOAD
-#define vload_partial_3_5 NO_LOAD
-#define vload_partial_3_6 NO_LOAD
-#define vload_partial_3_7 NO_LOAD
-#define vload_partial_3_8 NO_LOAD
-#define vload_partial_3_9 NO_LOAD
+#define vload_partial_3_0  NO_LOAD
+#define vload_partial_3_1  vload_partial_1
+#define vload_partial_3_2  vload_partial_2
+#define vload_partial_3_3  vload_partial_3
+#define vload_partial_3_4  NO_LOAD
+#define vload_partial_3_5  NO_LOAD
+#define vload_partial_3_6  NO_LOAD
+#define vload_partial_3_7  NO_LOAD
+#define vload_partial_3_8  NO_LOAD
+#define vload_partial_3_9  NO_LOAD
 #define vload_partial_3_10 NO_LOAD
 #define vload_partial_3_11 NO_LOAD
 #define vload_partial_3_12 NO_LOAD
@@ -280,16 +280,16 @@
 #define vload_partial_3_15 NO_LOAD
 #define vload_partial_3_16 NO_LOAD
 // Size == 4
-#define vload_partial_4_0 NO_LOAD
-#define vload_partial_4_1 vload_partial_1
-#define vload_partial_4_2 vload_partial_2
-#define vload_partial_4_3 vload_partial_3
-#define vload_partial_4_4 vload_partial_4
-#define vload_partial_4_5 NO_LOAD
-#define vload_partial_4_6 NO_LOAD
-#define vload_partial_4_7 NO_LOAD
-#define vload_partial_4_8 NO_LOAD
-#define vload_partial_4_9 NO_LOAD
+#define vload_partial_4_0  NO_LOAD
+#define vload_partial_4_1  vload_partial_1
+#define vload_partial_4_2  vload_partial_2
+#define vload_partial_4_3  vload_partial_3
+#define vload_partial_4_4  vload_partial_4
+#define vload_partial_4_5  NO_LOAD
+#define vload_partial_4_6  NO_LOAD
+#define vload_partial_4_7  NO_LOAD
+#define vload_partial_4_8  NO_LOAD
+#define vload_partial_4_9  NO_LOAD
 #define vload_partial_4_10 NO_LOAD
 #define vload_partial_4_11 NO_LOAD
 #define vload_partial_4_12 NO_LOAD
@@ -298,16 +298,16 @@
 #define vload_partial_4_15 NO_LOAD
 #define vload_partial_4_16 NO_LOAD
 // Size == 8
-#define vload_partial_8_0 NO_LOAD
-#define vload_partial_8_1 vload_partial_1
-#define vload_partial_8_2 vload_partial_2
-#define vload_partial_8_3 vload_partial_3
-#define vload_partial_8_4 vload_partial_4
-#define vload_partial_8_5 vload_partial_5
-#define vload_partial_8_6 vload_partial_6
-#define vload_partial_8_7 vload_partial_7
-#define vload_partial_8_8 vload_partial_8
-#define vload_partial_8_9 NO_LOAD
+#define vload_partial_8_0  NO_LOAD
+#define vload_partial_8_1  vload_partial_1
+#define vload_partial_8_2  vload_partial_2
+#define vload_partial_8_3  vload_partial_3
+#define vload_partial_8_4  vload_partial_4
+#define vload_partial_8_5  vload_partial_5
+#define vload_partial_8_6  vload_partial_6
+#define vload_partial_8_7  vload_partial_7
+#define vload_partial_8_8  vload_partial_8
+#define vload_partial_8_9  NO_LOAD
 #define vload_partial_8_10 NO_LOAD
 #define vload_partial_8_11 NO_LOAD
 #define vload_partial_8_12 NO_LOAD
@@ -316,16 +316,16 @@
 #define vload_partial_8_15 NO_LOAD
 #define vload_partial_8_16 NO_LOAD
 // Size == 16
-#define vload_partial_16_0 NO_LOAD
-#define vload_partial_16_1 vload_partial_1
-#define vload_partial_16_2 vload_partial_2
-#define vload_partial_16_3 vload_partial_3
-#define vload_partial_16_4 vload_partial_4
-#define vload_partial_16_5 vload_partial_5
-#define vload_partial_16_6 vload_partial_6
-#define vload_partial_16_7 vload_partial_7
-#define vload_partial_16_8 vload_partial_8
-#define vload_partial_16_9 vload_partial_9
+#define vload_partial_16_0  NO_LOAD
+#define vload_partial_16_1  vload_partial_1
+#define vload_partial_16_2  vload_partial_2
+#define vload_partial_16_3  vload_partial_3
+#define vload_partial_16_4  vload_partial_4
+#define vload_partial_16_5  vload_partial_5
+#define vload_partial_16_6  vload_partial_6
+#define vload_partial_16_7  vload_partial_7
+#define vload_partial_16_8  vload_partial_8
+#define vload_partial_16_9  vload_partial_9
 #define vload_partial_16_10 vload_partial_10
 #define vload_partial_16_11 vload_partial_11
 #define vload_partial_16_12 vload_partial_12
@@ -351,17 +351,13 @@
  * @param[in] PTR    The base pointer
  * @{
  */
-#define vload_partial_1(DATA, OFFSET, PTR) \
-    DATA.s0 = vload1(OFFSET, PTR);
+#define vload_partial_1(DATA, OFFSET, PTR) DATA.s0 = vload1(OFFSET, PTR);
 
-#define vload_partial_2(DATA, OFFSET, PTR) \
-    DATA.s01 = vload2(OFFSET, PTR);
+#define vload_partial_2(DATA, OFFSET, PTR) DATA.s01 = vload2(OFFSET, PTR);
 
-#define vload_partial_3(DATA, OFFSET, PTR) \
-    DATA.s012 = vload3(OFFSET, PTR);
+#define vload_partial_3(DATA, OFFSET, PTR) DATA.s012 = vload3(OFFSET, PTR);
 
-#define vload_partial_4(DATA, OFFSET, PTR) \
-    DATA.s0123 = vload4(OFFSET, PTR);
+#define vload_partial_4(DATA, OFFSET, PTR) DATA.s0123 = vload4(OFFSET, PTR);
 
 #define vload_partial_5(DATA, OFFSET, PTR)    \
     vload_partial_4(DATA.s0123, OFFSET, PTR); \
@@ -375,8 +371,7 @@
     vload_partial_4(DATA.s0123, OFFSET, PTR); \
     vload_partial_3(DATA.s456, OFFSET, PTR + 4);
 
-#define vload_partial_8(DATA, OFFSET, PTR) \
-    DATA.s01234567 = vload8(OFFSET, PTR);
+#define vload_partial_8(DATA, OFFSET, PTR) DATA.s01234567 = vload8(OFFSET, PTR);
 
 #define vload_partial_9(DATA, OFFSET, PTR)        \
     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
@@ -406,13 +401,12 @@
     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
     vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
 
-#define vload_partial_16(DATA, OFFSET, PTR) \
-    DATA = vload16(OFFSET, PTR);
+#define vload_partial_16(DATA, OFFSET, PTR) DATA = vload16(OFFSET, PTR);
 /** @} */ // end of groupd vload_partial_n
 /** @} */ // end of groupd VLOAD_PARTIAL
 
-#define PIXEL_UNIT4 1
-#define PIXEL_UNIT8 2
+#define PIXEL_UNIT4  1
+#define PIXEL_UNIT8  2
 #define PIXEL_UNIT16 4
 
 /** Utility macro to convert a vector size in pixel unit.
@@ -425,27 +419,45 @@
  * @{
  */
 #define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
-#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
+#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size)     CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
 /** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
 
 #define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
-#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
-#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
+#define read_image2d_floatx2(img, x_coord, y_coord) \
+    (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
+#define read_image2d_floatx4(img, x_coord, y_coord)                                                       \
+    (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), \
+              read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
 
 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 #define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
-#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
-#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
+#define read_image2d_halfx2(img, x_coord, y_coord) \
+    (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
+#define read_image2d_halfx4(img, x_coord, y_coord)                                                       \
+    (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), \
+             read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 
 #define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
-#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
-#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
+#define write_image2d_floatx2(img, x_coord, y_coord, values)    \
+    (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), \
+     write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
+#define write_image2d_floatx4(img, x_coord, y_coord, values)        \
+    (write_imagef(img, (int2)(x_coord, y_coord), values.s0123),     \
+     write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), \
+     write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), \
+     write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
 
 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 #define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
-#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
-#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
+#define write_image2d_halfx2(img, x_coord, y_coord, values)     \
+    (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), \
+     write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
+#define write_image2d_halfx4(img, x_coord, y_coord, values)         \
+    (write_imageh(img, (int2)(x_coord, y_coord), values.s0123),     \
+     write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), \
+     write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), \
+     write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 
 /** Utility macro to read a 2D OpenCL image object.
@@ -462,7 +474,7 @@
  * @{
  */
 #define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
-#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
+#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord)     READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
 /** @} */
 
 /** Utility macro to write a 2D OpenCL image object.
@@ -478,26 +490,28 @@
  *
  * @{
  */
-#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
-#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
+#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) \
+    write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
+#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) \
+    WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
 /** @} */
 
 #define VSTORE_STR(size) vstore##size
-#define VSTORE(size) VSTORE_STR(size)
+#define VSTORE(size)     VSTORE_STR(size)
 
-#define float1 float
-#define half1 half
-#define char1 char
-#define uchar1 uchar
-#define short1 short
+#define float1  float
+#define half1   half
+#define char1   char
+#define uchar1  uchar
+#define short1  short
 #define ushort1 ushort
-#define int1 int
-#define uint1 uint
-#define long1 long
-#define ulong1 ulong
+#define int1    int
+#define uint1   uint
+#define long1   long
+#define ulong1  ulong
 #define double1 double
 
-#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vload1(OFFSET, PTR)        *(OFFSET + PTR)
 #define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
 
 /** Extended partial vstore that correctly handles scalar values as well.
@@ -516,23 +530,23 @@
  * @{
  */
 #define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
-#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
+#define VSTORE_PARTIAL(size, store_size)     VSTORE_PARTIAL_STR(size, store_size)
 
 #define NO_STORE(data, offs, ptr) \
     {                             \
     }
 
 // Size == 1 (scalar)
-#define vstore_partial_1_0 NO_STORE
-#define vstore_partial_1_1 vstore1
-#define vstore_partial_1_2 NO_STORE
-#define vstore_partial_1_3 NO_STORE
-#define vstore_partial_1_4 NO_STORE
-#define vstore_partial_1_5 NO_STORE
-#define vstore_partial_1_6 NO_STORE
-#define vstore_partial_1_7 NO_STORE
-#define vstore_partial_1_8 NO_STORE
-#define vstore_partial_1_9 NO_STORE
+#define vstore_partial_1_0  NO_STORE
+#define vstore_partial_1_1  vstore1
+#define vstore_partial_1_2  NO_STORE
+#define vstore_partial_1_3  NO_STORE
+#define vstore_partial_1_4  NO_STORE
+#define vstore_partial_1_5  NO_STORE
+#define vstore_partial_1_6  NO_STORE
+#define vstore_partial_1_7  NO_STORE
+#define vstore_partial_1_8  NO_STORE
+#define vstore_partial_1_9  NO_STORE
 #define vstore_partial_1_10 NO_STORE
 #define vstore_partial_1_11 NO_STORE
 #define vstore_partial_1_12 NO_STORE
@@ -541,16 +555,16 @@
 #define vstore_partial_1_15 NO_STORE
 #define vstore_partial_1_16 NO_STORE
 // Size == 2
-#define vstore_partial_2_0 NO_STORE
-#define vstore_partial_2_1 vstore_partial_1
-#define vstore_partial_2_2 vstore_partial_2
-#define vstore_partial_2_3 NO_STORE
-#define vstore_partial_2_4 NO_STORE
-#define vstore_partial_2_5 NO_STORE
-#define vstore_partial_2_6 NO_STORE
-#define vstore_partial_2_7 NO_STORE
-#define vstore_partial_2_8 NO_STORE
-#define vstore_partial_2_9 NO_STORE
+#define vstore_partial_2_0  NO_STORE
+#define vstore_partial_2_1  vstore_partial_1
+#define vstore_partial_2_2  vstore_partial_2
+#define vstore_partial_2_3  NO_STORE
+#define vstore_partial_2_4  NO_STORE
+#define vstore_partial_2_5  NO_STORE
+#define vstore_partial_2_6  NO_STORE
+#define vstore_partial_2_7  NO_STORE
+#define vstore_partial_2_8  NO_STORE
+#define vstore_partial_2_9  NO_STORE
 #define vstore_partial_2_10 NO_STORE
 #define vstore_partial_2_11 NO_STORE
 #define vstore_partial_2_12 NO_STORE
@@ -559,16 +573,16 @@
 #define vstore_partial_2_15 NO_STORE
 #define vstore_partial_2_16 NO_STORE
 // Size == 3
-#define vstore_partial_3_0 NO_STORE
-#define vstore_partial_3_1 vstore_partial_1
-#define vstore_partial_3_2 vstore_partial_2
-#define vstore_partial_3_3 vstore_partial_3
-#define vstore_partial_3_4 NO_STORE
-#define vstore_partial_3_5 NO_STORE
-#define vstore_partial_3_6 NO_STORE
-#define vstore_partial_3_7 NO_STORE
-#define vstore_partial_3_8 NO_STORE
-#define vstore_partial_3_9 NO_STORE
+#define vstore_partial_3_0  NO_STORE
+#define vstore_partial_3_1  vstore_partial_1
+#define vstore_partial_3_2  vstore_partial_2
+#define vstore_partial_3_3  vstore_partial_3
+#define vstore_partial_3_4  NO_STORE
+#define vstore_partial_3_5  NO_STORE
+#define vstore_partial_3_6  NO_STORE
+#define vstore_partial_3_7  NO_STORE
+#define vstore_partial_3_8  NO_STORE
+#define vstore_partial_3_9  NO_STORE
 #define vstore_partial_3_10 NO_STORE
 #define vstore_partial_3_11 NO_STORE
 #define vstore_partial_3_12 NO_STORE
@@ -577,16 +591,16 @@
 #define vstore_partial_3_15 NO_STORE
 #define vstore_partial_3_16 NO_STORE
 // Size == 4
-#define vstore_partial_4_0 NO_STORE
-#define vstore_partial_4_1 vstore_partial_1
-#define vstore_partial_4_2 vstore_partial_2
-#define vstore_partial_4_3 vstore_partial_3
-#define vstore_partial_4_4 vstore_partial_4
-#define vstore_partial_4_5 NO_STORE
-#define vstore_partial_4_6 NO_STORE
-#define vstore_partial_4_7 NO_STORE
-#define vstore_partial_4_8 NO_STORE
-#define vstore_partial_4_9 NO_STORE
+#define vstore_partial_4_0  NO_STORE
+#define vstore_partial_4_1  vstore_partial_1
+#define vstore_partial_4_2  vstore_partial_2
+#define vstore_partial_4_3  vstore_partial_3
+#define vstore_partial_4_4  vstore_partial_4
+#define vstore_partial_4_5  NO_STORE
+#define vstore_partial_4_6  NO_STORE
+#define vstore_partial_4_7  NO_STORE
+#define vstore_partial_4_8  NO_STORE
+#define vstore_partial_4_9  NO_STORE
 #define vstore_partial_4_10 NO_STORE
 #define vstore_partial_4_11 NO_STORE
 #define vstore_partial_4_12 NO_STORE
@@ -595,16 +609,16 @@
 #define vstore_partial_4_15 NO_STORE
 #define vstore_partial_4_16 NO_STORE
 // Size == 8
-#define vstore_partial_8_0 NO_STORE
-#define vstore_partial_8_1 vstore_partial_1
-#define vstore_partial_8_2 vstore_partial_2
-#define vstore_partial_8_3 vstore_partial_3
-#define vstore_partial_8_4 vstore_partial_4
-#define vstore_partial_8_5 vstore_partial_5
-#define vstore_partial_8_6 vstore_partial_6
-#define vstore_partial_8_7 vstore_partial_7
-#define vstore_partial_8_8 vstore_partial_8
-#define vstore_partial_8_9 NO_STORE
+#define vstore_partial_8_0  NO_STORE
+#define vstore_partial_8_1  vstore_partial_1
+#define vstore_partial_8_2  vstore_partial_2
+#define vstore_partial_8_3  vstore_partial_3
+#define vstore_partial_8_4  vstore_partial_4
+#define vstore_partial_8_5  vstore_partial_5
+#define vstore_partial_8_6  vstore_partial_6
+#define vstore_partial_8_7  vstore_partial_7
+#define vstore_partial_8_8  vstore_partial_8
+#define vstore_partial_8_9  NO_STORE
 #define vstore_partial_8_10 NO_STORE
 #define vstore_partial_8_11 NO_STORE
 #define vstore_partial_8_12 NO_STORE
@@ -613,16 +627,16 @@
 #define vstore_partial_8_15 NO_STORE
 #define vstore_partial_8_16 NO_STORE
 // Size == 16
-#define vstore_partial_16_0 NO_STORE
-#define vstore_partial_16_1 vstore_partial_1
-#define vstore_partial_16_2 vstore_partial_2
-#define vstore_partial_16_3 vstore_partial_3
-#define vstore_partial_16_4 vstore_partial_4
-#define vstore_partial_16_5 vstore_partial_5
-#define vstore_partial_16_6 vstore_partial_6
-#define vstore_partial_16_7 vstore_partial_7
-#define vstore_partial_16_8 vstore_partial_8
-#define vstore_partial_16_9 vstore_partial_9
+#define vstore_partial_16_0  NO_STORE
+#define vstore_partial_16_1  vstore_partial_1
+#define vstore_partial_16_2  vstore_partial_2
+#define vstore_partial_16_3  vstore_partial_3
+#define vstore_partial_16_4  vstore_partial_4
+#define vstore_partial_16_5  vstore_partial_5
+#define vstore_partial_16_6  vstore_partial_6
+#define vstore_partial_16_7  vstore_partial_7
+#define vstore_partial_16_8  vstore_partial_8
+#define vstore_partial_16_9  vstore_partial_9
 #define vstore_partial_16_10 vstore_partial_10
 #define vstore_partial_16_11 vstore_partial_11
 #define vstore_partial_16_12 vstore_partial_12
@@ -648,17 +662,13 @@
  * @param[in] PTR    The base pointer
  * @{
  */
-#define vstore_partial_1(DATA, OFFSET, PTR) \
-    vstore1(DATA.s0, OFFSET, PTR);
+#define vstore_partial_1(DATA, OFFSET, PTR) vstore1(DATA.s0, OFFSET, PTR);
 
-#define vstore_partial_2(DATA, OFFSET, PTR) \
-    vstore2(DATA.s01, OFFSET, PTR);
+#define vstore_partial_2(DATA, OFFSET, PTR) vstore2(DATA.s01, OFFSET, PTR);
 
-#define vstore_partial_3(DATA, OFFSET, PTR) \
-    vstore3(DATA.s012, OFFSET, PTR);
+#define vstore_partial_3(DATA, OFFSET, PTR) vstore3(DATA.s012, OFFSET, PTR);
 
-#define vstore_partial_4(DATA, OFFSET, PTR) \
-    vstore4(DATA.s0123, OFFSET, PTR);
+#define vstore_partial_4(DATA, OFFSET, PTR) vstore4(DATA.s0123, OFFSET, PTR);
 
 #define vstore_partial_5(DATA, OFFSET, PTR)    \
     vstore_partial_4(DATA.s0123, OFFSET, PTR); \
@@ -672,8 +682,7 @@
     vstore_partial_4(DATA.s0123, OFFSET, PTR); \
     vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
 
-#define vstore_partial_8(DATA, OFFSET, PTR) \
-    vstore8(DATA.s01234567, OFFSET, PTR);
+#define vstore_partial_8(DATA, OFFSET, PTR) vstore8(DATA.s01234567, OFFSET, PTR);
 
 #define vstore_partial_9(DATA, OFFSET, PTR)        \
     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
@@ -703,176 +712,156 @@
     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
     vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
 
-#define vstore_partial_16(DATA, OFFSET, PTR) \
-    vstore16(DATA, OFFSET, PTR);
+#define vstore_partial_16(DATA, OFFSET, PTR) vstore16(DATA, OFFSET, PTR);
 /** @} */ // end of groupd vstore_partial_n
 /** @} */ // end of groupd VSTORE_PARTIAL
 
 // Convert built-in functions with _sat modifier are not supported in floating point so we create defines
 // without _sat to overcome this issue
-#define convert_float_sat convert_float
-#define convert_float1_sat convert_float
-#define convert_float2_sat convert_float2
-#define convert_float3_sat convert_float3
-#define convert_float4_sat convert_float4
-#define convert_float8_sat convert_float8
+#define convert_float_sat   convert_float
+#define convert_float1_sat  convert_float
+#define convert_float2_sat  convert_float2
+#define convert_float3_sat  convert_float3
+#define convert_float4_sat  convert_float4
+#define convert_float8_sat  convert_float8
 #define convert_float16_sat convert_float16
-#define convert_half_sat convert_float
-#define convert_half1_sat convert_half
-#define convert_half2_sat convert_half2
-#define convert_half3_sat convert_half3
-#define convert_half4_sat convert_half4
-#define convert_half8_sat convert_half8
-#define convert_half16_sat convert_half16
-
-#define convert_float1 convert_float
-#define convert_half1 convert_half
-#define convert_char1 convert_char
-#define convert_uchar1 convert_uchar
-#define convert_short1 convert_short
+#define convert_half_sat    convert_float
+#define convert_half1_sat   convert_half
+#define convert_half2_sat   convert_half2
+#define convert_half3_sat   convert_half3
+#define convert_half4_sat   convert_half4
+#define convert_half8_sat   convert_half8
+#define convert_half16_sat  convert_half16
+
+#define convert_float1  convert_float
+#define convert_half1   convert_half
+#define convert_char1   convert_char
+#define convert_uchar1  convert_uchar
+#define convert_short1  convert_short
 #define convert_ushort1 convert_ushort
-#define convert_int1 convert_int
-#define convert_uint1 convert_uint
-#define convert_long1 convert_long
-#define convert_ulong1 convert_ulong
+#define convert_int1    convert_int
+#define convert_uint1   convert_uint
+#define convert_long1   convert_long
+#define convert_ulong1  convert_ulong
 #define convert_double1 convert_double
 
-#define convert_char1_sat convert_char_sat
-#define convert_uchar1_sat convert_uchar_sat
-#define convert_uchar2_sat convert_uchar2_sat
-#define convert_uchar3_sat convert_uchar3_sat
-#define convert_uchar4_sat convert_uchar4_sat
-#define convert_uchar8_sat convert_uchar8_sat
+#define convert_char1_sat   convert_char_sat
+#define convert_uchar1_sat  convert_uchar_sat
+#define convert_uchar2_sat  convert_uchar2_sat
+#define convert_uchar3_sat  convert_uchar3_sat
+#define convert_uchar4_sat  convert_uchar4_sat
+#define convert_uchar8_sat  convert_uchar8_sat
 #define convert_uchar16_sat convert_uchar16_sat
-#define convert_short1_sat convert_short_sat
+#define convert_short1_sat  convert_short_sat
 #define convert_ushort1_sat convert_ushort_sat
-#define convert_int1_sat convert_int_sat
-#define convert_uint1_sat convert_uint_sat
-#define convert_long1_sat convert_long_sat
-#define convert_ulong1_sat convert_ulong_sat
+#define convert_int1_sat    convert_int_sat
+#define convert_uint1_sat   convert_uint_sat
+#define convert_long1_sat   convert_long_sat
+#define convert_ulong1_sat  convert_ulong_sat
 #define convert_double1_sat convert_double_sat
 
 #define VEC_DATA_TYPE_STR(type, size) type##size
-#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+#define VEC_DATA_TYPE(type, size)     VEC_DATA_TYPE_STR(type, size)
 
 #define CONVERT_STR(x, type) (convert_##type((x)))
-#define CONVERT(x, type) CONVERT_STR(x, type)
+#define CONVERT(x, type)     CONVERT_STR(x, type)
 
 #define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+#define CONVERT_SAT(x, type)     CONVERT_SAT_STR(x, type)
 
 #define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
-#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+#define CONVERT_SAT_ROUND(x, type, round)     CONVERT_SAT_ROUND_STR(x, type, round)
 
-#define select_vec_dt_uchar(size) uchar##size
-#define select_vec_dt_char(size) char##size
+#define select_vec_dt_uchar(size)  uchar##size
+#define select_vec_dt_char(size)   char##size
 #define select_vec_dt_ushort(size) ushort##size
-#define select_vec_dt_short(size) short##size
-#define select_vec_dt_half(size) short##size
-#define select_vec_dt_uint(size) uint##size
-#define select_vec_dt_int(size) int##size
-#define select_vec_dt_float(size) int##size
-#define select_vec_dt_ulong(size) ulong##size
-#define select_vec_dt_long(size) long##size
+#define select_vec_dt_short(size)  short##size
+#define select_vec_dt_half(size)   short##size
+#define select_vec_dt_uint(size)   uint##size
+#define select_vec_dt_int(size)    int##size
+#define select_vec_dt_float(size)  int##size
+#define select_vec_dt_ulong(size)  ulong##size
+#define select_vec_dt_long(size)   long##size
 
 #define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
-#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
-#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
+#define SELECT_VEC_DATA_TYPE(type, size)     SELECT_VEC_DATA_TYPE_STR(type, size)
+#define SELECT_DATA_TYPE(type)               SELECT_VEC_DATA_TYPE_STR(type, 1)
 
-#define signed_int_vec_dt_uchar(size) char##size
-#define signed_int_vec_dt_char(size) char##size
+#define signed_int_vec_dt_uchar(size)  char##size
+#define signed_int_vec_dt_char(size)   char##size
 #define signed_int_vec_dt_ushort(size) short##size
-#define signed_int_vec_dt_short(size) short##size
-#define signed_int_vec_dt_half(size) short##size
-#define signed_int_vec_dt_uint(size) int##size
-#define signed_int_vec_dt_int(size) int##size
-#define signed_int_vec_dt_float(size) int##size
-#define signed_int_vec_dt_ulong(size) long##size
-#define signed_int_vec_dt_long(size) long##size
+#define signed_int_vec_dt_short(size)  short##size
+#define signed_int_vec_dt_half(size)   short##size
+#define signed_int_vec_dt_uint(size)   int##size
+#define signed_int_vec_dt_int(size)    int##size
+#define signed_int_vec_dt_float(size)  int##size
+#define signed_int_vec_dt_ulong(size)  long##size
+#define signed_int_vec_dt_long(size)   long##size
 
 #define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
-#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
-#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
-
-#define sum_reduce_1(x) (x)
-#define sum_reduce_2(x) ((x).s0) + ((x).s1)
-#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
-#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
-#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
+#define SIGNED_INT_VEC_DATA_TYPE(type, size)     SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
+#define SIGNED_INT_DATA_TYPE(type)               SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
+
+#define sum_reduce_1(x)  (x)
+#define sum_reduce_2(x)  ((x).s0) + ((x).s1)
+#define sum_reduce_3(x)  sum_reduce_2((x).s01) + ((x).s2)
+#define sum_reduce_4(x)  sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
+#define sum_reduce_8(x)  sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
 #define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
 
 #define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
-#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
+#define SUM_REDUCE(x, size)     SUM_REDUCE_STR(x, size)
 
-#define prod_reduce_1(x) (x)
-#define prod_reduce_2(x) ((x).s0) * ((x).s1)
-#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
-#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
-#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
+#define prod_reduce_1(x)  (x)
+#define prod_reduce_2(x)  ((x).s0) * ((x).s1)
+#define prod_reduce_3(x)  prod_reduce_2((x).s01) * ((x).s2)
+#define prod_reduce_4(x)  prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
+#define prod_reduce_8(x)  prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
 #define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
 
 #define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
-#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
+#define PROD_REDUCE(x, size)     PROD_REDUCE_STR(x, size)
 
-#define max_reduce_1(x) (x)
-#define max_reduce_2(x) max(((x).s0), ((x).s1))
-#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
-#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
-#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
+#define max_reduce_1(x)  (x)
+#define max_reduce_2(x)  max(((x).s0), ((x).s1))
+#define max_reduce_3(x)  max(max_reduce_2((x).s01), ((x).s2))
+#define max_reduce_4(x)  max(max_reduce_2((x).s01), max_reduce_2((x).s23))
+#define max_reduce_8(x)  max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
 #define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
 
 #define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
-#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
-
-#define VECTOR_DECLARATION(name)     \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_offset_first_element_in_bytes
-
-#define IMAGE_DECLARATION(name)      \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_offset_first_element_in_bytes
-
-#define TENSOR3D_DECLARATION(name)   \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_stride_z, \
-    uint        name##_step_z,   \
-    uint        name##_offset_first_element_in_bytes
-
-#define TENSOR4D_DECLARATION(name)   \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_stride_z, \
-    uint        name##_step_z,   \
-    uint        name##_stride_w, \
-    uint        name##_step_w,   \
-    uint        name##_offset_first_element_in_bytes
-
-#define TENSOR5D_DECLARATION(name)   \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_stride_z, \
-    uint        name##_step_z,   \
-    uint        name##_stride_w, \
-    uint        name##_step_w,   \
-    uint        name##_stride_v, \
-    uint        name##_step_v,   \
-    uint        name##_offset_first_element_in_bytes
+#define MAX_REDUCE(x, size)     MAX_REDUCE_STR(x, size)
+
+#define min_reduce_1(x)  (x)
+#define min_reduce_2(x)  min(((x).s0), ((x).s1))
+#define min_reduce_3(x)  min(min_reduce_2((x).s01), ((x).s2))
+#define min_reduce_4(x)  min(min_reduce_2((x).s01), min_reduce_2((x).s23))
+#define min_reduce_8(x)  min(min_reduce_4((x).s0123), min_reduce_4((x).s4567))
+#define min_reduce_16(x) min(min_reduce_8((x).s01234567), min_reduce_8((x).s89ABCDEF))
+
+#define MIN_REDUCE_STR(x, size) min_reduce_##size(x)
+#define MIN_REDUCE(x, size)     MIN_REDUCE_STR(x, size)
+
+#define VECTOR_DECLARATION(name) \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)                                                                                     \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)                                                                                  \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_stride_z, uint name##_step_z, uint name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)                                                                                  \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w,                         \
+        uint name##_offset_first_element_in_bytes
+
+#define TENSOR5D_DECLARATION(name)                                                                                  \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w, uint name##_stride_v,   \
+        uint name##_step_v, uint name##_offset_first_element_in_bytes
 
 #define CONVERT_TO_VECTOR_STRUCT(name) \
     update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
@@ -880,38 +869,47 @@
 #define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
     update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
 
-#define CONVERT_TO_IMAGE_STRUCT(name) \
-    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+#define CONVERT_TO_IMAGE_STRUCT(name)                                                                           \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                              name##_stride_y, name##_step_y)
 
 #define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
     update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
 
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
-    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                                                 \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                                            name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
+                                            name##_step_z)
 
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
-    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name)                                                            \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+                                            name##_stride_y, 0, name##_stride_z, name##_step_z)
 
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
-    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                                                 \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                                            name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
+                                            name##_step_z)
 
-#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
-    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                                 name##_stride_z, name##_step_z)
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                                 name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
 
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
-    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)                                                       \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+                                 name##_stride_y, 0, name##_stride_z, 0)
 
-#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
-    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                                 name##_stride_y, name##_step_y, name##_stride_z, name##_step_z, name##_stride_w,  \
+                                 name##_step_w, mod_size)
 
-#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
-    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size)                                             \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+                                 name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
 
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
-    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                           name##_stride_z, name##_step_z)
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                       \
+    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                           name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
 
 /** Structure to hold Vector information */
 typedef struct Vector
@@ -960,10 +958,10 @@ typedef struct Tensor4D
  *
  * @return An image object
  */
-inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+inline Vector
+update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
 {
-    Vector vector =
-    {
+    Vector vector = {
         .ptr                           = ptr,
         .offset_first_element_in_bytes = offset_first_element_in_bytes,
         .stride_x                      = stride_x,
@@ -983,15 +981,13 @@ inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_
  *
  * @return An image object
  */
-inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+inline Image update_image_workitem_ptr(
+    __global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
 {
-    Image img =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y
-    };
+    Image img = {.ptr                           = ptr,
+                 .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                 .stride_x                      = stride_x,
+                 .stride_y                      = stride_y};
     img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
     return img;
 }
@@ -1009,16 +1005,21 @@ inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el
  *
  * @return A 3D tensor object
  */
-inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
+                                                     uint            offset_first_element_in_bytes,
+                                                     uint            stride_x,
+                                                     uint            step_x,
+                                                     uint            stride_y,
+                                                     uint            step_y,
+                                                     uint            stride_z,
+                                                     uint            step_z)
 {
-    Image img =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y
-    };
-    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    Image img = {.ptr                           = ptr,
+                 .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                 .stride_x                      = stride_x,
+                 .stride_y                      = stride_y};
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+               get_global_id(2) * step_z;
     return img;
 }
 
@@ -1035,17 +1036,22 @@ inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint o
  *
  * @return A 3D tensor object
  */
-inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
+                                             uint            offset_first_element_in_bytes,
+                                             uint            stride_x,
+                                             uint            step_x,
+                                             uint            stride_y,
+                                             uint            step_y,
+                                             uint            stride_z,
+                                             uint            step_z)
 {
-    Tensor3D tensor =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z
-    };
-    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    Tensor3D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z};
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+                  get_global_id(2) * step_z;
     return tensor;
 }
 
@@ -1062,34 +1068,44 @@ inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_fi
  *
  * @return A 3D tensor object
  */
-inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr,
+                                       uint            offset_first_element_in_bytes,
+                                       uint            stride_x,
+                                       uint            step_x,
+                                       uint            stride_y,
+                                       uint            step_y,
+                                       uint            stride_z,
+                                       uint            step_z)
 {
-    Tensor3D tensor =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z
-    };
+    Tensor3D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z};
     return tensor;
 }
 
-inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
-                                             uint step_w,
-                                             uint mod_size)
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
+                                             uint            offset_first_element_in_bytes,
+                                             uint            stride_x,
+                                             uint            step_x,
+                                             uint            stride_y,
+                                             uint            step_y,
+                                             uint            stride_z,
+                                             uint            step_z,
+                                             uint            stride_w,
+                                             uint            step_w,
+                                             uint            mod_size)
 {
-    Tensor4D tensor =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z,
-        .stride_w                      = stride_w
-    };
-
-    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    Tensor4D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z,
+                       .stride_w                      = stride_w};
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+                  (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
     return tensor;
 }
 
@@ -1161,7 +1177,8 @@ inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint wid
 
     const uint x = index;
 
-    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
+           tensor->offset_first_element_in_bytes;
 }
 
 #endif // _HELPER_H
diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index 562c5d323660c3e2d17f70113346ecc32a73fa15..166260a3c048cd217d49c33f2efac4036bf71229 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h
@@ -34,7 +34,7 @@
  * @return The converted vector
  */
 #define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+#define CONVERT_DOWN_RTE(x, type)     CONVERT_DOWN_RTE_STR(x, type)
 
 /** Quantize a floating-point scalar value to 8-bit asymmetric
  *
@@ -84,14 +84,15 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return quantized values
  */
-#define QUANTIZE_IMPL(type, size)                                                                                       \
-    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
-    {                                                                                                                   \
-        VEC_DATA_TYPE(float, size)                                                                                      \
-        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
-        VEC_DATA_TYPE(type, size)                                                                                       \
-        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
-        return res;                                                                                                     \
+#define QUANTIZE_IMPL(type, size)                                                                          \
+    inline VEC_DATA_TYPE(type, size)                                                                       \
+        quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale)                 \
+    {                                                                                                      \
+        VEC_DATA_TYPE(float, size)                                                                         \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);      \
+        VEC_DATA_TYPE(type, size)                                                                          \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \
+        return res;                                                                                        \
     }
 
 /** Dequantize a vector of values to floating-point
@@ -101,10 +102,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return dequantized values in floating point
  */
-#define DEQUANTIZE_IMPL(type, size)                                                                                       \
-    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
-    {                                                                                                                     \
-        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+#define DEQUANTIZE_IMPL(type, size)                                                         \
+    inline VEC_DATA_TYPE(float, size)                                                       \
+        dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                       \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;               \
     }
 
 /** Correctly-rounded-to-nearest division by a power-of-two.
@@ -113,18 +115,17 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Correctly-rounded-to-nearest division by a power-of-two.
  */
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
-    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
-    {                                                                                                                                   \
-        const VEC_DATA_TYPE(int, size)                                                                                                  \
-        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
-        const VEC_DATA_TYPE(int, size)                                                                                                  \
-        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
-        VEC_DATA_TYPE(int, size)                                                                                                        \
-        mask = (one << exponent) - one;                                                                                                 \
-        VEC_DATA_TYPE(int, size)                                                                                                        \
-        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                                          \
-        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold));                          \
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                               \
+    inline VEC_DATA_TYPE(int, size)                                                                            \
+        asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent)    \
+    {                                                                                                          \
+        const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0;                                     \
+        const VEC_DATA_TYPE(int, size) one  = (VEC_DATA_TYPE(int, size))1;                                     \
+        VEC_DATA_TYPE(int, size)                                                                               \
+        mask = (one << exponent) - one;                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                               \
+        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                 \
+        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \
     }
 
 /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
@@ -167,27 +168,29 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Result in fixed-point format Q0.
  */
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
-    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
-    {                                                                                                                               \
-        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
-        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
-        const int k_fractional_bits = 31;                                                                                           \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x2 = ASYMM_MULT(x, x, size);                                                                                                \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
-        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                              \
+    inline VEC_DATA_TYPE(int, size)                                                                           \
+        asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a)       \
+    {                                                                                                         \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                        \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                         \
+        const int k_fractional_bits                      = 31;                                                \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x = a + (1 << (k_fractional_bits - 3));                                                               \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x2 = ASYMM_MULT(x, x, size);                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x3 = ASYMM_MULT(x2, x, size);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                               \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;       \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 =                                                            \
+            ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);                        \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \
     }
 
 /** Each bit of the result is set to the corresponding bit of either then_val or
@@ -198,10 +201,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
  */
-#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
-    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
-    {                                                                                                                                                                     \
-        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                      \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(                                              \
+        VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                           \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                    \
     }
 
 /** For each element of input vector, the corresponding bits of the result item are set
@@ -234,18 +238,19 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
         return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0));       \
     }
 
-#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
-    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
-    {                                                                                                                                                                                                         \
-        if(k_integer_bits > exponent)                                                                                                                                                                         \
-        {                                                                                                                                                                                                     \
-            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
-            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
-                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
-                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
-        }                                                                                                                                                                                                     \
-        \
-        return result;                                                                                                                                                                                        \
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                  \
+    inline VEC_DATA_TYPE(int, size)                                                                                    \
+        exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \
+                                 int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder)                            \
+    {                                                                                                                  \
+        if (k_integer_bits > exponent)                                                                                 \
+        {                                                                                                              \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                   \
+            return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),            \
+                                           ASYMM_MULT(result, fp_multiplier, size), result, size);                     \
+        }                                                                                                              \
+                                                                                                                       \
+        return result;                                                                                                 \
     }
 
 /** Calculates \f$ exp(x) \f$ for x < 0.
@@ -254,39 +259,40 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Result in fixed-point format Q0.
  */
-#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
-    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
-    {                                                                                                                         \
-        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        mask = k_one_quarter - 1;                                                                                             \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
-        \
-        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
-        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
-        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
-        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
-        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
-        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
-        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
-        \
-        if(k_integer_bits > 5)                                                                                                \
-        {                                                                                                                     \
-            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
-            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
-        }                                                                                                                     \
-        \
-        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
-        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \
+    {                                                                                                                  \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                  \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        mask = k_one_quarter - 1;                                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                  \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                    \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, \
+                                                                               size);                                  \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                               \
+                                                                                                                       \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);       \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);       \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);        \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);        \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);         \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);           \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);              \
+                                                                                                                       \
+        if (k_integer_bits > 5)                                                                                        \
+        {                                                                                                              \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                    \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                \
+        }                                                                                                              \
+                                                                                                                       \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                               \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                             \
     }
 
 /** Calculates the product of a integer value by a power of two, with either a positive exponent
@@ -297,26 +303,27 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Arithmetic left or right shift.
  */
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
-    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
-    {                                                                                                                      \
-        if(exponent < 0)                                                                                                   \
-        {                                                                                                                  \
-            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
-        }                                                                                                                  \
-        \
-        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
-        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
-        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
-        VEC_DATA_TYPE(int, size)                                                                                           \
-        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
-        VEC_DATA_TYPE(int, size)                                                                                           \
-        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
-        VEC_DATA_TYPE(int, size)                                                                                           \
-        result = x << exponent;                                                                                            \
-        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
-        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
-        return result;                                                                                                     \
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                      \
+    inline VEC_DATA_TYPE(int, size)                                                            \
+        asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                          \
+        if (exponent < 0)                                                                      \
+        {                                                                                      \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                          \
+        }                                                                                      \
+                                                                                               \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                          \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                          \
+        int threshold                      = ((1 << (31 - exponent)) - 1);                     \
+        VEC_DATA_TYPE(int, size)                                                               \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                           \
+        VEC_DATA_TYPE(int, size)                                                               \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                          \
+        VEC_DATA_TYPE(int, size)                                                               \
+        result = x << exponent;                                                                \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                    \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                    \
+        return result;                                                                         \
     }
 
 /** Calculates (a+b)/2, rounded to the nearest integer.
@@ -326,20 +333,21 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return (a+b)/2, rounded to the nearest integer.
  */
-#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
-    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
-    {                                                                                                                     \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        a64 = convert_long##size(a);                                                                                      \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        b64 = convert_long##size(b);                                                                                      \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        sum = a64 + b64;                                                                                                  \
-        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
-        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));                                      \
-        return convert_int##size((sum + sign) / 2);                                                                       \
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size)                                                           \
+        asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                         \
+        VEC_DATA_TYPE(long, size)                                                             \
+        a64 = convert_long##size(a);                                                          \
+        VEC_DATA_TYPE(long, size)                                                             \
+        b64 = convert_long##size(b);                                                          \
+        VEC_DATA_TYPE(long, size)                                                             \
+        sum                                       = a64 + b64;                                \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                        \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                       \
+        VEC_DATA_TYPE(long, size)                                                             \
+        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));          \
+        return convert_int##size((sum + sign) / 2);                                           \
     }
 
 /** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
@@ -354,12 +362,12 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
         const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
         const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
         VEC_DATA_TYPE(int, size)                                                                             \
-        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        half_denominator                                 = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);         \
         const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
         const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
         VEC_DATA_TYPE(int, size)                                                                             \
         x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
-        for(int i = 0; i < 3; i++)                                                                           \
+        for (int i = 0; i < 3; i++)                                                                          \
         {                                                                                                    \
             VEC_DATA_TYPE(int, size)                                                                         \
             half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
@@ -378,48 +386,57 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Rescaled value.
  */
-#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
-    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
-    {                                                                                                                               \
-        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
-        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+#define ASYMM_RESCALE_IMPL(size)                                                                        \
+    inline VEC_DATA_TYPE(int, size)                                                                     \
+        asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                   \
+        int exponent = src_integer_bits - dst_integer_bits;                                             \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                           \
     }
 
-#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
-#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define QUANTIZE_STR(input, offset, scale, type, size)   quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size)       QUANTIZE_STR(input, offset, scale, type, size)
 #define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
-#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE(input, offset, scale, type, size)     DEQUANTIZE_STR(input, offset, scale, type, size)
 
 #define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
-#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b)
-#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size)
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size)     ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
+#define ASYMM_MULT_STR(a, b, size)                           asymm_mult##size(a, b)
+#define ASYMM_MULT(a, b, size)                               ASYMM_MULT_STR(a, b, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
     ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
     ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
-#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
-#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
+    asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \
+    asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size)     asymm_mask_if_zero##size(a)
 #define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
-#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) \
+    exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
 #define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
-#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size)     ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)       asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size)           ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \
+    asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
 #define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
-#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
-#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
-
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                             \
-    inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
-    {                                                                                                                           \
-        const int left_shift  = shift > 0 ? shift : 0;                                                                          \
-        const int right_shift = shift > 0 ? 0 : -shift;                                                                         \
-        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size);             \
+#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) \
+    asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
+    ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
+
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                 \
+    inline VEC_DATA_TYPE(int, size)                                                                                 \
+        multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift)                 \
+    {                                                                                                               \
+        const int left_shift  = shift > 0 ? shift : 0;                                                              \
+        const int right_shift = shift > 0 ? 0 : -shift;                                                             \
+        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \
     }
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift)
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
+    multiply_by_quantized_multiplier##size(input, qmul, shift)
 
 QUANTIZE_IMPL(uchar, 1)
 QUANTIZE_IMPL(char, 1)
diff --git a/src/core/CL/cl_kernels/load_store_utility.h b/src/core/CL/cl_kernels/load_store_utility.h
index 4ba2b2ca3a2b4db2dee248d4e4f585ef7ce81360..4daf0adc89275c4815fad8206f4b0f70615b5c92 100644
--- a/src/core/CL/cl_kernels/load_store_utility.h
+++ b/src/core/CL/cl_kernels/load_store_utility.h
@@ -223,8 +223,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** @} */ // end of group STORE_BLOCK
 
 /** Convert and store a block of the given size M0xN0
@@ -245,8 +247,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** @} */ // end of group CONVERT_STORE_BLOCK
 
 /** Partially store the 0 to (n-1)th rows of the given variables
@@ -365,8 +369,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** Store a block that can be partial in both x and y dimensions
  *
  * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
@@ -388,22 +394,23 @@
  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
  */
-#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
-    }                                                                                                                                                     \
-    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
-    }                                                                                                                                                     \
-    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
-    }                                                                                                                                                     \
-    else                                                                                                                                                  \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
+#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0,     \
+                                       PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                    \
+    if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                             \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                             \
+    }                                                                                                       \
+    else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                         \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);               \
+    }                                                                                                       \
+    else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                         \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);               \
+    }                                                                                                       \
+    else                                                                                                    \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
     }
 /** Store a block that can only be partial in x but not y.
  *
@@ -425,7 +432,7 @@
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
  */
 #define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X))                                                                                         \
+    if (!(PARTIAL_COND_X))                                                                                        \
     {                                                                                                             \
         STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
     }                                                                                                             \
@@ -453,7 +460,7 @@
  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
  */
 #define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
-    if(!(PARTIAL_COND_Y))                                                                                         \
+    if (!(PARTIAL_COND_Y))                                                                                        \
     {                                                                                                             \
         STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
     }                                                                                                             \
@@ -517,23 +524,28 @@
 #if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case1: No partial blocks in either x or y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
     STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 
 #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
 // Case2: Partial blocks in y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
     STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
 
 #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
 // Case3: Partial blocks in x
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
     STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
 
 #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case4: Partial blocks in both x and y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
+    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)
 
 #endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 
@@ -560,8 +572,7 @@
 #define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
     ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
 #else // defined(PARTIAL_STORE_M0)
-#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
-    ((uint)(y * M0))
+#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) ((uint)(y * M0))
 #endif    // defined(PARTIAL_STORE_M0)
 /** @} */ // end of group COMPUTE_M0_START_ROW
 
diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h
index bed94a7b3b81b51ca4a1f239478826f2cc51cc29..cb2f4b0319770c828ed28b2e802751b9f8e78f45 100644
--- a/src/core/CL/cl_kernels/repeat.h
+++ b/src/core/CL/cl_kernels/repeat.h
@@ -75,7 +75,9 @@
     P_X##_DEF(F, P_A, P_B, P_C);        \
     REPEAT_3_15(P_X, P_A, P_B, P_C)
 
-#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) \
+    REPEAT_3_##P_NUM(P_OP, P_A, P_B,               \
+                     P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
 #define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
 
 // Repeat macros with 4 param, excluding the implicit ID param
@@ -126,52 +128,59 @@
     P_X##_DEF(F, P_A, P_B, P_C, P_D);        \
     REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
 
-#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) \
+    REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C,               \
+                     P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
 #define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
 
 // Macro for initializing N variables. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
-#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
+#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL)   TYPE VAR##ID = VAL
 #define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
 
 // Macro for initializing N variables by converting the data type. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
-#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
+#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT)   TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
 #define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT)
 
 // Macro for initializing N variables by converting the data type with saturation. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
 #define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
-#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
+#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) \
+    REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
 
 // Macro for adding a constant to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
+#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL)   VAR##ID += (TYPE)VAL
 #define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
 
 // Macro for multiplying N variables (VAR_B) by a constant (VAL) and adding to other N variables (VAR_A). Generates N statements that defines VAR_A##N =RHS_ACCESSOR_DEF(...)
-#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
+#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL)   VAR_A##ID += VAR_B##ID * VAL
 #define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
 
 // Macro for adding a vector to N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
 #define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
-#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
+#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC)     REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
 
 // Macro for adding a two N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
 #define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
-#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
+#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B)     REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
 
 // Macro for performing Max between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
+#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL)   VAR##ID = max(VAR##ID, (TYPE)VAL)
 #define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
 
 // Macro for performing Min between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
+#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL)   VAR##ID = min(VAR##ID, (TYPE)VAL)
 #define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
 
 // Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
 
 // Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
 
 // Macro for performing per-channel ASYMM_MULT_BY_QUANT_MULTIPLIER to N variables.
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT)                     \
@@ -182,6 +191,7 @@
         VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);    \
         VAR##ID           = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0);                     \
     })
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
 
 #endif // ARM_COMPUTE_REPEAT_H
diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
index 642483ab3cbdf73cdc11a1626f76a4c830467efd..6595bd198104f7880775810a8dbe63d226c1151e 100644
--- a/src/core/CL/cl_kernels/warp_helpers.h
+++ b/src/core/CL/cl_kernels/warp_helpers.h
@@ -31,11 +31,13 @@
  * @param[in] border_size Border size of the image
  *
  */
-inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
+inline const float8
+clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
 {
     const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
     const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
-    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3,
+                    clamped_y.s3);
 }
 
 /** Clamps the given coordinates to the borders.
@@ -74,7 +76,8 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int
  */
 inline const float8 get_neighbour_coords(const float2 coord)
 {
-    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1,
+                    /*br*/ coord.s0 + 1, coord.s1 + 1);
 }
 
 /** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
@@ -85,37 +88,38 @@ inline const float8 get_neighbour_coords(const float2 coord)
  * @param[in] height      Height of the image
  * @param[in] border_size Border size
  */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size)
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(
+    const Image *in, const float8 coords, const float width, const float height, const float border_size)
 {
     // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
 
     // Sets the 4x4 coordinates for each of the four input texels
     const float8  fc = floor(coords);
-    const float16 c1 = (float16)(
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
-    const float16 c2 = (float16)(
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
+    const float16 c1 =
+        (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
+                  clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
+    const float16 c2 =
+        (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
+                  clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
 
     // Loads the values from the input image
     const float16 t = (float16)(
-                          /* tl, tr, bl, br */
-                          * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
-                          *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
-                          *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
-                          *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
-                          *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
-                          *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
-                          *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
-                          *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
-    const float8 a  = coords - fc;
-    const float8 b  = ((float8)(1.f)) - a;
-    const float4 fr = (float4)(
-                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
-                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
-                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
-                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+        /* tl, tr, bl, br */
+        *((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+        *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+        *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+        *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+        *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+        *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+        *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+        *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+    const float8 a = coords - fc;
+    const float8 b = ((float8)(1.f)) - a;
+    const float4 fr =
+        (float4)(((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+                 ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+                 ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+                 ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
     return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
 }
 
@@ -126,7 +130,8 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const
  * @param[in] width  Width of the image
  * @param[in] height Height of the image
  */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+inline const VEC_DATA_TYPE(DATA_TYPE, 4)
+    bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
 {
     return bilinear_interpolate_with_border(in, coords, width, height, 1);
 }
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
index 2728958addd55abf3efe94b2cdd6fd41c458da97..5b72354abea769438b9d3582b84ce65cab5e0742 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -44,16 +45,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::S64);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+                                    "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64, DataType::U64);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64,
+                                                             DataType::U64);
     }
 
     return Status{};
@@ -66,22 +71,34 @@ CLArgMinMaxLayerKernel::CLArgMinMaxLayerKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLArgMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLArgMinMaxLayerKernel::configure(const ICLTensor   *input,
+                                       ICLTensor         *output,
+                                       unsigned int       axis,
+                                       ReductionOperation op)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
 }
 
-void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
+                                       const ICLTensor        *input,
+                                       ICLTensor              *output,
+                                       unsigned int            axis,
+                                       ReductionOperation      op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape{ input->info()->tensor_shape() };
+    TensorShape output_shape{input->info()->tensor_shape()};
     output_shape.set(axis, 1);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(DataType::S32).reset_padding().set_is_resizable(true));
+    auto_init_if_empty(*output->info(), input->info()
+                                            ->clone()
+                                            ->set_tensor_shape(output_shape)
+                                            .set_data_type(DataType::S32)
+                                            .reset_padding()
+                                            .set_is_resizable(true));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input          = input;
     _output         = output;
@@ -90,11 +107,14 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
 
     // Set build options
     const auto adjusted_vector_size = adjust_vec_size(16U, input->info()->dimension(0));
-    const auto vector_size          = (adjusted_vector_size == 3U && axis == 0U) ? 2U : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16.
+    const auto vector_size          = (adjusted_vector_size == 3U && axis == 0U)
+                                          ? 2U
+                                          : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16.
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % vector_size));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(input->info()->dimension(0) % vector_size));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vector_size));
     build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
     build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN");
@@ -104,7 +124,7 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
 
     // Create kernel
     std::string kernel_axis_name;
-    switch(axis)
+    switch (axis)
     {
         case 0:
             build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
@@ -135,7 +155,10 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input,
+                                        const ITensorInfo *output,
+                                        unsigned int       axis,
+                                        ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
     return Status{};
@@ -146,7 +169,7 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    switch(_reduction_axis)
+    switch (_reduction_axis)
     {
         case 0:
         {
@@ -154,7 +177,8 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
             Window out_window(window);
             Window in_window(window);
             out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-            in_window.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+            in_window.set(Window::DimX,
+                          Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
             in_window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1u));
 
             // Get first input and output slices
@@ -166,15 +190,15 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 add_2D_tensor_argument(idx, _input, in_slice);
                 add_2D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+            } while (in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
         }
         break;
         case 1:
         {
             // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+            Window window_in{window};
+            window_in.set(Window::DimY,
+                          Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
             Window in_slice  = window_in.first_slice_window_2D();
             Window out_slice = window.first_slice_window_2D();
 
@@ -184,15 +208,15 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 add_2D_tensor_argument(idx, _input, in_slice);
                 add_2D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+            } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
         }
         break;
         case 2:
         {
             // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+            Window window_in{window};
+            window_in.set(Window::DimZ,
+                          Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
             Window in_slice  = window_in.first_slice_window_3D();
             Window out_slice = window.first_slice_window_3D();
 
@@ -202,14 +226,13 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 add_3D_tensor_argument(idx, _input, in_slice);
                 add_3D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+            } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
         }
         break;
         case 3:
         {
             // Get first input and output slices
-            Window window_in{ window };
+            Window window_in{window};
             window_in.set(3, Window::Dimension(0, 1, 1));
             Window in_slice  = window_in.first_slice_window_4D();
             Window out_slice = window.first_slice_window_4D();
@@ -220,8 +243,7 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 add_4D_tensor_argument(idx, _input, in_slice);
                 add_4D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+            } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
         }
         break;
         default:
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
index 5f36bdf113249b4ec1616840a56b98616675a716..fb3b41b0de0aa084b2cbe94d9ca43de6b46ae1fb 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -72,7 +73,11 @@ public:
      * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0,1,2,3
      * @param[in]  op              Reduction operation to perform. Only ArgMin and ArgMax are supported.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            axis,
+                   ReductionOperation      op);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayerKernel.
      *
@@ -84,7 +89,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 3fa8a8edaa133e080c444b834e05cff5e8819e08..c88a852a44eb289ba61fe5ad3fd8796d8495a9e7 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -23,58 +23,64 @@
  */
 #include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const ITensorInfo *mean, const ITensorInfo *var,
-                          const ITensorInfo *beta, const ITensorInfo *gamma,
-                          float epsilon, ActivationLayerInfo act_info)
+Status validate_arguments(const ITensorInfo  *input,
+                          const ITensorInfo  *output,
+                          const ITensorInfo  *mean,
+                          const ITensorInfo  *var,
+                          const ITensorInfo  *beta,
+                          const ITensorInfo  *gamma,
+                          float               epsilon,
+                          ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
-    if(beta != nullptr)
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(
+                                    input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
+    if (beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
     }
-    if(gamma != nullptr)
+    if (gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
     }
 
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         ActivationLayerInfo::ActivationFunction act = act_info.activation();
         ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32 && input->data_type() != DataType::F16);
-        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU &&
+                                    act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+                                    act !=
+                                        ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
         ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
     }
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -86,14 +92,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
 
 std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output)
 {
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input->element_size(), input->dimension(0));
 
     // Configure kernel window
     Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
 
     bool window_changed = false;
-    if(output != nullptr)
+    if (output != nullptr)
     {
         AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
         window_changed = update_window_and_padding(win, input_access, output_access);
@@ -104,30 +111,50 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input,
         window_changed = update_window_and_padding(win, input_access);
     }
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
 CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
-    : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0), _run_in_place(false)
+    : _input(nullptr),
+      _output(nullptr),
+      _mean(nullptr),
+      _var(nullptr),
+      _beta(nullptr),
+      _gamma(nullptr),
+      _epsilon(0),
+      _run_in_place(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
-                                                float epsilon, ActivationLayerInfo act_info)
+void CLBatchNormalizationLayerKernel::configure(ICLTensor          *input,
+                                                ICLTensor          *output,
+                                                const ICLTensor    *mean,
+                                                const ICLTensor    *var,
+                                                const ICLTensor    *beta,
+                                                const ICLTensor    *gamma,
+                                                float               epsilon,
+                                                ActivationLayerInfo act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
-                                                const ICLTensor *gamma,
-                                                float epsilon, ActivationLayerInfo act_info)
+void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+                                                ICLTensor              *input,
+                                                ICLTensor              *output,
+                                                const ICLTensor        *mean,
+                                                const ICLTensor        *var,
+                                                const ICLTensor        *beta,
+                                                const ICLTensor        *gamma,
+                                                float                   epsilon,
+                                                ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
 
-    auto padding_info = get_padding_info({ input, output, mean, var, beta, gamma });
+    auto padding_info = get_padding_info({input, output, mean, var, beta, gamma});
     _input            = input;
     _output           = output;
     _mean             = mean;
@@ -142,13 +169,15 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
                                                   mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
                                                   (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
 
-    unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
+    unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
 
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
     build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
     build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
@@ -157,29 +186,33 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
     build_opts.add_option_if(gamma == nullptr, "-DUSE_DEFAULT_GAMMA");
 
     // Create kernel
-    _kernel = create_kernel(compile_context, "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel =
+        create_kernel(compile_context,
+                      "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                      build_opts.options());
 
     // Set kernel static arguments
     unsigned int include_output = (!_run_in_place) ? 1 : 0;
-    unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor() + 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
-    if(_beta != nullptr)
+    unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor() +
+                       2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+    if (_beta != nullptr)
     {
         idx += num_arguments_per_1D_tensor(); // Skip beta parameter
     }
-    if(_gamma != nullptr)
+    if (_gamma != nullptr)
     {
         idx += num_arguments_per_1D_tensor(); // Skip gamma parameter
     }
     _kernel.setArg<cl_float>(idx++, _epsilon);
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*output->info(), *input->info()->clone());
     }
 
     // Configure kernel window
-    if(input->info()->data_layout() == DataLayout::NHWC)
+    if (input->info()->data_layout() == DataLayout::NHWC)
     {
         Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
         ICLKernel::configure_internal(win);
@@ -205,18 +238,23 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
     _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
 }
 
-Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                                 const ITensorInfo *mean, const ITensorInfo *var,
-                                                 const ITensorInfo *beta, const ITensorInfo *gamma,
-                                                 float epsilon, ActivationLayerInfo act_info)
+Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo  *input,
+                                                 const ITensorInfo  *output,
+                                                 const ITensorInfo  *mean,
+                                                 const ITensorInfo  *var,
+                                                 const ITensorInfo  *beta,
+                                                 const ITensorInfo  *gamma,
+                                                 float               epsilon,
+                                                 ActivationLayerInfo act_info)
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
 
-    if(input->data_layout() != DataLayout::NHWC)
+    if (input->data_layout() != DataLayout::NHWC)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
-                                    .first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
+                .first);
     }
 
     return Status{};
@@ -236,11 +274,11 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue
     unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor();
     add_1D_tensor_argument(idx, _mean, vector_slice);
     add_1D_tensor_argument(idx, _var, vector_slice);
-    if(_beta != nullptr)
+    if (_beta != nullptr)
     {
         add_1D_tensor_argument(idx, _beta, vector_slice);
     }
-    if(_gamma != nullptr)
+    if (_gamma != nullptr)
     {
         add_1D_tensor_argument(idx, _gamma, vector_slice);
     }
@@ -249,11 +287,10 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue
     {
         idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
index acbe0f2a26f49de04a40e616833585a317dd0a9a..1a88d2a8c50addee0b1b39a59d8f85115021567c 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -64,7 +65,13 @@ public:
      * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f,
+    void configure(ICLTensor          *input,
+                   ICLTensor          *output,
+                   const ICLTensor    *mean,
+                   const ICLTensor    *var,
+                   const ICLTensor    *beta     = nullptr,
+                   const ICLTensor    *gamma    = nullptr,
+                   float               epsilon  = 0.001f,
                    ActivationLayerInfo act_info = ActivationLayerInfo());
     /** Set the input and output tensors.
      *
@@ -82,8 +89,15 @@ public:
      * @param[in]      epsilon         (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr,
-                   const ICLTensor *gamma = nullptr, float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *mean,
+                   const ICLTensor        *var,
+                   const ICLTensor        *beta     = nullptr,
+                   const ICLTensor        *gamma    = nullptr,
+                   float                   epsilon  = 0.001f,
+                   ActivationLayerInfo     act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayerKernel
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
@@ -99,10 +113,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *output,
+                           const ITensorInfo  *mean,
+                           const ITensorInfo  *var,
+                           const ITensorInfo  *beta     = nullptr,
+                           const ITensorInfo  *gamma    = nullptr,
+                           float               epsilon  = 0.001f,
+                           ActivationLayerInfo act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
index 143a842d02d8959e80f20f880d2514b2a43f4ffd..c640b5a8d6f152b71565b34704c00d6515aeb1c7 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
@@ -25,13 +25,14 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
-#include "arm_compute/core/TensorInfo.h"
 
 using namespace arm_compute::misc::shape_calculator;
 namespace arm_compute
@@ -46,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -54,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status validate_arguments_static(const ITensorInfo *input,
+                                 const int          block_shape_x,
+                                 const int          block_shape_y,
+                                 const ITensorInfo *output,
+                                 const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
@@ -66,10 +71,11 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorShape expected_output_shape = compute_batch_to_space_shape(input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
-        const TensorInfo  expected_output       = output->clone()->set_tensor_shape(expected_output_shape);
+        const TensorShape expected_output_shape = compute_batch_to_space_shape(
+            input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
+        const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output);
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -79,8 +85,7 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
 }
 } // namespace
 
-CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel()
-    : _input(nullptr), _block_shape(nullptr), _output(nullptr)
+CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel() : _input(nullptr), _block_shape(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -90,11 +95,14 @@ void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTenso
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
 }
 
-void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const ICLTensor        *block_shape,
+                                          ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    auto padding_info = get_padding_info({ input, block_shape, output });
+    auto padding_info = get_padding_info({input, block_shape, output});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), output->info()));
 
@@ -106,8 +114,9 @@ void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3)));
-    _kernel = create_kernel(compile_context, "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
-
+    _kernel = create_kernel(compile_context,
+                            "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -116,47 +125,65 @@ void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
+void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input,
+                                          const int32_t    block_shape_x,
+                                          const int32_t    block_shape_y,
+                                          ICLTensor       *output,
+                                          const CropInfo  &crop_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info);
 }
 
-void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output,
-                                          const CropInfo &crop_info)
+void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const int32_t           block_shape_x,
+                                          const int32_t           block_shape_y,
+                                          ICLTensor              *output,
+                                          const CropInfo         &crop_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    const TensorShape output_shape = compute_batch_to_space_shape(input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
+    const TensorShape output_shape = compute_batch_to_space_shape(
+        input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
 
     _input  = input;
     _output = output;
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3)));
     build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x));
     build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y));
     build_opts.add_option("-DCROP_LEFT=" + support::cpp11::to_string(crop_info.left));
     build_opts.add_option("-DCROP_TOP=" + support::cpp11::to_string(crop_info.top));
-    _kernel = create_kernel(compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(
+        compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+        build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
     ICLKernel::configure_internal(win);
 }
 
-Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
     return Status{};
 }
 
-Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input,
+                                           const int32_t      block_shape_x,
+                                           const int32_t      block_shape_y,
+                                           const ITensorInfo *output,
+                                           const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info));
@@ -185,7 +212,7 @@ void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
         unsigned int idx = 0;
         add_4D_tensor_argument(idx, _input, slice_in);
         add_argument(idx, batch_id);
-        if(_block_shape != nullptr)
+        if (_block_shape != nullptr)
         {
             add_1D_tensor_argument(idx, _block_shape, vector_slice);
         }
@@ -193,7 +220,6 @@ void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
         enqueue(queue, *this, slice_out, lws_hint());
 
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice_out));
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
index a05184cd5b1351728bce9b3a602e9b8a72885dfd..b9d3e66fe2383fe83c4d576c55e4ba16b906e069 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -65,7 +66,10 @@ public:
      *
      * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *block_shape,
+                   ICLTensor              *output);
     /** Initialise the kernel's inputs and output (Static block shape).
      *
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -74,7 +78,11 @@ public:
      * @param[out] output        Tensor output. Data types supported: same as @p input
      * @param[in]  crop_info     Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info);
+    void configure(const ICLTensor *input,
+                   const int32_t    block_shape_x,
+                   const int32_t    block_shape_y,
+                   ICLTensor       *output,
+                   const CropInfo  &crop_info);
     /** Initialise the kernel's inputs and output (Static block shape).
      *
      * @param[in]  compile_context The compile context to be used.
@@ -84,7 +92,12 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  crop_info       Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const int32_t           block_shape_x,
+                   const int32_t           block_shape_y,
+                   ICLTensor              *output,
+                   const CropInfo         &crop_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -106,7 +119,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info);
+    static Status validate(const ITensorInfo *input,
+                           const int32_t      block_shape_x,
+                           const int32_t      block_shape_y,
+                           const ITensorInfo *output,
+                           const CropInfo    &crop_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBitwiseKernel.cpp b/src/core/CL/kernels/CLBitwiseKernel.cpp
index 11e6d021a5ff4d03d0d76333c453cba26cd1b2de..de3fb43de8cd96899f71903c3a1db7a8e4c36bc1 100644
--- a/src/core/CL/kernels/CLBitwiseKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseKernel.cpp
@@ -28,25 +28,29 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
-CLBitwiseKernel::CLBitwiseKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+CLBitwiseKernel::CLBitwiseKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op)
+void CLBitwiseKernel::configure(const CLCompileContext &compile_context,
+                                const ICLTensor        *input1,
+                                const ICLTensor        *input2,
+                                ICLTensor              *output,
+                                BitwiseOperation        op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
-    if(op != BitwiseOperation::NOT)
+    if (op != BitwiseOperation::NOT)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(input2);
         ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
@@ -56,7 +60,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*(output->info()), *(input1->info()));
-    auto padding_info = get_padding_info({ input1, input2, output });
+    auto padding_info = get_padding_info({input1, input2, output});
 
     // Configure kernel window
     const unsigned int vec_size_x = adjust_vec_size(16 / output->info()->element_size(), output->info()->dimension(0));
@@ -68,7 +72,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I
 
     // Create kernel
     std::string kernel_name = "";
-    switch(op)
+    switch (op)
     {
         case BitwiseOperation::AND:
             kernel_name = "bitwise_and";
@@ -107,13 +111,12 @@ void CLBitwiseKernel::run(const Window &window, cl::CommandQueue &queue)
     {
         unsigned int idx = 0;
         add_2D_tensor_argument(idx, _input1, slice);
-        if(_input2 != nullptr)
+        if (_input2 != nullptr)
         {
             add_2D_tensor_argument(idx, _input2, slice);
         }
         add_2D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLBitwiseKernel.h b/src/core/CL/kernels/CLBitwiseKernel.h
index c5a999643dea79a76a7e5a7ad398c3e82678d113..2c74955ae4f4d6e8a7c954f615d0c20f2eada37e 100644
--- a/src/core/CL/kernels/CLBitwiseKernel.h
+++ b/src/core/CL/kernels/CLBitwiseKernel.h
@@ -59,7 +59,11 @@ public:
      * @param[out] output          Destination tensor. Data types supported: U8.
      * @param[in]  op              Bitwise operation to perform. Supported: AND, OR, NOT, XOR.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output,
+                   BitwiseOperation        op);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
index 72de854afb139463f75ad302c6665376f0227042..f32c518e29c1a95919eb798a948ca9f8ef5147eb 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -40,7 +41,10 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status validate_arguments(const ITensorInfo              *boxes,
+                          const ITensorInfo              *pred_boxes,
+                          const ITensorInfo              *deltas,
+                          const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(boxes);
@@ -53,7 +57,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
 
     const bool is_qasymm16 = boxes->data_type() == DataType::QASYMM16;
-    if(is_qasymm16)
+    if (is_qasymm16)
     {
         const UniformQuantizationInfo boxes_qinfo = boxes->quantization_info().uniform();
         ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.scale != 0.125f);
@@ -65,12 +69,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas);
     }
 
-    if(pred_boxes->total_size() > 0)
+    if (pred_boxes->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, boxes);
         ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
-        if(is_qasymm16)
+        if (is_qasymm16)
         {
             const UniformQuantizationInfo pred_boxes_qinfo = pred_boxes->quantization_info().uniform();
             ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes_qinfo.scale != 0.125f);
@@ -83,22 +87,31 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
 }
 } // namespace
 
-CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel()
-    : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
+CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel() : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransformKernel::configure(const ICLTensor                *boxes,
+                                             ICLTensor                      *pred_boxes,
+                                             const ICLTensor                *deltas,
+                                             const BoundingBoxTransformInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
 }
 
-void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransformKernel::configure(const CLCompileContext         &compile_context,
+                                             const ICLTensor                *boxes,
+                                             ICLTensor                      *pred_boxes,
+                                             const ICLTensor                *deltas,
+                                             const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
-    auto padding_info = get_padding_info({ boxes, pred_boxes, deltas });
-    auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info()));
+    auto padding_info = get_padding_info({boxes, pred_boxes, deltas});
+    auto_init_if_empty(*pred_boxes->info(), deltas->info()
+                                                ->clone()
+                                                ->set_data_type(boxes->info()->data_type())
+                                                .set_quantization_info(boxes->info()->quantization_info()));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
 
@@ -128,7 +141,7 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con
     build_opts.add_option_if(info.apply_scale(), "-DSCALE_AFTER=" + float_to_string_with_full_precision(info.scale()));
     build_opts.add_option_if(info.correct_transform_coords(), "-DOFFSET=1");
 
-    if(is_quantized)
+    if (is_quantized)
     {
         build_opts.add_option("-DDATA_TYPE_DELTAS=" + get_cl_type_from_data_type(deltas->info()->data_type()));
         const UniformQuantizationInfo boxes_qinfo      = boxes->info()->quantization_info().uniform();
@@ -148,12 +161,15 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con
 
     // Since the number of columns is a multiple of 4 by definition, we don't need to pad the tensor
     const unsigned int num_elems_processed_per_iteration = 4;
-    Window             win                               = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
+    Window             win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
     ICLKernel::configure_internal(win);
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status CLBoundingBoxTransformKernel::validate(const ITensorInfo              *boxes,
+                                              const ITensorInfo              *pred_boxes,
+                                              const ITensorInfo              *deltas,
+                                              const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info));
     return Status{};
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
index 08f350e86a9491412f741db99633d655d9dd6052..9a1bb49bb9c74b935cf5f90521d5c43aee4726c3 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
@@ -58,7 +58,10 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      *
      */
-    void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    void configure(const ICLTensor                *boxes,
+                   ICLTensor                      *pred_boxes,
+                   const ICLTensor                *deltas,
+                   const BoundingBoxTransformInfo &info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -71,7 +74,11 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    void configure(const CLCompileContext         &compile_context,
+                   const ICLTensor                *boxes,
+                   ICLTensor                      *pred_boxes,
+                   const ICLTensor                *deltas,
+                   const BoundingBoxTransformInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
      *
@@ -85,7 +92,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+    static Status validate(const ITensorInfo              *boxes,
+                           const ITensorInfo              *pred_boxes,
+                           const ITensorInfo              *deltas,
+                           const BoundingBoxTransformInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index a2a0bc4fb4a97c346b78faa6e26b1d36839f3e90..ec58bf9e7a1436f3909ade4d1b04e7fe6ff2a58c 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -46,15 +47,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
 
-    const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+    const unsigned int channels =
+        input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        num_groups == channels,
+        "Channel shuffling with same number of groups as number of channels would be inefficient");
     // There cannot be more groups than channels
     ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0,
+                                    "The number of channels must be a multiple of the number of groups");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -70,11 +75,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     auto_init_if_empty(*output, *input->clone());
 
     const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
-    if(is_nhwc)
+    if (is_nhwc)
     {
-        unsigned int num_elems_processed_per_iteration_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
-        Window       win                                 = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x));
-        Window       win_collapsed                       = win.collapse(win, Window::DimZ);
+        unsigned int num_elems_processed_per_iteration_x =
+            adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+        Window win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x));
+        Window win_collapsed = win.collapse(win, Window::DimZ);
         return std::make_pair(Status{}, win_collapsed);
     }
     else
@@ -83,22 +89,25 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         constexpr unsigned int num_elems_processed_per_iteration_y = 2;
 
         // Configure kernel window
-        Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+        Window win = calculate_max_window(
+            *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x,
+                                           num_elems_processed_per_iteration_y);
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x,
+                                            num_elems_processed_per_iteration_y);
 
         const bool window_changed = update_window_and_padding(win, input_access, output_access);
 
         Window win_collapsed = win.collapse(win, Window::DimZ);
 
-        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+        Status err =
+            (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
         return std::make_pair(err, win_collapsed);
     }
 }
 } // namespace
 
-CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel()
-    : _input(nullptr), _output(nullptr)
+CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -108,23 +117,27 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o
     configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
 }
 
-void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context,
+                                            const ICLTensor        *input,
+                                            ICLTensor              *output,
+                                            unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output;
 
-    const DataLayout   data_layout          = input->info()->data_layout();
-    const bool         is_nhwc              = data_layout == DataLayout::NHWC;
-    const unsigned int channels             = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
-    unsigned int       vec_size_x           = 0;
-    unsigned int       vec_size_x_leftovers = 0;
-    if(is_nhwc)
+    const DataLayout   data_layout = input->info()->data_layout();
+    const bool         is_nhwc     = data_layout == DataLayout::NHWC;
+    const unsigned int channels =
+        input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+    unsigned int vec_size_x           = 0;
+    unsigned int vec_size_x_leftovers = 0;
+    if (is_nhwc)
     {
-        vec_size_x           = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+        vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
         vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
     }
     else
@@ -170,13 +183,14 @@ void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_cont
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
     }
 }
 
-Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+Status
+CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
index 31c007f17e7d4bf24a5738c623beddc3516aff07..43c939ebd8e0d70c0f29fc26566152597963b238 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
@@ -60,7 +60,10 @@ public:
      * @param[out] output          Output tensor. Data type supported: Same as @p input
      * @param[in]  num_groups      Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            num_groups);
     /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel
      *
      * @param[in] input      Input tensor info. Data types supported: All.
diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
index f4d6316517e5db8843412a46ec64aecec6e85927..a0f9aca54a8a1ee296d8139c35aa64f8601eb21a 100644
--- a/src/core/CL/kernels/CLComparisonKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp
@@ -25,7 +25,9 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -38,22 +40,16 @@ namespace arm_compute
 namespace
 {
 // Create supported comparisons map
-const std::map<ComparisonOperation, std::string> supported_comparison_ops =
-{
-    { ComparisonOperation::Equal, "EQUAL" },
-    { ComparisonOperation::NotEqual, "NOTEQUAL" },
-    { ComparisonOperation::Greater, "GREATER" },
-    { ComparisonOperation::GreaterEqual, "GREATEREQUAL" },
-    { ComparisonOperation::Less, "LESS" },
-    { ComparisonOperation::LessEqual, "LESSEQUAL" },
+const std::map<ComparisonOperation, std::string> supported_comparison_ops = {
+    {ComparisonOperation::Equal, "EQUAL"},     {ComparisonOperation::NotEqual, "NOTEQUAL"},
+    {ComparisonOperation::Greater, "GREATER"}, {ComparisonOperation::GreaterEqual, "GREATEREQUAL"},
+    {ComparisonOperation::Less, "LESS"},       {ComparisonOperation::LessEqual, "LESSEQUAL"},
 };
 
-int calculate_num_elems_processed_per_iteration(const ITensorInfo &input)
-{
-    return 16 / input.element_size();
-}
-
-Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ComparisonOperation operation)
+Status validate_arguments(const ITensorInfo  &input1,
+                          const ITensorInfo  &input2,
+                          const ITensorInfo  &output,
+                          ComparisonOperation operation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
     ARM_COMPUTE_RETURN_ERROR_ON(input1.data_type() == DataType::UNKNOWN);
@@ -64,7 +60,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured output
-    if(output.total_size() > 0)
+    if (output.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
@@ -76,41 +72,37 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
 {
-    const TensorShape &out_shape                         = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
-    const unsigned int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(input1);
+    const TensorShape &out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input1.element_size(), output.dimension(0));
 
     // Auto initialize output if not initialized
     auto_init_if_empty(output, out_shape, 1, DataType::U8, QuantizationInfo());
 
-    Window win        = calculate_max_window(out_shape, Steps(num_elems_processed_per_iteration));
-    Window win_input1 = win.broadcast_if_dimension_le_one(input1);
-    Window win_input2 = win.broadcast_if_dimension_le_one(input2);
-
-    AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+    Window win = calculate_max_window(out_shape, Steps(num_elems_processed_per_iteration));
 
-    bool window_changed = update_window_and_padding(win_input1, input1_access)
-                          || update_window_and_padding(win_input2, input2_access)
-                          || update_window_and_padding(win, output_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(Status{}, win);
 }
 } // namespace
 
-CLComparisonKernel::CLComparisonKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+CLComparisonKernel::CLComparisonKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLComparisonKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparisonKernel::configure(const ICLTensor    *input1,
+                                   const ICLTensor    *input2,
+                                   ICLTensor          *output,
+                                   ComparisonOperation operation)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
 }
 
-void CLComparisonKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparisonKernel::configure(const CLCompileContext &compile_context,
+                                   const ICLTensor        *input1,
+                                   const ICLTensor        *input2,
+                                   ICLTensor              *output,
+                                   ComparisonOperation     operation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), operation));
@@ -126,17 +118,29 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons
     const std::string &operation_name = supported_comparison_ops.at(operation);
     std::string        kernel_name    = "compare_" + lower_string(operation_name);
 
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input1->info()->element_size(), output->info()->dimension(0));
+
     // Set kernel build options
     std::set<std::string> build_opts;
     build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()));
-    build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info())));
+    build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.emplace("-DVEC_SIZE_LEFTOVER=" +
+                       support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.emplace(
+        "-DVEC_SIZE_IN1=" + //
+        support::cpp11::to_string(input1->info()->dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+    build_opts.emplace(
+        "-DVEC_SIZE_IN2=" + //
+        support::cpp11::to_string(input2->info()->dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
     build_opts.emplace("-DOP=" + operation_name);
     build_opts.emplace("-DOP_NAME=" + lower_string(operation_name));
-    if(is_data_type_quantized(input1->info()->data_type()))
+    if (is_data_type_quantized(input1->info()->data_type()))
     {
         const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
 
+        build_opts.emplace("-DIS_QUANTIZED");
         build_opts.emplace("-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset));
         build_opts.emplace("-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset));
         build_opts.emplace("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
@@ -160,12 +164,16 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons
     _config_id += lower_string(string_from_data_layout(input1->info()->data_layout()));
 }
 
-Status CLComparisonKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+Status CLComparisonKernel::validate(const ITensorInfo  *input1,
+                                    const ITensorInfo  *input2,
+                                    const ITensorInfo  *output,
+                                    ComparisonOperation operation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
 
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, operation));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
 
     return Status{};
 }
@@ -181,17 +189,18 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue)
 
     bool       can_collapse = true;
     const bool is_vector    = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -212,16 +221,7 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue)
 
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
-BorderSize CLComparisonKernel::border_size() const
-{
-    const int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(*_input1->info());
-
-    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize{ 0, border, 0, 0 };
-}
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLComparisonKernel.h b/src/core/CL/kernels/CLComparisonKernel.h
index 0b9419018370c8296e3b8980aeab6497f9c56ca0..2fb4ba06b693b7b5458c9a5165fe7c0962c6df48 100644
--- a/src/core/CL/kernels/CLComparisonKernel.h
+++ b/src/core/CL/kernels/CLComparisonKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLCOMPARISONKERNEL_H
-#define ARM_COMPUTE_CLCOMPARISONKERNEL_H
+#ifndef ACL_SRC_CORE_CL_KERNELS_CLCOMPARISONKERNEL_H
+#define ACL_SRC_CORE_CL_KERNELS_CLCOMPARISONKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -64,7 +65,11 @@ public:
      * @param[out] output          Destination tensor. Data types supported: U8.
      * @param[in]  operation       Comparison operation to use.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output,
+                   ComparisonOperation     operation);
     /** Static function to check if given info will lead to a valid configuration of @ref CLComparisonKernel
      *
      * @param[in] input1    Source tensor. Data types supported: All.
@@ -74,11 +79,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation);
+    static Status validate(const ITensorInfo  *input1,
+                           const ITensorInfo  *input2,
+                           const ITensorInfo  *output,
+                           ComparisonOperation operation);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
 
 private:
     const ICLTensor *_input1; /**< Source tensor 1 */
@@ -86,4 +93,4 @@ private:
     ICLTensor       *_output; /**< Destination tensor */
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCOMPARISONKERNEL_H */
+#endif // ACL_SRC_CORE_CL_KERNELS_CLCOMPARISONKERNEL_H
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index 76af5d564a61cb037f5796bc0d06b9c0f4e22559..f8ecc4c098f5558ad5d22c32e0cd0c43325de296 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -40,7 +41,8 @@ CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo   *input,
+                                                    const ITensorInfo   *output,
                                                     const PadStrideInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
@@ -60,7 +62,7 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
 
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
-    for(size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
     }
@@ -68,20 +70,21 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co
     return Status{};
 }
 
-void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                                   const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                                                   const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context,
+                                                   const ICLTensor        *input,
+                                                   ICLTensor              *output,
+                                                   const PadStrideInfo    &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), info));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input       = input;
     _output      = output;
@@ -119,7 +122,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
     const int out_end_y   = _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
     const int out_step_y  = _info.stride().second;
 
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -137,8 +140,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
                 add_3D_tensor_argument(idx, _input, slice_in);
                 add_3D_tensor_argument(idx, _output, slice_out);
                 enqueue(queue, *this, slice_out, lws_hint());
-            }
-            while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
+            } while (collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
             break;
         }
         case DataLayout::NHWC:
@@ -156,8 +158,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
                 add_3D_tensor_argument(idx, _input, slice_in);
                 add_3D_tensor_argument(idx, _output, slice_out);
                 enqueue(queue, *this, slice_out, lws_hint());
-            }
-            while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+            } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
             break;
         }
         default:
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
index e0d1322341549bc31d5fb813ab5549c3b8a2f638..762989a836bc5a3607b4ec5f36564f06cf5781f5 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
@@ -62,7 +62,10 @@ public:
      * @param[out] output          Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  info            Contains padding and stride information described in @ref PadStrideInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
      *
      * @param[in] input  Source tensor info. Data types supported: All.
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
index 0fc0ff816811b36f784b67d2403bcdc2429feff2..b33e0a8b6f84d0c90e38c385646f61879c9ceab2 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -38,7 +39,11 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+Status validate_arguments(const ITensorInfo   *input,
+                          const ITensorInfo   *bias,
+                          const ITensorInfo   *output,
+                          const ITensorInfo   *input_info,
+                          const ITensorInfo   *weights_info,
                           const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
@@ -53,19 +58,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
     ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_w) != deconv_info.stride().first);
     ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_h) != deconv_info.stride().second);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32);
-    if(!is_qasymm)
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S32);
+    if (!is_qasymm)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_info, weights_info);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) * weights_info->dimension(idx_b));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) *
+                                                           weights_info->dimension(idx_b));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != input_info->dimension(idx_w));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != input_info->dimension(idx_h));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) != input_info->dimension(idx_b));
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
-        if(is_qasymm)
+        if (is_qasymm)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -76,19 +83,26 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
         ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights_info->dimension(idx_b));
     }
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
-        auto                out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
+        auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h),
+                                                        weights_info->dimension(idx_w), weights_info->dimension(idx_h),
+                                                        stride_info);
 
-        const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+        const TensorShape output_shape =
+            misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
     }
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo   *input,
+                                                        ITensorInfo         *output,
+                                                        const ITensorInfo   *input_info,
+                                                        const ITensorInfo   *weights_info,
+                                                        const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -97,11 +111,17 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
     const size_t        idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
 
-    auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
+    auto out_dims =
+        deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h),
+                                        weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
 
-    const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+    const TensorShape output_shape =
+        misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
 
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout).set_quantization_info(input->quantization_info()));
+    auto_init_if_empty(*output, input->clone()
+                                    ->set_tensor_shape(output_shape)
+                                    .set_data_layout(data_layout)
+                                    .set_quantization_info(input->quantization_info()));
 
     Window win = calculate_max_window(*input);
 
@@ -109,29 +129,37 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
 }
 } // namespace
 
-CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel()
-    : _add_bias(false),
-      _bias(nullptr)
+CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel() : _add_bias(false), _bias(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor     *input,
+                                                   const ICLTensor     *bias,
+                                                   ICLTensor           *output,
+                                                   const ITensorInfo   *input_info,
+                                                   const ITensorInfo   *weights_info,
                                                    const PadStrideInfo &deconv_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, input_info, weights_info, deconv_info);
 }
 
-void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info,
-                                                   const ITensorInfo   *weights_info,
-                                                   const PadStrideInfo &deconv_info)
+void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context,
+                                                   const ICLTensor        *input,
+                                                   const ICLTensor        *bias,
+                                                   ICLTensor              *output,
+                                                   const ITensorInfo      *input_info,
+                                                   const ITensorInfo      *weights_info,
+                                                   const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), input_info, weights_info, deconv_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr),
+                                                  output->info(), input_info, weights_info, deconv_info));
 
-    auto padding_info = get_padding_info({ input, bias, output });
+    auto padding_info = get_padding_info({input, bias, output});
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
+    auto win_config =
+        validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     const DataLayout data_layout = input_info->data_layout();
@@ -178,7 +206,11 @@ void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compi
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo   *input,
+                                                    const ITensorInfo   *bias,
+                                                    const ITensorInfo   *output,
+                                                    const ITensorInfo   *input_info,
+                                                    const ITensorInfo   *weights_info,
                                                     const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, input_info, weights_info, deconv_info));
@@ -194,7 +226,7 @@ void CLDeconvolutionReshapeOutputKernel::run(const Window &window, cl::CommandQu
     unsigned int idx = 0;
     add_3D_tensor_argument(idx, _input, collapsed);
     add_3D_tensor_argument(idx, _output, collapsed);
-    if(_add_bias)
+    if (_add_bias)
     {
         add_1D_tensor_argument(idx, _bias, collapsed);
     }
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
index ce354fa86f5348fcc7d3400844fef233dcf01bd4..8f436b07e39e740ef868eac681933f0c5c78cdf3 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
@@ -67,7 +67,12 @@ public:
      * @param[in]  weights_info Deconvolution weights tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
      * @param[in]  deconv_info  Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
      */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
+    void configure(const ICLTensor     *input,
+                   const ICLTensor     *bias,
+                   ICLTensor           *output,
+                   const ITensorInfo   *input_info,
+                   const ITensorInfo   *weights_info,
+                   const PadStrideInfo &deconv_info);
     /** Initialise the kernel's source and destination.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -79,8 +84,13 @@ public:
      * @param[in]  weights_info    Deconvolution weights tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
      * @param[in]  deconv_info     Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
-                   const PadStrideInfo &deconv_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *bias,
+                   ICLTensor              *output,
+                   const ITensorInfo      *input_info,
+                   const ITensorInfo      *weights_info,
+                   const PadStrideInfo    &deconv_info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref  CLDeconvolutionReshapeOutputKernel.
      *
@@ -93,7 +103,12 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *bias,
+                           const ITensorInfo   *output,
+                           const ITensorInfo   *input_info,
+                           const ITensorInfo   *weights_info,
+                           const PadStrideInfo &deconv_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
index 5c1dc4fbf6bb1bb0cd22cbd9aa49ad3035359ed1..cdf19ab2e1b48f5a4fe640c5cde245104ea32cdc 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -49,12 +50,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
         const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width]));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+                                    (block_shape * input->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+                                    (block_shape * input->tensor_shape()[idx_height]));
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -63,8 +66,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 }
 } // namespace
 
-CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel()
-    : _input(nullptr), _output(nullptr), _block_shape()
+CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel() : _input(nullptr), _output(nullptr), _block_shape()
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -74,14 +76,18 @@ void CLDepthToSpaceLayerKernel::configure(const ICLTensor *input, ICLTensor *out
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          ICLTensor              *output,
+                                          int32_t                 block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+    TensorShape output_shape =
+        compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
 
@@ -98,7 +104,9 @@ void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
     build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(input->info()->dimension(idx_channel)));
     build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
     build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
-    _kernel = create_kernel(compile_context, "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(compile_context,
+                            "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());
@@ -137,7 +145,6 @@ void CLDepthToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
         enqueue(queue, *this, slice_in, lws_hint());
 
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_in));
+    } while (window.slide_window_slice_3D(slice_in));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
index 1f7f77b5695eead1122a94269a794ee52a1cc2da..cef70c4dda5a337139d6fe37529d1a4b8b8493b4 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -61,7 +62,8 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape     Block shape value.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayerKernel.
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index e34b6929e7f49df6b8e4299678dfe866ea923343..b95abe795f35d2bb37c7e78059ade56366a2f12a 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -23,16 +23,17 @@
  */
 #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/CL/ICLKernel.h"
@@ -45,12 +46,18 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info,
-                          const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo          *input,
+                          const ITensorInfo          *weights,
+                          const ITensorInfo          *biases,
+                          const ITensorInfo          *output,
+                          const DWCComputeKernelInfo &dwc_info,
+                          const ConvolutionInfo      &conv_info,
+                          const ITensorInfo          *output_multipliers,
+                          const ITensorInfo          *output_shifts)
 {
     ARM_COMPUTE_UNUSED(dwc_info);
     bool in_place = false;
-    if(output == nullptr || output == input)
+    if (output == nullptr || output == input)
     {
         in_place = true;
         output   = input;
@@ -58,11 +65,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && dwc_info.m0 != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && dwc_info.m0 != 1);
     ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_input_to_cl_image == true));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && (export_to_cl_image(weights) == false), "Weights cannot be exported to cl_image!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) &&
+                                        (export_to_cl_image(weights) == false),
+                                    "Weights cannot be exported to cl_image!");
     ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && ((dwc_info.n0 % 4) != 0));
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1);
@@ -72,33 +82,40 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * conv_info.depth_multiplier));
 
     // In place restrictions
-    if(in_place)
+    if (in_place)
     {
-        const int weights_width_idx  = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
-        const int weights_height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
-        ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U || weights->tensor_shape()[weights_height_idx] != 1U);
+        const int weights_width_idx =
+            get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+        const int weights_height_idx =
+            get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U ||
+                                    weights->tensor_shape()[weights_height_idx] != 1U);
         ARM_COMPUTE_RETURN_ERROR_ON(conv_info.depth_multiplier != 1U);
         ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride() != std::make_pair(1U, 1U));
         ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size2D(1U, 1U));
-        ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            conv_info.pad_stride_info
+                .has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it
     }
 
-    const ConvolutionInfo info{ conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(), conv_info.dilation };
-    const TensorShape     output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
+    const ConvolutionInfo info{conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(),
+                               conv_info.dilation};
+    const TensorShape     output_shape =
+        arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
 
-    if(conv_info.depth_multiplier > 1 && dwc_info.n0 > 1)
+    if (conv_info.depth_multiplier > 1 && dwc_info.n0 > 1)
     {
         ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % dwc_info.n0) != 0);
     }
 
     const bool is_quantized = is_data_type_quantized(input->data_type());
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[idx_c]);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
 
-        if(is_quantized)
+        if (is_quantized)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -108,7 +125,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         }
     }
 
-    if(is_quantized)
+    if (is_quantized)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
@@ -116,7 +133,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
 
-        if(is_data_type_quantized_per_channel(weights->data_type()))
+        if (is_data_type_quantized_per_channel(weights->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
             ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_multipliers->dimension(0));
@@ -134,22 +151,24 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     }
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
-    if(is_data_type_quantized(input->data_type()))
+    if (is_data_type_quantized(input->data_type()))
     {
         const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
         const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
+        const UniformQuantizationInfo oq_info =
+            (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
 
         float multiplier        = iq_info.scale * wq_info.scale / oq_info.scale;
         int   output_multiplier = 0;
         int   output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
     }
 
     return Status{};
@@ -171,30 +190,48 @@ CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel
     _type = CLKernelType::DEPTHWISE;
 }
 
-void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                        const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info,
-                                                        const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
+void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor                  *input,
+                                                        const ICLTensor            *weights,
+                                                        const ICLTensor            *biases,
+                                                        ICLTensor                  *output,
+                                                        const DWCComputeKernelInfo &dwc_info,
+                                                        const ConvolutionInfo      &conv_info,
+                                                        const ICLTensor            *output_multipliers,
+                                                        const ICLTensor            *output_shifts)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info,
+              output_multipliers, output_shifts);
 }
 
-void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                        const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info,
-                                                        const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
+void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext     &compile_context,
+                                                        ICLTensor                  *input,
+                                                        const ICLTensor            *weights,
+                                                        const ICLTensor            *biases,
+                                                        ICLTensor                  *output,
+                                                        const DWCComputeKernelInfo &dwc_info,
+                                                        const ConvolutionInfo      &conv_info,
+                                                        const ICLTensor            *output_multipliers,
+                                                        const ICLTensor            *output_shifts)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    if(output == nullptr)
+    if (output == nullptr)
     {
         // In-place
         output = input;
     }
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
-                                                  dwc_info, conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), dwc_info,
+        conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
+        (output_shifts != nullptr) ? output_shifts->info() : nullptr));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), conv_info);
-    auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
+        *(input->info()), *(weights->info()), conv_info);
+    auto_init_if_empty(*(output->info()), input->info()
+                                              ->clone()
+                                              ->set_tensor_shape(output_shape)
+                                              .set_quantization_info(output->info()->quantization_info()));
 
     _input                      = input;
     _output                     = output;
@@ -214,12 +251,12 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     CLBuildOptions build_opts;
 
     // Update the padding for the input/weights tensor if we can export to cl_image
-    if(_export_input_to_cl_image)
+    if (_export_input_to_cl_image)
     {
         arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(input->info());
     }
 
-    if(_export_weights_to_cl_image)
+    if (_export_weights_to_cl_image)
     {
         arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weights->info());
     }
@@ -229,9 +266,10 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     const auto      act_function  = conv_info.act_info.activation();
     const auto      dst_data_type = _output->info()->data_type();
 
-    if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-       && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-       && (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
+    if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+        (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+         act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+        (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
     {
         // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
         // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
@@ -268,23 +306,24 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
     build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1));
-    build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1", "-DN0_A=" + support::cpp11::to_string(n0));
+    build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1",
+                                  "-DN0_A=" + support::cpp11::to_string(n0));
     build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_output->info()->dimension(0) % n0));
     build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION");
 
     // Force unroll with pragma when any of the following values exceed the maximum number of manual unroll
-    set_unroll_with_pragma(build_opts, { static_cast<int>(_weights->info()->dimension(1) + m0 - 1),
-                                         static_cast<int>(_weights->info()->dimension(1)),
-                                         static_cast<int>(_weights->info()->dimension(2))
-                                       });
+    set_unroll_with_pragma(build_opts, {static_cast<int>(_weights->info()->dimension(1) + m0 - 1),
+                                        static_cast<int>(_weights->info()->dimension(1)),
+                                        static_cast<int>(_weights->info()->dimension(2))});
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         build_opts.add_option(std::string("-DHAS_BIAS"));
-        build_opts.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type())));
+        build_opts.add_option(
+            std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type())));
     }
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         kernel_name                          = "dwc_native_quantized_nhwc";
         const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform();
@@ -306,13 +345,17 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
         build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
         build_opts.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
         build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
-        build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" + get_cl_type_from_data_type(_output_multipliers->info()->data_type()));
-        build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" + get_cl_type_from_data_type(_output_shifts->info()->data_type()));
-        build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL, "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR");
+        build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" +
+                              get_cl_type_from_data_type(_output_multipliers->info()->data_type()));
+        build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" +
+                              get_cl_type_from_data_type(_output_shifts->info()->data_type()));
+        build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL,
+                                      "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR");
         // Note: We expect the input and output tensors to always adopt a per-tensor quantization approach
         int a_val{};
         int b_val{};
-        std::tie(b_val, a_val) = get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo);
+        std::tie(b_val, a_val) =
+            get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo);
 
         build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + support::cpp11::to_string(a_val));
         build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + support::cpp11::to_string(b_val));
@@ -321,8 +364,10 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     {
         kernel_name = "dwc_native_fp_nhwc";
         build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-        build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a()));
-        build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b()));
+        build_opts.add_option_if(conv_info.act_info.enabled(),
+                                 "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a()));
+        build_opts.add_option_if(conv_info.act_info.enabled(),
+                                 "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b()));
     }
 
     Window win = calculate_max_window(*(output->info()), Steps(n0, m0));
@@ -350,10 +395,17 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     _config_id += string_from_data_type(input->info()->data_type());
 }
 
-Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                         const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo          *input,
+                                                         const ITensorInfo          *weights,
+                                                         const ITensorInfo          *biases,
+                                                         const ITensorInfo          *output,
+                                                         const DWCComputeKernelInfo &dwc_info,
+                                                         const ConvolutionInfo      &conv_info,
+                                                         const ITensorInfo          *output_multipliers,
+                                                         const ITensorInfo          *output_shifts)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts));
     return Status{};
 }
 
@@ -370,47 +422,52 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm
     cl::Image2D input_cl_image;
     cl::Image2D weights_cl_image;
 
-    if(_export_input_to_cl_image || _export_weights_to_cl_image)
+    if (_export_input_to_cl_image || _export_weights_to_cl_image)
     {
         // Export cl_buffer to cl_image
-        if(_export_input_to_cl_image)
+        if (_export_input_to_cl_image)
         {
-            const size_t      image_w = _input->info()->dimension(0) / 4;
-            const size_t      image_h = _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3);
+            const size_t image_w = _input->info()->dimension(0) / 4;
+            const size_t image_h =
+                _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3);
             const TensorShape shape2d(image_w, image_h);
             const size_t      image_row_pitch = _input->info()->strides_in_bytes()[1];
-            input_cl_image                    = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+            input_cl_image =
+                create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d,
+                                           _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
         }
 
-        if(_export_weights_to_cl_image)
+        if (_export_weights_to_cl_image)
         {
-            const size_t      image_w = _weights->info()->dimension(0) / 4;
-            const size_t      image_h = _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3);
+            const size_t image_w = _weights->info()->dimension(0) / 4;
+            const size_t image_h =
+                _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3);
             const TensorShape shape2d(image_w, image_h);
             const size_t      image_row_pitch = _weights->info()->strides_in_bytes()[1];
-            weights_cl_image                  = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch,
-                                                                           CLImage2DType::ReadOnly);
+            weights_cl_image =
+                create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d,
+                                           _weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
         }
     }
 
     unsigned int idx = 0;
-    if(_export_input_to_cl_image)
+    if (_export_input_to_cl_image)
     {
         _kernel.setArg(idx++, input_cl_image);
     }
     add_4d_tensor_nhwc_argument(idx, _input);
     add_4d_tensor_nhwc_argument(idx, _output);
-    if(_export_weights_to_cl_image)
+    if (_export_weights_to_cl_image)
     {
         _kernel.setArg(idx++, weights_cl_image);
     }
     add_4d_tensor_nhwc_argument(idx, _weights);
-    if(_is_quantized)
+    if (_is_quantized)
     {
         add_1D_tensor_argument(idx, _output_multipliers, slice);
         add_1D_tensor_argument(idx, _output_shifts, slice);
     }
-    if(_biases != nullptr)
+    if (_biases != nullptr)
     {
         add_1D_tensor_argument(idx, _biases, slice);
     }
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
index 8eee7b25004757f6c9e5f33dfc39b114dbda4c1d..d34a6629666be1bf69815625b337feac0721c246 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
@@ -24,11 +24,11 @@
 #ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
 #define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/function_info/ConvolutionInfo.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 class ICLTensor;
@@ -74,15 +74,28 @@ public:
      *          * no padding
      *          * no change of data layout after configure
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info,
-                   const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+    void configure(const CLCompileContext     &compile_context,
+                   ICLTensor                  *input,
+                   const ICLTensor            *weights,
+                   const ICLTensor            *biases,
+                   ICLTensor                  *output,
+                   const DWCComputeKernelInfo &dwc_info,
+                   const ConvolutionInfo      &conv_info,
+                   const ICLTensor            *output_multipliers = nullptr,
+                   const ICLTensor            *output_shifts      = nullptr);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
      *
      * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info,
-                   const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+    void configure(ICLTensor                  *input,
+                   const ICLTensor            *weights,
+                   const ICLTensor            *biases,
+                   ICLTensor                  *output,
+                   const DWCComputeKernelInfo &dwc_info,
+                   const ConvolutionInfo      &conv_info,
+                   const ICLTensor            *output_multipliers = nullptr,
+                   const ICLTensor            *output_shifts      = nullptr);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
      *
@@ -90,23 +103,29 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info,
-                           const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+    static Status validate(const ITensorInfo          *input,
+                           const ITensorInfo          *weights,
+                           const ITensorInfo          *biases,
+                           const ITensorInfo          *output,
+                           const DWCComputeKernelInfo &dwc_info,
+                           const ConvolutionInfo      &conv_info,
+                           const ITensorInfo          *output_multipliers = nullptr,
+                           const ITensorInfo          *output_shifts      = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    const ICLTensor *_input {};
+    const ICLTensor *_input{};
     const ICLTensor *_weights{};
     const ICLTensor *_biases{};
     ICLTensor       *_output{};
-    unsigned int     _depth_multiplier{ 0 };
+    unsigned int     _depth_multiplier{0};
     const ICLTensor *_output_multipliers{};
     const ICLTensor *_output_shifts{};
-    bool             _export_input_to_cl_image{ false };
-    bool             _export_weights_to_cl_image{ true };
-    bool             _is_quantized{ false };
+    bool             _export_input_to_cl_image{false};
+    bool             _export_weights_to_cl_image{true};
+    bool             _is_quantized{false};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
index 9b514ed705f915239dbc948ff2569b095da762cf..3d8f875ef7ba5a7f3c36d1f3669578030302d008 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -37,17 +38,20 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status validate_arguments(const ITensorInfo               *input,
+                          const ITensorInfo               *output,
+                          const ITensorInfo               *idx,
+                          const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -57,7 +61,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo                     *input,
+                                                        ITensorInfo                     *output,
+                                                        ITensorInfo                     *idx,
+                                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_UNUSED(idx, config);
 
@@ -69,21 +76,27 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-CLFFTDigitReverseKernel::CLFFTDigitReverseKernel()
-    : _input(nullptr), _output(nullptr), _idx(nullptr)
+CLFFTDigitReverseKernel::CLFFTDigitReverseKernel() : _input(nullptr), _output(nullptr), _idx(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLFFTDigitReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+void CLFFTDigitReverseKernel::configure(const ICLTensor                 *input,
+                                        ICLTensor                       *output,
+                                        const ICLTensor                 *idx,
+                                        const FFTDigitReverseKernelInfo &config)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, idx, config);
 }
 
-void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+void CLFFTDigitReverseKernel::configure(const CLCompileContext          &compile_context,
+                                        const ICLTensor                 *input,
+                                        ICLTensor                       *output,
+                                        const ICLTensor                 *idx,
+                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
-    auto padding_info = get_padding_info({ input, output, idx });
+    auto padding_info = get_padding_info({input, output, idx});
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
 
     _input  = input;
@@ -114,10 +127,14 @@ void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context,
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status CLFFTDigitReverseKernel::validate(const ITensorInfo               *input,
+                                         const ITensorInfo               *output,
+                                         const ITensorInfo               *idx,
+                                         const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
 
     return Status{};
 }
@@ -137,7 +154,6 @@ void CLFFTDigitReverseKernel::run(const Window &window, cl::CommandQueue &queue)
         add_3D_tensor_argument(idx, _output, slice);
         add_1D_tensor_argument(idx, _idx, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.h b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
index e5583a4c22665e65168ceef9e15b773d499e9cc6..fdd1bcc3d3da750fa80c36fe177cdd748dbe6624 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.h
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
 #define ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
@@ -56,7 +56,8 @@ public:
      * @param[in]  idx    Digit reverse index tensor. Data type supported: U32
      * @param[in]  config Kernel configuration.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+    void
+    configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -65,7 +66,11 @@ public:
      * @param[in]  idx             Digit reverse index tensor. Data type supported: U32
      * @param[in]  config          Kernel configuration.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+    void configure(const CLCompileContext          &compile_context,
+                   const ICLTensor                 *input,
+                   ICLTensor                       *output,
+                   const ICLTensor                 *idx,
+                   const FFTDigitReverseKernelInfo &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTDigitReverseKernel
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
@@ -75,7 +80,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
+    static Status validate(const ITensorInfo               *input,
+                           const ITensorInfo               *output,
+                           const ITensorInfo               *idx,
+                           const FFTDigitReverseKernelInfo &config);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
index 95f4b640bd4877fc7b9f8d271a21272b1194bd90..3729e6b77ddd7a018f67c8faaf70118861d05d16 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -46,11 +47,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(CLFFTRadixStageKernel::supported_radix().count(config.radix) == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] % config.radix);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -59,9 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
 {
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output, *input);
     }
@@ -76,8 +78,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-CLFFTRadixStageKernel::CLFFTRadixStageKernel()
-    : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLFFTRadixStageKernel::CLFFTRadixStageKernel() : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -87,11 +88,15 @@ void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config)
+void CLFFTRadixStageKernel::configure(const CLCompileContext        &compile_context,
+                                      ICLTensor                     *input,
+                                      ICLTensor                     *output,
+                                      const FFTRadixStageKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
-    auto padding_info = get_padding_info({ input, output });
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+    auto padding_info = get_padding_info({input, output});
 
     _input        = input;
     _output       = output;
@@ -110,11 +115,12 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set static arguments if not the first stage
-    if(!config.is_first_stage)
+    if (!config.is_first_stage)
     {
         const unsigned int Ni        = config.Nx * config.radix;
         const float        exp_const = (-2.0 * M_PI) / static_cast<float>(Ni);
-        unsigned int       idx       = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+        unsigned int       idx =
+            (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
         _kernel.setArg<cl_uint>(idx++, config.Nx);
         _kernel.setArg<cl_uint>(idx++, Ni);
         _kernel.setArg<cl_float>(idx, exp_const);
@@ -136,21 +142,22 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+Status CLFFTRadixStageKernel::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *output,
+                                       const FFTRadixStageKernelInfo &config)
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              (run_in_place) ? nullptr : output->clone().get(),
-                                                              config)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config)
+            .first);
 
     return Status{};
 }
 
 std::set<unsigned int> CLFFTRadixStageKernel::supported_radix()
 {
-    return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+    return std::set<unsigned int>{2, 3, 4, 5, 7, 8};
 }
 
 void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -165,12 +172,11 @@ void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.h b/src/core/CL/kernels/CLFFTRadixStageKernel.h
index 9bb310db838f6f40c0c8ed47f7222a20e3806656..de80bfced3b79ed8a6555ae9f75349bb707af1b9 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.h
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
 #define ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 #include <set>
 
 namespace arm_compute
@@ -69,7 +69,10 @@ public:
      * @param[out]    output          Destination tensor. Can be nullptr. Data type supported: same as @p input
      * @param[in]     config          FFT descriptor metadata.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config);
+    void configure(const CLCompileContext        &compile_context,
+                   ICLTensor                     *input,
+                   ICLTensor                     *output,
+                   const FFTRadixStageKernelInfo &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTRadixStageKernel
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp
index 8a714d71bf93069552136baf80d0bee57bcc219d..be6e16b074bfbd4b50acda68999611b19bb2cf42 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.cpp
+++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -54,8 +55,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 }
 } // namespace
 
-CLFFTScaleKernel::CLFFTScaleKernel()
-    : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLFFTScaleKernel::CLFFTScaleKernel() : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -65,11 +65,14 @@ void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTS
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config)
+void CLFFTScaleKernel::configure(const CLCompileContext   &compile_context,
+                                 ICLTensor                *input,
+                                 ICLTensor                *output,
+                                 const FFTScaleKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input        = input;
     _output       = output;
@@ -78,20 +81,22 @@ void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTen
     // Create kernel
     CLBuildOptions build_opts;
     build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels() : input->info()->num_channels()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels()
+                                                                                      : input->info()->num_channels()));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option_if(config.conjugate, "-DCONJ");
     std::string kernel_name = "fft_scale_conj";
     _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set static arguments
-    unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    unsigned int idx =
+        (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
     _kernel.setArg<cl_float>(idx, config.scale);
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output auto inizialitation if not yet initialized
         auto_init_if_empty(*output->info(), *input->info()->clone());
@@ -130,12 +135,11 @@ void CLFFTScaleKernel::run(const Window &window, cl::CommandQueue &queue)
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.h b/src/core/CL/kernels/CLFFTScaleKernel.h
index cc518be193941da5006d98ee8687ffc555164f20..b995282e029d110c728084bd50b3f37b8e395d49 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.h
+++ b/src/core/CL/kernels/CLFFTScaleKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFTSCALEKERNEL_H
 #define ARM_COMPUTE_CLFFTSCALEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
@@ -63,7 +63,10 @@ public:
      * @param[out]    output          Destination tensor. Data type supported: same as @p input
      * @param[in]     config          Kernel configuration
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config);
+    void configure(const CLCompileContext   &compile_context,
+                   ICLTensor                *input,
+                   ICLTensor                *output,
+                   const FFTScaleKernelInfo &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTScaleKernel
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index fcd99a4ed9180026b1421374aaea8f67fec721d3..86bb502da385448db8f5d11b9c5273603b9f67b5 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -31,14 +31,14 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
-CLFillBorderKernel::CLFillBorderKernel()
-    : ICLKernel(), _tensor(nullptr)
+CLFillBorderKernel::CLFillBorderKernel() : ICLKernel(), _tensor(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -56,27 +56,38 @@ void CLFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue
     ICLKernel::add_argument<T>(idx, static_cast<T>(value));
 }
 
-void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(ICLTensor        *tensor,
+                                   BorderSize        border_size,
+                                   BorderMode        border_mode,
+                                   const PixelValue &constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), tensor, border_size, border_mode, constant_border_value);
 }
 
-void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context,
+                                   ICLTensor              *tensor,
+                                   BorderSize              border_size,
+                                   BorderMode              border_mode,
+                                   const PixelValue       &constant_border_value)
 {
     _tensor = tensor;
     configure(compile_context, tensor->info(), border_size, border_mode, constant_border_value);
 }
 
-void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context,
+                                   ITensorInfo            *tensor,
+                                   BorderSize              border_size,
+                                   BorderMode              border_mode,
+                                   const PixelValue       &constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
     ARM_COMPUTE_ERROR_ON(tensor->num_channels() != 1);
-    auto padding_info = get_padding_info({ tensor });
+    auto padding_info = get_padding_info({tensor});
 
     border_size.limit(tensor->padding());
 
     // If there is no border: early exit
-    if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
+    if (border_size.empty() || border_mode == BorderMode::UNDEFINED)
     {
         return;
     }
@@ -98,25 +109,22 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Create static kernel arguments
-    const unsigned int valid_width  = tensor->valid_region().shape[0];
-    const unsigned int valid_height = tensor->valid_region().shape[1];
-    const cl_int2      valid_region_coords =
-    {
-        {
-            static_cast<cl_int>(tensor->valid_region().anchor[0]),
-            static_cast<cl_int>(tensor->valid_region().anchor[1]),
-        }
-    };
-    const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
+    const unsigned int valid_width         = tensor->valid_region().shape[0];
+    const unsigned int valid_height        = tensor->valid_region().shape[1];
+    const cl_int2      valid_region_coords = {{
+             static_cast<cl_int>(tensor->valid_region().anchor[0]),
+             static_cast<cl_int>(tensor->valid_region().anchor[1]),
+    }};
+    const unsigned int total_valid_width   = border_size.left + valid_width + border_size.right;
 
     // Set static kernel arguments
     unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters
     ICLKernel::add_argument<cl_uint>(idx, valid_width);
     ICLKernel::add_argument<cl_uint>(idx, valid_height);
     ICLKernel::add_argument<cl_int2>(idx, valid_region_coords);
-    if(BorderMode::CONSTANT == border_mode)
+    if (BorderMode::CONSTANT == border_mode)
     {
-        switch(dt)
+        switch (dt)
         {
             case DataType::U8:
             case DataType::QASYMM8:
@@ -175,12 +183,13 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen
 void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     // Border mode undefined or border width == 0
-    if(_kernel() == nullptr)
+    if (_kernel() == nullptr)
     {
         return;
     }
 
-    const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto tensor =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
 
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
@@ -193,14 +202,13 @@ void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, tensor, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
 void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     // Border mode undefined or border width == 0
-    if(_kernel() == nullptr)
+    if (_kernel() == nullptr)
     {
         return;
     }
@@ -216,7 +224,6 @@ void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _tensor, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFillBorderKernel.h b/src/core/CL/kernels/CLFillBorderKernel.h
index 7951f48171be1e1f32be89847ff52a7bea63d199..5782143cf92dd28c461e60c99141ae1d3f0a1e62 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.h
+++ b/src/core/CL/kernels/CLFillBorderKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -57,7 +58,11 @@ public:
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *tensor,
+                   BorderSize              border_size,
+                   BorderMode              border_mode,
+                   const PixelValue       &constant_border_value = PixelValue());
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in,out] tensor                Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
@@ -65,7 +70,10 @@ public:
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ICLTensor        *tensor,
+                   BorderSize        border_size,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in]     compile_context       The compile context to be used.
@@ -74,7 +82,11 @@ public:
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *tensor,
+                   BorderSize              border_size,
+                   BorderMode              border_mode,
+                   const PixelValue       &constant_border_value = PixelValue());
 
     /** Function to set the constant value on fill border kernel depending on type.
      *
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
index 68fe324df6c3dfcdd4478619b4360054e21ac382..7da0679ae406cfa749ccd3283c2ad26b89bdbbdf 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
@@ -30,20 +30,26 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status validate_arguments(const ITensorInfo         *input_weights,
+                          const ITensorInfo         *bn_mean,
+                          const ITensorInfo         *bn_var,
+                          const ITensorInfo         *fused_weights,
+                          const ITensorInfo         *fused_bias,
+                          const ITensorInfo         *input_bias,
+                          const ITensorInfo         *bn_beta,
+                          const ITensorInfo         *bn_gamma,
+                          float                      epsilon,
+                          FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
@@ -54,43 +60,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
     ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1);
 
-    if(fbn_type == FuseBatchNormalizationType::CONVOLUTION)
+    if (fbn_type == FuseBatchNormalizationType::CONVOLUTION)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0));
     }
     else
     {
-        const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t channel_idx =
+            get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0));
     }
 
     // Validate bias
-    if(input_bias != nullptr)
+    if (input_bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias);
     }
     // Validate beta
-    if(bn_beta != nullptr)
+    if (bn_beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta);
     }
     // Validate gamma
-    if(bn_gamma != nullptr)
+    if (bn_gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma);
     }
     // Validate output weights
-    if(fused_weights != nullptr && fused_weights->total_size() != 0)
+    if (fused_weights != nullptr && fused_weights->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights);
     }
     // Validate output bias
-    if(fused_bias != nullptr && fused_bias->total_size() != 0)
+    if (fused_bias != nullptr && fused_bias->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias);
@@ -101,28 +108,52 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
 } // namespace
 
 CLFuseBatchNormalizationKernel::CLFuseBatchNormalizationKernel()
-    : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
-      _run_in_place_weights(false), _run_in_place_bias(false)
+    : _input_weights(nullptr),
+      _input_bias(nullptr),
+      _bn_mean(nullptr),
+      _bn_var(nullptr),
+      _bn_gamma(nullptr),
+      _bn_beta(nullptr),
+      _fused_weights(nullptr),
+      _fused_bias(nullptr),
+      _epsilon(),
+      _run_in_place_weights(false),
+      _run_in_place_bias(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                               ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                               const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                               float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalizationKernel::configure(const ICLTensor           *input_weights,
+                                               const ICLTensor           *bn_mean,
+                                               const ICLTensor           *bn_var,
+                                               ICLTensor                 *fused_weights,
+                                               ICLTensor                 *fused_bias,
+                                               const ICLTensor           *input_bias,
+                                               const ICLTensor           *bn_beta,
+                                               const ICLTensor           *bn_gamma,
+                                               float                      epsilon,
+                                               FuseBatchNormalizationType fbn_type)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+              input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
-void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                               ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                               const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                               float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalizationKernel::configure(const CLCompileContext    &compile_context,
+                                               const ICLTensor           *input_weights,
+                                               const ICLTensor           *bn_mean,
+                                               const ICLTensor           *bn_var,
+                                               ICLTensor                 *fused_weights,
+                                               ICLTensor                 *fused_bias,
+                                               const ICLTensor           *input_bias,
+                                               const ICLTensor           *bn_beta,
+                                               const ICLTensor           *bn_gamma,
+                                               float                      epsilon,
+                                               FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
 
-    auto padding_info = get_padding_info({ input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma });
+    auto padding_info =
+        get_padding_info({input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma});
 
     _input_weights = input_weights;
     _input_bias    = input_bias;
@@ -135,28 +166,28 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
     _epsilon       = epsilon;
 
     _run_in_place_weights = (fused_weights == nullptr) || (fused_weights == input_weights);
-    _run_in_place_bias    = (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
+    _run_in_place_bias =
+        (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
 
     // Auto initialize outputs
-    if(_fused_weights != nullptr)
+    if (_fused_weights != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone());
     }
-    if(_fused_bias != nullptr)
+    if (_fused_bias != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
     }
 
     // Validate arguments
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(),
-                                                  (fused_weights != nullptr) ? fused_weights->info() : nullptr,
-                                                  (fused_bias != nullptr) ? fused_bias->info() : nullptr,
-                                                  (input_bias != nullptr) ? input_bias->info() : nullptr,
-                                                  (bn_beta != nullptr) ? bn_beta->info() : nullptr,
-                                                  (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
-                                                  epsilon, fbn_type));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input_weights->info(), bn_mean->info(), bn_var->info(),
+        (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+        (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr,
+        (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon,
+        fbn_type));
 
     // Configure kernel window
     Window win = calculate_max_window(*input_weights->info());
@@ -165,7 +196,8 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input_weights->info()->data_type()));
-    build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION, "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2)));
+    build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION,
+                             "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2)));
     build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
     build_opts.add_option_if(_input_weights->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
     build_opts.add_option_if(_run_in_place_weights, "-DIN_PLACE_W");
@@ -180,12 +212,19 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                                const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                                const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                                float epsilon, FuseBatchNormalizationType fbn_type)
+Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo         *input_weights,
+                                                const ITensorInfo         *bn_mean,
+                                                const ITensorInfo         *bn_var,
+                                                const ITensorInfo         *fused_weights,
+                                                const ITensorInfo         *fused_bias,
+                                                const ITensorInfo         *input_bias,
+                                                const ITensorInfo         *bn_beta,
+                                                const ITensorInfo         *bn_gamma,
+                                                float                      epsilon,
+                                                FuseBatchNormalizationType fbn_type)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                   input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
     return Status{};
 }
 
@@ -202,25 +241,25 @@ void CLFuseBatchNormalizationKernel::run(const arm_compute::Window &window, cl::
     // Add kernel arguments
     unsigned int idx = 0;
     add_3D_tensor_argument(idx, _input_weights, slice_3d);
-    if(_input_bias != nullptr)
+    if (_input_bias != nullptr)
     {
         add_1D_tensor_argument(idx, _input_bias, slice_1d);
     }
     add_1D_tensor_argument(idx, _bn_mean, slice_1d);
     add_1D_tensor_argument(idx, _bn_var, slice_1d);
-    if(!_run_in_place_weights)
+    if (!_run_in_place_weights)
     {
         add_3D_tensor_argument(idx, _fused_weights, slice_3d);
     }
-    if(!_run_in_place_bias)
+    if (!_run_in_place_bias)
     {
         add_1D_tensor_argument(idx, _fused_bias, slice_1d);
     }
-    if(_bn_beta != nullptr)
+    if (_bn_beta != nullptr)
     {
         add_1D_tensor_argument(idx, _bn_beta, slice_1d);
     }
-    if(_bn_gamma != nullptr)
+    if (_bn_gamma != nullptr)
     {
         add_1D_tensor_argument(idx, _bn_gamma, slice_1d);
     }
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
index 78b1e74cab242e38e6dbe70abbef312410bbce89..76ec7a759ffee197cced6e941459329bd257f0a7 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
@@ -62,9 +62,16 @@ public:
      * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
      */
-    void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const ICLTensor           *input_weights,
+                   const ICLTensor           *bn_mean,
+                   const ICLTensor           *bn_var,
+                   ICLTensor                 *fused_weights,
+                   ICLTensor                 *fused_bias,
+                   const ICLTensor           *input_bias = nullptr,
+                   const ICLTensor           *bn_beta    = nullptr,
+                   const ICLTensor           *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Set the source, destination of the kernel
      *
      * @param[in]  compile_context The compile context to be used.
@@ -81,9 +88,17 @@ public:
      * @param[in]  epsilon         (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type        (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input_weights,
+                   const ICLTensor           *bn_mean,
+                   const ICLTensor           *bn_var,
+                   ICLTensor                 *fused_weights,
+                   ICLTensor                 *fused_bias,
+                   const ICLTensor           *input_bias = nullptr,
+                   const ICLTensor           *bn_beta    = nullptr,
+                   const ICLTensor           *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalizationKernel
      *
      * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -101,10 +116,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    static Status validate(const ITensorInfo         *input_weights,
+                           const ITensorInfo         *bn_mean,
+                           const ITensorInfo         *bn_var,
+                           const ITensorInfo         *fused_weights,
+                           const ITensorInfo         *fused_bias,
+                           const ITensorInfo         *input_bias = nullptr,
+                           const ITensorInfo         *bn_beta    = nullptr,
+                           const ITensorInfo         *bn_gamma   = nullptr,
+                           float                      epsilon    = 0.001f,
+                           FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp
index 5495023b806a90ecadc0f132e650592ae96a7fde..c11a18940adf960d61383fd122fb8c32e7cbd8bf 100644
--- a/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/src/core/CL/kernels/CLGatherKernel.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLGatherKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -34,7 +36,8 @@ namespace arm_compute
 {
 namespace
 {
-inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+inline Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
     const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
@@ -43,11 +46,12 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in
     ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+            input->tensor_shape(), indices->tensor_shape(), actual_axis);
         ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
     }
 
@@ -56,12 +60,14 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
     const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
     // Output auto initialization if not yet initialized
-    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+        input->tensor_shape(), indices->tensor_shape(), actual_axis);
     auto_init_if_empty((*output), output_shape, 1, input->data_type());
 
     // Create window
@@ -72,8 +78,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 
 } // namespace
 
-CLGatherKernel::CLGatherKernel()
-    : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+CLGatherKernel::CLGatherKernel() : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -83,10 +88,14 @@ void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices,
     configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
 }
 
-void CLGatherKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+void CLGatherKernel::configure(const CLCompileContext &compile_context,
+                               const ICLTensor        *input,
+                               const ICLTensor        *indices,
+                               ICLTensor              *output,
+                               int                     axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
-    auto padding_info = get_padding_info({ input, output, indices });
+    auto padding_info = get_padding_info({input, output, indices});
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), indices->info(), output->info(), axis));
 
     // Configure kernel window
@@ -100,7 +109,8 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC
 
     // Set build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DOUTPUT_DIM_Z=" + support::cpp11::to_string(output->info()->dimension(2)));
     build_opts.add_option("-DINDICES_DIM_Z=" + support::cpp11::to_string(indices->info()->dimension(2)));
     build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
@@ -114,10 +124,12 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+Status
+CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
     return Status{};
 }
 
diff --git a/src/core/CL/kernels/CLGatherKernel.h b/src/core/CL/kernels/CLGatherKernel.h
index 8f472a4696ef82dbc630962b6518272d06f36526..db4b49d2f5fdab31f5de67260f9f62042bf4edfa 100644
--- a/src/core/CL/kernels/CLGatherKernel.h
+++ b/src/core/CL/kernels/CLGatherKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLGATHERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -63,7 +64,11 @@ public:
      * @param[out] output          Destination tensor. Data type supported: Same as @p input
      * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *indices,
+                   ICLTensor              *output,
+                   int                     axis = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel
      *
@@ -74,7 +79,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
index 088c454f3c4b1e6974a99c4f199686e740708410..b9ff72b9289df09102f2b28767c54c3d807d2c07 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -47,7 +48,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
-    if(all_anchors->total_size() > 0)
+    if (all_anchors->total_size() > 0)
     {
         size_t feature_height = info.feat_height();
         size_t feature_width  = info.feat_width();
@@ -57,7 +58,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi());
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors);
 
-        if(is_data_type_quantized(anchors->data_type()))
+        if (is_data_type_quantized(anchors->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors);
         }
@@ -66,21 +67,25 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
 }
 } // namespace
 
-CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel()
-    : _anchors(nullptr), _all_anchors(nullptr)
+CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel() : _anchors(nullptr), _all_anchors(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+void CLComputeAllAnchorsKernel::configure(const ICLTensor          *anchors,
+                                          ICLTensor                *all_anchors,
+                                          const ComputeAnchorsInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), anchors, all_anchors, info);
 }
 
-void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+void CLComputeAllAnchorsKernel::configure(const CLCompileContext   &compile_context,
+                                          const ICLTensor          *anchors,
+                                          ICLTensor                *all_anchors,
+                                          const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(anchors, all_anchors);
-    auto padding_info = get_padding_info({ anchors, all_anchors });
+    auto padding_info = get_padding_info({anchors, all_anchors});
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(anchors->info(), all_anchors->info(), info));
 
     // Metadata
@@ -91,7 +96,8 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
 
     // Initialize the output if empty
     const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors);
-    auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
+    auto_init_if_empty(*all_anchors->info(),
+                       TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
 
     // Set instance variables
     _anchors     = anchors;
@@ -108,7 +114,7 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
     build_opts.add_option("-DNUM_ANCHORS=" + support::cpp11::to_string(num_anchors));
     build_opts.add_option("-DNUM_ROI_FIELDS=" + support::cpp11::to_string(info.values_per_roi()));
 
-    if(is_quantized)
+    if (is_quantized)
     {
         const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform();
         build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
@@ -116,8 +122,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
     }
 
     // Create kernel
-    const std::string kernel_name = (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors";
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts.options());
+    const std::string kernel_name =
+        (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors";
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // The tensor all_anchors can be interpreted as an array of structs (each structs has values_per_roi fields).
     // This means we don't need to pad on the X dimension, as we know in advance how many fields
@@ -127,7 +134,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+Status CLComputeAllAnchorsKernel::validate(const ITensorInfo        *anchors,
+                                           const ITensorInfo        *all_anchors,
+                                           const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info));
     return Status{};
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
index d26795ac7d9b9abaea5bd4b60881e1b6d7431cc6..e08f281d6c7cc8cd3ceb4877e4f2161dda79dba4 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
@@ -62,7 +62,10 @@ public:
      * @param[in]  info            Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info);
+    void configure(const CLCompileContext   &compile_context,
+                   const ICLTensor          *anchors,
+                   ICLTensor                *all_anchors,
+                   const ComputeAnchorsInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLComputeAllAnchorsKernel
      *
@@ -81,5 +84,5 @@ private:
     const ICLTensor *_anchors;
     ICLTensor       *_all_anchors;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif // ARM_COMPUTE_CLGENERATEPROSPOSALSLAYERKERNEL_H
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
index 7ed323c95087228aef45e8d5da651ab7251b481d..b13eb1655654678d1906a6eefcb3f8ef420a510b 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -39,17 +40,20 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status validate_arguments(const ITensorInfo                          *input,
+                          const ITensorInfo                          *output,
+                          const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.epsilon == 0.f, "Epsilon must be different than 0");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                        "Input and output have different number of channels");
     }
 
     return Status{};
@@ -59,27 +63,30 @@ Status validate_arguments_meanvar(const ITensorInfo *input, const ITensorInfo *o
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                        "Input and output have different number of channels");
     }
 
     return Status{};
 }
 } // namespace
 
-CLComputeMeanVariance::CLComputeMeanVariance()
-    : _input(nullptr), _output(nullptr)
+CLComputeMeanVariance::CLComputeMeanVariance() : _input(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision)
+void CLComputeMeanVariance::configure(const CLCompileContext &compile_context,
+                                      ICLTensor              *input,
+                                      ICLTensor              *output,
+                                      bool                    use_mixed_precision)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output == nullptr ? input : output;
@@ -88,7 +95,8 @@ void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, I
     const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
 
     CLBuildOptions build_opts;
-    build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.add_option("-DINTERNAL_DATA_TYPE=" +
+                          (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
@@ -108,7 +116,7 @@ void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, I
     const TensorShape  out_shape(input_channel, 2u, input_batches);
 
     // Output auto initialization if not yet initialized
-    if(use_mixed_precision)
+    if (use_mixed_precision)
     {
         auto_init_if_empty(*_output->info(), out_shape, 1, DataType::F32);
     }
@@ -134,7 +142,7 @@ void CLComputeMeanVariance::run(const Window &window, cl::CommandQueue &queue)
     Window collapsed_window = window.collapse(window, Window::DimZ);
 
     // We will process the planes together
-    if(_input->info()->data_layout() == DataLayout::NCHW)
+    if (_input->info()->data_layout() == DataLayout::NCHW)
     {
         collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
         collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -157,10 +165,14 @@ CLInstanceNormalizationLayerKernel::CLInstanceNormalizationLayerKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info)
+void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext                     &compile_context,
+                                                   ICLTensor                                  *input,
+                                                   ICLTensor                                  *mean_var,
+                                                   ICLTensor                                  *output,
+                                                   const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output == nullptr ? input : output;
@@ -172,7 +184,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision
+                                                         ? "float"
+                                                         : get_cl_type_from_data_type(input->info()->data_type())));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1)));
@@ -188,7 +202,7 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps(1));
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
     }
@@ -197,7 +211,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo                          *input,
+                                                    const ITensorInfo                          *output,
+                                                    const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
     return Status{};
@@ -211,7 +227,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu
     Window collapsed_window = window.collapse(window, Window::DimZ);
 
     // We will process the planes together
-    if(_input->info()->data_layout() == DataLayout::NCHW)
+    if (_input->info()->data_layout() == DataLayout::NCHW)
     {
         collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
         collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -226,7 +242,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu
     add_4D_tensor_argument(idx, _input, collapsed_window);
     add_3D_tensor_argument(idx, _mean, collapsed_window);
 
-    if(!_run_in_place)
+    if (!_run_in_place)
     {
         add_4D_tensor_argument(idx, _output, collapsed_window);
     }
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
index 2f9014a651d88928b4a44379d295bb20a1591843..9f436da7f6bfecac22fdcaf7d803378e040d8234 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
@@ -59,7 +59,11 @@ public:
      * @param[out]     output          Destination tensor. Data types and data layouts supported: same as @p input.
      * @param[in]      info            Kernel meta-data descriptor
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info);
+    void configure(const CLCompileContext                     &compile_context,
+                   ICLTensor                                  *input,
+                   ICLTensor                                  *mean_var,
+                   ICLTensor                                  *output,
+                   const InstanceNormalizationLayerKernelInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
      *
@@ -69,7 +73,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -106,7 +111,8 @@ public:
      * @param[out]     output              Destination tensor. Data types and data layouts supported: same as @p input.
      * @param[in]      use_mixed_precision Use mixed precision in case of FP16 execution
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision);
+    void
+    configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
      *
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 542d380e4a8d674ebf45a44887df2f898bd474db..9ed9d7c5b00fd28d42208ecce144ff8d17aba305 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -31,10 +31,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -43,7 +43,8 @@ namespace
 {
 constexpr int max_input_tensor_dim = 3;
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_UNUSED(epsilon);
 
@@ -53,14 +54,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions,
+                                    "Actual normalization axis greater than max number of dimensions");
 
     // Reduce shape on axis
     TensorShape sum_shape = input->tensor_shape();
     sum_shape.set(actual_axis, 1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -78,16 +80,22 @@ CLL2NormalizeLayerKernel::CLL2NormalizeLayerKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(
+    const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, sum, output, axis, epsilon);
 }
 
-void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context,
+                                         const ICLTensor        *input,
+                                         const ICLTensor        *sum,
+                                         ICLTensor              *output,
+                                         int                     axis,
+                                         float                   epsilon)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
-    auto padding_info = get_padding_info({ input, sum, output });
+    auto padding_info = get_padding_info({input, sum, output});
 
     _input       = input;
     _sum         = sum;
@@ -95,8 +103,9 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
     _actual_axis = wrap_around(axis, max_input_tensor_dim);
     _epsilon     = epsilon;
 
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
-    const int          vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+    const unsigned int vec_size_x =
+        adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+    const int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
 
     // Set build options
     CLBuildOptions build_opts;
@@ -107,7 +116,7 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
     // Create kernel
     std::string  kernel_name;
     unsigned int idx = 0;
-    switch(_actual_axis)
+    switch (_actual_axis)
     {
         case 0:
             kernel_name = "l2_normalize_x";
@@ -127,7 +136,7 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set epsilon argument
-    if(input->info()->data_type() == DataType::F32)
+    if (input->info()->data_type() == DataType::F32)
     {
         _kernel.setArg<cl_float>(idx, _epsilon);
     }
@@ -146,7 +155,8 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status CLL2NormalizeLayerKernel::validate(
+    const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
     return Status{};
@@ -159,7 +169,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
 
     Window window_sum(window);
 
-    switch(_actual_axis)
+    switch (_actual_axis)
     {
         case 0:
         {
@@ -173,8 +183,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
                 add_2D_tensor_argument(idx, _sum, sum_slice);
                 add_2D_tensor_argument(idx, _output, in_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+            } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
         }
         break;
         case 1:
@@ -189,8 +198,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
                 add_2D_tensor_argument(idx, _sum, sum_slice);
                 add_2D_tensor_argument(idx, _output, in_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+            } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
         }
         break;
         case 2:
@@ -205,8 +213,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
                 add_3D_tensor_argument(idx, _sum, sum_slice);
                 add_3D_tensor_argument(idx, _output, in_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
+            } while (window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
         }
         break;
         default:
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
index edc0585217a3139136ecf7b68c57cc4fc316aa61..5c9ab94ce55d8dea426a764aa14848cfcd2e5d23 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -70,7 +71,12 @@ public:
      * @param[in]  axis            Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
      * @param[in]  epsilon         Lower bound value for the normalization.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *sum,
+                   ICLTensor              *output,
+                   int                     axis,
+                   float                   epsilon);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayerKernel.
      *
@@ -84,7 +90,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
index dc9d68626db1c3ab5b8ea06f9393afe0b3908256..e560f1de4a6409b984864ae19714d346f604ab0f 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -42,26 +43,31 @@ using namespace misc::shape_calculator;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status validate_arguments(const ITensorInfo      *input,
+                          const ITensorInfo      *output,
+                          const PoolingLayerInfo &pool_info,
+                          const ITensorInfo      *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, indices);
 
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    PoolingType         pool_type       = pool_info.pool_type;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
+    PoolingType         pool_type          = pool_info.pool_type;
+    const PadStrideInfo pad_stride_info    = pool_info.pad_stride_info;
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int    pool_size_x = pool_info.pool_size.width;
-    const int    pool_size_y = pool_info.pool_size.height;
+    const int    pool_size_x               = pool_info.pool_size.width;
+    const int    pool_size_y               = pool_info.pool_size.height;
     const Size2D pool_size(pool_size_x, pool_size_y);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+                                    "Pooling indices only supported for MAX pooling method");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -71,17 +77,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel()
-    : _input(nullptr), _output(nullptr), _indices(nullptr)
+CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel() : _input(nullptr), _output(nullptr), _indices(nullptr)
 {
     _type = CLKernelType::POOL;
 }
 
-void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const ICLTensor        *indices,
+                                          ICLTensor              *output,
+                                          const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, indices->info()));
-    auto padding_info = get_padding_info({ input, indices, output });
+    auto padding_info = get_padding_info({input, indices, output});
 
     _input   = input;
     _output  = output;
@@ -119,7 +128,10 @@ void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_contex
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo      *input,
+                                           const ITensorInfo      *indices,
+                                           const ITensorInfo      *output,
+                                           const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices));
@@ -140,7 +152,6 @@ void CLMaxUnpoolingLayerKernel::run(const Window &window, cl::CommandQueue &queu
         add_3D_tensor_argument(idx, _output, slice);
         add_3D_tensor_argument(idx, _indices, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
index 45481d0507aa8f7db50c3faa3730bab744d16d60..eb18a467846fe0adc608d005540b493e8ed3f658 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
@@ -59,7 +59,11 @@ public:
      * @param[out] output          Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *indices,
+                   ICLTensor              *output,
+                   const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayerKernel
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -72,7 +76,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info);
 
     // Inherited methods overridden
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
index ac33468ad8e068eb57529a282226df4307667c9b..8632bdf623286d660c87573c1b2e565fda056b8f 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -49,7 +50,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -69,15 +70,19 @@ void CLMeanStdDevNormalizationKernel::configure(ICLTensor *input, ICLTensor *out
     configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
 }
 
-void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
+void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context,
+                                                ICLTensor              *input,
+                                                ICLTensor              *output,
+                                                float                   epsilon)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
     _run_in_place = (output == nullptr) || (output == input);
 
-    ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+    ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(
+        input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output->info(), *input->info());
     }
@@ -85,7 +90,8 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_
     _input  = input;
     _output = output;
 
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
 
     // Set build options
     CLBuildOptions build_opts;
@@ -134,7 +140,6 @@ void CLMeanStdDevNormalizationKernel::run(const Window &window, cl::CommandQueue
         add_2D_tensor_argument_if((!_run_in_place), idx, _output, slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
index a1ba2b905ea9fc5b5fd8cbb49ece540907dfaf2e..e02a3c58a3da45e11e0e0c497f98aabb3873b9d5 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
@@ -66,7 +66,10 @@ public:
      * @param[out]     output          (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
      * @param[in]      epsilon         (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output  = nullptr,
+                   float                   epsilon = 1e-8f);
     /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel
      *
      * @param[in] input   Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index c6c4229c0062123711b329624ad99b298c6fc4e2..b636c485e7237a3570826159b8dfa1124d23b0fd 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -32,6 +32,7 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -53,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -63,7 +64,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
 {
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, *input->clone());
@@ -71,9 +73,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     bool             window_changed = false;
     Window           win;
     const DataLayout data_layout = input->data_layout();
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
-        const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+        const unsigned int vec_size_x =
+            adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
         const unsigned int norm_idx             = get_normalization_dimension_index(input->data_layout(), norm_info);
         const bool         is_norm_across_width = norm_idx == 0;
 
@@ -87,15 +90,16 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         // The output has 1 right padding because of the vec_size_x.
         // The input has 1 left padding because radius = 1.
         // The input has 2 right padding because of radius = 1 AND because of the extra output padding
-        const unsigned int border_width_left  = is_norm_across_width ? norm_radius : 0;
-        const unsigned int border_width_right = is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0;
-        const BorderSize   border_size        = BorderSize(0, border_width_right, 0, border_width_left);
+        const unsigned int border_width_left = is_norm_across_width ? norm_radius : 0;
+        const unsigned int border_width_right =
+            is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0;
+        const BorderSize border_size = BorderSize(0, border_width_right, 0, border_width_left);
 
         win = calculate_max_window(*input, Steps(vec_size_x));
 
         // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding
         // Reads can occur within the valid region of the input
-        if(is_norm_across_width)
+        if (is_norm_across_width)
         {
             AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
             window_changed = window_changed || update_window_and_padding(win, input_access);
@@ -112,13 +116,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     else
     {
         unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
-        if(norm_info.is_cross_map())
+        if (norm_info.is_cross_map())
         {
             vec_size_x = 1;
         }
         win = calculate_max_window(*input, Steps(vec_size_x));
     }
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -139,10 +144,13 @@ void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *ou
     configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
 }
 
-void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
+void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+                                           const ICLTensor        *input,
+                                           ICLTensor              *output,
+                                           NormalizationLayerInfo  norm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
@@ -152,16 +160,17 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
     _input  = input;
     _output = output;
 
-    const DataLayout data_layout          = input->info()->data_layout();
-    unsigned int     vec_size_x           = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
-    int              vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
-    if(norm_info.is_cross_map() && data_layout == DataLayout::NHWC)
+    const DataLayout data_layout = input->info()->data_layout();
+    unsigned int     vec_size_x =
+        adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+    int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+    if (norm_info.is_cross_map() && data_layout == DataLayout::NHWC)
     {
         vec_size_x           = 1;
         vec_size_x_leftovers = 0;
     }
 
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
         const unsigned int norm_idx    = get_normalization_dimension_index(data_layout, norm_info);
         _is_norm_across_width          = norm_idx == 0;
@@ -175,9 +184,10 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
         // The output has 1 right padding because of the vec_size_x.
         // The input has 1 left padding because radius = 1.
         // The input has 2 right padding because of radius = 1 AND the extra output padding
-        const unsigned int border_width_left  = _is_norm_across_width ? norm_radius : 0;
-        const unsigned int border_width_right = _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0;
-        _border_size                          = BorderSize(0, border_width_right, 0, border_width_left);
+        const unsigned int border_width_left = _is_norm_across_width ? norm_radius : 0;
+        const unsigned int border_width_right =
+            _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0;
+        _border_size = BorderSize(0, border_width_right, 0, border_width_left);
     }
 
     const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
@@ -193,12 +203,14 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
     build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2)));
     build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
     build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
-    build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
-    build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC, "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()),
+                             "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
+    build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC,
+                             "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1)));
 
     // Create kernel
     std::string kernel_name;
-    if(norm_info.is_in_map())
+    if (norm_info.is_in_map())
     {
         kernel_name = "normalization_layer_in_map_" + lower_string(string_from_data_layout(data_layout));
     }
@@ -222,16 +234,19 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
     _config_id += support::cpp11::to_string(input->info()->dimension(0));
     _config_id += "_";
     _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
     }
 }
 
-Status CLNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
+Status CLNormalizationLayerKernel::validate(const ITensorInfo     *input,
+                                            const ITensorInfo     *output,
+                                            NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
 
     return Status{};
 }
@@ -251,7 +266,6 @@ void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &que
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    } while (window_collapsed.slide_window_slice_3D(slice));
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.h b/src/core/CL/kernels/CLNormalizationLayerKernel.h
index 739a2ae9f107a7e35744da3e0475bc92abe87faf..5517ba690471c7dddbfc1e41481ad9e149d71914 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.h
@@ -63,7 +63,10 @@ public:
      *                             Data layouts supported: same as @p input.
      * @param[in]  norm_info       Normalization layer information like the normalization type, normalization size and other parameters.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   NormalizationLayerInfo  norm_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel
      *
      * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -77,7 +80,7 @@ public:
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void       run(const Window &window, cl::CommandQueue &queue) override;
     BorderSize border_size() const override;
 
 private:
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
index 6b0400d50e9ddeeaaea1f311df7a4afff1c0fa2d..59352a8fb7c4e41b240d115bc049a5ef4dc25080 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
@@ -31,32 +31,35 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(mean->num_dimensions() > 1, "mean and std must be vectors");
 
-    const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+    const unsigned int channel_idx =
+        get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != mean->dimension(0));
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -77,7 +80,8 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input,
 
     bool window_changed = update_window_and_padding(win, input_access, output_access);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -88,12 +92,19 @@ CLNormalizePlanarYUVLayerKernel::CLNormalizePlanarYUVLayerKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input,
+                                                ICLTensor       *output,
+                                                const ICLTensor *mean,
+                                                const ICLTensor *std)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
 }
 
-void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context,
+                                                const ICLTensor        *input,
+                                                ICLTensor              *output,
+                                                const ICLTensor        *mean,
+                                                const ICLTensor        *std)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std);
@@ -102,7 +113,7 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), *input->info()->clone());
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output;
@@ -112,9 +123,10 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     const DataLayout data_layout = input->info()->data_layout();
 
     // Get number of elements to process per iterations
-    const unsigned int num_elems_processed_per_iteration = (data_layout == DataLayout::NHWC) ? adjust_vec_size(16 / input->info()->element_size(),
-                                                                                                               input->info()->dimension(0)) :
-                                                           (16 / input->info()->element_size());
+    const unsigned int num_elems_processed_per_iteration =
+        (data_layout == DataLayout::NHWC)
+            ? adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0))
+            : (16 / input->info()->element_size());
     const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
     const DataType     dt          = input->info()->data_type();
 
@@ -122,11 +134,12 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     CLBuildOptions build_opts;
     build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
     build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-    build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)));
+    build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" +
+                           support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)));
     build_opts.add_option(("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(channel_idx))));
 
     std::string kernel_name = "normalize_planar_yuv_layer_";
-    if(is_data_type_quantized(dt))
+    if (is_data_type_quantized(dt))
     {
         const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
         build_opts.add_option(("-DOFFSET=" + support::cpp11::to_string(qinfo.offset)));
@@ -139,7 +152,7 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
         ICLKernel::configure_internal(win);
@@ -165,12 +178,16 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     _config_id += support::cpp11::to_string(input->info()->dimension(2));
 }
 
-Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input,
+                                                 const ITensorInfo *output,
+                                                 const ITensorInfo *mean,
+                                                 const ITensorInfo *std)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, std));
-    if(input->data_layout() == DataLayout::NCHW)
+    if (input->data_layout() == DataLayout::NCHW)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first);
     }
     return Status{};
 }
@@ -196,7 +213,6 @@ void CLNormalizePlanarYUVLayerKernel::run(const Window &window, cl::CommandQueue
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
index 6db4433e78a40466857cf3cdfbe9c02eba531bf2..341b404e3d57a2305fab1a7c18231e235a4ea863 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
@@ -67,7 +67,11 @@ public:
      * @param[in]  std             Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
      *                             Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *mean,
+                   const ICLTensor        *std);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayerKernel
      *
      * @param[in]  input  Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
@@ -79,7 +83,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp
index 53f313c0d39bc8c569230f9196e688ffcd5e69e5..0ac285038ee35f3ed5f80a83fcf0255d39d78ba8 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPadLayerKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -35,25 +36,29 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const PaddingList &padding,
+                          PixelValue         constant_value,
+                          PaddingMode        mode)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_UNUSED(constant_value);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON((padding.size() < 1) || (padding.size() > input->num_dimensions()));
-    if(mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
+    if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3);
 
         const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
-        for(size_t i = 0; i < padding.size(); ++i)
+        for (size_t i = 0; i < padding.size(); ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect));
             ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect));
         }
     }
 
-    if(output->total_size() > 0)
+    if (output->total_size() > 0)
     {
         TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
 
@@ -65,41 +70,51 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLPadLayerKernel::CLPadLayerKernel()
-    : _input(nullptr), _output(nullptr), _4d_enabled(false)
+CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _4d_enabled(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayerKernel::configure(
+    const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
 }
 
-void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayerKernel::configure(const CLCompileContext &compile_context,
+                                 const ICLTensor        *input,
+                                 ICLTensor              *output,
+                                 const PaddingList      &padding,
+                                 PixelValue              constant_value,
+                                 PaddingMode             mode)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding)));
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(
+                           misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding)));
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, constant_value, mode));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input      = input;
     _output     = output;
     _4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3);
 
     // Set build options
-    const DataType    &data_type               = input->info()->data_type();
-    const unsigned int input_width             = input->info()->dimension(0);
-    const unsigned int input_height            = input->info()->dimension(1);
-    const unsigned int input_depth             = input->info()->dimension(2);
-    const unsigned int pad_x_before            = padding.at(0).first;
-    const unsigned int pad_y_before            = padding.size() > 1 ? padding.at(1).first : 0;
-    const unsigned int pad_z_before            = padding.size() > 2 ? padding.at(2).first : 0;
-    const unsigned int vec_size                = adjust_vec_size(std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))), input_width);
-    const unsigned int pad_right_start         = input_width + pad_x_before;
-    const unsigned int pad_x_before_remainder  = pad_x_before % vec_size;
-    const unsigned int vec_size_leftover_write = vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0));
+    const DataType    &data_type    = input->info()->data_type();
+    const unsigned int input_width  = input->info()->dimension(0);
+    const unsigned int input_height = input->info()->dimension(1);
+    const unsigned int input_depth  = input->info()->dimension(2);
+    const unsigned int pad_x_before = padding.at(0).first;
+    const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0;
+    const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0;
+    const unsigned int vec_size     = adjust_vec_size(
+            std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))),
+            input_width);
+    const unsigned int pad_right_start        = input_width + pad_x_before;
+    const unsigned int pad_x_before_remainder = pad_x_before % vec_size;
+    const unsigned int vec_size_leftover_write =
+        vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0));
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
@@ -108,12 +123,12 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
     build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
     build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" + support::cpp11::to_string(pad_x_before_remainder));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER_WRITE=" + support::cpp11::to_string(vec_size_leftover_write));
-    if(padding.size() > 1)
+    if (padding.size() > 1)
     {
         build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before));
         build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
 
-        if(padding.size() > 2)
+        if (padding.size() > 2)
         {
             build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before));
             build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth));
@@ -121,23 +136,25 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
     }
 
     std::string kernel_name = "pad_layer_";
-    switch(mode)
+    switch (mode)
     {
         case PaddingMode::CONSTANT:
         {
             kernel_name += "constant";
 
-            const unsigned int vec_size_leftover_read = vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start);
+            const unsigned int vec_size_leftover_read =
+                vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start);
 
             build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type));
             build_opts.add_option("-DVEC_SIZE_LEFTOVER_READ=" + support::cpp11::to_string(vec_size_leftover_read));
 
-            if(pad_x_before >= vec_size)
+            if (pad_x_before >= vec_size)
             {
                 build_opts.add_option("-DTHREADS_TO_SKIP_BEFORE=" + support::cpp11::to_string(pad_x_before / vec_size));
-                build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" + support::cpp11::to_string(pad_right_start / vec_size));
+                build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" +
+                                      support::cpp11::to_string(pad_right_start / vec_size));
             }
-            if(_4d_enabled)
+            if (_4d_enabled)
             {
                 build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first));
                 build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
@@ -154,14 +171,17 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
 
             const unsigned int pad_x_after_remainder = pad_right_start % vec_size;
             const unsigned int after_pad_fact_x      = (2 * input_width + pad_x_before) - is_reflect;
-            const unsigned int output_last_x         = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
+            const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
 
             build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect));
             build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" + support::cpp11::to_string(pad_x_after_remainder));
-            build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
-            build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
+            build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" +
+                                  support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
+            build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" +
+                                  support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
             build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x));
-            build_opts.add_option_if(after_pad_fact_x < output_last_x, "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
+            build_opts.add_option_if(after_pad_fact_x < output_last_x,
+                                     "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
 
             break;
         }
@@ -179,7 +199,11 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status CLPadLayerKernel::validate(const ITensorInfo *input,
+                                  const ITensorInfo *output,
+                                  const PaddingList &padding,
+                                  PixelValue         constant_value,
+                                  PaddingMode        mode)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode));
     return Status{};
@@ -197,13 +221,12 @@ void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
-        if(_4d_enabled)
+        if (_4d_enabled)
         {
             add_argument<unsigned int>(idx, batch++);
         }
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLPadLayerKernel.h b/src/core/CL/kernels/CLPadLayerKernel.h
index 90af337f947ee4cf67f2c858676d6411d950d952..dca121b6a1e3fff9d15aa90527a6af2541f2cd42 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.h
+++ b/src/core/CL/kernels/CLPadLayerKernel.h
@@ -56,7 +56,11 @@ public:
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(const ICLTensor   *input,
+                   ICLTensor         *output,
+                   const PaddingList &padding,
+                   PixelValue         constant_value = PixelValue(),
+                   PaddingMode        mode           = PaddingMode::CONSTANT);
     /** Set the input and output tensor.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -68,8 +72,12 @@ public:
      * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                             or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(),
-                   PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const PaddingList      &padding,
+                   PixelValue              constant_value = PixelValue(),
+                   PaddingMode             mode           = PaddingMode::CONSTANT);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPadLayerKernel
      *
      * @param[in] input          Source tensor info. Data types supported: All.
@@ -80,7 +88,11 @@ public:
      * @param[in] mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const PaddingList &padding,
+                           PixelValue         constant_value = PixelValue(),
+                           PaddingMode        mode           = PaddingMode::CONSTANT);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
index bf1b874dd0332bc296b04648d793582708f104f9..7dcdf1de6f11d5b0b13c5e56b46e130ea7a463d5 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
@@ -42,7 +42,10 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status validate_arguments(const ITensorInfo       *input1,
+                          const ITensorInfo       *input2,
+                          const ITensorInfo       *output,
+                          const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
@@ -51,10 +54,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
 
     // Check variances
     const int var_size = info.variances().size();
-    if(var_size > 1)
+    if (var_size > 1)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
-        for(int i = 0; i < var_size; ++i)
+        for (int i = 0; i < var_size; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
         }
@@ -62,17 +65,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
 
-    if(!info.max_sizes().empty())
+    if (!info.max_sizes().empty())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(),
+                                        "Max and min sizes dimensions should match");
     }
 
-    for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+    for (unsigned int i = 0; i < info.max_sizes().size(); ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i],
+                                        "Max size should be greater than min size");
     }
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
     }
@@ -80,7 +85,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info, int num_priors)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo       *input1,
+                                                        const ITensorInfo       *input2,
+                                                        ITensorInfo             *output,
+                                                        const PriorBoxLayerInfo &info,
+                                                        int                      num_priors)
 {
     ARM_COMPUTE_UNUSED(input2);
     // Output tensor auto initialization if not yet initialized
@@ -88,10 +97,11 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
     auto_init_if_empty(*output, output_shape, 1, input1->data_type());
 
     const unsigned int     num_elems_processed_per_iteration = 4 * num_priors;
-    Window                 win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    Window                 win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
     bool                   window_changed = update_window_and_padding(win, output_access);
-    Status                 err            = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status                 err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -102,13 +112,25 @@ CLPriorBoxLayerKernel::CLPriorBoxLayerKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios)
+void CLPriorBoxLayerKernel::configure(const ICLTensor         *input1,
+                                      const ICLTensor         *input2,
+                                      ICLTensor               *output,
+                                      const PriorBoxLayerInfo &info,
+                                      cl::Buffer              *min,
+                                      cl::Buffer              *max,
+                                      cl::Buffer              *aspect_ratios)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info, min, max, aspect_ratios);
 }
 
-void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min,
-                                      cl::Buffer *max, cl::Buffer *aspect_ratios)
+void CLPriorBoxLayerKernel::configure(const CLCompileContext  &compile_context,
+                                      const ICLTensor         *input1,
+                                      const ICLTensor         *input2,
+                                      ICLTensor               *output,
+                                      const PriorBoxLayerInfo &info,
+                                      cl::Buffer              *min,
+                                      cl::Buffer              *max,
+                                      cl::Buffer              *aspect_ratios)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
@@ -135,7 +157,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
 
     int img_width  = info.img_size().x;
     int img_height = info.img_size().y;
-    if(img_width == 0 || img_height == 0)
+    if (img_width == 0 || img_height == 0)
     {
         img_width  = input2->info()->dimension(width_idx);
         img_height = input2->info()->dimension(height_idx);
@@ -143,7 +165,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
 
     float step_x = info.steps()[0];
     float step_y = info.steps()[0];
-    if(step_x == 0.f || step_y == 0.f)
+    if (step_x == 0.f || step_y == 0.f)
     {
         step_x = static_cast<float>(img_width) / layer_width;
         step_y = static_cast<float>(img_height) / layer_height;
@@ -162,18 +184,20 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
     build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(info.offset()));
     build_opts.add_option_if(info.clip(), "-DIN_PLACE");
 
-    if(info.variances().size() > 1)
+    if (info.variances().size() > 1)
     {
-        for(unsigned int i = 0; i < info.variances().size(); ++i)
+        for (unsigned int i = 0; i < info.variances().size(); ++i)
         {
-            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(i)));
+            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" +
+                                  support::cpp11::to_string(info.variances().at(i)));
         }
     }
     else
     {
-        for(unsigned int i = 0; i < 4; ++i)
+        for (unsigned int i = 0; i < 4; ++i)
         {
-            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(0)));
+            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" +
+                                  support::cpp11::to_string(info.variances().at(0)));
         }
     }
 
@@ -194,13 +218,17 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
     ICLKernel::configure_internal(win_config.second);
 }
 
-Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status CLPriorBoxLayerKernel::validate(const ITensorInfo       *input1,
+                                       const ITensorInfo       *input2,
+                                       const ITensorInfo       *output,
+                                       const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
     const int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get(), info, num_priors)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(),
+                                                              output->clone().get(), info, num_priors)
+                                    .first);
 
     return Status{};
 }
@@ -211,8 +239,9 @@ void CLPriorBoxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     queue.enqueueWriteBuffer(*_min, CL_TRUE, 0, _info.min_sizes().size() * sizeof(float), _info.min_sizes().data());
-    queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float), _info.aspect_ratios().data());
-    if(!_info.max_sizes().empty())
+    queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float),
+                             _info.aspect_ratios().data());
+    if (!_info.max_sizes().empty())
     {
         queue.enqueueWriteBuffer(*_max, CL_TRUE, 0, _info.max_sizes().size() * sizeof(float), _info.max_sizes().data());
     }
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.h b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
index 6c369a7a4e01b0ac9d99625e838dfa1601ff6bfe..a50e0c5ff59c01bc3dd1c207aadbeca94a0d5fab 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.h
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
@@ -57,7 +57,13 @@ public:
      * @param[in]  max           Maximum prior box values
      * @param[in]  aspect_ratios Aspect ratio values
      */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios);
+    void configure(const ICLTensor         *input1,
+                   const ICLTensor         *input2,
+                   ICLTensor               *output,
+                   const PriorBoxLayerInfo &info,
+                   cl::Buffer              *min,
+                   cl::Buffer              *max,
+                   cl::Buffer              *aspect_ratios);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -69,8 +75,14 @@ public:
      * @param[in]  max             Maximum prior box values
      * @param[in]  aspect_ratios   Aspect ratio values
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max,
-                   cl::Buffer *aspect_ratios);
+    void configure(const CLCompileContext  &compile_context,
+                   const ICLTensor         *input1,
+                   const ICLTensor         *input2,
+                   ICLTensor               *output,
+                   const PriorBoxLayerInfo &info,
+                   cl::Buffer              *min,
+                   cl::Buffer              *max,
+                   cl::Buffer              *aspect_ratios);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayerKernel
      *
      * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
@@ -80,14 +92,17 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+    static Status validate(const ITensorInfo       *input1,
+                           const ITensorInfo       *input2,
+                           const ITensorInfo       *output,
+                           const PriorBoxLayerInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
+    const ICLTensor  *_input1;
+    const ICLTensor  *_input2;
     ICLTensor        *_output;
     PriorBoxLayerInfo _info;
     int               _num_priors;
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
index bd573e54c8eeff775555b482af552c2a032556ee..731fcb8e048361666515a97483e3032714605862 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
@@ -22,10 +22,12 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -49,14 +51,19 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     const uint32_t temp_num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
     /* If width is less then step, then make step same as width to avoid global size being step instead of actual width. */
     /* Or we should fix in arm_compute::enqueue() or arm_compute::calculate_max_window(). */
-    const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration) ? input->dimension(0) : temp_num_elems_processed_per_iteration;
+    const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration)
+                                                           ? input->dimension(0)
+                                                           : temp_num_elems_processed_per_iteration;
 
     // This kernel doesn't need padding
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
     return std::make_pair(Status{}, win);
 }
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const ITensorInfo *weight,
+                          const ITensorInfo *bias)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weight, bias, output);
 
@@ -72,7 +79,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias);
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -87,10 +94,14 @@ CLQLSTMLayerNormalizationKernel::CLQLSTMLayerNormalizationKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
+void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context,
+                                                const ICLTensor        *input,
+                                                ICLTensor              *output,
+                                                const ICLTensor        *weight,
+                                                const ICLTensor        *bias)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output);
-    auto padding_info = get_padding_info({ input, weight, bias, output });
+    auto padding_info = get_padding_info({input, weight, bias, output});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), weight->info(), bias->info()));
 
@@ -104,7 +115,8 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
     int32_t                       output_multiplier{};
     int32_t                       output_shift{};
     const UniformQuantizationInfo quan_info = _weight->info()->quantization_info().uniform();
-    const Status                  status    = quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift);
+    const Status                  status =
+        quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift);
     output_shift *= -1;
 
     // Set build options
@@ -114,8 +126,12 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
     build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
     build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-    build_opts.add_option("-DMIN_BOUND=" + support::cpp11::to_string(std::get<0>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
-    build_opts.add_option("-DMAX_BOUND=" + support::cpp11::to_string(std::get<1>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
+    build_opts.add_option("-DMIN_BOUND=" +
+                          support::cpp11::to_string(std::get<0>(
+                              quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
+    build_opts.add_option("-DMAX_BOUND=" +
+                          support::cpp11::to_string(std::get<1>(
+                              quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
 
     // Create kernel
     _kernel = create_kernel(compile_context, "qlstm_layer_normalization", build_opts.options());
@@ -135,12 +151,18 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
+void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input,
+                                                ICLTensor       *output,
+                                                const ICLTensor *weight,
+                                                const ICLTensor *bias)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, weight, bias);
 }
 
-Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input,
+                                                 const ITensorInfo *output,
+                                                 const ITensorInfo *weight,
+                                                 const ITensorInfo *bias)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, weight, bias));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
@@ -171,7 +193,6 @@ void CLQLSTMLayerNormalizationKernel::run(const Window &window, cl::CommandQueue
         add_2D_tensor_argument(idx, _output, slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
index 31085c37ba170ddda3519605ffd8982ce39bdf69..ba912e1d2dc006bf6d04153c4076c8bf8e4781c0 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
@@ -63,7 +63,11 @@ public:
      * @param[in]  weight          Weight tensor. Data types supported: Same as @p input.
      * @param[in]  bias            Bias tensor. Data types supported: S32.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *weight,
+                   const ICLTensor        *bias);
     /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayerNormalizationKernel
      *
      * @param[in] input  Source tensor info with 2 dimensions. Data types supported: QSYMM16.
@@ -73,7 +77,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
index 69a6fa5fa01838dc2183a4ba565af154ab8ed0c3..c97910ef7901d098c6ef9bc76b98fc8a8621735a 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -42,24 +43,29 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *rois,
+                          ITensorInfo               *output,
+                          const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info),
+                                                           output->tensor_shape());
     }
 
-    if(is_data_type_quantized_asymmetric(input->data_type()))
+    if (is_data_type_quantized_asymmetric(input->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16);
 
@@ -82,12 +88,19 @@ CLROIAlignLayerKernel::CLROIAlignLayerKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLROIAlignLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const ICLTensor           *input,
+                                      const ICLTensor           *rois,
+                                      ICLTensor                 *output,
+                                      const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const CLCompileContext    &compile_context,
+                                      const ICLTensor           *input,
+                                      const ICLTensor           *rois,
+                                      ICLTensor                 *output,
+                                      const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
@@ -97,7 +110,7 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
     output->info()->set_data_layout(input->info()->data_layout());
 
-    auto padding_info = get_padding_info({ input, rois, output });
+    auto padding_info = get_padding_info({input, rois, output});
 
     _input     = input;
     _output    = output;
@@ -111,16 +124,23 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH))));
-    build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
-    build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
+    build_opts.add_option("-DMAX_DIM_X=" +
+                          support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+                              input->info()->data_layout(), DataLayoutDimension::WIDTH))));
+    build_opts.add_option("-DMAX_DIM_Y=" +
+                          support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+                              input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
+    build_opts.add_option("-DMAX_DIM_Z=" +
+                          support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+                              input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
     build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
     build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
     build_opts.add_option("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale()));
     build_opts.add_option_if(input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
-    build_opts.add_option_if(pool_info.sampling_ratio() > 0, "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
+    build_opts.add_option_if(pool_info.sampling_ratio() > 0,
+                             "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
 
-    if(is_qasymm)
+    if (is_qasymm)
     {
         const UniformQuantizationInfo iq_info    = input->info()->quantization_info().uniform();
         const UniformQuantizationInfo roisq_info = rois->info()->quantization_info().uniform();
@@ -144,7 +164,10 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayerKernel::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *rois,
+                                       ITensorInfo               *output,
+                                       const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
     return Status{};
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.h b/src/core/CL/kernels/CLROIAlignLayerKernel.h
index 5284a5913f7fd287db19f42716ac402639849c4d..2e84e5d303ef0e747ec8dc655825f9fdd1337f4e 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.h
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.h
@@ -61,7 +61,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -77,7 +78,11 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *rois,
+                   ICLTensor                 *output,
+                   const ROIPoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayerKernel
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -93,7 +98,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue);
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index f6933c6cfd241dbfa2e08756b0043cbb484f268a..1b2c414a496e2a2a3872fc73f9ed6defd5544020 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -48,7 +49,10 @@ CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIPoolingLayerKernel::validate(const ITensorInfo         *input,
+                                         const ITensorInfo         *rois,
+                                         const ITensorInfo         *output,
+                                         const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
 
@@ -61,10 +65,11 @@ Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensor
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
+        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) ||
+                                    (output->dimension(1) != pool_info.pooled_height()));
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
         ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
     }
@@ -72,20 +77,30 @@ Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensor
     return Status{};
 }
 
-void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const ICLTensor           *input,
+                                        const ICLTensor           *rois,
+                                        ICLTensor                 *output,
+                                        const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const CLCompileContext    &compile_context,
+                                        const ICLTensor           *input,
+                                        const ICLTensor           *rois,
+                                        const ICLTensor           *output,
+                                        const ROIPoolingLayerInfo &pool_info)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info));
 
-    auto padding_info = get_padding_info({ input, rois, output });
+    auto padding_info = get_padding_info({input, rois, output});
 
     // Output auto initialization if not yet initialized
-    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1));
-    auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(), output->info()->quantization_info());
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2),
+                             rois->info()->dimension(1));
+    auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(),
+                       output->info()->quantization_info());
 
     // Set instance variables
     _input     = input;
@@ -107,11 +122,12 @@ void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context,
     build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
     build_opts.add_option("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale()));
 
-    if(is_qasymm)
+    if (is_qasymm)
     {
         // Determine quantization info scale, offset
         UniformQuantizationInfo uqinfo = UniformQuantizationInfo();
-        uqinfo                         = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(), _output->info()->quantization_info().uniform());
+        uqinfo = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(),
+                                                     _output->info()->quantization_info().uniform());
         build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(uqinfo.offset));
         build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(uqinfo.scale));
 
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.h b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
index 7b7b4576320bbaf9d5f1fcac5e32ad1b5432b2f1..80bfb63092532794d28df75908fc07fd3b3e9e40 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
@@ -59,7 +59,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -74,7 +75,11 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *rois,
+                   const ICLTensor           *output,
+                   const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -92,7 +97,10 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           const ITensorInfo         *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
 private:
     const ICLTensor    *_input;
diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp
index a06c2eed75f23525fda4b35aa129bad9cd4ea9ad..622f6210b9b1319c2d534a160ed454b8d6730136 100644
--- a/src/core/CL/kernels/CLRangeKernel.cpp
+++ b/src/core/CL/kernels/CLRangeKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -42,11 +43,8 @@ constexpr unsigned int vector_size_byte_opencl = 16;
 Status validate_arguments(const ITensorInfo *output, const float start, const float end, const float step)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8,
-                                                         DataType::U16, DataType::S16,
-                                                         DataType::U32, DataType::S32,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16, DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
 
@@ -56,19 +54,22 @@ Status validate_arguments(const ITensorInfo *output, const float start, const fl
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()), "start value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()), "end value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()), "step value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()),
+                                    "start value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()),
+                                    "end value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()),
+                                    "step value is outside the range of the data type");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->num_dimensions() != 1, "Output has to be a 1-D tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step),
+                                    "Output tensor size is incorrect");
 
     return Status{};
 }
 } // namespace
 
-CLRangeKernel::CLRangeKernel()
-    : _start(0), _end(1), _step(1), _output(nullptr)
+CLRangeKernel::CLRangeKernel() : _start(0), _end(1), _step(1), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -78,16 +79,18 @@ void CLRangeKernel::configure(ICLTensor *output, const float start, const float
     configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
 }
 
-void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
+void CLRangeKernel::configure(
+    const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(output->info(), start, end, step));
 
     // Configure kernel window
-    unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0));
-    Window       win                               = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0));
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
 
-    auto padding_info = get_padding_info({ output });
+    auto padding_info = get_padding_info({output});
 
     _start  = start;
     _end    = end;
@@ -100,10 +103,11 @@ void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
     build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DSTART=" + support::cpp11::to_string(start));
     build_opts.add_option("-DSTEP=" + support::cpp11::to_string(step));
-    if(is_data_type_quantized_asymmetric(output->info()->data_type()))
+    if (is_data_type_quantized_asymmetric(output->info()->data_type()))
     {
         const UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
         build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(qinfo.offset));
diff --git a/src/core/CL/kernels/CLRangeKernel.h b/src/core/CL/kernels/CLRangeKernel.h
index 1b94a099ed7021dabe87d428864d4d71d021b2e8..65251a11e5a5d40b10e633c1a96fb66a75c0842f 100644
--- a/src/core/CL/kernels/CLRangeKernel.h
+++ b/src/core/CL/kernels/CLRangeKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLRANGEKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index ee60b8e1df4c14035d8ddabd9372383368be05d6..c8665f8fbdac5ea777bc9c4c34dc9db89827efb8 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -28,15 +28,15 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -47,23 +47,28 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    if(input->num_channels() == 1)
+    if (input->num_channels() == 1)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                             DataType::S32, DataType::F16, DataType::F32);
     }
     else
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON(axis == 0);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8,
+                                    "Not supported reduction operation for QASYMM8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
-    ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) && (input->data_type() != DataType::QASYMM8)
-                                && (input->data_type() != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN), "Not supported reduction operation, use CLArgMinMaxLayer");
+    ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) &&
+                                (input->data_type() != DataType::QASYMM8) &&
+                                (input->data_type() != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN),
+                                    "Not supported reduction operation, use CLArgMinMaxLayer");
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -79,33 +84,42 @@ CLReductionOperationKernel::CLReductionOperationKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLReductionOperationKernel::configure(const ICLTensor   *input,
+                                           ICLTensor         *output,
+                                           unsigned int       axis,
+                                           ReductionOperation op)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
 }
 
-void CLReductionOperationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLReductionOperationKernel::configure(const CLCompileContext &compile_context,
+                                           const ICLTensor        *input,
+                                           ICLTensor              *output,
+                                           unsigned int            axis,
+                                           ReductionOperation      op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input          = input;
     _output         = output;
     _reduction_axis = axis;
     _op             = op;
 
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true));
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true);
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true));
 
     // Set build options
     CLBuildOptions build_opts;
     DataType       data_type = input->info()->data_type();
     std::string    data_type_promoted{};
 
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         data_type_promoted = "int";
     }
@@ -130,10 +144,14 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
     build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
     build_opts.add_option_if(op == ReductionOperation::MIN, "-DMIN");
     build_opts.add_option_if(op == ReductionOperation::MAX, "-DMAX");
-    build_opts.add_option_if(is_data_type_quantized(data_type), "-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
-    build_opts.add_option_if(is_data_type_quantized(data_type), "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
-
-    switch(op)
+    build_opts.add_option_if(is_data_type_quantized(data_type),
+                             "-DOFFSET=" +
+                                 support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
+    build_opts.add_option_if(
+        is_data_type_quantized(data_type),
+        "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
+
+    switch (op)
     {
         case ReductionOperation::SUM_SQUARE:
             build_opts.add_option(("-DOPERATION=square_sum"));
@@ -143,7 +161,10 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
             build_opts.add_option(("-DOPERATION=sum"));
             break;
         case ReductionOperation::MIN:
+            build_opts.add_option(("-DOPERATION=min_"));
+            break;
         case ReductionOperation::MAX:
+            build_opts.add_option(("-DOPERATION=max_"));
             break;
         case ReductionOperation::PROD:
             build_opts.add_option(("-DOPERATION=product"));
@@ -156,7 +177,7 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
     std::string kernel_axis_name;
     const bool  is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
 
-    switch(axis)
+    switch (axis)
     {
         case 0:
         {
@@ -183,14 +204,19 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
     _kernel = create_kernel(compile_context, "reduction_operation_" + kernel_axis_name, build_opts.options());
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(vec_size));
-    win.set(Window::DimX, Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step()));
+    TensorShape actual_input_shape = input->info()->tensor_shape();
+    actual_input_shape[0]          = width;
+
+    Window win = calculate_max_window(actual_input_shape, Steps(vec_size));
     ICLKernel::configure_internal(win);
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLReductionOperationKernel::validate(const ITensorInfo *input,
+                                            const ITensorInfo *output,
+                                            unsigned int       axis,
+                                            ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
     return Status{};
@@ -202,18 +228,19 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
-    switch(_reduction_axis)
+    switch (_reduction_axis)
     {
         case 0:
         {
             // We use parallel reduction only in non quantized types
-            if(is_serial_op)
+            if (is_serial_op)
             {
                 // Get first input and output slices
-                Window window_in{ window };
-                window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+                Window window_in{window};
+                window_in.set(Window::DimX,
+                              Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
 
-                Window out_window{ window };
+                Window out_window{window};
                 out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
 
                 Window in_slice  = window_in.first_slice_window_1D();
@@ -225,8 +252,7 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
                     add_1D_tensor_argument(idx, _input, in_slice);
                     add_1D_tensor_argument(idx, _output, out_slice);
                     enqueue(queue, *this, in_slice);
-                }
-                while(window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+                } while (window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
             }
             else
             {
@@ -247,56 +273,92 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
         break;
         case 1:
         {
-            // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
-            Window in_slice  = window_in.first_slice_window_2D();
-            Window out_slice = window.first_slice_window_2D();
+            bool   has_collapsed = true;
+            Window actual_window = window.collapse_if_possible(window, 2, &has_collapsed);
+            ARM_COMPUTE_ERROR_ON(!has_collapsed);
 
-            do
-            {
-                unsigned int idx = 0;
-                add_2D_tensor_argument(idx, _input, in_slice);
-                add_2D_tensor_argument(idx, _output, out_slice);
-                enqueue(queue, *this, in_slice);
-            }
-            while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+            actual_window = actual_window.shift_dimensions(1, Window::DimY);
+
+            const ITensorInfo *input_info    = _input->info();
+            const Strides     &input_strides = input_info->strides_in_bytes();
+
+            const ITensorInfo *output_info    = _output->info();
+            const Strides     &output_strides = output_info->strides_in_bytes();
+
+            unsigned int idx = 0;
+
+            _kernel.setArg(idx++, _input->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+            _kernel.setArg(idx++, _output->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, output_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+            enqueue(queue, *this, actual_window);
         }
         break;
         case 2:
         {
-            // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
-            Window in_slice  = window_in.first_slice_window_3D();
-            Window out_slice = window.first_slice_window_3D();
+            bool   has_collapsed = true;
+            Window actual_window = window.collapse_if_possible(window, 3, &has_collapsed);
+            ARM_COMPUTE_ERROR_ON(!has_collapsed);
 
-            do
-            {
-                unsigned int idx = 0;
-                add_3D_tensor_argument(idx, _input, in_slice);
-                add_3D_tensor_argument(idx, _output, out_slice);
-                enqueue(queue, *this, in_slice);
-            }
-            while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+            actual_window = actual_window.shift_dimensions(1, Window::DimZ);
+
+            const ITensorInfo *input_info    = _input->info();
+            const Strides     &input_strides = input_info->strides_in_bytes();
+
+            const ITensorInfo *output_info    = _output->info();
+            const Strides     &output_strides = output_info->strides_in_bytes();
+
+            unsigned int idx = 0;
+
+            _kernel.setArg(idx++, _input->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[3]);
+            _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+            _kernel.setArg(idx++, _output->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, output_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, output_strides[3]);
+            _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+            enqueue(queue, *this, actual_window);
         }
         break;
         case 3:
         {
-            // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(3, Window::Dimension(0, 1, 1));
-            Window in_slice  = window_in.first_slice_window_4D();
-            Window out_slice = window.first_slice_window_4D();
+            bool   has_collapsed = true;
+            Window actual_window = window.shift_dimensions(1, Window::DimW);
 
-            do
-            {
-                unsigned int idx = 0;
-                add_4D_tensor_argument(idx, _input, in_slice);
-                add_4D_tensor_argument(idx, _output, out_slice);
-                enqueue(queue, *this, in_slice);
-            }
-            while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+            actual_window = actual_window.collapse_if_possible(actual_window, 2, &has_collapsed);
+            ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+            const ITensorInfo *input_info    = _input->info();
+            const Strides     &input_strides = input_info->strides_in_bytes();
+
+            const ITensorInfo *output_info    = _output->info();
+            const Strides     &output_strides = output_info->strides_in_bytes();
+
+            unsigned int idx = 0;
+
+            _kernel.setArg(idx++, _input->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[3]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[4]);
+            _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+            _kernel.setArg(idx++, _output->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, output_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, output_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, output_strides[4]);
+            _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+            enqueue(queue, *this, actual_window);
         }
         break;
         default:
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.h b/src/core/CL/kernels/CLReductionOperationKernel.h
index b456378746910cd6df523c64d4a17cda4c881911..2f94b2add327847b5e5a50b7fa5ab6d54c025e30 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.h
+++ b/src/core/CL/kernels/CLReductionOperationKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -67,7 +68,11 @@ public:
      * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0,1,2,3
      * @param[in]  op              Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            axis,
+                   ReductionOperation      op);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel.
      *
@@ -79,7 +84,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp
index 3c74e80d33fc7b61518baf9170e45c87adb8abb3..9fd21943e890fdfe6e1d5dab93d3d313097ebb73 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp
@@ -28,9 +28,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -51,13 +52,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
     ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0,
+                                    "The width of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0,
+                                    "The height of the input tensor must be a multiple of stride");
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+        const TensorInfo tensor_info_output =
+            output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -66,8 +70,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 }
 } // namespace
 
-CLReorgLayerKernel::CLReorgLayerKernel()
-    : _input(nullptr), _output(nullptr)
+CLReorgLayerKernel::CLReorgLayerKernel() : _input(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -77,17 +80,22 @@ void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, in
     configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
 }
 
-void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t stride)
+void CLReorgLayerKernel::configure(const CLCompileContext &compile_context,
+                                   const ICLTensor        *input,
+                                   ICLTensor              *output,
+                                   int32_t                 stride)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output;
 
-    std::string  kernel_name = std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
-    const size_t idx_channel = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+    std::string kernel_name =
+        std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
+    const size_t idx_channel =
+        get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
 
     // Create kernel
     CLBuildOptions build_opts;
@@ -98,7 +106,9 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons
 
     // Configure window
     // auto inizialize the output tensor if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(
+                           misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
 
     Window win = calculate_max_window(*output->info(), Steps());
 
@@ -119,7 +129,9 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, int32_t stride)
+Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input,
+                                    const arm_compute::ITensorInfo *output,
+                                    int32_t                         stride)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
 
@@ -139,7 +151,6 @@ void CLReorgLayerKernel::run(const Window &window, cl::CommandQueue &queue)
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.h b/src/core/CL/kernels/CLReorgLayerKernel.h
index 455a6170c6cec94effa7cd7f363d52f2de5f0002..f335071e9fdac1eb5a2f3ff6fb48fb4a75e08966 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.h
+++ b/src/core/CL/kernels/CLReorgLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLREORGLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp
index 0d70ff4f3c8646169bf853e1347e7300a1236da4..00241b161b5683e1262728ca90b2626639c436ba 100644
--- a/src/core/CL/kernels/CLReverseKernel.cpp
+++ b/src/core/CL/kernels/CLReverseKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -39,17 +40,21 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
 {
+    ARM_COMPUTE_UNUSED(use_inverted_axis);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4,
+                                    "Current implementation only supports up to 4 dimensions.");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -60,21 +65,27 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLReverseKernel::CLReverseKernel()
-    : _input(nullptr), _output(nullptr), _axis(nullptr)
+CLReverseKernel::CLReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverseKernel::configure(const ICLTensor *input,
+                                ICLTensor       *output,
+                                const ICLTensor *axis,
+                                bool             use_inverted_axis)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, use_inverted_axis);
 }
 
-void CLReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverseKernel::configure(const CLCompileContext &compile_context,
+                                const ICLTensor        *input,
+                                ICLTensor              *output,
+                                const ICLTensor        *axis,
+                                bool                    use_inverted_axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
-    auto padding_info = get_padding_info({ input, output, axis });
+    auto padding_info = get_padding_info({input, output, axis});
 
     _input  = input;
     _output = output;
@@ -83,12 +94,14 @@ void CLReverseKernel::configure(const CLCompileContext &compile_context, const I
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), *input->info()->clone());
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info(), use_inverted_axis));
 
     // Set kernel build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DNUM_REVERSE_DIMS=" + support::cpp11::to_string(axis->info()->dimension(0)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
+    build_opts.add_option("-DRANK=" + support::cpp11::to_string(input->info()->num_dimensions()));
+    build_opts.add_option_if(use_inverted_axis, "-DUSE_INVERTED_AXIS");
 
     // Create kernel
     _kernel = create_kernel(compile_context, "reverse", build_opts.options());
@@ -116,9 +129,12 @@ void CLReverseKernel::configure(const CLCompileContext &compile_context, const I
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status CLReverseKernel::validate(const ITensorInfo *input,
+                                 const ITensorInfo *output,
+                                 const ITensorInfo *axis,
+                                 bool               use_inverted_axis)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, use_inverted_axis));
     return Status{};
 }
 
@@ -138,7 +154,6 @@ void CLReverseKernel::run(const Window &window, cl::CommandQueue &queue)
         add_1D_tensor_argument(idx, _axis, axis_slice);
         add_4D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_4D(slice));
+    } while (collapsed.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLReverseKernel.h b/src/core/CL/kernels/CLReverseKernel.h
index 4a21e4f8029bdf961e83c15099b95f959ad8d64a..a630aec15ab9f0c68149fba2e9681c9cdac30b9b 100644
--- a/src/core/CL/kernels/CLReverseKernel.h
+++ b/src/core/CL/kernels/CLReverseKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLREVERSEKERNEL_H
-#define ARM_COMPUTE_CLREVERSEKERNEL_H
+#ifndef ACL_SRC_CORE_CL_KERNELS_CLREVERSEKERNEL_H
+#define ACL_SRC_CORE_CL_KERNELS_CLREVERSEKERNEL_H
 
 #include "src/core/CL/ICLKernel.h"
 
@@ -48,29 +48,43 @@ public:
     ~CLReverseKernel() = default;
     /** Initialise the kernel's inputis and output
      *
-     * @param[in]  input  Input tensor. Data types supported: All.
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis   Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in]  input             Input tensor. Data types supported: All.
+     * @param[out] output            Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis              Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in]  use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
+     *
+     * @note The value of each axis should be between [-rank, rank)
+     * @note If there are duplicate values in the tensor, the subsequent axis values are ignored. e.g. an array of [2, 2] has the same effects as [2].
+     *
+     * @deprecated Support for U32 in axis tensor will be removed in 24.02 release
+     *
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis, bool use_inverted_axis);
     /** Initialise the kernel's inputis and output
      *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[out] output          Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis            Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in]  compile_context   The compile context to be used.
+     * @param[in]  input             Input tensor. Data types supported: All.
+     * @param[out] output            Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis              Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in]  use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *axis,
+                   bool                    use_inverted_axis);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: All.
-     * @param[in] output Output tensor info. Data type supported: Same as @p input
-     * @param[in] axis   Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in] input             Input tensor info. Data types supported: All.
+     * @param[in] output            Output tensor info. Data type supported: Same as @p input
+     * @param[in] axis              Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in] use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -81,4 +95,4 @@ public:
     const ICLTensor *_axis;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CLREVERSEKERNEL_H */
+#endif // ACL_SRC_CORE_CL_KERNELS_CLREVERSEKERNEL_H
diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp
index c0e014e8b88dd83fb338ea872d8115b6cb9cc17e..703c64d8d370af30f00987f43c318c9dc69e5e74 100644
--- a/src/core/CL/kernels/CLSelectKernel.cpp
+++ b/src/core/CL/kernels/CLSelectKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -51,9 +51,11 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen
 
     const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
-    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank &&
+                                ((c->tensor_shape().num_dimensions() > 1) ||
+                                 (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
@@ -63,13 +65,16 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen
 }
 } // namespace
 
-CLSelectKernel::CLSelectKernel()
-    : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
+CLSelectKernel::CLSelectKernel() : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLSelectKernel::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+void CLSelectKernel::configure(const CLCompileContext &compile_context,
+                               const ICLTensor        *c,
+                               const ICLTensor        *x,
+                               const ICLTensor        *y,
+                               ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(c, x, y, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(c->info(), x->info(), y->info(), output->info()));
@@ -80,7 +85,7 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
     _output        = output;
     _has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions());
 
-    auto               padding_info         = get_padding_info({ c, x, y, output });
+    auto               padding_info         = get_padding_info({c, x, y, output});
     const unsigned int vec_size_x           = adjust_vec_size(16 / x->info()->element_size(), x->info()->dimension(0));
     const int          vec_size_x_leftovers = output->info()->dimension(0) % vec_size_x;
 
@@ -92,14 +97,14 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
 
     // Create kernel
     std::string kernel_name = "select";
-    if(_has_same_rank)
+    if (_has_same_rank)
     {
         kernel_name += "_same_rank";
     }
     else
     {
         const bool is_input_rank_greater_than_two = x->info()->tensor_shape().num_dimensions() > 2;
-        if(is_input_rank_greater_than_two)
+        if (is_input_rank_greater_than_two)
         {
             const size_t width      = x->info()->tensor_shape().x();
             const size_t height     = x->info()->tensor_shape().y();
@@ -128,7 +133,8 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+Status
+CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(c, x, y, output));
     return Status{};
@@ -142,7 +148,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
 
-    if(!_has_same_rank)
+    if (!_has_same_rank)
     {
         Window vector_slice = window.first_slice_window_1D();
         vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
@@ -153,7 +159,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
     do
     {
         unsigned int idx = _has_same_rank ? 0 : num_arguments_per_1D_tensor();
-        if(_has_same_rank)
+        if (_has_same_rank)
         {
             add_3D_tensor_argument(idx, _c, slice);
         }
@@ -162,7 +168,6 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
         add_3D_tensor_argument(idx, _output, slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSelectKernel.h b/src/core/CL/kernels/CLSelectKernel.h
index b8c10cd7cf17731502087936e440fab99a96be7c..c4256fd74336ccc11f731d6c1bfcf5dabe5de6b0 100644
--- a/src/core/CL/kernels/CLSelectKernel.h
+++ b/src/core/CL/kernels/CLSelectKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLSELECTKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -60,7 +61,11 @@ public:
      * @param[out] y               Second input tensor. Data types supported: Same as @p x
      * @param[in]  output          Output tensor. Data types supported: Same as @p x.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *c,
+                   const ICLTensor        *x,
+                   const ICLTensor        *y,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSelectKernel
      *
      * @param[in] c      Condition input tensor. Data types supported: U8.
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
index 3632ae2b03d45ad32396e7ca82521affce790559..f4c0839ad2a39ac2d26245a38b88ccfefce417fe 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -38,19 +39,22 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *block_info,
+                          const ITensorInfo *paddings,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2});
     ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2});
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
@@ -61,7 +65,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status validate_arguments_static(const ITensorInfo *input,
+                                 const int          block_shape_x,
+                                 const int          block_shape_y,
+                                 const Size2D      &padding_left,
+                                 const Size2D      &padding_right,
                                  const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -70,9 +78,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right);
+        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+            input, block_shape_x, block_shape_y, padding_left, padding_right);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -88,16 +97,24 @@ CLSpaceToBatchLayerKernel::CLSpaceToBatchLayerKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input,
+                                          const ICLTensor *block_shape,
+                                          const ICLTensor *paddings,
+                                          ICLTensor       *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
 }
 
-void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const ICLTensor        *block_shape,
+                                          const ICLTensor        *paddings,
+                                          ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
-    auto padding_info = get_padding_info({ input, block_shape, paddings, output });
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+    auto padding_info = get_padding_info({input, block_shape, paddings, output});
 
     _input       = input;
     _block_shape = block_shape;
@@ -111,14 +128,17 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
     build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch)));
-    _kernel = create_kernel(compile_context, "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(compile_context,
+                            "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -126,22 +146,34 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                                          ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input,
+                                          const int        block_shape_x,
+                                          const int        block_shape_y,
+                                          const Size2D    &padding_left,
+                                          const Size2D    &padding_right,
+                                          ICLTensor       *output)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left,
+              padding_right, output);
 }
 
-void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
-                                          const Size2D &padding_right,
-                                          ICLTensor    *output)
+void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const int               block_shape_x,
+                                          const int               block_shape_y,
+                                          const Size2D           &padding_left,
+                                          const Size2D           &padding_right,
+                                          ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+        input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left,
+                                                         padding_right, output->info()));
 
     _input  = input;
     _output = output;
@@ -153,7 +185,8 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
@@ -166,22 +199,32 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
     build_opts.add_option("-DPAD_RIGHT_X=" + support::cpp11::to_string(padding_right.x()));
     build_opts.add_option("-DPAD_LEFT_Y=" + support::cpp11::to_string(padding_left.y()));
     build_opts.add_option("-DPAD_RIGHT_Y=" + support::cpp11::to_string(padding_right.y()));
-    _kernel = create_kernel(compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(
+        compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+        build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
     ICLKernel::configure_internal(win);
 }
 
-Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const ITensorInfo *block_shape,
+                                           const ITensorInfo *paddings,
+                                           const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
     return Status{};
 }
-Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const int          block_shape_x,
+                                           const int          block_shape_y,
+                                           const Size2D      &padding_left,
+                                           const Size2D      &padding_right,
                                            const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
     return Status{};
 }
 
@@ -218,7 +261,6 @@ void CLSpaceToBatchLayerKernel::run(const Window &window, cl::CommandQueue &queu
         add_3D_tensor_argument(idx, _output, slice_out);
         enqueue(queue, *this, slice_out, lws_hint());
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice_out));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
index 4817cfeef2e5556143b336550352ce71cc1d9dbb..f9dce9db4764501ccb71523a86c1e0b7303b7288 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -63,7 +64,11 @@ public:
      * @param[in]  paddings        2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *block_shape,
+                   const ICLTensor        *paddings,
+                   ICLTensor              *output);
     /** Initialise the kernel's input and output. (Static block shape and paddings)
      *
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -73,7 +78,12 @@ public:
      * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
-    void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
+    void configure(const ICLTensor *input,
+                   const int        block_shape_x,
+                   const int        block_shape_y,
+                   const Size2D    &padding_left,
+                   const Size2D    &padding_right,
+                   ICLTensor       *output);
     /** Initialise the kernel's input and output. (Static block shape and paddings)
      *
      * @param[in]  compile_context The compile context to be used.
@@ -84,8 +94,13 @@ public:
      * @param[in]  padding_right   The padding at the end of every dimension of the output tensor.
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                   ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const int               block_shape_x,
+                   const int               block_shape_y,
+                   const Size2D           &padding_left,
+                   const Size2D           &padding_right,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -95,7 +110,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *block_shape,
+                           const ITensorInfo *paddings,
+                           const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel (Static block shape and paddings)
      *
      * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -107,7 +125,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int          block_shape_x,
+                           const int          block_shape_y,
+                           const Size2D      &padding_left,
+                           const Size2D      &padding_right,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
index c5ffdb588ba2b2525783431f237fb04ebea1d0a6..25662b5c628e781fd9e2125d1a78d666e6653e79 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -45,7 +46,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -64,8 +65,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 }
 } // namespace
 
-CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel()
-    : _input(nullptr), _output(nullptr), _block_shape()
+CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel() : _input(nullptr), _output(nullptr), _block_shape()
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -75,10 +75,13 @@ void CLSpaceToDepthLayerKernel::configure(const ICLTensor *input, ICLTensor *out
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          ICLTensor              *output,
+                                          int32_t                 block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     TensorShape output_shape = compute_space_to_depth_shape(input->info(), block_shape);
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
@@ -94,11 +97,14 @@ void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_contex
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type())));
     build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_channel)));
     build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
     build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
-    _kernel = create_kernel(compile_context, "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(compile_context,
+                            "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -136,7 +142,6 @@ void CLSpaceToDepthLayerKernel::run(const Window &window, cl::CommandQueue &queu
         enqueue(queue, *this, slice_out, lws_hint());
 
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice_out));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
index bb1ac5f9a6d5e739e749135cbf292ef88d88eaee..d0932919e09c49ca9ecfef500d0cfa8e183e18ba 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -61,7 +62,8 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape     Block shape value.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayerKernel.
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp
index 075c93ab6080830978ef86e9dc73249adea0da20..23e26716e7e6655767726be57192d2c839d7f458 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.cpp
+++ b/src/core/CL/kernels/CLStackLayerKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
@@ -42,7 +42,11 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          unsigned int       axis,
+                          unsigned int       idx_input,
+                          unsigned int       num_tensors,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -51,9 +55,10 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
     ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+                                                           compute_stack_shape(*input, axis, num_tensors));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
@@ -61,7 +66,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
 {
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
@@ -73,18 +79,23 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsi
 }
 } // namespace
 
-CLStackLayerKernel::CLStackLayerKernel()
-    : _input(nullptr), _output(nullptr)
+CLStackLayerKernel::CLStackLayerKernel() : _input(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLStackLayerKernel::configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+void CLStackLayerKernel::configure(
+    const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, idx_input, num_tensors, output);
 }
 
-void CLStackLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+void CLStackLayerKernel::configure(const CLCompileContext &compile_context,
+                                   const ICLTensor        *input,
+                                   unsigned int            axis,
+                                   unsigned int            idx_input,
+                                   unsigned int            num_tensors,
+                                   ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
@@ -112,10 +123,15 @@ void CLStackLayerKernel::configure(const CLCompileContext &compile_context, cons
     _kernel.setArg<cl_uint>(idx, idx_input);
 }
 
-Status CLStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status CLStackLayerKernel::validate(const ITensorInfo *input,
+                                    unsigned int       axis,
+                                    unsigned int       idx_input,
+                                    unsigned int       num_tensors,
+                                    const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
     return Status{};
 }
 
diff --git a/src/core/CL/kernels/CLStackLayerKernel.h b/src/core/CL/kernels/CLStackLayerKernel.h
index 2865127a90c4f322ee1c5ae40d26501f0ee5e088..d3c17f529c144fbb1f7338b44b4a0e838b967c07 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.h
+++ b/src/core/CL/kernels/CLStackLayerKernel.h
@@ -26,6 +26,7 @@
 #define ARM_COMPUTE_CLSTACKLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -60,7 +61,8 @@ public:
      * @param[out] output      Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+    void configure(
+        const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
     /** Initialise the kernel's inputs and output
      *
      * @note Supported input tensor rank: up to 4
@@ -74,7 +76,12 @@ public:
      * @param[out] output          Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   unsigned int            axis,
+                   unsigned int            idx_input,
+                   unsigned int            num_tensors,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel
      *
      * @note Supported input tensor rank: up to 4
@@ -88,7 +95,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           unsigned int       axis,
+                           unsigned int       idx_input,
+                           unsigned int       num_tensors,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index 9acbafdb190a6b33eeb1b0104a0b8c42a1767961..a8f611282047e8b1e0b6a6a3fa4b894248637002 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -22,11 +22,13 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLStridedSliceKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/bit_ops.h"
@@ -37,9 +39,14 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                          int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const Coordinates &starts,
+                          const Coordinates &ends,
+                          const BiStrides   &strides,
+                          int32_t            begin_mask,
+                          int32_t            end_mask,
+                          int32_t            shrink_axis_mask)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -48,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
     ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
-    {
-        return i == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; }));
 
     // Get expected output shape
-    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                          starts, ends, strides,
-                                                                                                          begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
 
     // Checks output if configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
@@ -76,28 +80,33 @@ CLStridedSliceKernel::CLStridedSliceKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
-                                     const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                     int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSliceKernel::configure(const CLCompileContext &compile_context,
+                                     const ITensorInfo      *input,
+                                     ITensorInfo            *output,
+                                     const Coordinates      &starts,
+                                     const Coordinates      &ends,
+                                     const BiStrides        &strides,
+                                     int32_t                 begin_mask,
+                                     int32_t                 end_mask,
+                                     int32_t                 shrink_axis_mask)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto padding_info = get_padding_info({ input, output });
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    auto padding_info = get_padding_info({input, output});
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
 
     const TensorShape &input_shape = input->tensor_shape();
 
     Coordinates starts_abs;
     Coordinates ends_abs;
     Coordinates final_strides;
-    std::tie(starts_abs, ends_abs, final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
-                                                        input_shape,
-                                                        starts, ends, strides,
-                                                        begin_mask, end_mask, shrink_axis_mask);
+    std::tie(starts_abs, ends_abs, final_strides) =
+        arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides,
+                                                                               begin_mask, end_mask, shrink_axis_mask);
 
     // Configure kernel window
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                      starts, ends, strides,
-                                                                                                      begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
     Window win = calculate_max_window(*output, Steps());
 
@@ -108,29 +117,33 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
     const bool multi_access_x = !is_shrink_on_x && (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
 
     // Update window if needed
-    if(multi_access_x)
+    if (multi_access_x)
     {
         Window &updated_window = win;
         updated_window.set(Window::DimX,
-                           Window::Dimension(updated_window.x().start(), ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
+                           Window::Dimension(updated_window.x().start(),
+                                             ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
     // Create build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
-        build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i]));
-        build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i]));
+        build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" +
+                              support::cpp11::to_string(starts_abs[i]));
+        build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" +
+                              support::cpp11::to_string(final_strides[i]));
         build_opts.add_option_if(is_shrink, "-DSHRINK_" + support::cpp11::to_string(i));
     }
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+                                                                        std::max<int>(output_width_x - vec_size_x, 0)));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option_if_else(input_shape.num_dimensions() > 2,
-                                  "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()),
-                                  "-DSRC_DEPTH=1");
+                                  "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()), "-DSRC_DEPTH=1");
     build_opts.add_option_if_else(output->num_dimensions() > 2,
                                   "-DDST_DEPTH=" + support::cpp11::to_string(output->tensor_shape().z()),
                                   "-DDST_DEPTH=1");
@@ -142,7 +155,7 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
     _config_id = "strided_slice";
     _config_id += "_";
     _config_id += lower_string(string_from_data_type(input->data_type()));
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         _config_id += "_";
         _config_id += support::cpp11::to_string(input->dimension(i));
@@ -156,11 +169,17 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                      const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                      int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSliceKernel::validate(const ITensorInfo *input,
+                                      const ITensorInfo *output,
+                                      const Coordinates &starts,
+                                      const Coordinates &ends,
+                                      const BiStrides   &strides,
+                                      int32_t            begin_mask,
+                                      int32_t            end_mask,
+                                      int32_t            shrink_axis_mask)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
 
     return Status{};
 }
@@ -170,8 +189,9 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice            = window_collapsed.first_slice_window_4D();
@@ -182,7 +202,6 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl
         add_4D_tensor_argument(idx, src, slice);
         add_4D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_4D(slice));
+    } while (window_collapsed.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.h b/src/core/CL/kernels/CLStridedSliceKernel.h
index 4c201504f5e3412532e917873849167217dbae14..1cf5bcacec157afe0b987bd8c6982886c906332f 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.h
+++ b/src/core/CL/kernels/CLStridedSliceKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 #include <cstdint>
@@ -53,9 +54,15 @@ public:
      * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   ITensorInfo            *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends,
+                   const BiStrides        &strides,
+                   int32_t                 begin_mask,
+                   int32_t                 end_mask,
+                   int32_t                 shrink_axis_mask);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
      *
@@ -71,9 +78,14 @@ public:
      * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask,
+                           int32_t            end_mask,
+                           int32_t            shrink_axis_mask);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
index 3e7015cfd22492c94b242a61989dffa32057c545..fa996c4008ca8cd20fa0c984ddabd8b5cc60f9d3 100644
--- a/src/core/CL/kernels/CLTileKernel.cpp
+++ b/src/core/CL/kernels/CLTileKernel.cpp
@@ -22,9 +22,11 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLTileKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -39,15 +41,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
-    {
-        return e == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; }));
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
@@ -55,8 +55,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLTileKernel::CLTileKernel()
-    : _input(nullptr), _output(nullptr)
+CLTileKernel::CLTileKernel() : _input(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -66,7 +65,10 @@ void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Mu
     configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
 }
 
-void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+void CLTileKernel::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input,
+                             ICLTensor              *output,
+                             const Multiples        &multiples)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -104,15 +106,14 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     // Configure window without padding
     Window win = calculate_max_window(*output->info());
 
-    if(multi_access_x)
+    if (multi_access_x)
     {
         // If multi-access is enabled, no thread should cross the tile boundaries. This means we need
         // as many threads as those to cover a single tile times multiples[0]. Note that if threads
         // do not cross the boundaries of the tiles, they won't cross the boundaries of the last tile, and
         // we don't need to pad the output
         const unsigned int size_win_x = ceil_to_multiple(input->info()->dimension(0), vec_size_x) * multiples[0];
-        win.set(Window::DimX,
-                Window::Dimension(win.x().start(), size_win_x, vec_size_x));
+        win.set(Window::DimX, Window::Dimension(win.x().start(), size_win_x, vec_size_x));
     }
 
     ICLKernel::configure_internal(win);
@@ -121,7 +122,7 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     _config_id = "tile";
     _config_id += "_";
     _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    for(unsigned int i = 0; i < multiples.size(); ++i)
+    for (unsigned int i = 0; i < multiples.size(); ++i)
     {
         _config_id += "_";
         _config_id += support::cpp11::to_string(input->info()->dimension(i));
@@ -150,7 +151,6 @@ void CLTileKernel::run(const Window &window, cl::CommandQueue &queue)
         add_4D_tensor_argument(idx, _input, slice);
         add_4D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_4D(slice));
+    } while (collapsed.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLTileKernel.h b/src/core/CL/kernels/CLTileKernel.h
index 41752ca90b74eee472470df48b462d968da767e6..c3486aecef48d97d0ecef32122e3f7d8d646ab9e 100644
--- a/src/core/CL/kernels/CLTileKernel.h
+++ b/src/core/CL/kernels/CLTileKernel.h
@@ -64,7 +64,10 @@ public:
      * @param[out] output          Destination tensor. Same as @p input
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const Multiples        &multiples);
     /** Static function to check if given info will lead to a valid configuration of @ref CLTileKernel
      *
      * @param[in] input     Source tensor info. Data type supported: All.
diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp
index 6a3f66fd5ac457415d43a255ace0883f7e7860ca..9980db42f353240f5f6698a18ed9ae8be3ac0100 100644
--- a/src/core/CPP/CPPTypes.cpp
+++ b/src/core/CPP/CPPTypes.cpp
@@ -25,6 +25,7 @@
 #include "arm_compute/core/CPP/CPPTypes.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "src/common/cpuinfo/CpuInfo.h"
 #include "src/common/cpuinfo/CpuIsaInfo.h"
 
@@ -43,8 +44,7 @@ CPUInfo &CPUInfo::get()
     return _cpuinfo;
 }
 
-CPUInfo::CPUInfo()
-    : _impl(std::make_unique<Impl>())
+CPUInfo::CPUInfo() : _impl(std::make_unique<Impl>())
 {
     _impl->info = cpuinfo::CpuInfo::build();
 }
diff --git a/src/core/CPP/Validate.h b/src/core/CPP/Validate.h
index df192b51311c1de4489420f096d986b01b60d86c..fe253508cf275ba92c85ee27b7795254b0cd1796 100644
--- a/src/core/CPP/Validate.h
+++ b/src/core/CPP/Validate.h
@@ -38,8 +38,8 @@ namespace arm_compute
  *
  * @return Status
  */
-inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
-                                            const ITensorInfo *tensor_info)
+inline Status
+error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, const ITensorInfo *tensor_info)
 {
     bool fp16_kernels_enabled = false;
 #if defined(ARM_COMPUTE_ENABLE_FP16) && defined(ENABLE_FP16_KERNELS)
@@ -47,8 +47,9 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi
 #endif /* defined(ARM_COMPUTE_ENABLE_FP16) && defined(ENABLE_FP16_KERNELS) */
 
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::F16) && (!CPUInfo::get().has_fp16() || !fp16_kernels_enabled),
-                                        function, file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above");
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        (tensor_info->data_type() == DataType::F16) && (!CPUInfo::get().has_fp16() || !fp16_kernels_enabled), function,
+        file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above");
     return Status{};
 }
 
@@ -61,8 +62,8 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi
  *
  * @return Status
  */
-inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line,
-                                            const ITensorInfo *tensor_info)
+inline Status
+error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, const ITensorInfo *tensor_info)
 {
     bool bf16_kernels_enabled = false;
 #if defined(ARM_COMPUTE_ENABLE_BF16)
@@ -70,8 +71,9 @@ inline Status error_on_unsupported_cpu_bf16(const char *function, const char *fi
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::BFLOAT16) && (!CPUInfo::get().has_bf16() || !bf16_kernels_enabled),
-                                        function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above");
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        (tensor_info->data_type() == DataType::BFLOAT16) && (!CPUInfo::get().has_bf16() || !bf16_kernels_enabled),
+        function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above");
     return Status{};
 }
 
@@ -84,8 +86,8 @@ inline Status error_on_unsupported_cpu_bf16(const char *function, const char *fi
  *
  * @return Status
  */
-inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
-                                            const ITensor *tensor)
+inline Status
+error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, const ITensor *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(function, file, line, tensor->info()));
@@ -101,8 +103,8 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi
  *
  * @return Status
  */
-inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line,
-                                            const ITensor *tensor)
+inline Status
+error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, const ITensor *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_bf16(function, file, line, tensor->info()));
diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
index 0f405d8e83c79b8dd9316e7c14e2a31949148dbc..02686eb4f6ad91b1b391a18ae8d2773211283eff 100644
--- a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
+++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
@@ -34,7 +35,11 @@ namespace arm_compute
 namespace
 {
 template <typename T>
-std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &scores_in, std::vector<int> inds, const BoxNMSLimitInfo &info, int class_id)
+std::vector<int> SoftNMS(const ITensor               *proposals,
+                         std::vector<std::vector<T>> &scores_in,
+                         std::vector<int>             inds,
+                         const BoxNMSLimitInfo       &info,
+                         int                          class_id)
 {
     std::vector<int> keep;
     const int        proposals_width = proposals->info()->dimension(1);
@@ -45,7 +50,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
     std::vector<T> y2(proposals_width);
     std::vector<T> areas(proposals_width);
 
-    for(int i = 0; i < proposals_width; ++i)
+    for (int i = 0; i < proposals_width; ++i)
     {
         x1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
         y1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
@@ -56,13 +61,13 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
 
     // Note: Soft NMS scores have already been initialized with input scores
 
-    while(!inds.empty())
+    while (!inds.empty())
     {
         // Find proposal with max score among remaining proposals
         int max_pos = 0;
-        for(unsigned int i = 1; i < inds.size(); ++i)
+        for (unsigned int i = 1; i < inds.size(); ++i)
         {
-            if(scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)])
+            if (scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)])
             {
                 max_pos = i;
             }
@@ -75,7 +80,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
         inds.erase(inds.begin());
 
         std::vector<int> sorted_indices_temp;
-        for(auto idx : inds)
+        for (auto idx : inds)
         {
             const auto xx1 = std::max(x1[idx], x1[element]);
             const auto yy1 = std::max(y1[idx], y1[element]);
@@ -89,7 +94,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
 
             // Update scores based on computed IoU, overlap threshold and NMS method
             T weight;
-            switch(info.soft_nms_method())
+            switch (info.soft_nms_method())
             {
                 case NMSType::LINEAR:
                     weight = (ovr > info.nms()) ? (1.f - ovr) : 1.f;
@@ -106,7 +111,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
 
             // Discard boxes with new scores below min threshold and update pending indices
             scores_in[class_id][idx] *= weight;
-            if(scores_in[class_id][idx] >= info.soft_nms_min_score_thres())
+            if (scores_in[class_id][idx] >= info.soft_nms_min_score_thres())
             {
                 sorted_indices_temp.push_back(idx);
             }
@@ -118,7 +123,10 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
 }
 
 template <typename T>
-std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> sorted_indices, const BoxNMSLimitInfo &info, int class_id)
+std::vector<int> NonMaximaSuppression(const ITensor         *proposals,
+                                      std::vector<int>       sorted_indices,
+                                      const BoxNMSLimitInfo &info,
+                                      int                    class_id)
 {
     std::vector<int> keep;
 
@@ -130,7 +138,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
     std::vector<T> y2(proposals_width);
     std::vector<T> areas(proposals_width);
 
-    for(int i = 0; i < proposals_width; ++i)
+    for (int i = 0; i < proposals_width; ++i)
     {
         x1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
         y1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
@@ -139,7 +147,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
         areas[i] = (x2[i] - x1[i] + 1.0) * (y2[i] - y1[i] + 1.0);
     }
 
-    while(!sorted_indices.empty())
+    while (!sorted_indices.empty())
     {
         int i = sorted_indices.at(0);
         keep.push_back(i);
@@ -148,7 +156,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
         std::vector<int> new_indices;
         sorted_indices_temp.erase(sorted_indices_temp.begin());
 
-        for(unsigned int j = 0; j < sorted_indices_temp.size(); ++j)
+        for (unsigned int j = 0; j < sorted_indices_temp.size(); ++j)
         {
             const float xx1 = std::max(x1[sorted_indices_temp.at(j)], x1[i]);
             const float yy1 = std::max(y1[sorted_indices_temp.at(j)], y1[i]);
@@ -163,8 +171,9 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
             const float ctr_y = yy1 + (h / 2);
 
             // If suppress_size is specified, filter the boxes based on their size and position
-            const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() && ctr_x < info.im_width() && ctr_y < info.im_height());
-            if(ovr <= info.nms() && keep_size)
+            const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() &&
+                                                             ctr_x < info.im_width() && ctr_y < info.im_height());
+            if (ovr <= info.nms() && keep_size)
             {
                 new_indices.push_back(j);
             }
@@ -172,7 +181,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
 
         const unsigned int new_indices_size = new_indices.size();
         std::vector<int>   new_sorted_indices(new_indices_size);
-        for(unsigned int i = 0; i < new_indices_size; ++i)
+        for (unsigned int i = 0; i < new_indices_size; ++i)
         {
             new_sorted_indices[i] = sorted_indices[new_indices[i] + 1];
         }
@@ -184,7 +193,15 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
 } // namespace
 
 CPPBoxWithNonMaximaSuppressionLimitKernel::CPPBoxWithNonMaximaSuppressionLimitKernel()
-    : _scores_in(nullptr), _boxes_in(nullptr), _batch_splits_in(nullptr), _scores_out(nullptr), _boxes_out(nullptr), _classes(nullptr), _batch_splits_out(nullptr), _keeps(nullptr), _keeps_size(nullptr),
+    : _scores_in(nullptr),
+      _boxes_in(nullptr),
+      _batch_splits_in(nullptr),
+      _scores_out(nullptr),
+      _boxes_out(nullptr),
+      _classes(nullptr),
+      _batch_splits_out(nullptr),
+      _keeps(nullptr),
+      _keeps_size(nullptr),
       _info()
 {
 }
@@ -197,7 +214,7 @@ bool CPPBoxWithNonMaximaSuppressionLimitKernel::is_parallelisable() const
 template <typename T>
 void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
 {
-    const int                     batch_size   = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0);
+    const int                     batch_size = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0);
     const int                     num_classes  = _scores_in->info()->dimension(0);
     const int                     scores_count = _scores_in->info()->dimension(1);
     std::vector<int>              total_keep_per_batch(batch_size);
@@ -205,51 +222,48 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
     int                           total_keep_count = 0;
 
     std::vector<std::vector<T>> in_scores(num_classes, std::vector<T>(scores_count));
-    for(int i = 0; i < scores_count; ++i)
+    for (int i = 0; i < scores_count; ++i)
     {
-        for(int j = 0; j < num_classes; ++j)
+        for (int j = 0; j < num_classes; ++j)
         {
             in_scores[j][i] = *reinterpret_cast<const T *>(_scores_in->ptr_to_element(Coordinates(j, i)));
         }
     }
 
     int cur_start_idx = 0;
-    for(int b = 0; b < batch_size; ++b)
+    for (int b = 0; b < batch_size; ++b)
     {
         // Skip first class if there is more than 1 except if the number of classes is 1.
         const int j_start = (num_classes == 1 ? 0 : 1);
-        for(int j = j_start; j < num_classes; ++j)
+        for (int j = j_start; j < num_classes; ++j)
         {
             std::vector<T>   cur_scores(scores_count);
             std::vector<int> inds;
-            for(int i = 0; i < scores_count; ++i)
+            for (int i = 0; i < scores_count; ++i)
             {
                 const T score = in_scores[j][i];
                 cur_scores[i] = score;
 
-                if(score > _info.score_thresh())
+                if (score > _info.score_thresh())
                 {
                     inds.push_back(i);
                 }
             }
-            if(_info.soft_nms_enabled())
+            if (_info.soft_nms_enabled())
             {
                 keeps[j] = SoftNMS(_boxes_in, in_scores, inds, _info, j);
             }
             else
             {
                 std::sort(inds.data(), inds.data() + inds.size(),
-                          [&cur_scores](int lhs, int rhs)
-                {
-                    return cur_scores[lhs] > cur_scores[rhs];
-                });
+                          [&cur_scores](int lhs, int rhs) { return cur_scores[lhs] > cur_scores[rhs]; });
 
                 keeps[j] = NonMaximaSuppression<T>(_boxes_in, inds, _info, j);
             }
             total_keep_count += keeps[j].size();
         }
 
-        if(_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im())
+        if (_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im())
         {
             // merge all scores (represented by indices) together and sort
             auto get_all_scores_sorted = [&in_scores, &keeps, total_keep_count]()
@@ -257,10 +271,10 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
                 std::vector<T> ret(total_keep_count);
 
                 int ret_idx = 0;
-                for(unsigned int i = 1; i < keeps.size(); ++i)
+                for (unsigned int i = 1; i < keeps.size(); ++i)
                 {
                     auto &cur_keep = keeps[i];
-                    for(auto &ckv : cur_keep)
+                    for (auto &ckv : cur_keep)
                     {
                         ret[ret_idx++] = in_scores[i][ckv];
                     }
@@ -273,13 +287,13 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
 
             auto    all_scores_sorted = get_all_scores_sorted();
             const T image_thresh      = all_scores_sorted[all_scores_sorted.size() - _info.detections_per_im()];
-            for(int j = 1; j < num_classes; ++j)
+            for (int j = 1; j < num_classes; ++j)
             {
                 auto            &cur_keep = keeps[j];
                 std::vector<int> new_keeps_j;
-                for(auto &k : cur_keep)
+                for (auto &k : cur_keep)
                 {
-                    if(in_scores[j][k] >= image_thresh)
+                    if (in_scores[j][k] >= image_thresh)
                     {
                         new_keeps_j.push_back(k);
                     }
@@ -293,40 +307,52 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
 
         // Write results
         int cur_out_idx = 0;
-        for(int j = j_start; j < num_classes; ++j)
+        for (int j = j_start; j < num_classes; ++j)
         {
-            auto     &cur_keep        = keeps[j];
-            auto      cur_out_scores  = reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
-            auto      cur_out_classes = reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
-            const int box_column      = (cur_start_idx + cur_out_idx) * 4;
-
-            for(unsigned int k = 0; k < cur_keep.size(); ++k)
+            auto &cur_keep = keeps[j];
+            auto  cur_out_scores =
+                reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+            auto cur_out_classes =
+                reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+            const int box_column = (cur_start_idx + cur_out_idx) * 4;
+
+            for (unsigned int k = 0; k < cur_keep.size(); ++k)
             {
-                cur_out_scores[k]     = in_scores[j][cur_keep[k]];
-                cur_out_classes[k]    = static_cast<T>(j);
-                auto cur_out_box_row0 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k)));
-                auto cur_out_box_row1 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k)));
-                auto cur_out_box_row2 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k)));
-                auto cur_out_box_row3 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k)));
-                *cur_out_box_row0     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k])));
-                *cur_out_box_row1     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k])));
-                *cur_out_box_row2     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k])));
-                *cur_out_box_row3     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k])));
+                cur_out_scores[k]  = in_scores[j][cur_keep[k]];
+                cur_out_classes[k] = static_cast<T>(j);
+                auto cur_out_box_row0 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k)));
+                auto cur_out_box_row1 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k)));
+                auto cur_out_box_row2 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k)));
+                auto cur_out_box_row3 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k)));
+                *cur_out_box_row0 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k])));
+                *cur_out_box_row1 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k])));
+                *cur_out_box_row2 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k])));
+                *cur_out_box_row3 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k])));
             }
 
             cur_out_idx += cur_keep.size();
         }
 
-        if(_keeps != nullptr)
+        if (_keeps != nullptr)
         {
             cur_out_idx = 0;
-            for(int j = 0; j < num_classes; ++j)
+            for (int j = 0; j < num_classes; ++j)
             {
-                for(unsigned int i = 0; i < keeps[j].size(); ++i)
+                for (unsigned int i = 0; i < keeps[j].size(); ++i)
                 {
-                    *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) = static_cast<T>(keeps[j].at(i));
+                    *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) =
+                        static_cast<T>(keeps[j].at(i));
                 }
-                *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) = keeps[j].size();
+                *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) =
+                    keeps[j].size();
                 cur_out_idx += keeps[j].size();
             }
         }
@@ -334,17 +360,25 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
         cur_start_idx += total_keep_count;
     }
 
-    if(_batch_splits_out != nullptr)
+    if (_batch_splits_out != nullptr)
     {
-        for(int b = 0; b < batch_size; ++b)
+        for (int b = 0; b < batch_size; ++b)
         {
             *reinterpret_cast<float *>(_batch_splits_out->ptr_to_element(Coordinates(b))) = total_keep_per_batch[b];
         }
     }
 }
 
-void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
-                                                          ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor        *scores_in,
+                                                          const ITensor        *boxes_in,
+                                                          const ITensor        *batch_splits_in,
+                                                          ITensor              *scores_out,
+                                                          ITensor              *boxes_out,
+                                                          ITensor              *classes,
+                                                          ITensor              *batch_splits_out,
+                                                          ITensor              *keeps,
+                                                          ITensor              *keeps_size,
+                                                          const BoxNMSLimitInfo info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::F16, DataType::F32);
@@ -352,25 +386,28 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_
     const unsigned int num_classes = scores_in->info()->dimension(0);
 
     ARM_COMPUTE_UNUSED(num_classes);
-    ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0), "First dimension of input boxes must be of size 4*num_classes");
-    ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1), "Input scores and input boxes must have the same number of rows");
+    ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0),
+                             "First dimension of input boxes must be of size 4*num_classes");
+    ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1),
+                             "Input scores and input boxes must have the same number of rows");
 
     ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != boxes_out->info()->dimension(1));
     ARM_COMPUTE_ERROR_ON(boxes_out->info()->dimension(0) != 4);
     ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != classes->info()->dimension(0));
-    if(keeps != nullptr)
+    if (keeps != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr, "keeps_size cannot be nullptr if keeps has to be provided as output");
+        ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr,
+                                 "keeps_size cannot be nullptr if keeps has to be provided as output");
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, keeps);
         ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keeps_size, 1, DataType::U32);
         ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != keeps->info()->dimension(0));
         ARM_COMPUTE_ERROR_ON(num_classes != keeps_size->info()->dimension(0));
     }
-    if(batch_splits_in != nullptr)
+    if (batch_splits_in != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_in);
     }
-    if(batch_splits_out != nullptr)
+    if (batch_splits_out != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_out);
     }
@@ -399,7 +436,7 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run(const Window &window, const
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
 
-    switch(_scores_in->info()->data_type())
+    switch (_scores_in->info()->data_type())
     {
         case DataType::F32:
             run_nmslimit<float>();
diff --git a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
index c1187ff2b3e214c33bb46531f823716011568328..1224ec14a713704b239a2ed56fc7376fb5220b31 100644
--- a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
@@ -35,15 +35,22 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices, unsigned int max_output_size,
-                          const float score_threshold, const float iou_threshold)
+Status validate_arguments(const ITensorInfo *bboxes,
+                          const ITensorInfo *scores,
+                          const ITensorInfo *output_indices,
+                          unsigned int       max_output_size,
+                          const float        score_threshold,
+                          const float        iou_threshold)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(bboxes, scores, output_indices);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bboxes, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_indices, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1, "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2,
+                                    "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1,
+                                    "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1,
+                                    "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bboxes, scores);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->dimension(0) == 0, "Indices tensor must be bigger than 0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(max_output_size == 0, "Max size cannot be 0");
@@ -55,15 +62,26 @@ Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores,
 } // namespace
 
 CPPNonMaximumSuppressionKernel::CPPNonMaximumSuppressionKernel()
-    : _input_bboxes(nullptr), _input_scores(nullptr), _output_indices(nullptr), _max_output_size(0), _score_threshold(0.f), _iou_threshold(0.f), _num_boxes(0)
+    : _input_bboxes(nullptr),
+      _input_scores(nullptr),
+      _output_indices(nullptr),
+      _max_output_size(0),
+      _score_threshold(0.f),
+      _iou_threshold(0.f),
+      _num_boxes(0)
 {
 }
 
-void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices,
-                                               unsigned int max_output_size, const float score_threshold, const float iou_threshold)
+void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes,
+                                               const ITensor *input_scores,
+                                               ITensor       *output_indices,
+                                               unsigned int   max_output_size,
+                                               const float    score_threshold,
+                                               const float    iou_threshold)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_bboxes, input_scores, output_indices);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(), max_output_size, score_threshold, iou_threshold));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(),
+                                                  max_output_size, score_threshold, iou_threshold));
 
     auto_init_if_empty(*output_indices->info(), TensorShape(max_output_size), 1, DataType::U8, QuantizationInfo());
 
@@ -82,10 +100,15 @@ void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, cons
     ICPPKernel::configure(win);
 }
 
-Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices,
-                                                unsigned int max_output_size, const float score_threshold, const float iou_threshold)
+Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes,
+                                                const ITensorInfo *scores,
+                                                const ITensorInfo *output_indices,
+                                                unsigned int       max_output_size,
+                                                const float        score_threshold,
+                                                const float        iou_threshold)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold));
     return Status{};
 }
 
@@ -99,10 +122,10 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
     // Auxiliary tensors
     std::vector<int>   indices_above_thd;
     std::vector<float> scores_above_thd;
-    for(unsigned int i = 0; i < _num_boxes; ++i)
+    for (unsigned int i = 0; i < _num_boxes; ++i)
     {
         const float score_i = *(reinterpret_cast<float *>(_input_scores->ptr_to_element(Coordinates(i))));
-        if(score_i >= _score_threshold)
+        if (score_i >= _score_threshold)
         {
             scores_above_thd.emplace_back(score_i);
             indices_above_thd.emplace_back(i);
@@ -114,12 +137,9 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
     std::vector<unsigned int> sorted_indices;
     sorted_indices.resize(num_above_thd);
     std::iota(sorted_indices.data(), sorted_indices.data() + num_above_thd, 0);
-    std::sort(std::begin(sorted_indices),
-              std::end(sorted_indices),
+    std::sort(std::begin(sorted_indices), std::end(sorted_indices),
               [&](unsigned int first, unsigned int second)
-    {
-        return scores_above_thd[first] > scores_above_thd[second];
-    });
+              { return scores_above_thd[first] > scores_above_thd[second]; });
 
     // Number of output is the minimum between max_detection and the scores above the threshold
     const unsigned int num_output = std::min(_max_output_size, num_above_thd);
@@ -127,19 +147,20 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
     std::vector<bool>  visited(num_above_thd, false);
 
     // Keep only boxes with small IoU
-    for(unsigned int i = 0; i < num_above_thd; ++i)
+    for (unsigned int i = 0; i < num_above_thd; ++i)
     {
         // Check if the output is full
-        if(output_idx >= num_output)
+        if (output_idx >= num_output)
         {
             break;
         }
 
         // Check if it was already visited, if not add it to the output and update the indices counter
-        if(!visited[sorted_indices[i]])
+        if (!visited[sorted_indices[i]])
         {
-            *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = indices_above_thd[sorted_indices[i]];
-            visited[sorted_indices[i]]                                                           = true;
+            *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) =
+                indices_above_thd[sorted_indices[i]];
+            visited[sorted_indices[i]] = true;
             ++output_idx;
         }
         else
@@ -148,28 +169,36 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
         }
 
         // Once added one element at the output check if the next ones overlap and can be skipped
-        for(unsigned int j = i + 1; j < num_above_thd; ++j)
+        for (unsigned int j = i + 1; j < num_above_thd; ++j)
         {
-            if(!visited[sorted_indices[j]])
+            if (!visited[sorted_indices[j]])
             {
                 // Calculate IoU
                 const unsigned int i_index = indices_above_thd[sorted_indices[i]];
                 const unsigned int j_index = indices_above_thd[sorted_indices[j]];
                 // Box-corner format: xmin, ymin, xmax, ymax
-                const auto box_i_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, i_index))));
-                const auto box_i_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, i_index))));
-                const auto box_i_xmax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, i_index))));
-                const auto box_i_ymax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, i_index))));
-
-                const auto box_j_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, j_index))));
-                const auto box_j_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, j_index))));
-                const auto box_j_xmax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, j_index))));
-                const auto box_j_ymax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, j_index))));
+                const auto box_i_xmin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, i_index))));
+                const auto box_i_ymin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, i_index))));
+                const auto box_i_xmax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, i_index))));
+                const auto box_i_ymax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, i_index))));
+
+                const auto box_j_xmin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, j_index))));
+                const auto box_j_ymin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, j_index))));
+                const auto box_j_xmax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, j_index))));
+                const auto box_j_ymax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, j_index))));
 
                 const float area_i = (box_i_xmax - box_i_xmin) * (box_i_ymax - box_i_ymin);
                 const float area_j = (box_j_xmax - box_j_xmin) * (box_j_ymax - box_j_ymin);
                 float       overlap;
-                if(area_i <= 0 || area_j <= 0)
+                if (area_i <= 0 || area_j <= 0)
                 {
                     overlap = 0.0f;
                 }
@@ -179,11 +208,12 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
                     const auto x_min_intersection = std::max<float>(box_i_xmin, box_j_xmin);
                     const auto y_max_intersection = std::min<float>(box_i_ymax, box_j_ymax);
                     const auto x_max_intersection = std::min<float>(box_i_xmax, box_j_xmax);
-                    const auto area_intersection  = std::max<float>(y_max_intersection - y_min_intersection, 0.0f) * std::max<float>(x_max_intersection - x_min_intersection, 0.0f);
-                    overlap                       = area_intersection / (area_i + area_j - area_intersection);
+                    const auto area_intersection  = std::max<float>(y_max_intersection - y_min_intersection, 0.0f) *
+                                                   std::max<float>(x_max_intersection - x_min_intersection, 0.0f);
+                    overlap = area_intersection / (area_i + area_j - area_intersection);
                 }
 
-                if(overlap > _iou_threshold)
+                if (overlap > _iou_threshold)
                 {
                     visited[sorted_indices[j]] = true;
                 }
@@ -192,7 +222,7 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
     }
     // The output could be full but not the output indices tensor
     // Instead return values not valid we put -1
-    for(; output_idx < _max_output_size; ++output_idx)
+    for (; output_idx < _max_output_size; ++output_idx)
     {
         *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = -1;
     }
diff --git a/src/core/CPP/kernels/CPPPermuteKernel.cpp b/src/core/CPP/kernels/CPPPermuteKernel.cpp
index 054c7bf05a900cb6a7788e6db733f6bfc011101c..e68090d82bd55da240f5281c443bd465c1fedd23 100644
--- a/src/core/CPP/kernels/CPPPermuteKernel.cpp
+++ b/src/core/CPP/kernels/CPPPermuteKernel.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -43,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
 
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -65,7 +66,7 @@ void CPPPermuteKernel::run_permute(const Window &window)
     // Create output window
     Window                  window_out(window);
     const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
-    for(size_t d = 0; d <= _perm.num_dimensions(); ++d)
+    for (size_t d = 0; d <= _perm.num_dimensions(); ++d)
     {
         window_out.set(d, zero_window);
     }
@@ -74,28 +75,32 @@ void CPPPermuteKernel::run_permute(const Window &window)
     Iterator in(_input, window);
     Iterator out(_output, window_out);
 
-    if(_input->info()->num_dimensions() <= 3)
+    if (_input->info()->num_dimensions() <= 3)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2];
-            *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2];
+                *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+            },
+            in, out);
     }
-    else if(_input->info()->num_dimensions() >= 4)
+    else if (_input->info()->num_dimensions() >= 4)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_strides[3];
-            *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] +
+                                id[3] * perm_strides[3];
+                *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+            },
+            in, out);
     }
 }
 
-CPPPermuteKernel::CPPPermuteKernel()
-    : _func(), _input(nullptr), _output(nullptr), _perm()
+CPPPermuteKernel::CPPPermuteKernel() : _func(), _input(nullptr), _output(nullptr), _perm()
 {
 }
 
@@ -113,7 +118,7 @@ void CPPPermuteKernel::configure(const ITensor *input, ITensor *output, const Pe
     _output = output;
     _perm   = perm;
 
-    switch(input->info()->element_size())
+    switch (input->info()->element_size())
     {
         case 1:
             _func = &CPPPermuteKernel::run_permute<uint8_t>;
@@ -152,7 +157,7 @@ void CPPPermuteKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    if(_func != nullptr)
+    if (_func != nullptr)
     {
         (this->*_func)(window);
     }
diff --git a/src/core/CPP/kernels/CPPTopKVKernel.cpp b/src/core/CPP/kernels/CPPTopKVKernel.cpp
index d2b54e412e20a06df320a9dbea423a03f1c4313a..6ffb68e770b063752387ca50939bc29820b3f127 100644
--- a/src/core/CPP/kernels/CPPTopKVKernel.cpp
+++ b/src/core/CPP/kernels/CPPTopKVKernel.cpp
@@ -34,32 +34,34 @@ namespace arm_compute
 {
 namespace
 {
-template <typename T,
-          typename std::enable_if<utils::traits::is_floating_point<T>::value, int>::type = 0>
+template <typename T, typename std::enable_if<utils::traits::is_floating_point<T>::value, int>::type = 0>
 inline bool greater_than(T a, T b)
 {
     const T epsilon = std::numeric_limits<T>::epsilon();
     return (a - b > epsilon);
 }
 
-template < typename T,
-           typename std::enable_if < !utils::traits::is_floating_point<T>::value, int >::type = 0 >
+template <typename T, typename std::enable_if<!utils::traits::is_floating_point<T>::value, int>::type = 0>
 inline bool greater_than(T a, T b)
 {
     return (a > b);
 }
 
-Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status validate_arguments(const ITensorInfo *predictions,
+                          const ITensorInfo *targets,
+                          ITensorInfo       *output,
+                          const unsigned int k)
 {
     ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(targets, 1, DataType::U32);
 
     ARM_COMPUTE_RETURN_ERROR_ON(predictions->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(targets->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(targets->dimension(0) != predictions->dimension(1));
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), targets->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
@@ -72,22 +74,23 @@ Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *tar
 template <typename T>
 void CPPTopKVKernel::run_topkv()
 {
-    for(unsigned int i = 0; i < _batch_size; ++i)
+    for (unsigned int i = 0; i < _batch_size; ++i)
     {
-        const auto target_class_id = *reinterpret_cast<uint32_t *>(_targets->ptr_to_element(Coordinates{ i }));
-        const auto predicted_value = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ target_class_id, i }));
+        const auto target_class_id = *reinterpret_cast<uint32_t *>(_targets->ptr_to_element(Coordinates{i}));
+        const auto predicted_value =
+            *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{target_class_id, i}));
 
         // The variable rank indicates how many values there are before the target_class_id
         unsigned int rank = 0;
-        for(unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j)
+        for (unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j)
         {
-            const auto current_prediction = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ j, i }));
-            if(greater_than(current_prediction, predicted_value))
+            const auto current_prediction = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{j, i}));
+            if (greater_than(current_prediction, predicted_value))
             {
                 rank++;
             }
         }
-        *(_output->ptr_to_element(Coordinates{ i })) = static_cast<uint8_t>(rank < _k);
+        *(_output->ptr_to_element(Coordinates{i})) = static_cast<uint8_t>(rank < _k);
     }
 }
 
@@ -96,7 +99,10 @@ CPPTopKVKernel::CPPTopKVKernel()
 {
 }
 
-void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k)
+void CPPTopKVKernel::configure(const ITensor     *predictions,
+                               const ITensor     *targets,
+                               ITensor           *output,
+                               const unsigned int k)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(predictions, targets, output);
 
@@ -115,7 +121,10 @@ void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *target
     ICPPKernel::configure(Window()); // Default 1 iteration window
 }
 
-Status CPPTopKVKernel::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status CPPTopKVKernel::validate(const ITensorInfo *predictions,
+                                const ITensorInfo *targets,
+                                ITensorInfo       *output,
+                                const unsigned int k)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(predictions, targets, output, k));
     return Status{};
@@ -129,7 +138,7 @@ bool CPPTopKVKernel::is_parallelisable() const
 void CPPTopKVKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(window, info);
-    switch(_predictions->info()->data_type())
+    switch (_predictions->info()->data_type())
     {
         case DataType::F32:
             run_topkv<float>();
diff --git a/src/core/CPP/kernels/CPPUpsampleKernel.cpp b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
index 7ef83fb2c49fd9023886dde40d4aa024b1c09758..b1efe324460e129a97e140b6a7b5e526a8274baf 100644
--- a/src/core/CPP/kernels/CPPUpsampleKernel.cpp
+++ b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 
 #include <cstddef>
@@ -31,8 +32,7 @@
 
 namespace arm_compute
 {
-CPPUpsampleKernel::CPPUpsampleKernel()
-    : _input(nullptr), _output(nullptr), _info()
+CPPUpsampleKernel::CPPUpsampleKernel() : _input(nullptr), _output(nullptr), _info()
 {
 }
 
@@ -82,7 +82,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
     const size_t element_size  = _input->info()->element_size();
 
     // The fill value is normally 0, but for quantized types '0' corresponds to the offset
-    switch(_output->info()->data_type())
+    switch (_output->info()->data_type())
     {
         case DataType::QASYMM8:
         {
@@ -102,7 +102,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
 
     // Create window
     Window window_out(window);
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
         window_out.set(Window::DimX, Window::Dimension(start_width, end_width, stride_width));
         window_out.set(Window::DimY, Window::Dimension(start_height, end_height, stride_height));
@@ -117,10 +117,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
     Iterator in(_input, window);
     Iterator out(_output, window_out);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        memcpy(out.ptr(), in.ptr(), element_size);
-    },
-    in, out);
+    execute_window_loop(
+        window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/Error.cpp b/src/core/Error.cpp
index 5c8d45c987989f1d5e1aa313838c66b43d260e59..679a93f9aff5fd8649426395614b86af53251543 100644
--- a/src/core/Error.cpp
+++ b/src/core/Error.cpp
@@ -36,9 +36,10 @@ Status arm_compute::create_error(ErrorCode error_code, std::string msg)
     return Status(error_code, msg);
 }
 
-Status arm_compute::create_error_msg(ErrorCode error_code, const char *func, const char *file, int line, const char *msg)
+Status
+arm_compute::create_error_msg(ErrorCode error_code, const char *func, const char *file, int line, const char *msg)
 {
-    std::array<char, 512> out{ 0 };
+    std::array<char, 512> out{0};
     snprintf(out.data(), out.size(), "in %s %s:%d: %s", func, file, line, msg);
     return Status(error_code, std::string(out.data()));
 }
diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp
index 292acf86337f19768b68365513310102528f1851..2d1a13cb335b05bedfaeb32c33fdfab8fde04809 100644
--- a/src/core/GPUTarget.cpp
+++ b/src/core/GPUTarget.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/GPUTarget.h"
+
 #include "arm_compute/core/Log.h"
 
 #include <map>
@@ -31,47 +32,47 @@ namespace
 {
 arm_compute::GPUTarget get_valhall_target(const std::string &version)
 {
-    if(version.find("G77") != std::string::npos)
+    if (version.find("G77") != std::string::npos)
     {
         return arm_compute::GPUTarget::G77;
     }
-    else if(version.find("G57") != std::string::npos)
+    else if (version.find("G57") != std::string::npos)
     {
         return arm_compute::GPUTarget::G57;
     }
-    if(version.find("G68") != std::string::npos)
+    if (version.find("G68") != std::string::npos)
     {
         return arm_compute::GPUTarget::G68;
     }
-    if(version.find("G78AE") != std::string::npos)
+    if (version.find("G78AE") != std::string::npos)
     {
         return arm_compute::GPUTarget::G78AE;
     }
-    if(version.find("G78") != std::string::npos)
+    if (version.find("G78") != std::string::npos)
     {
         return arm_compute::GPUTarget::G78;
     }
-    else if(version.find("G710") != std::string::npos)
+    else if (version.find("G710") != std::string::npos)
     {
         return arm_compute::GPUTarget::G710;
     }
-    else if(version.find("G610") != std::string::npos)
+    else if (version.find("G610") != std::string::npos)
     {
         return arm_compute::GPUTarget::G610;
     }
-    else if(version.find("G510") != std::string::npos)
+    else if (version.find("G510") != std::string::npos)
     {
         return arm_compute::GPUTarget::G510;
     }
-    else if(version.find("G310") != std::string::npos)
+    else if (version.find("G310") != std::string::npos)
     {
         return arm_compute::GPUTarget::G310;
     }
-    else if(version.find("G715") != std::string::npos)
+    else if (version.find("G715") != std::string::npos)
     {
         return arm_compute::GPUTarget::G715;
     }
-    else if(version.find("G615") != std::string::npos)
+    else if (version.find("G615") != std::string::npos)
     {
         return arm_compute::GPUTarget::G615;
     }
@@ -83,39 +84,39 @@ arm_compute::GPUTarget get_valhall_target(const std::string &version)
 
 arm_compute::GPUTarget get_bifrost_target(const std::string &version)
 {
-    if(version.find("G71") != std::string::npos)
+    if (version.find("G71") != std::string::npos)
     {
         return arm_compute::GPUTarget::G71;
     }
-    else if(version.find("G72") != std::string::npos)
+    else if (version.find("G72") != std::string::npos)
     {
         return arm_compute::GPUTarget::G72;
     }
-    else if(version.find("G51BIG") != std::string::npos)
+    else if (version.find("G51BIG") != std::string::npos)
     {
         return arm_compute::GPUTarget::G51BIG;
     }
-    else if(version.find("G51LIT") != std::string::npos)
+    else if (version.find("G51LIT") != std::string::npos)
     {
         return arm_compute::GPUTarget::G51LIT;
     }
-    else if(version.find("G51") != std::string::npos)
+    else if (version.find("G51") != std::string::npos)
     {
         return arm_compute::GPUTarget::G51;
     }
-    else if(version.find("G52LIT") != std::string::npos)
+    else if (version.find("G52LIT") != std::string::npos)
     {
         return arm_compute::GPUTarget::G52LIT;
     }
-    else if(version.find("G52") != std::string::npos)
+    else if (version.find("G52") != std::string::npos)
     {
         return arm_compute::GPUTarget::G52;
     }
-    else if(version.find("G76") != std::string::npos)
+    else if (version.find("G76") != std::string::npos)
     {
         return arm_compute::GPUTarget::G76;
     }
-    else if(version.find("G31") != std::string::npos)
+    else if (version.find("G31") != std::string::npos)
     {
         return arm_compute::GPUTarget::G31;
     }
@@ -127,15 +128,15 @@ arm_compute::GPUTarget get_bifrost_target(const std::string &version)
 
 arm_compute::GPUTarget get_midgard_target(const std::string &version)
 {
-    if(version.find("T600") != std::string::npos)
+    if (version.find("T600") != std::string::npos)
     {
         return arm_compute::GPUTarget::T600;
     }
-    else if(version.find("T700") != std::string::npos)
+    else if (version.find("T700") != std::string::npos)
     {
         return arm_compute::GPUTarget::T700;
     }
-    else if(version.find("T800") != std::string::npos)
+    else if (version.find("T800") != std::string::npos)
     {
         return arm_compute::GPUTarget::T800;
     }
@@ -150,34 +151,16 @@ namespace arm_compute
 {
 const std::string &string_from_target(GPUTarget target)
 {
-    static std::map<GPUTarget, const std::string> gpu_target_map =
-    {
-        { GPUTarget::MIDGARD, "midgard" },
-        { GPUTarget::BIFROST, "bifrost" },
-        { GPUTarget::VALHALL, "valhall" },
-        { GPUTarget::T600, "t600" },
-        { GPUTarget::T700, "t700" },
-        { GPUTarget::T800, "t800" },
-        { GPUTarget::G71, "g71" },
-        { GPUTarget::G72, "g72" },
-        { GPUTarget::G51, "g51" },
-        { GPUTarget::G51BIG, "g51big" },
-        { GPUTarget::G51LIT, "g51lit" },
-        { GPUTarget::G31, "g31" },
-        { GPUTarget::G76, "g76" },
-        { GPUTarget::G52, "g52" },
-        { GPUTarget::G52LIT, "g52lit" },
-        { GPUTarget::G77, "g77" },
-        { GPUTarget::G57, "g57" },
-        { GPUTarget::G78, "g78" },
-        { GPUTarget::G68, "g68" },
-        { GPUTarget::G78AE, "g78ae" },
-        { GPUTarget::G710, "g710" },
-        { GPUTarget::G610, "g610" },
-        { GPUTarget::G510, "g510" },
-        { GPUTarget::G310, "g310" },
-        { GPUTarget::G715, "g715" },
-        { GPUTarget::G615, "g615" },
+    static std::map<GPUTarget, const std::string> gpu_target_map = {
+        {GPUTarget::MIDGARD, "midgard"}, {GPUTarget::BIFROST, "bifrost"}, {GPUTarget::VALHALL, "valhall"},
+        {GPUTarget::T600, "t600"},       {GPUTarget::T700, "t700"},       {GPUTarget::T800, "t800"},
+        {GPUTarget::G71, "g71"},         {GPUTarget::G72, "g72"},         {GPUTarget::G51, "g51"},
+        {GPUTarget::G51BIG, "g51big"},   {GPUTarget::G51LIT, "g51lit"},   {GPUTarget::G31, "g31"},
+        {GPUTarget::G76, "g76"},         {GPUTarget::G52, "g52"},         {GPUTarget::G52LIT, "g52lit"},
+        {GPUTarget::G77, "g77"},         {GPUTarget::G57, "g57"},         {GPUTarget::G78, "g78"},
+        {GPUTarget::G68, "g68"},         {GPUTarget::G78AE, "g78ae"},     {GPUTarget::G710, "g710"},
+        {GPUTarget::G610, "g610"},       {GPUTarget::G510, "g510"},       {GPUTarget::G310, "g310"},
+        {GPUTarget::G715, "g715"},       {GPUTarget::G615, "g615"},
     };
 
     return gpu_target_map[target];
@@ -189,7 +172,7 @@ GPUTarget get_target_from_name(const std::string &device_name)
     std::smatch name_parts;
     const bool  found_mali = std::regex_search(device_name, name_parts, mali_regex);
 
-    if(!found_mali)
+    if (!found_mali)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Can't find valid Arm® Mali™ GPU. Target is set to default.");
         return GPUTarget::MIDGARD;
@@ -203,22 +186,22 @@ GPUTarget get_target_from_name(const std::string &device_name)
 
     // Work-out gpu target
     GPUTarget gpu_target;
-    if(target == 'G' || is_future_gpu)
+    if (target == 'G' || is_future_gpu)
     {
         // Check for Valhall or Bifrost
         gpu_target = get_valhall_target(version);
-        if(gpu_target == GPUTarget::UNKNOWN)
+        if (gpu_target == GPUTarget::UNKNOWN)
         {
             gpu_target = get_bifrost_target(version);
         }
 
         // Default GPUTarget
-        if(gpu_target == GPUTarget::UNKNOWN)
+        if (gpu_target == GPUTarget::UNKNOWN)
         {
             gpu_target = GPUTarget::VALHALL;
         }
     }
-    else if(target == 'T')
+    else if (target == 'T')
     {
         gpu_target = get_midgard_target(version);
     }
@@ -228,7 +211,7 @@ GPUTarget get_target_from_name(const std::string &device_name)
     }
 
     // Report in case of unknown target
-    if(gpu_target == GPUTarget::UNKNOWN)
+    if (gpu_target == GPUTarget::UNKNOWN)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Arm® Mali™ Mali GPU unknown. Target is set to the default one. (BIFROST)");
         return GPUTarget::BIFROST;
diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index 28e7f4c1e59048baceeaff271bc316c8838411f4..c801b097b5cc0f52d0864e8373f9066e5a6b3fcd 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp
@@ -25,8 +25,11 @@
 
 namespace arm_compute
 {
-ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape,
-                                         InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined)
+ValidRegion calculate_valid_region_scale(const ITensorInfo  &src_info,
+                                         const TensorShape  &dst_shape,
+                                         InterpolationPolicy interpolate_policy,
+                                         SamplingPolicy      sampling_policy,
+                                         bool                border_undefined)
 {
     const DataLayout data_layout = src_info.data_layout();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -49,9 +52,9 @@ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const Tens
     auto valid_end_out_y   = std::min<int>(std::ceil(valid_end_in_y * scale_y), dst_shape[idx_height]);
 
     // Handle valid points in case of the bi-linear interpolation
-    if(border_undefined)
+    if (border_undefined)
     {
-        switch(interpolate_policy)
+        switch (interpolate_policy)
         {
             case InterpolationPolicy::NEAREST_NEIGHBOR:
             {
@@ -90,7 +93,7 @@ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const Tens
     }
 
     // Setup output valid region
-    ValidRegion valid_region{ Coordinates(), dst_shape, dst_shape.num_dimensions() };
+    ValidRegion valid_region{Coordinates(), dst_shape, dst_shape.num_dimensions()};
 
     valid_region.anchor.set(idx_width, std::max(0, valid_start_out_x));
     valid_region.anchor.set(idx_height, std::max(0, valid_start_out_y));
@@ -109,14 +112,12 @@ const std::map<DataLayout, std::vector<DataLayoutDimension>> &get_layout_map()
     constexpr DataLayoutDimension D = DataLayoutDimension::DEPTH;
     constexpr DataLayoutDimension N = DataLayoutDimension::BATCHES;
 
-    static const std::map<DataLayout, std::vector<DataLayoutDimension>> layout_map =
-    {
-        { DataLayout::NDHWC, { C, W, H, D, N } },
-        { DataLayout::NCDHW, { W, H, D, C, N } },
-        { DataLayout::NHWC, { C, W, H, N } },
-        { DataLayout::NCHW, { W, H, C, N } }
-    };
+    static const std::map<DataLayout, std::vector<DataLayoutDimension>> layout_map = {
+        {DataLayout::NDHWC, {C, W, H, D, N}},
+        {DataLayout::NCDHW, {W, H, D, C, N}},
+        {DataLayout::NHWC, {C, W, H, N}},
+        {DataLayout::NCHW, {W, H, C, N}}};
 
     return layout_map;
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp
index 832801255fbda01dd68bfd843669ef4ad78b3c5e..923c5f8a8593a6d377e0eb1e414095d9f901e1a5 100644
--- a/src/core/IAccessWindow.cpp
+++ b/src/core/IAccessWindow.cpp
@@ -29,14 +29,18 @@
 
 using namespace arm_compute;
 
-ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, const ValidRegion &input_valid_region) const
+ValidRegion AccessWindowRectangle::compute_valid_region(const Window      &window,
+                                                        const ValidRegion &input_valid_region) const
 {
     return compute_valid_region(window, input_valid_region, false, BorderSize(0));
 }
 
-ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window,
+                                                        ValidRegion   input_valid_region,
+                                                        bool          border_undefined,
+                                                        BorderSize    border_size) const
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return input_valid_region;
     }
@@ -45,7 +49,7 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va
     Coordinates  old_anchor(anchor);
     TensorShape &shape = input_valid_region.shape;
 
-    if(!border_undefined)
+    if (!border_undefined)
     {
         border_size = BorderSize(0);
     }
@@ -56,7 +60,7 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va
     // Additionally the valid region is shifted by the offset that is used by
     // the kernel to write back output values.
     anchor.set(0, std::max<int>(window.x().start() * _scale_x, anchor[0] + border_size.left) + _x);
-    if(_info->num_dimensions() > 1)
+    if (_info->num_dimensions() > 1)
     {
         anchor.set(1, std::max<int>(window.y().start() * _scale_y, anchor[1] + border_size.top) + _y);
     }
@@ -69,15 +73,19 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va
     // old size is first converted into end points to compared against the
     // execution window. Afterwards the new end points are converted back into
     // a size of the region.
-    shape.set(0, std::min<int>(old_anchor[0] + shape[0] - border_size.right, (window.x().end() - window.x().step()) * _scale_x + _width) - anchor[0]);
-    if(_info->num_dimensions() > 1)
+    shape.set(0, std::min<int>(old_anchor[0] + shape[0] - border_size.right,
+                               (window.x().end() - window.x().step()) * _scale_x + _width) -
+                     anchor[0]);
+    if (_info->num_dimensions() > 1)
     {
-        shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom, (window.y().end() - window.y().step()) * _scale_y + _height) - anchor[1]);
+        shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom,
+                                   (window.y().end() - window.y().step()) * _scale_y + _height) -
+                         anchor[1]);
     }
 
     // For higher dimensions use the intersection of the window size and the
     // valid region of the input
-    for(size_t d = 2; d < _info->num_dimensions(); ++d)
+    for (size_t d = 2; d < _info->num_dimensions(); ++d)
     {
         anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
         shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
@@ -86,9 +94,12 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va
     return input_valid_region;
 }
 
-void AccessWindowRectangle::set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined, const BorderSize &border_size)
+void AccessWindowRectangle::set_valid_region(const Window      &window,
+                                             const ValidRegion &input_valid_region,
+                                             bool               border_undefined,
+                                             const BorderSize  &border_size)
 {
-    if(_info != nullptr)
+    if (_info != nullptr)
     {
         _info->set_valid_region(compute_valid_region(window, input_valid_region, border_undefined, border_size));
     }
@@ -97,17 +108,16 @@ void AccessWindowRectangle::set_valid_region(const Window &window, const ValidRe
 bool AccessWindowRectangle::update_window_if_needed(Window &window) const
 {
     // Only update the window size if we can't use padding
-    if(_info == nullptr || _info->is_resizable())
+    if (_info == nullptr || _info->is_resizable())
     {
         return false;
     }
 
-    PaddingSize needed = get_needed_padding(window);
+    PaddingSize needed    = get_needed_padding(window);
     PaddingSize available = _info->padding();
 
-    if(needed.top <= available.top && needed.right <= available.right
-    && needed.bottom <= available.bottom
-    && needed.left <= available.left)
+    if (needed.top <= available.top && needed.right <= available.right && needed.bottom <= available.bottom &&
+        needed.left <= available.left)
     {
         return false;
     }
@@ -124,12 +134,12 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
     const int max_y = (window.y().end() - window.y().step()) * _scale_y + _y + _height;
 
     // Adjust window start for Y dimension
-    if(min_y < 0)
+    if (min_y < 0)
     {
         // Calculate rows available above the tensor
         const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]);
 
-        if(min_y < front_pad_y_available)
+        if (min_y < front_pad_y_available)
         {
             // Not enough padding available, need to shrink the window
             int start = adjust_up(min_y, front_pad_y_available, window.y().step() * _scale_y) - _y;
@@ -144,18 +154,19 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
     }
 
     // Adjust window end for Y dimension
-    if(max_y > static_cast<int>(shape[1]))
+    if (max_y > static_cast<int>(shape[1]))
     {
         const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
 
         // Calculate rows available below the tensor
         const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
 
-        if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
+        if (static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
         {
             // Not enough padding available, need to shrink the window
-            int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + window.y().step() * _scale_y - _y - _height;
-            end     = std::max<int>(window.y().start(), end / _scale_y);
+            int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) +
+                      window.y().step() * _scale_y - _y - _height;
+            end = std::max<int>(window.y().start(), end / _scale_y);
 
             window.set(1, Window::Dimension(window.y().start(), end, window.y().step()));
             window_modified = true;
@@ -170,11 +181,14 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
     const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
 
     // Adjust window start for X dimension
-    if(min_x < 0)
+    if (min_x < 0)
     {
-        const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+        const int front_pad_x_available =
+            -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1],
+                           stride_y - shape[0] * strides[0]) /
+            static_cast<int>(strides[0]);
 
-        if(min_x < front_pad_x_available)
+        if (min_x < front_pad_x_available)
         {
             // Not enough padding available, need to shrink the window
             int start = adjust_up(min_x, front_pad_x_available, window.x().step() * _scale_x) - _x;
@@ -189,15 +203,16 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
     }
 
     // Adjust window end for X dimension
-    if(max_x > static_cast<int>(shape[0]))
+    if (max_x > static_cast<int>(shape[0]))
     {
         const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
 
-        if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
+        if (static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
         {
             // Not enough padding available, need to shrink the window
-            int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + window.x().step() * _scale_x - _x - _width;
-            end     = std::max<int>(window.x().start(), end / _scale_x);
+            int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) +
+                      window.x().step() * _scale_x - _x - _width;
+            end = std::max<int>(window.x().start(), end / _scale_x);
 
             window.set(0, Window::Dimension(window.x().start(), end, window.x().step()));
             window_modified = true;
@@ -212,15 +227,15 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
 bool AccessWindowRectangle::update_padding_if_needed(const Window &window)
 {
     // Only update the padding if the tensor allows it
-    if(_info == nullptr || !_info->is_resizable())
+    if (_info == nullptr || !_info->is_resizable())
     {
         return false;
     }
     // Update strides in tensor info
-    return _info->extend_padding( get_needed_padding(window));
+    return _info->extend_padding(get_needed_padding(window));
 }
 
-PaddingSize AccessWindowRectangle::get_needed_padding(const Window &window)const
+PaddingSize AccessWindowRectangle::get_needed_padding(const Window &window) const
 {
     ARM_COMPUTE_ERROR_ON(_scale_x == 0);
     ARM_COMPUTE_ERROR_ON(_scale_y == 0);
diff --git a/src/core/IKernel.cpp b/src/core/IKernel.cpp
index 31f1ec7a3f81a553455f809c9b98fb624a757333..fb7e0950917f45d8a3d87ecdf6098b86c3ba1894 100644
--- a/src/core/IKernel.cpp
+++ b/src/core/IKernel.cpp
@@ -30,8 +30,7 @@ const Window &IKernel::window() const
     return _window;
 }
 
-IKernel::IKernel()
-    : _window()
+IKernel::IKernel() : _window()
 {
     // Create an empty window to make sure the children classes set the window values themselves
     _window.set(Window::DimX, Window::Dimension(0, 0, 1));
diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index 2f4354cc6f1938f8f0b8c9281b7c5789cdb9d58f..4dc8ea959b4970cc4a2a037f444c71e07990ff48 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp
@@ -35,7 +35,7 @@ namespace arm_compute
 {
 void ITensor::copy_from(const ITensor &src)
 {
-    if(&src == this)
+    if (&src == this)
     {
         return;
     }
@@ -47,7 +47,7 @@ void ITensor::copy_from(const ITensor &src)
     ARM_COMPUTE_ERROR_ON(src_info->num_channels() != dst_info->num_channels());
     ARM_COMPUTE_ERROR_ON(src_info->element_size() != dst_info->element_size());
 
-    for(size_t d = 0; d < src_info->num_dimensions(); d++)
+    for (size_t d = 0; d < src_info->num_dimensions(); d++)
     {
         ARM_COMPUTE_ERROR_ON(src_info->dimension(d) > dst_info->dimension(d));
     }
@@ -66,11 +66,7 @@ void ITensor::copy_from(const ITensor &src)
     const size_t line_size = src_info->element_size() * src_info->dimension(0);
 
     execute_window_loop(
-        win_src, [&](const Coordinates &)
-    {
-        memcpy(dst_it.ptr(), src_it.ptr(), line_size);
-    },
-    src_it, dst_it);
+        win_src, [&](const Coordinates &) { memcpy(dst_it.ptr(), src_it.ptr(), line_size); }, src_it, dst_it);
 }
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
@@ -87,10 +83,10 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
     stream_status.copyfmt(s);
 
     // Set precision
-    if(is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default))
+    if (is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default))
     {
         int precision = io_fmt.precision;
-        if(io_fmt.precision_type == IOFormatInfo::PrecisionType::Full)
+        if (io_fmt.precision_type == IOFormatInfo::PrecisionType::Full)
         {
             precision = std::numeric_limits<float>().max_digits10;
         }
@@ -101,7 +97,7 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
     size_t print_width  = 0;
     size_t print_height = 0;
     int    start_offset = 0;
-    switch(io_fmt.print_region)
+    switch (io_fmt.print_region)
     {
         case IOFormatInfo::PrintRegion::NoPadding:
             print_width  = this->info()->dimension(0);
@@ -111,13 +107,14 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
         case IOFormatInfo::PrintRegion::ValidRegion:
             print_width  = this->info()->valid_region().shape.x();
             print_height = this->info()->valid_region().shape.y();
-            start_offset = this->info()->offset_element_in_bytes(Coordinates(this->info()->valid_region().anchor.x(),
-                                                                             this->info()->valid_region().anchor.y()));
+            start_offset = this->info()->offset_element_in_bytes(
+                Coordinates(this->info()->valid_region().anchor.x(), this->info()->valid_region().anchor.y()));
             break;
         case IOFormatInfo::PrintRegion::Full:
             print_width  = padding.left + this->info()->dimension(0) + padding.right;
             print_height = padding.top + this->info()->dimension(1) + padding.bottom;
-            start_offset = static_cast<int>(this->info()->offset_first_element_in_bytes()) - padding.top * strides[1] - padding.left * strides[0];
+            start_offset = static_cast<int>(this->info()->offset_first_element_in_bytes()) - padding.top * strides[1] -
+                           padding.left * strides[0];
             break;
         default:
             break;
@@ -129,16 +126,17 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
     const uint8_t *ptr = this->buffer() + start_offset;
 
     // Start printing
-    for(size_t i = 0; i < slices2D; ++i)
+    for (size_t i = 0; i < slices2D; ++i)
     {
         // Find max_width of elements in slice to align columns
         int max_element_width = 0;
-        if(io_fmt.align_columns)
+        if (io_fmt.align_columns)
         {
             size_t offset = i * strides[2];
-            for(size_t h = 0; h < print_height; ++h)
+            for (size_t h = 0; h < print_height; ++h)
             {
-                max_element_width = std::max<int>(max_element_width, max_consecutive_elements_display_width(s, dt, ptr + offset, print_width));
+                max_element_width = std::max<int>(
+                    max_element_width, max_consecutive_elements_display_width(s, dt, ptr + offset, print_width));
                 offset += strides[1];
             }
         }
@@ -146,7 +144,7 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
         // Print slice
         {
             size_t offset = i * strides[2];
-            for(size_t h = 0; h < print_height; ++h)
+            for (size_t h = 0; h < print_height; ++h)
             {
                 print_consecutive_elements(s, dt, ptr + offset, print_width, max_element_width, io_fmt.element_delim);
                 offset += strides[1];
diff --git a/src/core/ITensorPack.cpp b/src/core/ITensorPack.cpp
index 90f9a45039f0d295b8f558363e25989e96c4aa3a..0f8b0824f81b973d67b1488d3e3743ca63b3c63d 100644
--- a/src/core/ITensorPack.cpp
+++ b/src/core/ITensorPack.cpp
@@ -27,10 +27,9 @@
 
 namespace arm_compute
 {
-ITensorPack::ITensorPack(std::initializer_list<PackElement> l)
-    : _pack()
+ITensorPack::ITensorPack(std::initializer_list<PackElement> l) : _pack()
 {
-    for(auto &e : l)
+    for (auto &e : l)
     {
         _pack[e.id] = e;
     }
@@ -54,7 +53,7 @@ void ITensorPack::add_const_tensor(int id, const ITensor *tensor)
 const ITensor *ITensorPack::get_const_tensor(int id) const
 {
     auto it = _pack.find(id);
-    if(it != _pack.end())
+    if (it != _pack.end())
     {
         return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
     }
@@ -81,4 +80,4 @@ bool ITensorPack::empty() const
 {
     return _pack.empty();
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h
index e6d0e532c846eefd02e940db7d773ad56d278d2d..5f4d08d0f6cdf4e4ad972b4789c53d34ec08d56a 100644
--- a/src/core/NEON/NEAsymm.h
+++ b/src/core/NEON/NEAsymm.h
@@ -26,6 +26,7 @@
 
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -90,7 +91,7 @@ inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
         in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
         in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
@@ -130,18 +131,13 @@ inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
     in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to U8
     uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_u8 = vmaxq_u8(out_u8, min_u8);
         out_u8 = vminq_u8(out_u8, max_u8);
@@ -170,7 +166,7 @@ inline int8x16_t finalize_quantization(int32x4x4_t &in_s32,
                                        int8x16_t    max_s8,
                                        bool         is_bounded_relu)
 {
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
         in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
         in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
@@ -204,18 +200,13 @@ inline int8x16_t finalize_quantization(int32x4x4_t &in_s32,
     in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8
     int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = vmaxq_s8(out_s8, min_s8);
         out_s8 = vminq_s8(out_s8, max_s8);
@@ -247,8 +238,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
     const static int32x4_t one_s32 = vdupq_n_s32(1);
 
     // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-    int32x4x4_t res_shift_gt0 =
-    {
+    int32x4x4_t res_shift_gt0 = {
         vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]),
         vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]),
         vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]),
@@ -260,8 +250,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
     res_shift_gt0.val[2] = rounding_divide_by_pow2(res_shift_gt0.val[2], result_shift.val[2]);
     res_shift_gt0.val[3] = rounding_divide_by_pow2(res_shift_gt0.val[3], result_shift.val[3]);
 
-    int32x4x4_t res_shift_lt0 =
-    {
+    int32x4x4_t res_shift_lt0 = {
         vmulq_s32(in_s32.val[0], vshlq_s32(one_s32, vnegq_s32(result_shift.val[0]))),
         vmulq_s32(in_s32.val[1], vshlq_s32(one_s32, vnegq_s32(result_shift.val[1]))),
         vmulq_s32(in_s32.val[2], vshlq_s32(one_s32, vnegq_s32(result_shift.val[2]))),
@@ -273,8 +262,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
     res_shift_lt0.val[3] = vqrdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]);
 
     // Select result depending on shift value
-    const uint32x4x4_t mask_lt0 =
-    {
+    const uint32x4x4_t mask_lt0 = {
 #ifdef __aarch64__
         vcltzq_s32(result_shift.val[0]),
         vcltzq_s32(result_shift.val[1]),
@@ -300,18 +288,13 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
     in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8
     int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = vmaxq_s8(out_s8, min_s8);
         out_s8 = vminq_s8(out_s8, max_s8);
@@ -332,15 +315,20 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
  *
  * @return Quantized value
  */
-inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
-                                     int32_t result_shift, int32_t result_offset_after_shift_s32,
-                                     uint8_t min_u8, uint8_t max_u8, bool is_bounded_relu)
+inline uint8_t finalize_quantization(int32_t in_value,
+                                     int     result_fixedpoint_multiplier,
+                                     int32_t result_shift,
+                                     int32_t result_offset_after_shift_s32,
+                                     uint8_t min_u8,
+                                     uint8_t max_u8,
+                                     bool    is_bounded_relu)
 {
     int32x4_t in_s32 = vdupq_n_s32(in_value);
 
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
-        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
+        in_value = vgetq_lane_s32(
+            vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
     }
     else
     {
@@ -355,7 +343,7 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul
 
     // Bound the result
     uint8_t out_u8 = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_u8 = static_cast<uint8_t>(std::max(min_u8, std::min(max_u8, out_u8)));
     }
@@ -375,15 +363,20 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul
  *
  * @return Quantized value
  */
-inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
-                                    int32_t result_shift, int32_t result_offset_after_shift_s32,
-                                    int8_t min_s8, int8_t max_s8, bool is_bounded_relu)
+inline int8_t finalize_quantization(int32_t in_value,
+                                    int     result_fixedpoint_multiplier,
+                                    int32_t result_shift,
+                                    int32_t result_offset_after_shift_s32,
+                                    int8_t  min_s8,
+                                    int8_t  max_s8,
+                                    bool    is_bounded_relu)
 {
     int32x4_t in_s32 = vdupq_n_s32(in_value);
 
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
-        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
+        in_value = vgetq_lane_s32(
+            vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
     }
     else
     {
@@ -399,7 +392,7 @@ inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mult
 
     // Bound the result
     int8_t out_s8 = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = static_cast<int8_t>(std::max(min_s8, std::min(max_s8, out_s8)));
     }
@@ -416,17 +409,16 @@ inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mult
  */
 inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const int           offset             = qi.offset;
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)),
+                  vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -439,17 +431,14 @@ inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationI
  */
 inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const int           offset             = qi.offset;
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -462,19 +451,24 @@ inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationIn
  */
 inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const int           offset             = qi.offset;
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+                  vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -487,19 +481,16 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantization
  */
 inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const int           offset             = qi.offset;
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -513,17 +504,22 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationI
  */
 inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offset)
 {
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-        }
-    };
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+                  vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -537,17 +533,14 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offs
  */
 inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offset)
 {
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-        }
-    };
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -560,15 +553,12 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offse
  */
 inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale)
 {
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
-        }
-    };
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
+    }};
     return vdequantized_input;
 }
 
@@ -581,16 +571,13 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale
  */
 inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale)
 {
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
-        }
-    };
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -607,18 +594,15 @@ inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInf
     const int         offset    = qi.offset;
     const float32x4_t voffset   = vdupq_n_f32(offset);
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
+    const int32x4x4_t rf        = {{
 #ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
 #else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
 #endif //__aarch64__
-        }
-    };
+    }};
     return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
 }
 
@@ -635,18 +619,15 @@ inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizat
     const int         offset    = qi.offset;
     const float32x4_t voffset   = vdupq_n_f32(offset);
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
+    const int32x4x4_t rf        = {{
 #ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
 #else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
 #endif //__aarch64__
-        }
-    };
+    }};
     return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
 }
 
@@ -654,22 +635,19 @@ inline int32x4x4_t vquantize_internal(const float32x4x4_t &qv, float scale, int3
 {
     const int32x4_t   voffset   = vdupq_n_s32(offset);
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
+    const int32x4x4_t rf        = {{
 #ifdef __aarch64__
-            vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
-            vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
-            vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
-            vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
+        vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
+        vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
+        vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
+        vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
 #else  //__aarch64__
-            vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
-            vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
-            vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
-            vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
+        vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
+        vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
+        vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
+        vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
 #endif //__aarch64__
-        }
-    };
+    }};
     return rf;
 }
 
@@ -715,7 +693,7 @@ inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQua
     auto             rf = vquantize_internal(qv, qi.scale, qi.offset);
     const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1]));
     const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
-    return { pa, pb };
+    return {pa, pb};
 }
 
 } // namespace arm_compute
diff --git a/src/core/NEON/NEAsymm.inl b/src/core/NEON/NEAsymm.inl
index ca2aea1e18de67d75077f240e6200178be14033d..fd62fd4654d4f861b624b598f1dbcb0502d24c2b 100644
--- a/src/core/NEON/NEAsymm.inl
+++ b/src/core/NEON/NEAsymm.inl
@@ -51,14 +51,14 @@ inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t v
     D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
     // Convert float32 vectors to uint32 vectors
 #if __aarch64__
-    if(round_policy == RoundingPolicy::TO_NEAREST_EVEN)
+    if (round_policy == RoundingPolicy::TO_NEAREST_EVEN)
     {
         A_u32x4 = vcvtnq_u32_f32(A_f32x4);
         B_u32x4 = vcvtnq_u32_f32(B_f32x4);
         C_u32x4 = vcvtnq_u32_f32(C_f32x4);
         D_u32x4 = vcvtnq_u32_f32(D_f32x4);
     }
-    else if(round_policy == RoundingPolicy::TO_NEAREST_UP)
+    else if (round_policy == RoundingPolicy::TO_NEAREST_UP)
     {
         A_u32x4 = vcvtaq_u32_f32(A_f32x4);
         B_u32x4 = vcvtaq_u32_f32(B_f32x4);
@@ -86,7 +86,7 @@ inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t v
     return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8));
 }
 
-template <RoundingPolicy   round_policy>
+template <RoundingPolicy round_policy>
 inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo)
 {
     // Convert uint8 vectors to int16 vectors
@@ -110,14 +110,14 @@ inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x
     C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
     D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
 #if __aarch64__
-    if(round_policy == RoundingPolicy::TO_NEAREST_EVEN)
+    if (round_policy == RoundingPolicy::TO_NEAREST_EVEN)
     {
         A_s32x4 = vcvtnq_s32_f32(A_f32x4);
         B_s32x4 = vcvtnq_s32_f32(B_f32x4);
         C_s32x4 = vcvtnq_s32_f32(C_f32x4);
         D_s32x4 = vcvtnq_s32_f32(D_f32x4);
     }
-    else if(round_policy == RoundingPolicy::TO_NEAREST_UP)
+    else if (round_policy == RoundingPolicy::TO_NEAREST_UP)
     {
         A_s32x4 = vcvtaq_s32_f32(A_f32x4);
         B_s32x4 = vcvtaq_s32_f32(B_f32x4);
diff --git a/src/core/NEON/NEFixedPoint.inl b/src/core/NEON/NEFixedPoint.inl
index 8bff9c4a8efc2f4584519b71183c32022ad14a5f..fb403b6d26bf77abca9a648a05fdf73d275bd3c5 100644
--- a/src/core/NEON/NEFixedPoint.inl
+++ b/src/core/NEON/NEFixedPoint.inl
@@ -30,13 +30,7 @@ namespace arm_compute
 
 inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
 {
-    float32x4x2_t res =
-    {
-        {
-            vmaxq_f32(a.val[0], b.val[0]),
-            vmaxq_f32(a.val[1], b.val[1])
-        }
-    };
+    float32x4x2_t res = {{vmaxq_f32(a.val[0], b.val[0]), vmaxq_f32(a.val[1], b.val[1])}};
     return res;
 }
 #endif /* DOXYGEN_SKIP_THIS */
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index 75016b0db1e19fe1a0684119cb71512951cd5656..8675eec93ff8e29e7a9582acd3f0fa691bde132a 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEKERNELS_H
-#define ARM_COMPUTE_NEKERNELS_H
+#ifndef ACL_SRC_CORE_NEON_NEKERNELS_H
+#define ACL_SRC_CORE_NEON_NEKERNELS_H
 
 #include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
 #include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
@@ -50,13 +50,13 @@
 #include "src/core/NEON/kernels/NEPadLayerKernel.h"
 #include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
 #include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
-#include "src/core/NEON/kernels/NEROIAlignLayerKernel.h"
-#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "src/core/NEON/kernels/NERangeKernel.h"
 #include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 #include "src/core/NEON/kernels/NEReorderKernel.h"
 #include "src/core/NEON/kernels/NEReorgLayerKernel.h"
 #include "src/core/NEON/kernels/NEReverseKernel.h"
+#include "src/core/NEON/kernels/NEROIAlignLayerKernel.h"
+#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "src/core/NEON/kernels/NESelectKernel.h"
 #include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
 #include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
@@ -64,4 +64,4 @@
 #include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 #include "src/core/NEON/kernels/NETileKernel.h"
 
-#endif /* ARM_COMPUTE_NEKERNELS_H */
+#endif // ACL_SRC_CORE_NEON_NEKERNELS_H
diff --git a/src/core/NEON/NEMath.inl b/src/core/NEON/NEMath.inl
index 1cbe66937304a338d58148f8f24c6bd5217c9e8c..a5aba0bf23dd1477bbbd4d88cc7ccc2b60f46fce 100644
--- a/src/core/NEON/NEMath.inl
+++ b/src/core/NEON/NEMath.inl
@@ -21,6 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
+#include "src/core/utils/Math.h"
 #include "support/ToolchainSupport.h"
 
 #include <cmath>
@@ -29,19 +31,16 @@
 namespace arm_compute
 {
 /** Logarithm polynomial coefficients */
-const std::array<float32x4_t, 8> log_tab =
-{
-    {
-        vdupq_n_f32(-2.29561495781f),
-        vdupq_n_f32(-2.47071170807f),
-        vdupq_n_f32(-5.68692588806f),
-        vdupq_n_f32(-0.165253549814f),
-        vdupq_n_f32(5.17591238022f),
-        vdupq_n_f32(0.844007015228f),
-        vdupq_n_f32(4.58445882797f),
-        vdupq_n_f32(0.0141278216615f),
-    }
-};
+const std::array<float32x4_t, 8> log_tab = {{
+    vdupq_n_f32(-2.29561495781f),
+    vdupq_n_f32(-2.47071170807f),
+    vdupq_n_f32(-5.68692588806f),
+    vdupq_n_f32(-0.165253549814f),
+    vdupq_n_f32(5.17591238022f),
+    vdupq_n_f32(0.844007015228f),
+    vdupq_n_f32(4.58445882797f),
+    vdupq_n_f32(0.0141278216615f),
+}};
 
 /** Sin polynomial coefficients */
 constexpr float te_sin_coeff2 = 0.166666666666f; // 1/(2*3)
@@ -54,7 +53,7 @@ inline float32x4_t prefer_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c)
 {
 #if __ARM_FEATURE_FMA
     return vfmaq_f32(a, b, c);
-#else // __ARM_FEATURE_FMA
+#else  // __ARM_FEATURE_FMA
     return vmlaq_f32(a, b, c);
 #endif // __ARM_FEATURE_FMA
 }
@@ -73,13 +72,14 @@ inline float32x4_t vroundq_rte_f32(float32x4_t val)
 {
 #ifdef __aarch64__
     return vrndnq_f32(val);
-#else // __aarch64__
+#else  // __aarch64__
     static const float32x4_t CONST_HALF_FLOAT = vdupq_n_f32(0.5f);
     static const float32x4_t CONST_1_FLOAT    = vdupq_n_f32(1.f);
     static const int32x4_t   CONST_1_INT      = vdupq_n_s32(1);
     const float32x4_t        floor_val        = vfloorq_f32(val);
     const float32x4_t        diff             = vsubq_f32(val, floor_val);
-    const float32x4_t        fp32_upper_limit = vreinterpretq_f32_u32(vdupq_n_u32(0x4B000000)); // 0x4B000000 = (23U + 127U) << 23U
+    const float32x4_t        fp32_upper_limit =
+        vreinterpretq_f32_u32(vdupq_n_u32(0x4B000000)); // 0x4B000000 = (23U + 127U) << 23U
 
     /*
     * 1. Select the floor value when (diff<0.5 || (diff==0.5 && floor_val%2==0).
@@ -95,12 +95,13 @@ inline float32x4_t vroundq_rte_f32(float32x4_t val)
     *    Threshold upper limit with format |S|E(8bits)|   Fraction(23bits)     | = (23 + 127) << 23 (assuming positive sign): Adding 127, because 127 represents the actual zero in this format.
     */
 
-    float32x4_t rounded_val = vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT),
-                                                  vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT),
-                                                            vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT)))),
-                                        floor_val, vaddq_f32(floor_val, CONST_1_FLOAT));
+    float32x4_t rounded_val = vbslq_f32(
+        vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT),
+                  vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT),
+                            vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT), CONST_1_INT)))),
+        floor_val, vaddq_f32(floor_val, CONST_1_FLOAT));
 
-    float32x4_t result      = vbslq_f32(vcgeq_f32(vabsq_f32(val), fp32_upper_limit), val, rounded_val);
+    float32x4_t result = vbslq_f32(vcgeq_f32(vabsq_f32(val), fp32_upper_limit), val, rounded_val);
 
     return result;
 #endif // __aarch64__
@@ -118,8 +119,8 @@ inline float32x2_t vinvsqrt_f32(float32x2_t x)
 inline float32x4_t vinvsqrtq_f32(float32x4_t x)
 {
     float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
-    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
 
     return sqrt_reciprocal;
 }
@@ -152,8 +153,7 @@ inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t
     return res;
 }
 
-static const uint32_t exp_f32_coeff[] =
-{
+static const uint32_t exp_f32_coeff[] = {
     0x3f7ffff6, // x^1: 0x1.ffffecp-1f
     0x3efffedb, // x^2: 0x1.fffdb6p-2f
     0x3e2aaf33, // x^3: 0x1.555e66p-3f
@@ -169,10 +169,12 @@ inline float32x4_t vexpq_f32(float32x4_t x)
     const auto c4 = vreinterpretq_f32_u32(vdupq_n_u32(exp_f32_coeff[3]));
     const auto c5 = vreinterpretq_f32_u32(vdupq_n_u32(exp_f32_coeff[4]));
 
-    const auto shift      = vreinterpretq_f32_u32(vdupq_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
-    const auto inv_ln2    = vreinterpretq_f32_u32(vdupq_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
-    const auto neg_ln2_hi = vreinterpretq_f32_u32(vdupq_n_u32(0xbf317200)); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
-    const auto neg_ln2_lo = vreinterpretq_f32_u32(vdupq_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
+    const auto shift   = vreinterpretq_f32_u32(vdupq_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
+    const auto inv_ln2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
+    const auto neg_ln2_hi =
+        vreinterpretq_f32_u32(vdupq_n_u32(0xbf317200)); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
+    const auto neg_ln2_lo =
+        vreinterpretq_f32_u32(vdupq_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
 
     const auto inf       = vdupq_n_f32(std::numeric_limits<float>::infinity());
     const auto max_input = vdupq_n_f32(88.37f); // Approximately ln(2^127.5)
@@ -224,35 +226,62 @@ inline float32x4_t vexpq_f32(float32x4_t x)
 #ifdef __aarch64__
 inline float32x4_t verfq_f32(float32x4_t x)
 {
-    static const float       erffdata[4] = { 0.278393f, 0.230389f, 0.000972f, 0.078108f };
-    static const float32x4_t coeffdata   = vld1q_f32(erffdata);
-    static const float32x4_t onev{ vdupq_n_f32(1.0f) };
+    const float32x4_t max_value = vdupq_n_f32(3.9375);       // 4 - 8/128
+    const float32x4_t shift     = vdupq_n_f32(65536);        // 2^16
+    const float32x4_t third     = vdupq_n_f32(0.3333333333); // 1/3
+    const float32x4_t one       = vdupq_n_f32(1.f);
+    const uint32x4_t  max_index = vdupq_n_u32(512);
+    const uint32x4_t  sign_mask = vdupq_n_u32(0x7fffffff);
 
-    uint32x4_t selector = vcltzq_f32(x);
+    const float32x4_t x_abs = vabsq_f32(x);
+
+    // erf(x) for x in [0, 3.9375] is approxiated as follows:
+    //
+    //   erf(x) = erf(r) + scale(r) * d * (1 - r * d - 1/3 * d^2)
+    //
+    // where:
+    //   r = floor(x * 128) / 128
+    //   d = x - r
+    //
+    // erf(r) and scale(r) are stored in a 513-entry lookup table.
+    // The LUT covers the range from 0 to 4 with the step of 1/128.
+    //
+    // Special cases:
+    //   erf(x) =  1 for x >  3.9375
+    //   erf(x) = -1 for x < -3.9375
+
+    // Find the LUT indices by rounding the input value to the step of 1/128.
+    //
+    // `shift` is used to push out the 16 LSBs of the input value. Only 7 bits in the fraction part
+    // of the input value is preserved.
+    const float32x4_t z = x_abs + shift;
+    const float32x4_t r = z - shift;
 
-    float32x4_t absx  = vabsq_f32(x);
-    float32x4_t absx2 = vmulq_f32(x, x);
-    float32x4_t absx3 = vmulq_f32(absx2, absx);
-    float32x4_t absx4 = vmulq_f32(absx2, absx2);
+    uint32x4_t index = vreinterpretq_u32_f32(z) - vreinterpretq_u32_f32(shift);
+    index            = vminq_u32(index, max_index);
 
-    float32x4_t denom = onev;
-    denom             = vfmaq_laneq_f32(denom, absx, coeffdata, 0);
-    denom             = vfmaq_laneq_f32(denom, absx2, coeffdata, 1);
-    denom             = vfmaq_laneq_f32(denom, absx3, coeffdata, 2);
-    denom             = vfmaq_laneq_f32(denom, absx4, coeffdata, 3);
+    // Lookup erf(r) and scale(r).
+    const float64_t entry_0 = *reinterpret_cast<const float64_t *>(&erf_f32_lut[index[0]]);
+    const float64_t entry_1 = *reinterpret_cast<const float64_t *>(&erf_f32_lut[index[1]]);
+    const float64_t entry_2 = *reinterpret_cast<const float64_t *>(&erf_f32_lut[index[2]]);
+    const float64_t entry_3 = *reinterpret_cast<const float64_t *>(&erf_f32_lut[index[3]]);
 
-    denom = vmulq_f32(denom, denom);
-    denom = vmulq_f32(denom, denom);
+    const float32x4_t entry_01 = vreinterpretq_f32_f64(float64x2_t{entry_0, entry_1});
+    const float32x4_t entry_23 = vreinterpretq_f32_f64(float64x2_t{entry_2, entry_3});
 
-    float32x4_t fract = onev;
-    fract             = vdivq_f32(fract, denom);
+    const float32x4_t erf_r   = vuzp1q_f32(entry_01, entry_23);
+    const float32x4_t scale_r = vuzp2q_f32(entry_01, entry_23);
 
-    float32x4_t result = onev;
-    result             = vsubq_f32(result, fract);
+    // Approximate erf(x) = erf(r) + scale(r) * d * (1 - r * d - 1/3 * d^2).
+    const float32x4_t d  = x_abs - r;
+    const float32x4_t d2 = d * d;
 
-    float32x4_t inverse = vnegq_f32(result);
+    const float32x4_t t0    = vfmaq_f32(r, third, d); // t0 = r + 1/3 * d.
+    const float32x4_t t1    = vfmsq_f32(d, d2, t0);   // t1 = d - d2 * t0 = d * (1 - r * d - 1/3 * d^2).
+    const float32x4_t erf_x = vfmaq_f32(erf_r, scale_r, t1);
 
-    result = vbslq_f32(selector, inverse, result);
+    const float32x4_t clamped = vbslq_f32(x_abs > max_value, one, erf_x);
+    const float32x4_t result  = vbslq_f32(sign_mask, clamped, x);
 
     return result;
 }
@@ -287,10 +316,12 @@ inline float32x4_t vtanhq_f32(float32x4_t val)
 
     float32x4_t x = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
     // x * (1 - x^2/3) if |x| < 5.e-3 or (exp2x - 1) / (exp2x + 1) otherwise
-    float32x4_t exp2x = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vexpq_f32(vmulq_f32(CONST_2, x)), vmulq_f32(x, x));
-    float32x4_t num   = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vsubq_f32(exp2x, CONST_1), vmulq_f32(CONST_1_3, exp2x));
-    float32x4_t den   = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vaddq_f32(exp2x, CONST_1), vsubq_f32(CONST_1, num));
-    float32x4_t tanh  = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vmulq_f32(num, vinvq_f32(den)), vmulq_f32(x, den));
+    float32x4_t exp2x =
+        vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vexpq_f32(vmulq_f32(CONST_2, x)), vmulq_f32(x, x));
+    float32x4_t num =
+        vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vsubq_f32(exp2x, CONST_1), vmulq_f32(CONST_1_3, exp2x));
+    float32x4_t den = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vaddq_f32(exp2x, CONST_1), vsubq_f32(CONST_1, num));
+    float32x4_t tanh = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vmulq_f32(num, vinvq_f32(den)), vmulq_f32(x, den));
     return tanh;
 }
 
@@ -456,30 +487,23 @@ inline float32x4x4_t convert_to_float32x4x4(const int8x16_t &in)
 
 inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out)
 {
-    out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
-    out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
-    out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
+    out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])), vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
+    out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])), vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
+    out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])), vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
 }
 
 inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out)
 {
-    const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])),
-                                   vqmovn_u32(vcvtq_u32_f32(in.val[1])));
-    const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])),
-                                   vqmovn_u32(vcvtq_u32_f32(in.val[3])));
-    out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
+    const auto low  = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])), vqmovn_u32(vcvtq_u32_f32(in.val[1])));
+    const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])), vqmovn_u32(vcvtq_u32_f32(in.val[3])));
+    out             = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
 }
 
 inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out)
 {
-    const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])),
-                                   vqmovn_s32(vcvtq_s32_f32(in.val[1])));
-    const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])),
-                                   vqmovn_s32(vcvtq_s32_f32(in.val[3])));
-    out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
+    const auto low  = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])), vqmovn_s32(vcvtq_s32_f32(in.val[1])));
+    const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])), vqmovn_s32(vcvtq_s32_f32(in.val[3])));
+    out             = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
 }
 
 template <>
@@ -552,8 +576,8 @@ inline float16x4_t vinvsqrt_f16(float16x4_t x)
 inline float16x8_t vinvsqrtq_f16(float16x8_t x)
 {
     float16x8_t sqrt_reciprocal = vrsqrteq_f16(x);
-    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
     return sqrt_reciprocal;
 }
 
@@ -602,8 +626,8 @@ inline float16x4_t vtanh_rational_approx_f16(float16x4_t x16)
 inline float16x8_t vtanhq_f16(float16x8_t x)
 {
     // Split into high/low and use rational approximation on both parts exactly
-    const float16x8_t tanh = vcombine_f16(vtanh_rational_approx_f16(vget_low_f16(x)),
-                                          vtanh_rational_approx_f16(vget_high_f16(x)));
+    const float16x8_t tanh =
+        vcombine_f16(vtanh_rational_approx_f16(vget_low_f16(x)), vtanh_rational_approx_f16(vget_high_f16(x)));
 
     // tanh(x) == sign(x) to F16 precision for |x| >= 4.508, use sign after this
     const float16x8_t ONE      = vdupq_n_f16(1.0f);
diff --git a/src/core/NEON/NESymm.h b/src/core/NEON/NESymm.h
index e6644577a17493cab07f86b2e128b76c391f7438..ec246efc8c4984a33534a8aca539a10ddc3f3f9a 100644
--- a/src/core/NEON/NESymm.h
+++ b/src/core/NEON/NESymm.h
@@ -25,7 +25,9 @@
 #define ARM_COMPUTE_NESYMM_H
 
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -49,13 +51,10 @@ using qsymm16x8x2_t = int16x8x2_t; /**< 16 bit quantized symmetric vector with 1
  * @return Quantized values
  */
 template <bool is_bounded_relu>
-int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
-                                      int          result_fixedpoint_multiplier,
-                                      int32_t      result_shift,
-                                      int16x8_t    min_s16,
-                                      int16x8_t    max_s16)
+int16x8_t finalize_quantization_int16(
+    int32x4x2_t &in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int16x8_t min_s16, int16x8_t max_s16)
 {
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
         in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << -result_shift));
         in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << -result_shift));
@@ -76,7 +75,7 @@ int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
     // Convert S32 to S16
     int16x8_t out_s16 = vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s16 = vmaxq_s16(out_s16, min_s16);
         out_s16 = vminq_s16(out_s16, max_s16);
@@ -98,13 +97,14 @@ int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
  * @return Quantized values
  */
 template <bool is_bounded_relu>
-inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoint_multiplier,
-                                           int32_t result_shift, int16_t min_s16, int16_t max_s16)
+inline int16_t finalize_quantization_int16(
+    int32_t in_value, int result_fixedpoint_multiplier, int32_t result_shift, int16_t min_s16, int16_t max_s16)
 {
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
-        const int64_t in_64 = static_cast<int64_t>(in_value) * (1 << (-result_shift)) * static_cast<int64_t>(result_fixedpoint_multiplier);
-        in_value            = static_cast<int32_t>((in_64 + (1 << 30)) >> 31);
+        const int64_t in_64 = static_cast<int64_t>(in_value) * (1 << (-result_shift)) *
+                              static_cast<int64_t>(result_fixedpoint_multiplier);
+        in_value = static_cast<int32_t>((in_64 + (1 << 30)) >> 31);
     }
     else
     {
@@ -117,7 +117,7 @@ inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoi
     // Bound the result
     int16_t out_s16 = static_cast<int16_t>(std::max<int32_t>(-32768, std::min<int32_t>(32767, in_value)));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s16 = static_cast<int16_t>(std::max(min_s16, std::min(max_s16, out_s16)));
     }
@@ -134,14 +134,9 @@ inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoi
  */
 inline float32x4x2_t vdequantize_int16(const int16x8_t &qv, float scale)
 {
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)
-        }
-    };
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input = {{vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale),
+                                               vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)}};
     return vdequantized_input;
 }
 
@@ -156,18 +151,13 @@ inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale)
 {
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
 
-    const int32x4x2_t rf =
-    {
-        {
+    const int32x4x2_t rf = {{
 #ifdef __aarch64__
-            vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
+        vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
 #else  //__aarch64__
-            vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
+        vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
 #endif //__aarch64__
-        }
-    };
+    }};
     return vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
 }
 
@@ -180,17 +170,14 @@ inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale)
  */
 inline float32x4x4_t vdequantize(const int16x8x2_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale  = qi.scale;
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -206,24 +193,20 @@ inline qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQua
     const float scale = qi.scale;
     ARM_COMPUTE_ERROR_ON(scale == 0.f);
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
+    const int32x4x4_t rf        = {{
 #ifdef __aarch64__
-            vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
+        vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
+        vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
+        vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
+        vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
 #else  //__aarch64__
-            vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
+        vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
+        vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
+        vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
+        vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
 #endif //__aarch64__
-        }
-    };
-    const qsymm16x8x2_t res =
-    {
+    }};
+    const qsymm16x8x2_t res = {
         vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])),
         vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])),
     };
diff --git a/src/core/NEON/SVEAsymm.h b/src/core/NEON/SVEAsymm.h
index eea2627c620498a0f00362696ac563ebf4f18fae..a448cde4754190bbc2a82497c1e38b3ac0849b19 100644
--- a/src/core/NEON/SVEAsymm.h
+++ b/src/core/NEON/SVEAsymm.h
@@ -26,6 +26,7 @@
 
 #if defined(ARM_COMPUTE_ENABLE_SVE2)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -70,10 +71,18 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svuint8_t &qv, float scal
     const auto          voffset            = svdup_n_s32(offset);
     const auto          vscale             = svdup_n_f32(scale);
     const svfloat32x4_t vdequantized_input = svcreate4_f32(
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)), vscale));
+        svmul_f32_z(pg,
+                    svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)),
+                    vscale),
+        svmul_f32_z(pg,
+                    svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)),
+                    vscale),
+        svmul_f32_z(pg,
+                    svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)),
+                    vscale),
+        svmul_f32_z(pg,
+                    svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)),
+                    vscale));
     return vdequantized_input;
 }
 
@@ -104,10 +113,10 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale
     const auto          voffset            = svdup_n_s32(offset);
     const auto          vscale             = svdup_n_f32(scale);
     const svfloat32x4_t vdequantized_input = svcreate4_f32(
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale));
+        svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale),
+        svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale),
+        svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale),
+        svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale));
 
     return vdequantized_input;
 }
@@ -135,11 +144,11 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const Unifo
  */
 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const svfloat32x4_t vscale)
 {
-    const svfloat32x4_t vdequantized_input = svcreate4_f32(
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3)));
+    const svfloat32x4_t vdequantized_input =
+        svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3)));
 
     return vdequantized_input;
 }
@@ -153,12 +162,12 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const svflo
  */
 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale)
 {
-    const auto          vscale             = svdup_n_f32(scale);
-    const svfloat32x4_t vdequantized_input = svcreate4_f32(
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale));
+    const auto          vscale = svdup_n_f32(scale);
+    const svfloat32x4_t vdequantized_input =
+        svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale));
     return vdequantized_input;
 }
 
diff --git a/src/core/NEON/SVEMath.h b/src/core/NEON/SVEMath.h
index 5ada7ae0ffd1cc7b7d032601e9404680c61935c3..49ed9df720826687ba09c8ca4a09bebc3258462d 100644
--- a/src/core/NEON/SVEMath.h
+++ b/src/core/NEON/SVEMath.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_SVEMATH_H
-#define ARM_COMPUTE_SVEMATH_H
+#ifndef ACL_SRC_CORE_NEON_SVEMATH_H
+#define ACL_SRC_CORE_NEON_SVEMATH_H
 
 #if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "src/core/NEON/wrapper/intrinsics/svcvt.h"
 #include "src/core/NEON/wrapper/intrinsics/svdup_n.h"
 #include "src/core/NEON/wrapper/intrinsics/svreinterpret.h"
+
 #include <arm_sve.h>
 #include <array>
 
@@ -95,6 +96,19 @@ svfloat16_t svtanh_f16_z(svbool_t pg, svfloat16_t val);
  */
 svfloat16_t svexp_f16_z(svbool_t pg, svfloat16_t x);
 
+#ifdef ARM_COMPUTE_ENABLE_SVE2
+
+/** Calculate exponential
+ *
+ * @param[in] pg Input predicate.
+ * @param[in] x  Input vector value in F16 format.
+ *
+ * @return The calculated exponent.
+ */
+svfloat16_t svexp_f16_z_sve2(svbool_t pg, svfloat16_t x);
+
+#endif // ARM_COMPUTE_ENABLE_SVE2
+
 /** Calculate reciprocal.
  *
  * @param[in] pg Input predicate.
@@ -113,6 +127,19 @@ svfloat16_t svinv_f16_z(svbool_t pg, svfloat16_t x);
  */
 svfloat16_t svlog_f16_z(svbool_t pg, svfloat16_t x);
 
+#ifdef ARM_COMPUTE_ENABLE_SVE2
+
+/** Calculate logarithm
+ *
+ * @param[in] pg Input predicate.
+ * @param[in] x  Input vector value in F32 format.
+ *
+ * @return The calculated logarithm.
+ */
+svfloat16_t svlog_f16_z_sve2(svbool_t pg, svfloat16_t x);
+
+#endif // ARM_COMPUTE_ENABLE_SVE2
+
 /** Calculate inverse square root.
  *
  * @param[in] pg  Input predicate.
@@ -147,6 +174,19 @@ svfloat32_t svsin_f32_z(svbool_t pg, svfloat32_t val);
  */
 svfloat16_t svsin_f16_z(svbool_t pg, svfloat16_t val);
 
+#ifdef ARM_COMPUTE_ENABLE_SVE2
+
+/** Calculate sine.
+ *
+ * @param[in] pg  Input predicate.
+ * @param[in] val Input vector value in radians, F16 format.
+ *
+ * @return The calculated sine.
+ */
+svfloat16_t svsin_f16_z_sve2(svbool_t pg, svfloat16_t val);
+
+#endif // ARM_COMPUTE_ENABLE_SVE2
+
 /** Calculate n power of a number.
  *
  * pow(x,n) = e^(n*log(x))
@@ -171,6 +211,22 @@ svfloat32_t svpow_f32_z(svbool_t pg, svfloat32_t a, svfloat32_t b);
  */
 svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b);
 
+#ifdef ARM_COMPUTE_ENABLE_SVE2
+
+/** Calculate n power of a number.
+ *
+ * pow(x,n) = e^(n*log(x))
+ *
+ * @param[in] pg Input predicate.
+ * @param[in] a  Input vector value in F16 format.
+ * @param[in] b  Powers to raise the input to.
+ *
+ * @return The calculated power.
+ */
+svfloat16_t svpow_f16_z_sve2(svbool_t pg, svfloat16_t a, svfloat16_t b);
+
+#endif // ARM_COMPUTE_ENABLE_SVE2
+
 /** Convert and pack four 32-bit float vectors into an 8-bit integer vector
  *
  * @param[in] in_0 The first float vector
@@ -181,9 +237,12 @@ svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b);
  * @return The converted integer vector
  */
 template <typename int_vec_type>
-int_vec_type convert_float_to_int(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3);
+int_vec_type convert_float_to_int(const svfloat32_t &in_0,
+                                  const svfloat32_t &in_1,
+                                  const svfloat32_t &in_2,
+                                  const svfloat32_t &in_3);
 
 } // namespace arm_compute
 #include "src/core/NEON/SVEMath.inl"
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-#endif /* ARM_COMPUTE_SVEMATH_H */
\ No newline at end of file
+#endif // ACL_SRC_CORE_NEON_SVEMATH_H
diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl
index 8973d0b273be8d35cf99d4a9b0cfb0afdf0d3d40..fdf94f08594cc2c05748bc6c0675e9e7cc9cf913 100644
--- a/src/core/NEON/SVEMath.inl
+++ b/src/core/NEON/SVEMath.inl
@@ -21,6 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
+#ifndef ACL_SRC_CORE_NEON_SVEMATH_INL
+#define ACL_SRC_CORE_NEON_SVEMATH_INL
+
 #include <cmath>
 #include <limits>
 
@@ -32,8 +36,16 @@
 
 namespace arm_compute
 {
-inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, svfloat32_t coeff_1, svfloat32_t coeff_2, svfloat32_t coeff_3,
-                                       svfloat32_t coeff_4, svfloat32_t coeff_5, svfloat32_t coeff_6, svfloat32_t coeff_7, svfloat32_t coeff_8)
+inline svfloat32_t svtaylor_poly_f32_z(svbool_t    pg,
+                                       svfloat32_t x,
+                                       svfloat32_t coeff_1,
+                                       svfloat32_t coeff_2,
+                                       svfloat32_t coeff_3,
+                                       svfloat32_t coeff_4,
+                                       svfloat32_t coeff_5,
+                                       svfloat32_t coeff_6,
+                                       svfloat32_t coeff_7,
+                                       svfloat32_t coeff_8)
 {
     const auto A   = svmla_f32_z(pg, coeff_1, coeff_5, x);
     const auto B   = svmla_f32_z(pg, coeff_3, coeff_7, x);
@@ -45,8 +57,16 @@ inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, svfloat32_t c
     return res;
 }
 
-inline svfloat16_t svtaylor_poly_f16_z(svbool_t pg, svfloat16_t x, svfloat16_t coeff_1, svfloat16_t coeff_2, svfloat16_t coeff_3,
-                                       svfloat16_t coeff_4, svfloat16_t coeff_5, svfloat16_t coeff_6, svfloat16_t coeff_7, svfloat16_t coeff_8)
+inline svfloat16_t svtaylor_poly_f16_z(svbool_t    pg,
+                                       svfloat16_t x,
+                                       svfloat16_t coeff_1,
+                                       svfloat16_t coeff_2,
+                                       svfloat16_t coeff_3,
+                                       svfloat16_t coeff_4,
+                                       svfloat16_t coeff_5,
+                                       svfloat16_t coeff_6,
+                                       svfloat16_t coeff_7,
+                                       svfloat16_t coeff_8)
 {
     const auto A   = svmla_f16_z(pg, coeff_1, coeff_5, x);
     const auto B   = svmla_f16_z(pg, coeff_3, coeff_7, x);
@@ -90,15 +110,17 @@ inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x)
     const auto c4 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[3]));
     const auto c5 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[4]));
 
-    const auto shift   = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f));  // 2^23 + 127 = 0x1.0000fep23f
-    const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b));  // 1 / ln(2) = 0x1.715476p+0f
-    const auto neg_ln2_hi  = svreinterpret_f32_u32(svdup_n_u32(0xbf317200));  // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
-    const auto neg_ln2_lo  = svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e));  // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
+    const auto shift   = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
+    const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
+    const auto neg_ln2_hi =
+        svreinterpret_f32_u32(svdup_n_u32(0xbf317200)); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
+    const auto neg_ln2_lo =
+        svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
 
     const auto inf       = svdup_n_f32(std::numeric_limits<float>::infinity());
-    const auto max_input = svdup_n_f32(88.37f);  // Approximately ln(2^127.5)
+    const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5)
     const auto zero      = svdup_n_f32(0.f);
-    const auto min_input = svdup_n_f32(-86.64f);  // Approximately ln(2^-125)
+    const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125)
 
     // Range reduction:
     //   e^x = 2^n * e^r
@@ -114,23 +136,23 @@ inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x)
     //     (i.e. n) because the decimal part has been pushed out and lost.
     //   * The addition of 127 makes the FP32 fraction part of z ready to be used as the exponent
     //     in FP32 format. Left shifting z by 23 bits will result in 2^n.
-    const auto z = svmla_f32_z(pg, shift, x, inv_ln2);
-    const auto n = svsub_f32_z(pg, z, shift);
-    const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23));  // 2^n
+    const auto z     = svmla_f32_z(pg, shift, x, inv_ln2);
+    const auto n     = svsub_f32_z(pg, z, shift);
+    const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n
 
     // The calculation of n * ln(2) is done using 2 steps to achieve accuracy beyond FP32.
     // This outperforms longer Taylor series (3-4 tabs) both in term of accuracy and performance.
     const auto r_hi = svmla_f32_z(pg, x, n, neg_ln2_hi);
-    const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo);
+    const auto r    = svmla_f32_z(pg, r_hi, n, neg_ln2_lo);
 
     // Compute the truncated Taylor series of e^r.
     //   poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5)
     const auto r2 = svmul_f32_z(pg, r, r);
 
-    const auto p1 = svmul_f32_z(pg, c1, r);
-    const auto p23 = svmla_f32_z(pg, c2, c3, r);
-    const auto p45 = svmla_f32_z(pg, c4, c5, r);
-    const auto p2345 = svmla_f32_z(pg, p23, p45, r2);
+    const auto p1     = svmul_f32_z(pg, c1, r);
+    const auto p23    = svmla_f32_z(pg, c2, c3, r);
+    const auto p45    = svmla_f32_z(pg, c4, c5, r);
+    const auto p2345  = svmla_f32_z(pg, p23, p45, r2);
     const auto p12345 = svmla_f32_z(pg, p1, p2345, r2);
 
     auto poly = svmla_f32_z(pg, scale, p12345, scale);
@@ -145,24 +167,31 @@ inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x)
 inline svfloat16_t svexp_f16_z(svbool_t pg, svfloat16_t x)
 {
     auto bottom = svcvt_f32_z(pg, x);
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-    auto top    = svcvtlt_f32_x(pg, x);
-    auto pg_top = pg;
-#else  /* defined(ARM_COMPUTE_ENABLE_SVE2) */
     auto pg_top = svptrue_b16();
     auto top    = svcvt_f32_z(pg_top, svreinterpret_f16(svrevh_z(svptrue_b16(), svreinterpret_u32(x))));
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 
     bottom = svexp_f32_z(pg, bottom);
     top    = svexp_f32_z(pg_top, top);
 
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-    return svcvtnt_f16_m(svcvt_f16_z(pg, bottom), pg_top, top);
-#else  /* defined(ARM_COMPUTE_ENABLE_SVE2) */
     return svtrn1(svcvt_f16_z(pg, bottom), svcvt_f16_z(pg_top, top));
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 }
 
+#ifdef ARM_COMPUTE_ENABLE_SVE2
+
+inline svfloat16_t svexp_f16_z_sve2(svbool_t pg, svfloat16_t x)
+{
+    auto bottom = svcvt_f32_z(pg, x);
+    auto top    = svcvtlt_f32_x(pg, x);
+    auto pg_top = pg;
+
+    bottom = svexp_f32_z(pg, bottom);
+    top    = svexp_f32_z(pg_top, top);
+
+    return svcvtnt_f16_m(svcvt_f16_z(pg, bottom), pg_top, top);
+}
+
+#endif // ARM_COMPUTE_ENABLE_SVE2
+
 inline svfloat32_t svtanh_f32_z(svbool_t pg, svfloat32_t val)
 {
     const svfloat32_t CONST_1        = svdup_n_f32(1.f);
@@ -213,7 +242,8 @@ inline svfloat32_t svlog_f32_z(svbool_t pg, svfloat32_t x)
     auto val = svreinterpret_f32_s32(svsub_s32_z(pg, svreinterpret_s32_f32(x), svlsl_n_s32_z(pg, m, 23)));
 
     // Polynomial Approximation
-    auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6, log_tab_7, log_tab_8);
+    auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6,
+                                    log_tab_7, log_tab_8);
 
     // Reconstruct
     poly = svmla_f32_z(pg, poly, svcvt_f32_s32_z(pg, m), CONST_LN2);
@@ -224,24 +254,31 @@ inline svfloat32_t svlog_f32_z(svbool_t pg, svfloat32_t x)
 inline svfloat16_t svlog_f16_z(svbool_t pg, svfloat16_t x)
 {
     auto bottom = svcvt_f32_z(pg, x);
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-    auto top    = svcvtlt_f32_x(pg, x);
-    auto pg_top = pg;
-#else  /* defined(ARM_COMPUTE_ENABLE_SVE2) */
     auto pg_top = svptrue_b16();
     auto top    = svcvt_f32_z(pg_top, svreinterpret_f16(svrevh_z(svptrue_b16(), svreinterpret_u32(x))));
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 
     bottom = svlog_f32_z(pg, bottom);
     top    = svlog_f32_z(pg_top, top);
 
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-    return svcvtnt_f16_m(svcvt_f16_z(pg, bottom), pg_top, top);
-#else  /* defined(ARM_COMPUTE_ENABLE_SVE2) */
     return svtrn1(svcvt_f16_z(pg, bottom), svcvt_f16_z(pg_top, top));
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 }
 
+#ifdef ARM_COMPUTE_ENABLE_SVE2
+
+inline svfloat16_t svlog_f16_z_sve2(svbool_t pg, svfloat16_t x)
+{
+    auto bottom = svcvt_f32_z(pg, x);
+    auto top    = svcvtlt_f32_x(pg, x);
+    auto pg_top = pg;
+
+    bottom = svlog_f32_z(pg, bottom);
+    top    = svlog_f32_z(pg_top, top);
+
+    return svcvtnt_f16_m(svcvt_f16_z(pg, bottom), pg_top, top);
+}
+
+#endif // ARM_COMPUTE_ENABLE_SVE2
+
 inline svfloat32_t svsin_f32_z(svbool_t pg, svfloat32_t val)
 {
     using ScalarType = float;
@@ -259,7 +296,8 @@ inline svfloat32_t svsin_f32_z(svbool_t pg, svfloat32_t val)
     //Find positive or negative
     const auto c_v    = svabs_z(pg, wrapper::svcvt_z<int32_t>(pg, svmul_z(pg, val, ipi_v)));
     const auto sign_v = svcmple(pg, val, wrapper::svdup_n(ScalarType(0)));
-    const auto odd_v  = svcmpne(pg, svand_z(pg, wrapper::svreinterpret<IntType>(c_v), wrapper::svdup_n(IntType(1))), wrapper::svdup_n(IntType(0)));
+    const auto odd_v  = svcmpne(pg, svand_z(pg, wrapper::svreinterpret<IntType>(c_v), wrapper::svdup_n(IntType(1))),
+                                wrapper::svdup_n(IntType(0)));
 
     auto neg_v = sveor_z(pg, odd_v, sign_v);
 
@@ -297,24 +335,31 @@ inline svfloat32_t svsin_f32_z(svbool_t pg, svfloat32_t val)
 inline svfloat16_t svsin_f16_z(svbool_t pg, svfloat16_t val)
 {
     auto bottom = svcvt_f32_z(pg, val);
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-    auto top    = svcvtlt_f32_x(pg, val);
-    auto pg_top = pg;
-#else  /* defined(ARM_COMPUTE_ENABLE_SVE2) */
     auto pg_top = svptrue_b16();
     auto top    = svcvt_f32_z(pg_top, svreinterpret_f16(svrevh_z(svptrue_b16(), svreinterpret_u32(val))));
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 
     bottom = svsin_f32_z(pg, bottom);
     top    = svsin_f32_z(pg_top, top);
 
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-    return svcvtnt_f16_m(svcvt_f16_z(pg, bottom), pg_top, top);
-#else  /* defined(ARM_COMPUTE_ENABLE_SVE2) */
     return svtrn1(svcvt_f16_z(pg, bottom), svcvt_f16_z(pg_top, top));
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 }
 
+#ifdef ARM_COMPUTE_ENABLE_SVE2
+
+inline svfloat16_t svsin_f16_z_sve2(svbool_t pg, svfloat16_t val)
+{
+    auto bottom = svcvt_f32_z(pg, val);
+    auto top    = svcvtlt_f32_x(pg, val);
+    auto pg_top = pg;
+
+    bottom = svsin_f32_z(pg, bottom);
+    top    = svsin_f32_z(pg_top, top);
+
+    return svcvtnt_f16_m(svcvt_f16_z(pg, bottom), pg_top, top);
+}
+
+#endif // ARM_COMPUTE_ENABLE_SVE2
+
 inline svfloat32_t svpow_f32_z(svbool_t pg, svfloat32_t a, svfloat32_t b)
 {
     return svexp_f32_z(pg, svmul_z(pg, b, svlog_f32_z(pg, a)));
@@ -325,29 +370,41 @@ inline svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b)
     auto a_bottom = svcvt_f32_z(pg, a);
     auto b_bottom = svcvt_f32_z(pg, b);
 
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-    auto pg_top = pg;
-    auto a_top  = svcvtlt_f32_x(pg, a);
-    auto b_top  = svcvtlt_f32_x(pg, b);
-#else  /* defined(ARM_COMPUTE_ENABLE_SVE2) */
     auto pg_top = svptrue_b16();
     auto a_top  = svcvt_f32_z(pg_top, svreinterpret_f16(svrevh_z(svptrue_b16(), svreinterpret_u32(a))));
     auto b_top  = svcvt_f32_z(pg_top, svreinterpret_f16(svrevh_z(svptrue_b16(), svreinterpret_u32(b))));
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 
     auto res_bottom = svpow_f32_z(pg, a_bottom, b_bottom);
     auto res_top    = svpow_f32_z(pg_top, a_top, b_top);
 
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-    return svcvtnt_f16_m(svcvt_f16_z(pg, res_bottom), pg_top, res_top);
-#else  /* defined(ARM_COMPUTE_ENABLE_SVE2) */
     return svtrn1(svcvt_f16_z(pg, res_bottom), svcvt_f16_z(pg_top, res_top));
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 }
 
+#ifdef ARM_COMPUTE_ENABLE_SVE2
+
+inline svfloat16_t svpow_f16_z_sve2(svbool_t pg, svfloat16_t a, svfloat16_t b)
+{
+    auto a_bottom = svcvt_f32_z(pg, a);
+    auto b_bottom = svcvt_f32_z(pg, b);
+
+    auto pg_top = pg;
+    auto a_top  = svcvtlt_f32_x(pg, a);
+    auto b_top  = svcvtlt_f32_x(pg, b);
+
+    auto res_bottom = svpow_f32_z(pg, a_bottom, b_bottom);
+    auto res_top    = svpow_f32_z(pg_top, a_top, b_top);
+
+    return svcvtnt_f16_m(svcvt_f16_z(pg, res_bottom), pg_top, res_top);
+}
+
+#endif // ARM_COMPUTE_ENABLE_SVE2
+
 #if defined(ARM_COMPUTE_ENABLE_SVE2)
 template <>
-inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3)
+inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0,
+                                                 const svfloat32_t &in_1,
+                                                 const svfloat32_t &in_2,
+                                                 const svfloat32_t &in_3)
 {
     svuint8_t  out;
     const auto all_true_pg = svptrue_b32();
@@ -381,7 +438,10 @@ inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, const
 }
 
 template <>
-inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3)
+inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0,
+                                               const svfloat32_t &in_1,
+                                               const svfloat32_t &in_2,
+                                               const svfloat32_t &in_3)
 {
     svint8_t   out;
     const auto all_true_pg = svptrue_b32();
@@ -417,3 +477,5 @@ inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0, const sv
 
 } // namespace arm_compute
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
+
+#endif // ACL_SRC_CORE_NEON_SVEMATH_INL
diff --git a/src/core/NEON/SVESymm.h b/src/core/NEON/SVESymm.h
index 6808577681386cc2c8a86f9ba195d3a960500f77..288d45d979310ab4f670dc334b890adb84ab935b 100644
--- a/src/core/NEON/SVESymm.h
+++ b/src/core/NEON/SVESymm.h
@@ -28,6 +28,7 @@
 
 #if defined(ARM_COMPUTE_ENABLE_SVE2)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -42,8 +43,10 @@ namespace arm_compute
  */
 inline svfloat32x2_t svdequantize_qsymm16_z(svbool_t pg, const svint16_t &qv, float scale)
 {
-    const auto          vscale             = svdup_n_f32(scale);
-    const svfloat32x2_t vdequantized_input = svcreate2_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(qv)), vscale), svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(qv)), vscale));
+    const auto          vscale = svdup_n_f32(scale);
+    const svfloat32x2_t vdequantized_input =
+        svcreate2_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(qv)), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(qv)), vscale));
     return vdequantized_input;
 }
 
@@ -76,13 +79,13 @@ inline svint16_t svquantize_qsymm16_z(svbool_t pg, const svfloat32x2_t qv, float
  */
 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint16x2_t qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale              = qi.scale;
-    const auto          vscale             = svdup_n_f32(scale);
-    const svfloat32x4_t vdequantized_input = svcreate4_f32(
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 0))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 0))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 1))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 1))), vscale));
+    const float         scale  = qi.scale;
+    const auto          vscale = svdup_n_f32(scale);
+    const svfloat32x4_t vdequantized_input =
+        svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 0))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 0))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 1))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 1))), vscale));
     return vdequantized_input;
 }
 
@@ -112,4 +115,4 @@ inline svint16x2_t svquantize_qsymm16_z(svbool_t pg, const svfloat32x4_t qv, con
 
 } // namespace arm_compute
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
-#endif // ARM_COMPUTE_NESYMM_H
\ No newline at end of file
+#endif // ARM_COMPUTE_NESYMM_H
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 108b199df717a936c92420b556675a57d3712112..deb89996a93ab12c904f2180fe5765e6104eb930 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -28,18 +28,17 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
+#include "src/core/NEON/kernels/batchnormalization/impl/list.h"
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 
-#include "src/core/NEON/kernels/batchnormalization/impl/list.h"
-#include "src/core/common/Registrars.h"
-
 #include <map>
 
 namespace arm_compute
@@ -52,8 +51,15 @@ struct BatchNormalizationSelectorData
     const CPUInfo &ci;
 };
 using BatchNormalizationSelectorPtr = std::add_pointer<bool(const BatchNormalizationSelectorData &data)>::type;
-using BatchNormalizationKernelPtr   = std::add_pointer<void(ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const ITensor *,
-                                                            float, ActivationLayerInfo &, const Window &)>::type;
+using BatchNormalizationKernelPtr   = std::add_pointer<void(ITensor *,
+                                                          ITensor *,
+                                                          const ITensor *,
+                                                          const ITensor *,
+                                                          const ITensor *,
+                                                          const ITensor *,
+                                                          float,
+                                                          ActivationLayerInfo &,
+                                                          const Window &)>::type;
 
 struct BatchNormalizationKernel
 {
@@ -62,41 +68,32 @@ struct BatchNormalizationKernel
     BatchNormalizationKernelPtr         ukernel;
 };
 
-static const BatchNormalizationKernel available_kernels[] =
-{
+static const BatchNormalizationKernel available_kernels[] = {
 #if defined(ARM_COMPUTE_ENABLE_SVE)
-    {
-        "sve_fp16_batch_normalization",
-        [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
-        REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization)
-    },
-    {
-        "sve_fp32_batch_normalization",
-        [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
-        REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)
-    },
+    {"sve_fp16_batch_normalization",
+     [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
+     REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization)},
+    {"sve_fp32_batch_normalization",
+     [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
+     REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)},
 #endif /* !defined(ARM_COMPUTE_ENABLE_SVE) */
 #if defined(ARM_COMPUTE_ENABLE_NEON)
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_fp16_batch_normalization",
-        [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)
-    },
+    {"neon_fp16_batch_normalization",
+     [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)},
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    {
-        "neon_fp32_batch_normalization",
-        [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)
-    },
+    {"neon_fp32_batch_normalization",
+     [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)},
 #endif /* !defined(ARM_COMPUTE_ENABLE_NEON) */
 };
 
 const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -104,25 +101,31 @@ const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelec
     return nullptr;
 }
 
-Status
-validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var,
-                   const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info)
+Status validate_arguments(const ITensorInfo  *input,
+                          const ITensorInfo  *output,
+                          const ITensorInfo  *mean,
+                          const ITensorInfo  *var,
+                          const ITensorInfo  *beta,
+                          const ITensorInfo  *gamma,
+                          float               epsilon,
+                          ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_UNUSED(epsilon);
 
-    const auto *uk = get_implementation(BatchNormalizationSelectorData{ input->data_type(), CPUInfo::get() });
+    const auto *uk = get_implementation(BatchNormalizationSelectorData{input->data_type(), CPUInfo::get()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         ActivationLayerInfo::ActivationFunction act = act_info.activation();
-        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU &&
+                                    act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+                                    act !=
+                                        ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
         ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
     }
 
-    if(nullptr != output)
+    if (nullptr != output)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -131,17 +134,18 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
-    if(beta != nullptr)
+    if (beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
     }
-    if(gamma != nullptr)
+    if (gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(
+                                    input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
 
     return Status{};
 }
@@ -169,10 +173,12 @@ void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &win
     // Only compute denominator and constants once per feature map.
     int slice = -1;
 
-    const auto input_mean  = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     T mean        = static_cast<T>(0);
     T var         = static_cast<T>(0);
@@ -186,80 +192,83 @@ void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &win
     auto       beta_vec        = wrapper::vdup_n(beta, ExactTagType{});
     auto       denominator_vec = wrapper::vdup_n(denominator, ExactTagType{});
     const auto epsilon_vec     = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
-    execute_window_loop(win_to_use, [&](const Coordinates & id)
-    {
-        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-        if(slice != id.z())
+    execute_window_loop(
+        win_to_use,
+        [&](const Coordinates &id)
         {
-            mean     = input_mean[id.z()];
-            var      = input_var[id.z()];
-            mean_vec = wrapper::vdup_n(mean, ExactTagType{});
-            var_vec  = wrapper::vdup_n(var, ExactTagType{});
-            if(input_gamma != nullptr)
-            {
-                gamma     = input_gamma[id.z()];
-                gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
-            }
-            if(input_beta != nullptr)
+            const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+            if (slice != id.z())
             {
-                beta     = input_beta[id.z()];
-                beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                mean     = input_mean[id.z()];
+                var      = input_var[id.z()];
+                mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+                var_vec  = wrapper::vdup_n(var, ExactTagType{});
+                if (input_gamma != nullptr)
+                {
+                    gamma     = input_gamma[id.z()];
+                    gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
+                }
+                if (input_beta != nullptr)
+                {
+                    beta     = input_beta[id.z()];
+                    beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                }
+
+                // Calculate denominator
+                denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+                denominator     = wrapper::vgetlane(denominator_vec, 0);
+                slice           = id.z();
             }
 
-            // Calculate denominator
-            denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-            denominator     = wrapper::vgetlane(denominator_vec, 0);
-            slice           = id.z();
-        }
-
-        // Perform core calculations using vector operations
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Calculate x bar
-            const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
-            const auto x_bar     = wrapper::vmul(numerator, denominator_vec);
-            auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
-            // Perform fused activation
-            if(fused_activation)
+            // Perform core calculations using vector operations
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                activation_functor(res);
+                // Calculate x bar
+                const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+                const auto x_bar     = wrapper::vmul(numerator, denominator_vec);
+                auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (fused_activation)
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                wrapper::vstore(output_ptr + x, res);
             }
 
-            // Store results
-            wrapper::vstore(output_ptr + x, res);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const T numerator = input_ptr[x] - mean;
-            const T x_bar     = numerator * denominator;
-            T       res       = beta + x_bar * gamma;
-
-            // Perform fused activation
-            if(fused_activation)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                activation_functor(res);
+                const T numerator = input_ptr[x] - mean;
+                const T x_bar     = numerator * denominator;
+                T       res       = beta + x_bar * gamma;
+
+                // Perform fused activation
+                if (fused_activation)
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                *(output_ptr + x) = res;
             }
-
-            // Store results
-            *(output_ptr + x) = res;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 
 void NEBatchNormalizationLayerKernel::configure_non_fused()
 {
-    switch(_input->info()->data_type())
+    switch (_input->info()->data_type())
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>;
+            _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false,
+                                                                               detail::dummy<float16_t, 8>>;
             break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
@@ -274,23 +283,25 @@ void NEBatchNormalizationLayerKernel::configure_non_fused()
 void NEBatchNormalizationLayerKernel::configure_fused()
 {
     // NCHW Fused Batched Normalization with activation functions : FP32
-    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw =
-    {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>> }
-    };
+    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw = {
+        {ActivationLayerInfo::ActivationFunction::RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>>},
+        {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>>},
+        {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>>}};
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     // NCHW Fused Batched Normalization with activation functions : FP16
-    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw =
-    {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>> }
-    };
+    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw = {
+        {ActivationLayerInfo::ActivationFunction::RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>>},
+        {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>>},
+        {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>>}};
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
-    switch(_input->info()->data_type())
+    switch (_input->info()->data_type())
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
@@ -307,22 +318,32 @@ void NEBatchNormalizationLayerKernel::configure_fused()
 }
 
 NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(), _act_info()
+    : _func(nullptr),
+      _input(nullptr),
+      _output(nullptr),
+      _mean(nullptr),
+      _var(nullptr),
+      _gamma(nullptr),
+      _beta(nullptr),
+      _epsilon(),
+      _act_info()
 {
 }
 
-void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
-                                                const ITensor *mean, const ITensor *var,
-                                                const ITensor *beta, const ITensor *gamma,
-                                                float epsilon, ActivationLayerInfo act_info)
+void NEBatchNormalizationLayerKernel::configure(ITensor            *input,
+                                                ITensor            *output,
+                                                const ITensor      *mean,
+                                                const ITensor      *var,
+                                                const ITensor      *beta,
+                                                const ITensor      *gamma,
+                                                float               epsilon,
+                                                ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr,
-                                                  mean->info(), var->info(),
-                                                  (beta != nullptr) ? beta->info() : nullptr,
-                                                  (gamma != nullptr) ? gamma->info() : nullptr,
-                                                  epsilon, act_info));
+                                                  mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
+                                                  (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
 
     _input    = input;
     _output   = input;
@@ -334,16 +355,16 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
     _act_info = act_info;
 
     const bool run_in_place = (output == nullptr) || (output == input);
-    if(!run_in_place)
+    if (!run_in_place)
     {
         _output = output;
     }
 
     // Configure activation function to run
     const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW;
-    if(is_nchw)
+    if (is_nchw)
     {
-        if(_act_info.enabled())
+        if (_act_info.enabled())
         {
             configure_fused();
         }
@@ -357,17 +378,21 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
     Window win = calculate_max_window(*input->info(), Steps());
     INEKernel::configure(win);
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output auto initialization if not yet initialized
         auto_init_if_empty(*output->info(), *input->info()->clone());
     }
 }
 
-Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                                 const ITensorInfo *mean, const ITensorInfo *var,
-                                                 const ITensorInfo *beta, const ITensorInfo *gamma,
-                                                 float epsilon, ActivationLayerInfo act_info)
+Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo  *input,
+                                                 const ITensorInfo  *output,
+                                                 const ITensorInfo  *mean,
+                                                 const ITensorInfo  *var,
+                                                 const ITensorInfo  *beta,
+                                                 const ITensorInfo  *gamma,
+                                                 float               epsilon,
+                                                 ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
 
@@ -382,13 +407,14 @@ void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo
     ARM_COMPUTE_ERROR_ON(_func == nullptr && _input->info()->data_layout() == DataLayout::NCHW);
 
     const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW;
-    if(is_nchw)
+    if (is_nchw)
     {
         (this->*_func)(window);
     }
     else
     {
-        const auto *uk = get_implementation(BatchNormalizationSelectorData{ _input->info()->data_type(), CPUInfo::get() });
+        const auto *uk =
+            get_implementation(BatchNormalizationSelectorData{_input->info()->data_type(), CPUInfo::get()});
         uk->ukernel(_input, _output, _mean, _var, _beta, _gamma, _epsilon, _act_info, window);
     }
 }
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index 0551ace30cebb5778fcb58b225b10298031fac0b..2e8ff0dc9ae5908adbd638dbc0e5df702f5ee934 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -68,7 +69,13 @@ public:
      * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f,
+    void configure(ITensor            *input,
+                   ITensor            *output,
+                   const ITensor      *mean,
+                   const ITensor      *var,
+                   const ITensor      *beta     = nullptr,
+                   const ITensor      *gamma    = nullptr,
+                   float               epsilon  = 0.001f,
                    ActivationLayerInfo act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayerKernel
      *
@@ -85,10 +92,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *output,
+                           const ITensorInfo  *mean,
+                           const ITensorInfo  *var,
+                           const ITensorInfo  *beta     = nullptr,
+                           const ITensorInfo  *gamma    = nullptr,
+                           float               epsilon  = 0.001f,
+                           ActivationLayerInfo act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
index 83fb5f6f511d1853032fbc215715cf9148614490..f299bb94a418a3d219f8299f9b5d6c0943237cc2 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
@@ -27,8 +27,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -46,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -54,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, int block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status validate_arguments_static(const ITensorInfo *input,
+                                 int                block_shape_x,
+                                 int                block_shape_y,
+                                 const ITensorInfo *output,
+                                 const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
@@ -65,13 +70,14 @@ Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, in
     const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
-        const TensorShape expected_output_shape = compute_batch_to_space_shape(input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
-        const TensorInfo  expected_output       = output->clone()->set_tensor_shape(expected_output_shape);
+        const TensorShape expected_output_shape = compute_batch_to_space_shape(
+            input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
+        const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output);
     }
 
@@ -80,7 +86,13 @@ Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, in
 } // namespace
 
 NEBatchToSpaceLayerKernel::NEBatchToSpaceLayerKernel()
-    : _input(nullptr), _block_shape(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _block_shape_x(), _block_shape_y(), _crop_info()
+    : _input(nullptr),
+      _block_shape(nullptr),
+      _output(nullptr),
+      _data_layout(DataLayout::UNKNOWN),
+      _block_shape_x(),
+      _block_shape_y(),
+      _crop_info()
 {
 }
 
@@ -99,15 +111,18 @@ void NEBatchToSpaceLayerKernel::configure(const ITensor *input, const ITensor *b
     ICPPKernel::configure(win);
 }
 
-void NEBatchToSpaceLayerKernel::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
+void NEBatchToSpaceLayerKernel::configure(
+    const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    const TensorShape output_shape = compute_batch_to_space_shape(input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
+    const TensorShape output_shape = compute_batch_to_space_shape(
+        input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
 
     _input         = input;
     _output        = output;
@@ -121,14 +136,19 @@ void NEBatchToSpaceLayerKernel::configure(const ITensor *input, int32_t block_sh
     ICPPKernel::configure(win);
 }
 
-Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
     return Status{};
 }
 
-Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input,
+                                           int32_t            block_shape_x,
+                                           int32_t            block_shape_y,
+                                           const ITensorInfo *output,
+                                           const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info));
@@ -141,7 +161,7 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    if(_block_shape != nullptr)
+    if (_block_shape != nullptr)
     {
         // Retrieve the block shapes dynamically
         _block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0)));
@@ -155,31 +175,32 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
 
     int batch_id = 0;
     // Main loop for NCHW and NHWC
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-
-                const int x = id.x();
-                const int y = id.y();
-                const int z = id.z();
-                // Translate x, y to uncropped version
-                const int x_c = x + _crop_info.left;
-                const int y_c = y + _crop_info.top;
-
-                const int   in_batch = batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
-                const int   in_x     = x_c / _block_shape_x;
-                const int   in_y     = y_c / _block_shape_y;
-                Coordinates input_coords{ in_x, in_y, z, in_batch };
-                memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-            },
-            out);
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
+                {
+                    const int x = id.x();
+                    const int y = id.y();
+                    const int z = id.z();
+                    // Translate x, y to uncropped version
+                    const int x_c = x + _crop_info.left;
+                    const int y_c = y + _crop_info.top;
+
+                    const int in_batch =
+                        batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
+                    const int   in_x = x_c / _block_shape_x;
+                    const int   in_y = y_c / _block_shape_y;
+                    Coordinates input_coords{in_x, in_y, z, in_batch};
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
     else
     {
@@ -188,26 +209,28 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-
-                const int x = id.y();
-                const int y = id.z();
-
-                // Translate x, y to uncropped version
-                const int x_c = x + _crop_info.left;
-                const int y_c = y + _crop_info.top;
-
-                const int   in_batch = batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
-                const int   in_x     = x_c / _block_shape_x;
-                const int   in_y     = y_c / _block_shape_y;
-                Coordinates input_coords{ 0, in_x, in_y, in_batch };
-                memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size * _input->info()->dimension(0));
-            },
-            out);
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
+                {
+                    const int x = id.y();
+                    const int y = id.z();
+
+                    // Translate x, y to uncropped version
+                    const int x_c = x + _crop_info.left;
+                    const int y_c = y + _crop_info.top;
+
+                    const int in_batch =
+                        batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
+                    const int   in_x = x_c / _block_shape_x;
+                    const int   in_y = y_c / _block_shape_y;
+                    Coordinates input_coords{0, in_x, in_y, in_batch};
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords),
+                           element_size * _input->info()->dimension(0));
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
index 5eceee0904ebaba0f06da510e5d5d403174bbe53..d98ac621b0192c3b69e05c65aaa09204d74071bf 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -68,7 +69,11 @@ public:
      * @param[out] output        Tensor output. Data types supported: same as @p input
      * @param[in]  crop_info     Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info = CropInfo{});
+    void configure(const ITensor  *input,
+                   int32_t         block_shape_x,
+                   int32_t         block_shape_y,
+                   ITensor        *output,
+                   const CropInfo &crop_info = CropInfo{});
     /** Static function to check if given info will lead to a valid configuration of @ref NEBatchToSpaceLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -90,7 +95,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info = CropInfo{});
+    static Status validate(const ITensorInfo *input,
+                           int32_t            block_shape_x,
+                           int32_t            block_shape_y,
+                           const ITensorInfo *output,
+                           const CropInfo    &crop_info = CropInfo{});
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index 677c5cddccbf8660520d1592bb97a7aef741b0a1..a59bbd233b957c5bb02fb3f63b6f1b79a8c965b3 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstdint>
@@ -55,8 +56,7 @@ inline void bitwise_and(const T *__restrict input1, const T *__restrict input2,
 }
 } // namespace
 
-NEBitwiseAndKernel::NEBitwiseAndKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseAndKernel::NEBitwiseAndKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
 }
 
@@ -86,8 +86,7 @@ void NEBitwiseAndKernel::configure(const ITensor *input1, const ITensor *input2,
     Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+    update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
                               AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
                               output_access);
 
@@ -103,9 +102,7 @@ void NEBitwiseAndKernel::run(const Window &window, const ThreadInfo &info)
     Iterator input2(_input2, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr());
-    },
-    input1, input2, output);
+    execute_window_loop(
+        window, [&](const Coordinates &) { bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+        input2, output);
 }
diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
index 19b1af690a0fc20317821c6e2885a2c5db37dced..ecd181a7af0cefd735a99d6e5301cec7069ad4c4 100644
--- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -50,8 +51,7 @@ inline void bitwise_not_U8_U8(const uint8_t *__restrict input, uint8_t *__restri
 }
 } // namespace
 
-NEBitwiseNotKernel::NEBitwiseNotKernel()
-    : _input(nullptr), _output(nullptr)
+NEBitwiseNotKernel::NEBitwiseNotKernel() : _input(nullptr), _output(nullptr)
 {
 }
 
@@ -77,7 +77,8 @@ void NEBitwiseNotKernel::configure(const ITensor *input, ITensor *output)
     // Configure kernel window
     Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access);
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
 
     INEKernel::configure(win);
 }
@@ -90,9 +91,6 @@ void NEBitwiseNotKernel::run(const Window &window, const ThreadInfo &info)
     Iterator input(_input, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        bitwise_not_U8_U8(input.ptr(), output.ptr());
-    },
-    input, output);
+    execute_window_loop(
+        window, [&](const Coordinates &) { bitwise_not_U8_U8(input.ptr(), output.ptr()); }, input, output);
 }
diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
index 08094fbfcf1a362b67a2471e6b57b5c7cc6b1e55..4c906134aa1f3c571f0d45321559a973bbf0707b 100644
--- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -42,7 +43,8 @@ class Coordinates;
 
 namespace
 {
-inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+inline void
+bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
 {
     const uint8x16_t val1 = vld1q_u8(input1);
     const uint8x16_t val2 = vld1q_u8(input2);
@@ -51,8 +53,7 @@ inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t
 }
 } // namespace
 
-NEBitwiseOrKernel::NEBitwiseOrKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseOrKernel::NEBitwiseOrKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
 }
 
@@ -82,8 +83,7 @@ void NEBitwiseOrKernel::configure(const ITensor *input1, const ITensor *input2,
     Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+    update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
                               AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
                               output_access);
 
@@ -99,9 +99,7 @@ void NEBitwiseOrKernel::run(const Window &window, const ThreadInfo &info)
     Iterator input2(_input2, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
-    },
-    input1, input2, output);
+    execute_window_loop(
+        window, [&](const Coordinates &) { bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+        input2, output);
 }
diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
index fc5b38b64fca0c12bda83705a193bb19cf6356b7..dbbed2483c3c9e05081925e91fbf3208cc283762 100644
--- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -42,7 +43,8 @@ class Coordinates;
 
 namespace
 {
-inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+inline void
+bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
 {
     const uint8x16_t val1 = vld1q_u8(input1);
     const uint8x16_t val2 = vld1q_u8(input2);
@@ -51,8 +53,7 @@ inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t
 }
 } // namespace
 
-NEBitwiseXorKernel::NEBitwiseXorKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseXorKernel::NEBitwiseXorKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
 }
 
@@ -82,7 +83,8 @@ void NEBitwiseXorKernel::configure(const ITensor *input1, const ITensor *input2,
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
     update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
-                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access);
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
 
     INEKernel::configure(win);
 }
@@ -96,9 +98,7 @@ void NEBitwiseXorKernel::run(const Window &window, const ThreadInfo &info)
     Iterator input2(_input2, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
-    },
-    input1, input2, output);
+    execute_window_loop(
+        window, [&](const Coordinates &) { bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+        input2, output);
 }
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
index 69bfd56ce0d4bedee5c19de2ff3e23f882f1bc03..cb869838e275e1d1e8b5104bb1f7b8b92ae9b457 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
@@ -27,8 +27,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/boundingboxtransform/list.h"
@@ -45,7 +46,11 @@ struct BoundingBoxTransformSelectorData
 };
 
 using BoundingBoxTransformSelctorPtr = std::add_pointer<bool(const BoundingBoxTransformSelectorData &data)>::type;
-using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)>::type;
+using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor           *boxes,
+                                                             ITensor                 *pred_boxes,
+                                                             const ITensor           *deltas,
+                                                             BoundingBoxTransformInfo bbinfo,
+                                                             const Window            &window)>::type;
 
 struct BoundingBoxTransformKernel
 {
@@ -54,26 +59,19 @@ struct BoundingBoxTransformKernel
     BoundingBoxTransformUKernelPtr       ukernel;
 };
 
-static const BoundingBoxTransformKernel available_kernels[] =
-{
-    {
-        "fp32_neon_boundingboxtransform",
-        [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)
-    },
+static const BoundingBoxTransformKernel available_kernels[] = {
+    {"fp32_neon_boundingboxtransform",
+     [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)},
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "fp16_neon_boundingboxtransform",
-        [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)
-    },
+    {"fp16_neon_boundingboxtransform",
+     [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)},
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "qu16_neon_boundingboxtransform",
-        [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::QASYMM16; },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)
-    },
+    {"qu16_neon_boundingboxtransform",
+     [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::QASYMM16; },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)},
 #endif //defined(ARM_COMPUTE_ENABLE_NEON)
 };
 
@@ -85,9 +83,9 @@ static const BoundingBoxTransformKernel available_kernels[] =
  */
 const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -95,7 +93,10 @@ const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformS
     return nullptr;
 }
 
-Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status validate_arguments(const ITensorInfo              *boxes,
+                          const ITensorInfo              *pred_boxes,
+                          const ITensorInfo              *deltas,
+                          const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(boxes);
@@ -108,7 +109,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(info.scale() <= 0);
 
-    if(boxes->data_type() == DataType::QASYMM16)
+    if (boxes->data_type() == DataType::QASYMM16)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(deltas, 1, DataType::QASYMM8);
         const UniformQuantizationInfo deltas_qinfo = deltas->quantization_info().uniform();
@@ -120,12 +121,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas);
     }
 
-    if(pred_boxes->total_size() > 0)
+    if (pred_boxes->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, deltas);
         ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
-        if(pred_boxes->data_type() == DataType::QASYMM16)
+        if (pred_boxes->data_type() == DataType::QASYMM16)
         {
             const UniformQuantizationInfo pred_qinfo = pred_boxes->quantization_info().uniform();
             ARM_COMPUTE_RETURN_ERROR_ON(pred_qinfo.scale != 0.125f);
@@ -142,13 +143,19 @@ NEBoundingBoxTransformKernel::NEBoundingBoxTransformKernel()
 {
 }
 
-void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info)
+void NEBoundingBoxTransformKernel::configure(const ITensor                  *boxes,
+                                             ITensor                        *pred_boxes,
+                                             const ITensor                  *deltas,
+                                             const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
 
     // Configure kernel window
-    auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info()));
+    auto_init_if_empty(*pred_boxes->info(), deltas->info()
+                                                ->clone()
+                                                ->set_data_type(boxes->info()->data_type())
+                                                .set_quantization_info(boxes->info()->quantization_info()));
 
     // Set instance variables
     _boxes      = boxes;
@@ -164,7 +171,10 @@ void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred
     INEKernel::configure(win);
 }
 
-Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status NEBoundingBoxTransformKernel::validate(const ITensorInfo              *boxes,
+                                              const ITensorInfo              *pred_boxes,
+                                              const ITensorInfo              *deltas,
+                                              const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info));
     return Status{};
@@ -176,7 +186,7 @@ void NEBoundingBoxTransformKernel::run(const Window &window, const ThreadInfo &i
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const auto *uk = get_implementation(BoundingBoxTransformSelectorData{ _boxes->info()->data_type() });
+    const auto *uk = get_implementation(BoundingBoxTransformSelectorData{_boxes->info()->data_type()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     uk->ukernel(_boxes, _pred_boxes, _deltas, _bbinfo, window);
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
index def827836ca067a2a8bace58b05ca531853d7539..3915994feb0265a1a49600f8f1d80702e0593444 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
@@ -63,7 +63,8 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      *
      */
-    void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
+    void
+    configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
      *
@@ -77,7 +78,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+    static Status validate(const ITensorInfo              *boxes,
+                           const ITensorInfo              *pred_boxes,
+                           const ITensorInfo              *deltas,
+                           const BoundingBoxTransformInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
index 64da1f226265a6636f6d65b3b3ddcc3a4c85253a..3b53b7055f1bc10ed8bfb4756f582cbcbdd2dfba 100644
--- a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -44,15 +45,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW, DataLayout::NHWC);
 
-    const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+    const unsigned int channels =
+        input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        num_groups == channels,
+        "Channel shuffling with same number of groups as number of channels would be inefficient");
     ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels); // There cannot be more groups than channels
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0,
+                                    "The number of channels must be a multiple of the number of groups");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -72,20 +77,22 @@ void channel_shuffle_nhwc(const ITensor *input, ITensor *output, unsigned int nu
 
     Iterator in(input, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Shuffle channel
-        const unsigned int curr_channel = id.x();
-        const unsigned int group_id     = curr_channel * rK;
-        const unsigned int r            = group_id * K;
-        const unsigned int channel_id   = curr_channel - r;
-
-        // Calculate output coordinates
-        Coordinates out_coords = id;
-        out_coords.set(Window::DimX, channel_id * num_groups + group_id);
-        std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords));
-    },
-    in);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            // Shuffle channel
+            const unsigned int curr_channel = id.x();
+            const unsigned int group_id     = curr_channel * rK;
+            const unsigned int r            = group_id * K;
+            const unsigned int channel_id   = curr_channel - r;
+
+            // Calculate output coordinates
+            Coordinates out_coords = id;
+            out_coords.set(Window::DimX, channel_id * num_groups + group_id);
+            std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords));
+        },
+        in);
 }
 void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int num_groups, const Window &window)
 {
@@ -107,34 +114,35 @@ void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int nu
 
     Iterator in(input, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        // Shuffle channel
-        const unsigned int curr_channel = id.z();
-        const unsigned int group_id     = curr_channel * rK;
-        const unsigned int r            = group_id * K;
-        const unsigned int channel_id   = curr_channel - r;
-
-        // Calculate output coordinates
-        Coordinates out_coords = id;
-        out_coords.set(Window::DimZ, channel_id * num_groups + group_id);
-        const uint8_t *input_ptr  = in.ptr();
-        uint8_t       *output_ptr = output->ptr_to_element(out_coords);
-
-        // Copy plane
-        for(unsigned int y = 0; y < height; ++y)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            std::copy_n(input_ptr, row_size, output_ptr);
-            input_ptr += input_stride_y;
-            output_ptr += output_stride_y;
-        }
-    },
-    in);
+            // Shuffle channel
+            const unsigned int curr_channel = id.z();
+            const unsigned int group_id     = curr_channel * rK;
+            const unsigned int r            = group_id * K;
+            const unsigned int channel_id   = curr_channel - r;
+
+            // Calculate output coordinates
+            Coordinates out_coords = id;
+            out_coords.set(Window::DimZ, channel_id * num_groups + group_id);
+            const uint8_t *input_ptr  = in.ptr();
+            uint8_t       *output_ptr = output->ptr_to_element(out_coords);
+
+            // Copy plane
+            for (unsigned int y = 0; y < height; ++y)
+            {
+                std::copy_n(input_ptr, row_size, output_ptr);
+                input_ptr += input_stride_y;
+                output_ptr += output_stride_y;
+            }
+        },
+        in);
 }
 } // namespace
 
-NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel()
-    : _input(nullptr), _output(nullptr), _num_groups()
+NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr), _num_groups()
 {
 }
 
@@ -158,7 +166,8 @@ void NEChannelShuffleLayerKernel::configure(const ITensor *input, ITensor *outpu
     INEKernel::configure(win);
 }
 
-Status NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+Status
+NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
     return Status{};
@@ -170,7 +179,7 @@ void NEChannelShuffleLayerKernel::run(const Window &window, const ThreadInfo &in
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    switch(_input->info()->data_layout())
+    switch (_input->info()->data_layout())
     {
         case DataLayout::NHWC:
             channel_shuffle_nhwc(_input, _output, _num_groups, window);
diff --git a/src/core/NEON/kernels/NECol2ImKernel.h b/src/core/NEON/kernels/NECol2ImKernel.h
index 1976302036e90dc86cbc7fa58c785cb6d5a26e85..bc6652fd30ff63e613900f262a7ae6789a7a658f 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.h
+++ b/src/core/NEON/kernels/NECol2ImKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_NECOL2IMKERNEL_H
 #define ARM_COMPUTE_NECOL2IMKERNEL_H
 
-#include "src/core/NEON/INEKernel.h"
-
 #include "arm_compute/core/Size2D.h"
 
+#include "src/core/NEON/INEKernel.h"
+
 namespace arm_compute
 {
 class ITensor;
diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp
index 94c455305ce94fa4fe12c9aab2501b5c722e1566..60271fbc743c86b883a1ed8a7cefd4e609d5749c 100644
--- a/src/core/NEON/kernels/NECropKernel.cpp
+++ b/src/core/NEON/kernels/NECropKernel.cpp
@@ -26,14 +26,15 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/utils/helpers/bit_ops.h"
 #include "src/cpu/kernels/crop/list.h"
 
@@ -47,7 +48,8 @@ struct CropSelectorData
 };
 
 using CropSelectorPtr = std::add_pointer<bool(const CropSelectorData &data)>::type;
-using CropUKernelPtr  = std::add_pointer<void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type;
+using CropUKernelPtr  = std::add_pointer<void(
+    const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type;
 
 struct CropUKernel
 {
@@ -56,48 +58,23 @@ struct CropUKernel
     CropUKernelPtr        ukernel;
 };
 
-static const CropUKernel available_kernels[] =
-{
-    {
-        "fp16_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)
-    },
-    {
-        "f32_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)
-    },
-    {
-        "u8_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::U8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)
-    },
-    {
-        "u16_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::U16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)
-    },
-    {
-        "u32_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::U32; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)
-    },
-    {
-        "s8_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::S8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)
-    },
-    {
-        "s16_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::S16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)
-    },
-    {
-        "s32_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::S32; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)
-    },
+static const CropUKernel available_kernels[] = {
+    {"fp16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)},
+    {"f32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)},
+    {"u8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)},
+    {"u16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)},
+    {"u32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U32; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)},
+    {"s8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)},
+    {"s16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)},
+    {"s32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S32; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)},
 };
 
 /** Micro-kernel selector
@@ -108,9 +85,9 @@ static const CropUKernel available_kernels[] =
  */
 const CropUKernel *get_implementation(const CropSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -119,26 +96,40 @@ const CropUKernel *get_implementation(const CropSelectorData &data)
     return nullptr;
 }
 
-inline void out_of_bounds_crop_window(const ITensor *output, float *output_ptr, float extrapolation_value,
-                                      int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit)
+inline void out_of_bounds_crop_window(const ITensor *output,
+                                      float         *output_ptr,
+                                      float          extrapolation_value,
+                                      int32_t        window_step_x,
+                                      int32_t        output_width_start,
+                                      int32_t        output_width_limit)
 {
-    auto    in               = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag());
-    int32_t x                = 0;
-    int32_t limit            = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
-    float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
-    for(; x <= limit - window_step_x; x += window_step_x)
+    auto    in    = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag());
+    int32_t x     = 0;
+    int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
+    float  *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
+    for (; x <= limit - window_step_x; x += window_step_x)
     {
         wrapper::vstore(output_start_ptr + x, in);
     }
-    for(; x < limit; ++x)
+    for (; x < limit; ++x)
     {
         *(output_start_ptr + x) = extrapolation_value;
     }
 }
 
-inline void execute_window(const ITensor *input, const ITensor *output, Coordinates input_offset, float extrapolation_value,
-                           const std::array<uint32_t, 2> &rows_out_of_bounds, const std::array<uint32_t, 2> &cols_out_of_bounds, NECropKernel::InBoundsCropFunction *in_bounds_crop_function,
-                           bool is_height_flipped, bool has_cols_in_bounds, bool has_cols_out_of_bounds_before, bool has_cols_out_of_bounds_after, bool input_has_single_channel, bool is_width_flipped)
+inline void execute_window(const ITensor                      *input,
+                           const ITensor                      *output,
+                           Coordinates                         input_offset,
+                           float                               extrapolation_value,
+                           const std::array<uint32_t, 2>      &rows_out_of_bounds,
+                           const std::array<uint32_t, 2>      &cols_out_of_bounds,
+                           NECropKernel::InBoundsCropFunction *in_bounds_crop_function,
+                           bool                                is_height_flipped,
+                           bool                                has_cols_in_bounds,
+                           bool                                has_cols_out_of_bounds_before,
+                           bool                                has_cols_out_of_bounds_after,
+                           bool                                input_has_single_channel,
+                           bool                                is_width_flipped)
 {
     // Output is always float.
     const int window_step_x = 16 / sizeof(float);
@@ -159,45 +150,66 @@ inline void execute_window(const ITensor *input, const ITensor *output, Coordina
     //  |------------------------------|
     // Fill all output rows that have no elements that are within the input bounds with the extrapolation value.
     // First for the rows before the in bounds rows.
-    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[0] * output->info()->dimension(1));
+    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0,
+                              rows_out_of_bounds[0] * output->info()->dimension(1));
     output_ptr += rows_out_of_bounds[0] * output->info()->dimension(1) * output->info()->dimension(0);
     // Iterate through each row that has any elements within the input bounds.
-    for(uint32_t row = rows_out_of_bounds[0]; static_cast<int32_t>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]);
-        ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2])
+    for (uint32_t row = rows_out_of_bounds[0];
+         static_cast<int32_t>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]);
+         ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2])
     {
         // Fill all elements in the row that are out of bounds with the extrapolation value.
         // First for the elements before the in bounds elements.
-        if(has_cols_out_of_bounds_before)
+        if (has_cols_out_of_bounds_before)
         {
             out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, cols_out_of_bounds[0]);
         }
         // Copy all elements within the input bounds from the input tensor.
-        if(has_cols_in_bounds)
+        if (has_cols_in_bounds)
         {
             (*in_bounds_crop_function)(input, output, output_ptr, input_offset, window_step_x, cols_out_of_bounds[0],
-                                       output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel, is_width_flipped);
+                                       output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel,
+                                       is_width_flipped);
         }
         // Fill all elements after the in bounds elements with the extrapolation value.
-        if(has_cols_out_of_bounds_after)
+        if (has_cols_out_of_bounds_after)
         {
-            out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1));
+            out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x,
+                                      output->info()->dimension(1) - cols_out_of_bounds[1],
+                                      output->info()->dimension(1));
         }
         output_ptr += output->info()->dimension(1) * output->info()->dimension(0);
     }
     // Fill all rows after the in bounds elements with the extrapolation value.
-    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[1] * output->info()->dimension(1));
+    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0,
+                              rows_out_of_bounds[1] * output->info()->dimension(1));
 }
 } // namespace
 
 NECropKernel::NECropKernel()
-    : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds()
+    : _input(nullptr),
+      _crop_boxes(nullptr),
+      _box_ind(nullptr),
+      _output(nullptr),
+      _start(),
+      _end(),
+      _crop_box_ind(0),
+      _extrapolation_value(0),
+      _rows_out_of_bounds(),
+      _cols_out_of_bounds()
 {
 }
 
-void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind, float extrapolation_value)
+void NECropKernel::configure(const ITensor *input,
+                             const ITensor *crop_boxes,
+                             const ITensor *box_ind,
+                             ITensor       *output,
+                             uint32_t       crop_box_ind,
+                             float          extrapolation_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(), crop_box_ind, extrapolation_value));
+    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(),
+                                        crop_box_ind, extrapolation_value));
 
     _input               = input;
     _crop_boxes          = crop_boxes;
@@ -207,21 +219,27 @@ void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, co
     _extrapolation_value = extrapolation_value;
 }
 
-Status NECropKernel::validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind, float extrapolation_value)
+Status NECropKernel::validate(const ITensorInfo *input,
+                              const ITensorInfo *crop_boxes,
+                              const ITensorInfo *box_ind,
+                              const ITensorInfo *output,
+                              uint32_t           crop_box_ind,
+                              float              extrapolation_value)
 {
     ARM_COMPUTE_UNUSED(extrapolation_value);
-    const auto *uk = get_implementation(CropSelectorData{ input->data_type() });
+    const auto *uk = get_implementation(CropSelectorData{input->data_type()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16,
+                                                         DataType::F16, DataType::U32, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[0] != 4);
     ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
     ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] <= crop_box_ind);
     ARM_COMPUTE_RETURN_ERROR_ON(box_ind->tensor_shape()[0] <= crop_box_ind);
-    if(output->total_size() > 0)
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -242,48 +260,53 @@ void NECropKernel::configure_output_shape()
     // The normalized coordiantes are scaled to retrieve the floating point image coordinates which are rounded to integers.
     _start = Coordinates(std::floor(x0 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
                          std::floor(y0 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
-    _end = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
-                       std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
-    const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1, abs(_end[1] - _start[1]) + 1);
+    _end   = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
+                         std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
+    const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1,
+                                abs(_end[1] - _start[1]) + 1);
     _output->info()->set_tensor_shape(out_shape);
 
     bool is_width_flipped  = _end[0] < _start[0];
     bool is_height_flipped = _end[1] < _start[1];
-    if(is_height_flipped)
+    if (is_height_flipped)
     {
-        _rows_out_of_bounds[0] = _start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_start[1] - _input->info()->dimension(2) + 1),
-                                                                                                            static_cast<uint32_t>(_output->info()->dimension(2))) :
-                                 0;
+        _rows_out_of_bounds[0] = _start[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                     ? std::min(static_cast<uint32_t>(_start[1] - _input->info()->dimension(2) + 1),
+                                                static_cast<uint32_t>(_output->info()->dimension(2)))
+                                     : 0;
         _rows_out_of_bounds[1] = _end[1] < 0 ? std::min(static_cast<uint32_t>(-_end[1]),
-                                                        static_cast<uint32_t>(_output->info()->dimension(2))) :
-                                 0;
+                                                        static_cast<uint32_t>(_output->info()->dimension(2)))
+                                             : 0;
     }
     else
     {
         _rows_out_of_bounds[0] = _start[1] < 0 ? std::min(static_cast<uint32_t>(-_start[1]),
-                                                          static_cast<uint32_t>(_output->info()->dimension(2))) :
-                                 0;
-        _rows_out_of_bounds[1] = _end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_end[1] - _input->info()->dimension(2) + 1),
-                                                                                                          static_cast<uint32_t>(_output->info()->dimension(2))) :
-                                 0;
+                                                          static_cast<uint32_t>(_output->info()->dimension(2)))
+                                               : 0;
+        _rows_out_of_bounds[1] = _end[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                     ? std::min(static_cast<uint32_t>(_end[1] - _input->info()->dimension(2) + 1),
+                                                static_cast<uint32_t>(_output->info()->dimension(2)))
+                                     : 0;
     }
-    if(is_width_flipped)
+    if (is_width_flipped)
     {
-        _cols_out_of_bounds[0] = _start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_start[0] - _input->info()->dimension(1) + 1),
-                                                                                                            static_cast<uint32_t>(_output->info()->dimension(1))) :
-                                 0;
+        _cols_out_of_bounds[0] = _start[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                     ? std::min(static_cast<uint32_t>(_start[0] - _input->info()->dimension(1) + 1),
+                                                static_cast<uint32_t>(_output->info()->dimension(1)))
+                                     : 0;
         _cols_out_of_bounds[1] = _end[0] < 0 ? std::min(static_cast<uint32_t>(-_end[0]),
-                                                        static_cast<uint32_t>(_output->info()->dimension(1))) :
-                                 0;
+                                                        static_cast<uint32_t>(_output->info()->dimension(1)))
+                                             : 0;
     }
     else
     {
         _cols_out_of_bounds[0] = _start[0] < 0 ? std::min(static_cast<uint32_t>(-_start[0]),
-                                                          static_cast<uint32_t>(_output->info()->dimension(1))) :
-                                 0;
-        _cols_out_of_bounds[1] = _end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_end[0] - _input->info()->dimension(1) + 1),
-                                                                                                          static_cast<uint32_t>(_output->info()->dimension(1))) :
-                                 0;
+                                                          static_cast<uint32_t>(_output->info()->dimension(1)))
+                                               : 0;
+        _cols_out_of_bounds[1] = _end[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                     ? std::min(static_cast<uint32_t>(_end[0] - _input->info()->dimension(1) + 1),
+                                                static_cast<uint32_t>(_output->info()->dimension(1)))
+                                     : 0;
     }
 
     INEKernel::configure(calculate_max_window(*_output->info()));
@@ -298,13 +321,18 @@ void NECropKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON(_input->info()->has_padding());
     ARM_COMPUTE_ERROR_ON(_output->info()->has_padding());
 
-    const auto *uk = get_implementation(CropSelectorData{ _input->info()->data_type() });
+    const auto *uk = get_implementation(CropSelectorData{_input->info()->data_type()});
 
     uint32_t    batch_index = *(reinterpret_cast<int32_t *>(_box_ind->ptr_to_element(Coordinates(_crop_box_ind))));
-    Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
-                             _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
-    execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, uk->ukernel, _end[1] < _start[1],
-                   _cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->info()->dimension(1), _cols_out_of_bounds[0] > 0, _cols_out_of_bounds[1] > 0,
+    Coordinates input_offset(
+        0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
+        _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
+    execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds,
+                   uk->ukernel,
+                   _end[1]<_start[1],
+                           _cols_out_of_bounds[0] +
+                               _cols_out_of_bounds[1]<_output->info()->dimension(1), _cols_out_of_bounds[0]> 0,
+                           _cols_out_of_bounds[1]> 0,
                    _start[0] <= _end[0], _end[0] < _start[0]);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NECropKernel.h b/src/core/NEON/kernels/NECropKernel.h
index 6c989c1d2ce7b83a40cd8d92a43859a3f2f61f04..da4a1b26e548721089e68fe95e1e9433082db851 100644
--- a/src/core/NEON/kernels/NECropKernel.h
+++ b/src/core/NEON/kernels/NECropKernel.h
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_NEON_CROP_KERNEL_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -67,7 +67,12 @@ public:
      * @param[in]  crop_box_ind        Index of the crop box to be used from @p crop_boxes. Default is 0.
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      */
-    void configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
+    void configure(const ITensor *input,
+                   const ITensor *crop_boxes,
+                   const ITensor *box_ind,
+                   ITensor       *output,
+                   uint32_t       crop_box_ind        = 0,
+                   float          extrapolation_value = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
      *
@@ -82,7 +87,12 @@ public:
      * @param[in] crop_box_ind        Index of the crop box to be used from @p crop_boxes. Default is 0.
      * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *crop_boxes,
+                           const ITensorInfo *box_ind,
+                           const ITensorInfo *output,
+                           uint32_t           crop_box_ind        = 0,
+                           float              extrapolation_value = 0);
 
     /** Configure output tensor's shape as this can only be determined at runtime. */
     void configure_output_shape();
@@ -91,7 +101,8 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
     /** Function to use for in bounds crop for the particular tensor types passed to configure() */
-    using InBoundsCropFunction = void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool);
+    using InBoundsCropFunction =
+        void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool);
 
 private:
     const ITensor *_input;
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index 6dcc85ec2e2486451e19bf2ae31226cb1d7fd4d1..de0079ee60dc7283f1b9fd60093a4f2859d01e84 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstdint>
@@ -52,12 +53,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0);
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
         const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width]));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+                                    (block_shape * input->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+                                    (block_shape * input->tensor_shape()[idx_height]));
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -74,7 +77,8 @@ NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel()
 void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, int32_t block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+    TensorShape output_shape =
+        compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
@@ -117,26 +121,27 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
     slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
     // Main loop for NCHW and NHWC
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         Window slice_in = window.first_slice_window_2D();
         do
         {
             Iterator in(_input, slice_in);
-            execute_window_loop(slice_in, [&](const Coordinates & id)
-            {
-                const int x = id.x();
-                const int y = id.y();
-
-                const int   z     = id.z() % r;
-                const int   out_x = x * _block_shape + (id.z() / r) % _block_shape;
-                const int   out_y = y * _block_shape + (id.z() / r) / _block_shape;
-                Coordinates output_coords{ out_x, out_y, z, id[3] };
-                memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
-            },
-            in);
-        }
-        while(window.slide_window_slice_2D(slice_in));
+            execute_window_loop(
+                slice_in,
+                [&](const Coordinates &id)
+                {
+                    const int x = id.x();
+                    const int y = id.y();
+
+                    const int   z     = id.z() % r;
+                    const int   out_x = x * _block_shape + (id.z() / r) % _block_shape;
+                    const int   out_y = y * _block_shape + (id.z() / r) / _block_shape;
+                    Coordinates output_coords{out_x, out_y, z, id[3]};
+                    memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+                },
+                in);
+        } while (window.slide_window_slice_2D(slice_in));
     }
     else
     {
@@ -144,20 +149,21 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
         do
         {
             Iterator in(_input, slice_in);
-            execute_window_loop(slice_in, [&](const Coordinates & id)
-            {
-                const int x = id.y();
-                const int y = id.z();
-
-                const int   z     = id.x() % r;
-                const int   out_x = x * _block_shape + (id.x() / r) % _block_shape;
-                const int   out_y = y * _block_shape + (id.x() / r) / _block_shape;
-                Coordinates output_coords{ z, out_x, out_y, id[3] };
-                memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
-            },
-            in);
-        }
-        while(window.slide_window_slice_3D(slice_in));
+            execute_window_loop(
+                slice_in,
+                [&](const Coordinates &id)
+                {
+                    const int x = id.y();
+                    const int y = id.z();
+
+                    const int   z     = id.x() % r;
+                    const int   out_x = x * _block_shape + (id.x() / r) % _block_shape;
+                    const int   out_y = y * _block_shape + (id.x() / r) / _block_shape;
+                    Coordinates output_coords{z, out_x, out_y, id[3]};
+                    memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+                },
+                in);
+        } while (window.slide_window_slice_3D(slice_in));
     }
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
index 261437f07d26382fcc03338a542e3074638d076b..a5969cd4972825620cf28a23d085dd3a67ee4f1b 100644
--- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -37,16 +38,19 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status validate_arguments(const ITensorInfo               *input,
+                          const ITensorInfo               *output,
+                          const ITensorInfo               *idx,
+                          const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -56,7 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo                     *input,
+                                                        ITensorInfo                     *output,
+                                                        ITensorInfo                     *idx,
+                                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_UNUSED(idx, config);
 
@@ -68,12 +75,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-NEFFTDigitReverseKernel::NEFFTDigitReverseKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr)
+NEFFTDigitReverseKernel::NEFFTDigitReverseKernel() : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr)
 {
 }
 
-void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *idx, const FFTDigitReverseKernelInfo &config)
+void NEFFTDigitReverseKernel::configure(const ITensor                   *input,
+                                        ITensor                         *output,
+                                        const ITensor                   *idx,
+                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
@@ -91,11 +100,11 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 
-    if(axis == 0)
+    if (axis == 0)
     {
-        if(is_input_complex)
+        if (is_input_complex)
         {
-            if(is_conj)
+            if (is_conj)
             {
                 _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<true, true>;
             }
@@ -109,11 +118,11 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c
             _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<false, false>;
         }
     }
-    else if(axis == 1)
+    else if (axis == 1)
     {
-        if(is_input_complex)
+        if (is_input_complex)
         {
-            if(is_conj)
+            if (is_conj)
             {
                 _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1<true, true>;
             }
@@ -133,10 +142,14 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c
     }
 }
 
-Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status NEFFTDigitReverseKernel::validate(const ITensorInfo               *input,
+                                         const ITensorInfo               *output,
+                                         const ITensorInfo               *idx,
+                                         const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
     return Status{};
 }
 
@@ -159,38 +172,40 @@ void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0(const Window &window)
     std::vector<float> buffer_row_out(2 * N);
     std::vector<float> buffer_row_in(2 * N);
 
-    execute_window_loop(slice, [&](const Coordinates &)
-    {
-        if(is_input_complex)
+    execute_window_loop(
+        slice,
+        [&](const Coordinates &)
         {
-            // Load
-            memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), 2 * N * sizeof(float));
-
-            // Shuffle
-            for(size_t x = 0; x < 2 * N; x += 2)
+            if (is_input_complex)
             {
-                size_t idx            = buffer_idx[x / 2];
-                buffer_row_out[x]     = buffer_row_in[2 * idx];
-                buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]);
-            }
-        }
-        else
-        {
-            // Load
-            memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), N * sizeof(float));
+                // Load
+                memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), 2 * N * sizeof(float));
 
-            // Shuffle
-            for(size_t x = 0; x < N; ++x)
+                // Shuffle
+                for (size_t x = 0; x < 2 * N; x += 2)
+                {
+                    size_t idx            = buffer_idx[x / 2];
+                    buffer_row_out[x]     = buffer_row_in[2 * idx];
+                    buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]);
+                }
+            }
+            else
             {
-                size_t idx            = buffer_idx[x];
-                buffer_row_out[2 * x] = buffer_row_in[idx];
+                // Load
+                memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), N * sizeof(float));
+
+                // Shuffle
+                for (size_t x = 0; x < N; ++x)
+                {
+                    size_t idx            = buffer_idx[x];
+                    buffer_row_out[2 * x] = buffer_row_in[idx];
+                }
             }
-        }
 
-        // Copy back
-        memcpy(reinterpret_cast<float *>(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float));
-    },
-    in, out);
+            // Copy back
+            memcpy(reinterpret_cast<float *>(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float));
+        },
+        in, out);
 }
 
 template <bool is_input_complex, bool is_conj>
@@ -215,39 +230,41 @@ void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1(const Window &window)
     const size_t stride_z = _input->info()->strides_in_bytes()[2];
     const size_t stride_w = _input->info()->strides_in_bytes()[3];
 
-    execute_window_loop(slice, [&](const Coordinates & id)
-    {
-        auto        *out_ptr    = reinterpret_cast<float *>(out.ptr());
-        auto        *in_ptr     = reinterpret_cast<float *>(_input->buffer() + id.z() * stride_z + id[3] * stride_w);
-        const size_t y_shuffled = buffer_idx[id.y()];
-
-        if(is_input_complex)
+    execute_window_loop(
+        slice,
+        [&](const Coordinates &id)
         {
-            // Shuffle the entire row into the output
-            memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float));
+            auto        *out_ptr = reinterpret_cast<float *>(out.ptr());
+            auto        *in_ptr  = reinterpret_cast<float *>(_input->buffer() + id.z() * stride_z + id[3] * stride_w);
+            const size_t y_shuffled = buffer_idx[id.y()];
 
-            // Conjugate if necessary
-            if(is_conj)
+            if (is_input_complex)
             {
-                for(size_t x = 0; x < 2 * Nx; x += 2)
+                // Shuffle the entire row into the output
+                memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float));
+
+                // Conjugate if necessary
+                if (is_conj)
                 {
-                    out_ptr[x + 1] = -out_ptr[x + 1];
+                    for (size_t x = 0; x < 2 * Nx; x += 2)
+                    {
+                        out_ptr[x + 1] = -out_ptr[x + 1];
+                    }
                 }
             }
-        }
-        else
-        {
-            // Shuffle the entire row into the buffer
-            memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float));
-
-            // Copy the buffer to the output, with a zero imaginary part
-            for(size_t x = 0; x < 2 * Nx; x += 2)
+            else
             {
-                out_ptr[x] = buffer_row[x / 2];
+                // Shuffle the entire row into the buffer
+                memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float));
+
+                // Copy the buffer to the output, with a zero imaginary part
+                for (size_t x = 0; x < 2 * Nx; x += 2)
+                {
+                    out_ptr[x] = buffer_row[x / 2];
+                }
             }
-        }
-    },
-    out);
+        },
+        out);
 }
 
 void NEFFTDigitReverseKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
index f436c364b205ad1988c5da78d8fb5c267c33273f..ecf85ebc985ec75dd0362f53559672ce1d8c6579 100644
--- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -70,7 +71,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
+    static Status validate(const ITensorInfo               *input,
+                           const ITensorInfo               *output,
+                           const ITensorInfo               *idx,
+                           const FFTDigitReverseKernelInfo &config);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
index 44c841f626c8c0b3ddfd101beedf7b32824828d1..4b58a7b9ac1b9500d6707b8b0b8c4c3435d96fbe 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
@@ -28,10 +28,11 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/traits.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "support/ToolchainSupport.h"
 
 #include <arm_neon.h>
@@ -70,7 +71,7 @@ float32x2_t c_mul_neon(float32x2_t a, float32x2_t b)
 {
     using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
 
-    const float32x2_t mask = { -1.0, 1.0 };
+    const float32x2_t mask = {-1.0, 1.0};
     const float32x2_t tmp0 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
     const float32x2_t tmp1 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
 
@@ -88,7 +89,7 @@ float32x2_t c_mul_neon_img(float32x2_t a, float img_constant)
     const float a_r = wrapper::vgetlane(a, 0);
     const float a_i = wrapper::vgetlane(a, 1);
 
-    const auto out = wrapper::vmul(float32x2_t{ -a_i, a_r }, float32x2_t{ img_constant, img_constant });
+    const auto out = wrapper::vmul(float32x2_t{-a_i, a_r}, float32x2_t{img_constant, img_constant});
     return out;
 }
 
@@ -100,7 +101,8 @@ float32x2_t reduce_sum_5(float32x2_t a, float32x2_t b, float32x2_t c, float32x2_
     return wrapper::vadd(t2, e);
 }
 
-float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
+float32x2_t reduce_sum_7(
+    float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
 {
     const auto t0  = wrapper::vadd(x1, x2);
     const auto t1  = wrapper::vadd(x3, x4);
@@ -111,7 +113,14 @@ float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32
     return wrapper::vadd(t00, t01);
 }
 
-float32x2_t reduce_sum_8(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7, float32x2_t x8)
+float32x2_t reduce_sum_8(float32x2_t x1,
+                         float32x2_t x2,
+                         float32x2_t x3,
+                         float32x2_t x4,
+                         float32x2_t x5,
+                         float32x2_t x6,
+                         float32x2_t x7,
+                         float32x2_t x8)
 {
     const auto t0  = wrapper::vadd(x1, x2);
     const auto t1  = wrapper::vadd(x3, x4);
@@ -141,15 +150,21 @@ void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z, const float32x2_t &w,
     x = wrapper::vadd(a, b);
     x = wrapper::vadd(x, c);
 
-    const auto v1 = wrapper::vmul(float32x2_t{ 0.5f, 0.5 }, wrapper::vadd(b, c));
-    const auto v2 = c_mul_neon(float32x2_t{ 0.f, -kSqrt3Div2 }, wrapper::vsub(b, c));
+    const auto v1 = wrapper::vmul(float32x2_t{0.5f, 0.5}, wrapper::vadd(b, c));
+    const auto v2 = c_mul_neon(float32x2_t{0.f, -kSqrt3Div2}, wrapper::vsub(b, c));
 
     y = z = wrapper::vsub(a, v1);
     y     = wrapper::vadd(y, v2);
     z     = wrapper::vsub(z, v2);
 }
 
-void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3)
+void fft_4(float32x2_t       &x1,
+           float32x2_t       &x2,
+           float32x2_t       &x3,
+           float32x2_t       &x4,
+           const float32x2_t &w,
+           const float32x2_t &w2,
+           const float32x2_t &w3)
 {
     float32x2_t a = x1;
     float32x2_t b = c_mul_neon(w, x2);
@@ -173,7 +188,15 @@ void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, c
     x4             = wrapper::vadd(x41, x42);
 }
 
-void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, const float32x2_t &w4)
+void fft_5(float32x2_t       &x1,
+           float32x2_t       &x2,
+           float32x2_t       &x3,
+           float32x2_t       &x4,
+           float32x2_t       &x5,
+           const float32x2_t &w,
+           const float32x2_t &w2,
+           const float32x2_t &w3,
+           const float32x2_t &w4)
 {
     const auto a = x1;
     const auto b = c_mul_neon(w, x2);
@@ -181,25 +204,25 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
     const auto d = c_mul_neon(w3, x4);
     const auto e = c_mul_neon(w4, x5);
 
-    const auto b0 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, b);
-    const auto b1 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, b);
-    const auto b2 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, b);
-    const auto b3 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, b);
+    const auto b0 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, b);
+    const auto b1 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, b);
+    const auto b2 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, b);
+    const auto b3 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, b);
 
-    const auto c0 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, c);
-    const auto c1 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, c);
-    const auto c2 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, c);
-    const auto c3 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, c);
+    const auto c0 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, c);
+    const auto c1 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, c);
+    const auto c2 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, c);
+    const auto c3 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, c);
 
-    const auto d0 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, d);
-    const auto d1 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, d);
-    const auto d2 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, d);
-    const auto d3 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, d);
+    const auto d0 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, d);
+    const auto d1 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, d);
+    const auto d2 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, d);
+    const auto d3 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, d);
 
-    const auto e0 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, e);
-    const auto e1 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, e);
-    const auto e2 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, e);
-    const auto e3 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, e);
+    const auto e0 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, e);
+    const auto e1 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, e);
+    const auto e2 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, e);
+    const auto e3 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, e);
 
     x1 = reduce_sum_5(a, b, c, d, e);
     x2 = reduce_sum_5(a, b0, c0, d0, e0);
@@ -208,9 +231,19 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
     x5 = reduce_sum_5(a, b3, c3, d3, e3);
 }
 
-void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3,
+void fft_7(float32x2_t       &x1,
+           float32x2_t       &x2,
+           float32x2_t       &x3,
+           float32x2_t       &x4,
+           float32x2_t       &x5,
+           float32x2_t       &x6,
+           float32x2_t       &x7,
+           const float32x2_t &w,
+           const float32x2_t &w2,
+           const float32x2_t &w3,
            const float32x2_t &w4,
-           const float32x2_t &w5, const float32x2_t &w6)
+           const float32x2_t &w5,
+           const float32x2_t &w6)
 {
     const auto a = x1;
     const auto b = c_mul_neon(w, x2);
@@ -220,47 +253,47 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
     const auto f = c_mul_neon(w5, x6);
     const auto g = c_mul_neon(w6, x7);
 
-    const auto b0 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, b);
-    const auto b1 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, b);
-    const auto b2 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, b);
-    const auto b3 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, b);
-    const auto b4 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, b);
-    const auto b5 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, b);
-
-    const auto c0 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, c);
-    const auto c1 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, c);
-    const auto c2 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, c);
-    const auto c3 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, c);
-    const auto c4 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, c);
-    const auto c5 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, c);
-
-    const auto d0 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, d);
-    const auto d1 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, d);
-    const auto d2 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, d);
-    const auto d3 = c_mul_neon(float32x2_t{ -kW7_2, +kW7_3 }, d);
-    const auto d4 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, d);
-    const auto d5 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, d);
-
-    const auto e0 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, e);
-    const auto e1 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, e);
-    const auto e2 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, e);
-    const auto e3 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, e);
-    const auto e4 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, e);
-    const auto e5 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, e);
-
-    const auto f0 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, f);
-    const auto f1 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, f);
-    const auto f2 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, f);
-    const auto f3 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, f);
-    const auto f4 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, f);
-    const auto f5 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, f);
-
-    const auto g0 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, g);
-    const auto g1 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, g);
-    const auto g2 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, g);
-    const auto g3 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, g);
-    const auto g4 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, g);
-    const auto g5 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, g);
+    const auto b0 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, b);
+    const auto b1 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, b);
+    const auto b2 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, b);
+    const auto b3 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, b);
+    const auto b4 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, b);
+    const auto b5 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, b);
+
+    const auto c0 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, c);
+    const auto c1 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, c);
+    const auto c2 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, c);
+    const auto c3 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, c);
+    const auto c4 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, c);
+    const auto c5 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, c);
+
+    const auto d0 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, d);
+    const auto d1 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, d);
+    const auto d2 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, d);
+    const auto d3 = c_mul_neon(float32x2_t{-kW7_2, +kW7_3}, d);
+    const auto d4 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, d);
+    const auto d5 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, d);
+
+    const auto e0 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, e);
+    const auto e1 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, e);
+    const auto e2 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, e);
+    const auto e3 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, e);
+    const auto e4 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, e);
+    const auto e5 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, e);
+
+    const auto f0 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, f);
+    const auto f1 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, f);
+    const auto f2 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, f);
+    const auto f3 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, f);
+    const auto f4 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, f);
+    const auto f5 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, f);
+
+    const auto g0 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, g);
+    const auto g1 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, g);
+    const auto g2 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, g);
+    const auto g3 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, g);
+    const auto g4 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, g);
+    const auto g5 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, g);
 
     x1 = reduce_sum_7(a, b, c, d, e, f, g);
     x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0);
@@ -271,9 +304,20 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
     x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5);
 }
 
-void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, float32x2_t &x8, const float32x2_t &w, const float32x2_t &w2,
+void fft_8(float32x2_t       &x1,
+           float32x2_t       &x2,
+           float32x2_t       &x3,
+           float32x2_t       &x4,
+           float32x2_t       &x5,
+           float32x2_t       &x6,
+           float32x2_t       &x7,
+           float32x2_t       &x8,
+           const float32x2_t &w,
+           const float32x2_t &w2,
            const float32x2_t &w3,
-           const float32x2_t &w4, const float32x2_t &w5, const float32x2_t &w6,
+           const float32x2_t &w4,
+           const float32x2_t &w5,
+           const float32x2_t &w6,
            const float32x2_t &w7)
 {
     const auto a = x1;
@@ -285,61 +329,61 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
     const auto g = c_mul_neon(w6, x7);
     const auto h = c_mul_neon(w7, x8);
 
-    const auto b0 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, b);
-    const auto b1 = c_mul_neon(float32x2_t{ 0, -1 }, b);
-    const auto b2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, b);
-    const auto b3 = c_mul_neon(float32x2_t{ -1, 0 }, b);
-    const auto b4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, b);
-    const auto b5 = c_mul_neon(float32x2_t{ 0, 1 }, b);
-    const auto b6 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, b);
-
-    const auto c0 = c_mul_neon(float32x2_t{ 0, -1 }, c);
-    const auto c1 = c_mul_neon(float32x2_t{ -1, 0 }, c);
-    const auto c2 = c_mul_neon(float32x2_t{ 0, 1 }, c);
-    const auto c3 = c_mul_neon(float32x2_t{ 1, 0 }, c);
-    const auto c4 = c_mul_neon(float32x2_t{ 0, -1 }, c);
-    const auto c5 = c_mul_neon(float32x2_t{ -1, 0 }, c);
-    const auto c6 = c_mul_neon(float32x2_t{ 0, 1 }, c);
-
-    const auto d0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, d);
-    const auto d1 = c_mul_neon(float32x2_t{ 0, 1 }, d);
-    const auto d2 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, d);
-    const auto d3 = c_mul_neon(float32x2_t{ -1, 0 }, d);
-    const auto d4 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, d);
-    const auto d5 = c_mul_neon(float32x2_t{ 0, -1 }, d);
-    const auto d6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, d);
-
-    const auto e0 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-    const auto e1 = c_mul_neon(float32x2_t{ 1, 0 }, e);
-    const auto e2 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-    const auto e3 = c_mul_neon(float32x2_t{ 1, 0 }, e);
-    const auto e4 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-    const auto e5 = c_mul_neon(float32x2_t{ 1, 0 }, e);
-    const auto e6 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-
-    const auto f0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, f);
-    const auto f1 = c_mul_neon(float32x2_t{ 0, -1 }, f);
-    const auto f2 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, f);
-    const auto f3 = c_mul_neon(float32x2_t{ -1, 0 }, f);
-    const auto f4 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, f);
-    const auto f5 = c_mul_neon(float32x2_t{ 0, 1 }, f);
-    const auto f6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, f);
-
-    const auto g0 = c_mul_neon(float32x2_t{ 0, 1 }, g);
-    const auto g1 = c_mul_neon(float32x2_t{ -1, 0 }, g);
-    const auto g2 = c_mul_neon(float32x2_t{ 0, -1 }, g);
-    const auto g3 = c_mul_neon(float32x2_t{ 1, 0 }, g);
-    const auto g4 = c_mul_neon(float32x2_t{ 0, 1 }, g);
-    const auto g5 = c_mul_neon(float32x2_t{ -1, 0 }, g);
-    const auto g6 = c_mul_neon(float32x2_t{ 0, -1 }, g);
-
-    const auto h0 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, h);
-    const auto h1 = c_mul_neon(float32x2_t{ 0, 1 }, h);
-    const auto h2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, h);
-    const auto h3 = c_mul_neon(float32x2_t{ -1, 0 }, h);
-    const auto h4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, h);
-    const auto h5 = c_mul_neon(float32x2_t{ 0, -1 }, h);
-    const auto h6 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, h);
+    const auto b0 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, b);
+    const auto b1 = c_mul_neon(float32x2_t{0, -1}, b);
+    const auto b2 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, b);
+    const auto b3 = c_mul_neon(float32x2_t{-1, 0}, b);
+    const auto b4 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, b);
+    const auto b5 = c_mul_neon(float32x2_t{0, 1}, b);
+    const auto b6 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, b);
+
+    const auto c0 = c_mul_neon(float32x2_t{0, -1}, c);
+    const auto c1 = c_mul_neon(float32x2_t{-1, 0}, c);
+    const auto c2 = c_mul_neon(float32x2_t{0, 1}, c);
+    const auto c3 = c_mul_neon(float32x2_t{1, 0}, c);
+    const auto c4 = c_mul_neon(float32x2_t{0, -1}, c);
+    const auto c5 = c_mul_neon(float32x2_t{-1, 0}, c);
+    const auto c6 = c_mul_neon(float32x2_t{0, 1}, c);
+
+    const auto d0 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, d);
+    const auto d1 = c_mul_neon(float32x2_t{0, 1}, d);
+    const auto d2 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, d);
+    const auto d3 = c_mul_neon(float32x2_t{-1, 0}, d);
+    const auto d4 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, d);
+    const auto d5 = c_mul_neon(float32x2_t{0, -1}, d);
+    const auto d6 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, d);
+
+    const auto e0 = c_mul_neon(float32x2_t{-1, 0}, e);
+    const auto e1 = c_mul_neon(float32x2_t{1, 0}, e);
+    const auto e2 = c_mul_neon(float32x2_t{-1, 0}, e);
+    const auto e3 = c_mul_neon(float32x2_t{1, 0}, e);
+    const auto e4 = c_mul_neon(float32x2_t{-1, 0}, e);
+    const auto e5 = c_mul_neon(float32x2_t{1, 0}, e);
+    const auto e6 = c_mul_neon(float32x2_t{-1, 0}, e);
+
+    const auto f0 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, f);
+    const auto f1 = c_mul_neon(float32x2_t{0, -1}, f);
+    const auto f2 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, f);
+    const auto f3 = c_mul_neon(float32x2_t{-1, 0}, f);
+    const auto f4 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, f);
+    const auto f5 = c_mul_neon(float32x2_t{0, 1}, f);
+    const auto f6 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, f);
+
+    const auto g0 = c_mul_neon(float32x2_t{0, 1}, g);
+    const auto g1 = c_mul_neon(float32x2_t{-1, 0}, g);
+    const auto g2 = c_mul_neon(float32x2_t{0, -1}, g);
+    const auto g3 = c_mul_neon(float32x2_t{1, 0}, g);
+    const auto g4 = c_mul_neon(float32x2_t{0, 1}, g);
+    const auto g5 = c_mul_neon(float32x2_t{-1, 0}, g);
+    const auto g6 = c_mul_neon(float32x2_t{0, -1}, g);
+
+    const auto h0 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, h);
+    const auto h1 = c_mul_neon(float32x2_t{0, 1}, h);
+    const auto h2 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, h);
+    const auto h3 = c_mul_neon(float32x2_t{-1, 0}, h);
+    const auto h4 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, h);
+    const auto h5 = c_mul_neon(float32x2_t{0, -1}, h);
+    const auto h6 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, h);
 
     x1 = reduce_sum_8(a, b, c, d, e, f, g, h);
     x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0);
@@ -352,18 +396,19 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
 }
 
 template <bool first_stage>
-void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_2_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
-            auto a = float32x2_t{ 0, 0 };
-            auto b = float32x2_t{ 0, 0 };
+            auto a = float32x2_t{0, 0};
+            auto b = float32x2_t{0, 0};
 
             // Load inputs
-            if(first_stage)
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 a             = wrapper::vgetlow(ab);
@@ -379,7 +424,7 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             fft_2(a, b, w);
 
             // Write outputs
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
             }
@@ -394,12 +439,20 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_2_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -418,20 +471,21 @@ void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
 }
 
 template <bool first_stage>
-void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_3_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const auto w2 = c_mul_neon(w, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
             // Load inputs
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            if(first_stage)
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 a             = wrapper::vgetlow(ab);
@@ -447,7 +501,7 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             // Base-case prime transform
             fft_3(a, b, c, w, w2);
 
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
             }
@@ -462,14 +516,22 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_3_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const auto w2 = c_mul_neon(w, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -489,21 +551,22 @@ void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
 }
 
 template <bool first_stage>
-void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_4_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const auto w2 = c_mul_neon(w, w);
         const auto w3 = c_mul_neon(w2, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            float32x2_t d = { 0, 0 };
-            if(first_stage)
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            float32x2_t d = {0, 0};
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -524,7 +587,7 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             // Base-case prime transform
             fft_4(a, b, c, d, w, w2, w3);
 
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
                 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -542,15 +605,23 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_4_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const auto w2 = c_mul_neon(w, w);
         const auto w3 = c_mul_neon(w2, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -572,25 +643,26 @@ void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
 }
 
 template <bool first_stage>
-void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_5_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
         const float32x2_t w4 = c_mul_neon(w3, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            float32x2_t d = { 0, 0 };
-            float32x2_t e = { 0, 0 };
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            float32x2_t d = {0, 0};
+            float32x2_t e = {0, 0};
 
             // Load inputs
-            if(first_stage)
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -613,7 +685,7 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             fft_5(a, b, c, d, e, w, w2, w3, w4);
 
             // Store outputs
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
                 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -632,16 +704,24 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_5_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
         const float32x2_t w4 = c_mul_neon(w3, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -666,10 +746,11 @@ void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
 }
 
 template <bool first_stage>
-void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_7_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
@@ -677,18 +758,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
         const float32x2_t w5 = c_mul_neon(w4, w);
         const float32x2_t w6 = c_mul_neon(w5, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            float32x2_t d = { 0, 0 };
-            float32x2_t e = { 0, 0 };
-            float32x2_t f = { 0, 0 };
-            float32x2_t g = { 0, 0 };
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            float32x2_t d = {0, 0};
+            float32x2_t e = {0, 0};
+            float32x2_t f = {0, 0};
+            float32x2_t g = {0, 0};
 
             // Load inputs
-            if(first_stage)
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -715,7 +796,7 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             // Base-case prime transform
             fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);
 
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
                 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -737,10 +818,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_7_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
@@ -748,7 +837,7 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
         const float32x2_t w5 = c_mul_neon(w4, w);
         const float32x2_t w6 = c_mul_neon(w5, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -777,10 +866,11 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
 }
 
 template <bool first_stage>
-void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_8_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
@@ -789,20 +879,20 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
         const float32x2_t w6 = c_mul_neon(w5, w);
         const float32x2_t w7 = c_mul_neon(w6, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
             // Load inputs
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            float32x2_t d = { 0, 0 };
-            float32x2_t e = { 0, 0 };
-            float32x2_t f = { 0, 0 };
-            float32x2_t g = { 0, 0 };
-            float32x2_t h = { 0, 0 };
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            float32x2_t d = {0, 0};
+            float32x2_t e = {0, 0};
+            float32x2_t f = {0, 0};
+            float32x2_t g = {0, 0};
+            float32x2_t h = {0, 0};
 
             // Base-case prime transform
-            if(first_stage)
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -834,7 +924,7 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);
 
             // Store outputs
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
                 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -858,10 +948,18 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_8_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
@@ -870,7 +968,7 @@ void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
         const float32x2_t w6 = c_mul_neon(w5, w);
         const float32x2_t w7 = c_mul_neon(w6, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -908,7 +1006,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_UNUSED(config);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -917,11 +1015,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
 {
     ARM_COMPUTE_UNUSED(config);
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output, *input);
     }
@@ -942,7 +1041,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis0(const FFTRadixStageKernelInfo
     // FFT table axis 0: [radix, first_stage]
     static std::map<unsigned int, std::map<bool, FFTFunctionPointerAxis0>> fft_table_axis0;
 
-    if(fft_table_axis0.empty())
+    if (fft_table_axis0.empty())
     {
         fft_table_axis0[2][false] = &fft_radix_2_axes_0<false>;
         fft_table_axis0[3][false] = &fft_radix_3_axes_0<false>;
@@ -967,7 +1066,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis1(const FFTRadixStageKernelInfo
     // FFT table axis 1: [radix, first_stage]
     static std::map<unsigned int, FFTFunctionPointerAxis1> fft_table_axis1;
 
-    if(fft_table_axis1.empty())
+    if (fft_table_axis1.empty())
     {
         fft_table_axis1[2] = &fft_radix_2_axes_1;
         fft_table_axis1[3] = &fft_radix_3_axes_1;
@@ -985,12 +1084,13 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
     // Output auto inizialitation if not yet initialized
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output->info(), *input->info()->clone());
     }
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
 
     _input  = input;
     _output = (output == nullptr) ? input : output;
@@ -998,7 +1098,7 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT
     _axis   = config.axis;
     _radix  = config.radix;
 
-    switch(config.axis)
+    switch (config.axis)
     {
         case 0:
             set_radix_stage_axis0(config);
@@ -1012,26 +1112,28 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT
     }
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config);
+    auto win_config =
+        validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
-Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+Status NEFFTRadixStageKernel::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *output,
+                                       const FFTRadixStageKernelInfo &config)
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              (run_in_place) ? nullptr : output->clone().get(),
-                                                              config)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config)
+            .first);
 
     return Status{};
 }
 
 std::set<unsigned int> NEFFTRadixStageKernel::supported_radix()
 {
-    return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+    return std::set<unsigned int>{2, 3, 4, 5, 7, 8};
 }
 
 void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info)
@@ -1049,28 +1151,32 @@ void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info)
     // Precompute FFT constants
     const unsigned int NxRadix = _radix * _Nx;
     const float        alpha   = 2.0f * kPi / float(NxRadix);
-    const float32x2_t  w_m{ cosf(alpha), -sinf(alpha) };
+    const float32x2_t  w_m{cosf(alpha), -sinf(alpha)};
 
-    if(_axis == 0)
+    if (_axis == 0)
     {
         const unsigned int N = _input->info()->dimension(0);
-        execute_window_loop(input_window, [&](const Coordinates &)
-        {
-            _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N);
-        },
-        in, out);
+        execute_window_loop(
+            input_window,
+            [&](const Coordinates &) {
+                _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m,
+                        N);
+            },
+            in, out);
     }
     else
     {
         const unsigned int N = _input->info()->dimension(0);
         const unsigned int M = _input->info()->dimension(1);
-        execute_window_loop(input_window, [&](const Coordinates &)
-        {
-            _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N, M,
-                    _input->info()->padding().right + _input->info()->padding().left,
-                    _output->info()->padding().right + _output->info()->padding().left);
-        },
-        in, out);
+        execute_window_loop(
+            input_window,
+            [&](const Coordinates &)
+            {
+                _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N,
+                        M, _input->info()->padding().right + _input->info()->padding().left,
+                        _output->info()->padding().right + _output->info()->padding().left);
+            },
+            in, out);
     }
 
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.h b/src/core/NEON/kernels/NEFFTRadixStageKernel.h
index 2291a1068c9dd1c008a74e5cc0c5a5c8f361ce21..54f32efa230ec90341ef05532d1da49d68e07e1e 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.h
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 #include <arm_neon.h>
@@ -92,8 +93,17 @@ private:
     void set_radix_stage_axis0(const FFTRadixStageKernelInfo &config);
     void set_radix_stage_axis1(const FFTRadixStageKernelInfo &config);
 
-    using FFTFunctionPointerAxis0 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>;
-    using FFTFunctionPointerAxis1 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int, unsigned int, unsigned int, unsigned int)>;
+    using FFTFunctionPointerAxis0 =
+        std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>;
+    using FFTFunctionPointerAxis1 = std::function<void(float *,
+                                                       float *,
+                                                       unsigned int,
+                                                       unsigned int,
+                                                       const float32x2_t &,
+                                                       unsigned int,
+                                                       unsigned int,
+                                                       unsigned int,
+                                                       unsigned int)>;
 
     FFTFunctionPointerAxis0 _func_0;
     FFTFunctionPointerAxis1 _func_1;
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
index 5ec330bebc8d70aab0a7f4ae55bf6c8221977e9c..9fe561fc59ac5b2c4c32532c0731717fc68ec19f 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
@@ -28,9 +28,10 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 
@@ -41,8 +42,8 @@ namespace
 void scale_complex(float *c_in, float *c_out, bool is_conjugate, float scale)
 {
     const auto a = wrapper::vload(c_in);
-    auto       b = wrapper::vdiv(a, float32x2_t{ scale, scale });
-    if(is_conjugate)
+    auto       b = wrapper::vdiv(a, float32x2_t{scale, scale});
+    if (is_conjugate)
     {
         const float img_part = wrapper::vgetlane(b, 1);
         b                    = wrapper::vsetlane(-img_part, b, 1);
@@ -56,7 +57,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -71,7 +72,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     // Configure kernel window
     Window win = calculate_max_window(*input, Steps());
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output auto inizialitation if not yet initialized
         auto_init_if_empty(*output, *input->clone());
@@ -126,10 +127,10 @@ void NEFFTScaleKernel::run(const Window &window, const ThreadInfo &info)
     Iterator in(_input, input_window);
     Iterator out(_run_in_place ? _input : _output, input_window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        scale_complex(reinterpret_cast<float *>(in.ptr()), reinterpret_cast<float *>(out.ptr()), _is_conj, _scale);
-    },
-    in, out);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
+        { scale_complex(reinterpret_cast<float *>(in.ptr()), reinterpret_cast<float *>(out.ptr()), _is_conj, _scale); },
+        in, out);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.h b/src/core/NEON/kernels/NEFFTScaleKernel.h
index 24a19f98ba3e71192311df534a1f0a09b3e9d86a..608cf5ea34212ccec75d63e16a9b04141fac16fc 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.h
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_NEFFTSCALEKERNEL_H
 #define ARM_COMPUTE_NEFFTSCALEKERNEL_H
 
-#include "src/core/NEON/INEKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/NEON/INEKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 1c7c1f9763b763595f1653c3c9434cc3332809e2..00b0c0ae8d43ccde88b57167f06b5a390052e53b 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -30,14 +30,19 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 
 namespace arm_compute
 {
 namespace
 {
-inline void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value)
+inline void fill_constant_value_single_channel_special(ITensor          *tensor,
+                                                       const Window     &window,
+                                                       unsigned int      right,
+                                                       unsigned int      bottom,
+                                                       const PixelValue &constant_border_value)
 {
     float border_value;
     constant_border_value.get(border_value);
@@ -52,39 +57,43 @@ inline void fill_constant_value_single_channel_special(ITensor *tensor, const Wi
 
     Iterator vertical_it(tensor, vertical);
 
-    execute_window_loop(vertical, [&](const Coordinates &)
-    {
-        const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset());
+    execute_window_loop(
+        vertical,
+        [&](const Coordinates &)
+        {
+            const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset());
 
-        // Fill left and right borders
-        *(row_start - 1) = border_value;
-        std::fill_n(row_start + width, right, border_value);
-    },
-    vertical_it);
+            // Fill left and right borders
+            *(row_start - 1) = border_value;
+            std::fill_n(row_start + width, right, border_value);
+        },
+        vertical_it);
 
     // Top and bottom border
     Iterator plane_it(tensor, window);
 
     // Iterate over all XY planes
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + plane_it.offset();
-        // Top border
-        const auto row_start = reinterpret_cast<float *>(base_addr - stridey);
-        // Fill top rows including left/right borders
-        std::fill_n(row_start - 1, 1 + width + right, border_value);
-
-        // Bottom border
-        const unsigned low_border_size = height + bottom;
-        for(unsigned int i = height; i < low_border_size; ++i)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey);
-
-            // Fill bottom rows including left/right borders
+            uint8_t *base_addr = start_valid_region + plane_it.offset();
+            // Top border
+            const auto row_start = reinterpret_cast<float *>(base_addr - stridey);
+            // Fill top rows including left/right borders
             std::fill_n(row_start - 1, 1 + width + right, border_value);
-        }
-    },
-    plane_it);
+
+            // Bottom border
+            const unsigned low_border_size = height + bottom;
+            for (unsigned int i = height; i < low_border_size; ++i)
+            {
+                const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey);
+
+                // Fill bottom rows including left/right borders
+                std::fill_n(row_start - 1, 1 + width + right, border_value);
+            }
+        },
+        plane_it);
 }
 } // namespace
 
@@ -93,14 +102,20 @@ NEFillBorderKernel::NEFillBorderKernel()
 {
 }
 
-void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorderKernel::configure(ITensor          *tensor,
+                                   BorderSize        border_size,
+                                   BorderMode        border_mode,
+                                   const PixelValue &constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
     _tensor = tensor;
     configure(tensor->info(), border_size, border_mode, constant_border_value);
 }
 
-void NEFillBorderKernel::configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorderKernel::configure(ITensorInfo      *tensor,
+                                   BorderSize        border_size,
+                                   BorderMode        border_mode,
+                                   const PixelValue &constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
@@ -124,7 +139,7 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_UNUSED(info);
 
     // If there is no border: early exit
-    if(_border_size.empty())
+    if (_border_size.empty())
     {
         return;
     }
@@ -132,13 +147,14 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_mode)
+    switch (_mode)
     {
         case BorderMode::CONSTANT:
         {
-            if(_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32)
+            if (_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32)
             {
-                fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value);
+                fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom,
+                                                           _constant_border_value);
             }
             else
             {
@@ -176,46 +192,56 @@ void NEFillBorderKernel::fill_replicate_single_channel(const Window &window)
 
     Iterator vertical_it(_tensor, vertical);
 
-    execute_window_loop(vertical, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + vertical_it.offset();
-        // Fill left and right borders
-        for(unsigned int i = 0; i < _border_size.left; ++i)
+    execute_window_loop(
+        vertical,
+        [&](const Coordinates &)
         {
-            std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, vertical_it.ptr(), element_size);
-        }
+            uint8_t *base_addr = start_valid_region + vertical_it.offset();
+            // Fill left and right borders
+            for (unsigned int i = 0; i < _border_size.left; ++i)
+            {
+                std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, vertical_it.ptr(),
+                            element_size);
+            }
 
-        for(unsigned int i = 0; i < _border_size.right; ++i)
-        {
-            std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size, element_size);
-        }
-    },
-    vertical_it);
+            for (unsigned int i = 0; i < _border_size.right; ++i)
+            {
+                std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size,
+                            element_size);
+            }
+        },
+        vertical_it);
 
     // Top and bottom border
     Iterator plane_it(_tensor, window);
 
     // Iterate over all XY planes
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + plane_it.offset();
-        // Top border
-        for(int i = -_border_size.top; i < 0; ++i)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            // Copy top rows including left/right borders
-            std::memcpy(base_addr + i * static_cast<int>(_tensor->info()->strides_in_bytes()[1]) - _border_size.left * element_size,
-                        base_addr - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size);
-        }
+            uint8_t *base_addr = start_valid_region + plane_it.offset();
+            // Top border
+            for (int i = -_border_size.top; i < 0; ++i)
+            {
+                // Copy top rows including left/right borders
+                std::memcpy(base_addr + i * static_cast<int>(_tensor->info()->strides_in_bytes()[1]) -
+                                _border_size.left * element_size,
+                            base_addr - _border_size.left * element_size,
+                            (_border_size.left + width + _border_size.right) * element_size);
+            }
 
-        // Bottom border
-        for(unsigned int i = height; i < height + _border_size.bottom; ++i)
-        {
-            // Copy bottom rows including left/right borders
-            std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size,
-                        base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size);
-        }
-    },
-    plane_it);
+            // Bottom border
+            for (unsigned int i = height; i < height + _border_size.bottom; ++i)
+            {
+                // Copy bottom rows including left/right borders
+                std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size,
+                            base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] -
+                                _border_size.left * element_size,
+                            (_border_size.left + width + _border_size.right) * element_size);
+            }
+        },
+        plane_it);
 }
 
 void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window)
@@ -232,50 +258,57 @@ void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window
 
     Iterator vertical_it(_tensor, vertical);
 
-    execute_window_loop(vertical, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + vertical_it.offset();
-        // Fill left and right borders
-        for(unsigned int i = 0; i < _border_size.left; ++i)
+    execute_window_loop(
+        vertical,
+        [&](const Coordinates &)
         {
-            std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, &_constant_border_value, element_size);
-        }
+            uint8_t *base_addr = start_valid_region + vertical_it.offset();
+            // Fill left and right borders
+            for (unsigned int i = 0; i < _border_size.left; ++i)
+            {
+                std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, &_constant_border_value,
+                            element_size);
+            }
 
-        for(unsigned int i = 0; i < _border_size.right; ++i)
-        {
-            std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size);
-        }
-    },
-    vertical_it);
+            for (unsigned int i = 0; i < _border_size.right; ++i)
+            {
+                std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size);
+            }
+        },
+        vertical_it);
 
     // Top and bottom border
     Iterator plane_it(_tensor, window);
 
     // Iterate over all XY planes
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + plane_it.offset();
-        // Top border
-        for(int i = -_border_size.top; i < 0; ++i)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            // Fill top rows including left/right borders
-            for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+            uint8_t *base_addr = start_valid_region + plane_it.offset();
+            // Top border
+            for (int i = -_border_size.top; i < 0; ++i)
             {
-                std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size);
+                // Fill top rows including left/right borders
+                for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+                {
+                    std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size,
+                                &_constant_border_value, element_size);
+                }
             }
-        }
 
-        // Bottom border
-        const unsigned low_border_size = height + _border_size.bottom;
-        for(unsigned int i = height; i < low_border_size; ++i)
-        {
-            // Fill bottom rows including left/right borders
-            for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+            // Bottom border
+            const unsigned low_border_size = height + _border_size.bottom;
+            for (unsigned int i = height; i < low_border_size; ++i)
             {
-                std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size);
+                // Fill bottom rows including left/right borders
+                for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+                {
+                    std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size,
+                                &_constant_border_value, element_size);
+                }
             }
-        }
-    },
-    plane_it);
+        },
+        plane_it);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.h b/src/core/NEON/kernels/NEFillBorderKernel.h
index 2c851583ed1c75778807f278dd69721492d81923..aaad108bfa49122b64b24bc9905dae29964fa48b 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.h
+++ b/src/core/NEON/kernels/NEFillBorderKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -64,7 +65,10 @@ public:
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      *
      */
-    void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ITensor          *tensor,
+                   BorderSize        border_size,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
     /** Initialise the function.
      *
      * @note This kernel fills the borders within the XY-planes.
@@ -75,7 +79,10 @@ public:
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      *
      */
-    void configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ITensorInfo      *tensor,
+                   BorderSize        border_size,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
index 51a69046a90f289557b7db6dcd350891c3c8d3ce..cbe5136fb1b22e845f0b9b8ee2f481289a478786 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
-#include "src/cpu/kernels/fuse_batch_normalization/list.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -30,12 +29,14 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/common/cpuinfo/CpuIsaInfo.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/fuse_batch_normalization/list.h"
 
 #include <map>
 
@@ -52,8 +53,16 @@ struct FuseBatchNormalizeSelectorData
 };
 
 using FBNSelectorPtr = std::add_pointer<bool(const FuseBatchNormalizeSelectorData &data)>::type;
-using FBNUKernelPtr  = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, ITensor *,
-                                             const ITensor *, const ITensor *, const ITensor *, const ITensor *, float, const Window &)>::type;
+using FBNUKernelPtr  = std::add_pointer<void(const ITensor *,
+                                            const ITensor *,
+                                            ITensor *,
+                                            ITensor *,
+                                            const ITensor *,
+                                            const ITensor *,
+                                            const ITensor *,
+                                            const ITensor *,
+                                            float,
+                                            const Window &)>::type;
 
 struct FBNUKernel
 {
@@ -62,73 +71,63 @@ struct FBNUKernel
     FBNUKernelPtr        ukernel;
 };
 
-static const FBNUKernel available_kernels[] =
-{
-    {
-        "fused_batch_normalization_conv_NHWC_F16",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
-        },
-        REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)
-    },
-    {
-        "fused_batch_normalization_conv_NCHW_F16",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
-        },
-        REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)
-    },
-    {
-        "fused_batch_normalization_dwc_NHWC_F16",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
-        },
-        REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16)
-    },
-    {
-        "fused_batch_normalization_dwc_NCHW_F16",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
-        },
-        REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16)
-    },
-    {
-        "fused_batch_normalization_conv_NHWC_F32",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)
-    },
-    {
-        "fused_batch_normalization_conv_NCHW_F32",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)
-    },
-    {
-        "fused_batch_normalization_dwc_NHWC_F32",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32)
-    },
-    {
-        "fused_batch_normalization_dwc_NCHW_F32",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32)
-    }
-};
+static const FBNUKernel available_kernels[] = {
+    {"fused_batch_normalization_conv_NHWC_F16",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 &&
+                data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)},
+    {"fused_batch_normalization_conv_NCHW_F16",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 &&
+                data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)},
+    {"fused_batch_normalization_dwc_NHWC_F16",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 &&
+                data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16)},
+    {"fused_batch_normalization_dwc_NCHW_F16",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 &&
+                data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16)},
+    {"fused_batch_normalization_conv_NHWC_F32",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F32 && data.dl == DataLayout::NHWC &&
+                data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)},
+    {"fused_batch_normalization_conv_NCHW_F32",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F32 && data.dl == DataLayout::NCHW &&
+                data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)},
+    {"fused_batch_normalization_dwc_NHWC_F32",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F32 && data.dl == DataLayout::NHWC &&
+                data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32)},
+    {"fused_batch_normalization_dwc_NCHW_F32",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F32 && data.dl == DataLayout::NCHW &&
+                data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32)}};
 
 /** Micro-kernel selector
  *
@@ -140,9 +139,9 @@ static const FBNUKernel available_kernels[] =
  */
 const FBNUKernel *get_implementation(const FuseBatchNormalizeSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -150,10 +149,16 @@ const FBNUKernel *get_implementation(const FuseBatchNormalizeSelectorData &data)
     return nullptr;
 }
 
-Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status validate_arguments(const ITensorInfo         *input_weights,
+                          const ITensorInfo         *bn_mean,
+                          const ITensorInfo         *bn_var,
+                          const ITensorInfo         *fused_weights,
+                          const ITensorInfo         *fused_bias,
+                          const ITensorInfo         *input_bias,
+                          const ITensorInfo         *bn_beta,
+                          const ITensorInfo         *bn_gamma,
+                          float                      epsilon,
+                          FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
@@ -164,43 +169,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
     ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1);
 
-    if(fbn_type == FuseBatchNormalizationType::CONVOLUTION)
+    if (fbn_type == FuseBatchNormalizationType::CONVOLUTION)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0));
     }
     else
     {
-        const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t channel_idx =
+            get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0));
     }
     // Validate bias
-    if(input_bias != nullptr)
+    if (input_bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias);
     }
     // Validate beta
-    if(bn_beta != nullptr)
+    if (bn_beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta);
     }
     // Validate gamma
-    if(bn_gamma != nullptr)
+    if (bn_gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma);
     }
 
     // Validate output weights
-    if(fused_weights != nullptr && fused_weights->total_size() != 0)
+    if (fused_weights != nullptr && fused_weights->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights);
     }
     // Validate output bias
-    if(fused_bias != nullptr && fused_bias->total_size() != 0)
+    if (fused_bias != nullptr && fused_bias->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias);
@@ -212,15 +218,31 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
 } // namespace
 
 NEFuseBatchNormalizationKernel::NEFuseBatchNormalizationKernel()
-    : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
-      _run_in_place_weights(false), _run_in_place_bias(false), _func(nullptr)
+    : _input_weights(nullptr),
+      _input_bias(nullptr),
+      _bn_mean(nullptr),
+      _bn_var(nullptr),
+      _bn_gamma(nullptr),
+      _bn_beta(nullptr),
+      _fused_weights(nullptr),
+      _fused_bias(nullptr),
+      _epsilon(),
+      _run_in_place_weights(false),
+      _run_in_place_bias(false),
+      _func(nullptr)
 {
 }
 
-void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var,
-                                               ITensor *fused_weights, ITensor *fused_bias,
-                                               const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
-                                               float epsilon, FuseBatchNormalizationType fbn_type)
+void NEFuseBatchNormalizationKernel::configure(const ITensor             *input_weights,
+                                               const ITensor             *bn_mean,
+                                               const ITensor             *bn_var,
+                                               ITensor                   *fused_weights,
+                                               ITensor                   *fused_bias,
+                                               const ITensor             *input_bias,
+                                               const ITensor             *bn_beta,
+                                               const ITensor             *bn_gamma,
+                                               float                      epsilon,
+                                               FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
 
@@ -238,27 +260,27 @@ void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, con
     _run_in_place_bias    = (fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
 
     // Auto initialize outputs
-    if(_fused_weights != nullptr)
+    if (_fused_weights != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone());
     }
-    if(_fused_bias != nullptr)
+    if (_fused_bias != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
     }
 
     // Validate arguments
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(),
-                                                  (fused_weights != nullptr) ? fused_weights->info() : nullptr,
-                                                  (fused_bias != nullptr) ? fused_bias->info() : nullptr,
-                                                  (input_bias != nullptr) ? input_bias->info() : nullptr,
-                                                  (bn_beta != nullptr) ? bn_beta->info() : nullptr,
-                                                  (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
-                                                  epsilon, fbn_type));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input_weights->info(), bn_mean->info(), bn_var->info(),
+        (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+        (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr,
+        (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon,
+        fbn_type));
 
-    const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{ input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa() });
+    const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{
+        input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
     _func = uk->ukernel;
@@ -268,12 +290,19 @@ void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, con
     INEKernel::configure(win);
 }
 
-Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                                const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                                const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                                float epsilon, FuseBatchNormalizationType fbn_type)
+Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo         *input_weights,
+                                                const ITensorInfo         *bn_mean,
+                                                const ITensorInfo         *bn_var,
+                                                const ITensorInfo         *fused_weights,
+                                                const ITensorInfo         *fused_bias,
+                                                const ITensorInfo         *input_bias,
+                                                const ITensorInfo         *bn_beta,
+                                                const ITensorInfo         *bn_gamma,
+                                                float                      epsilon,
+                                                FuseBatchNormalizationType fbn_type)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                   input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
     return Status{};
 }
 
@@ -284,6 +313,7 @@ void NEFuseBatchNormalizationKernel::run(const Window &window, const ThreadInfo
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
-    (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon, window);
+    (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon,
+             window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
index ee767b01c8eddc330ea4a2369a3dfb785691cc69..f23280d55a78819611cf6d85933f46d650d2f709 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
@@ -66,9 +66,16 @@ public:
      * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
      */
-    void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias,
-                   const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const ITensor             *input_weights,
+                   const ITensor             *bn_mean,
+                   const ITensor             *bn_var,
+                   ITensor                   *fused_weights,
+                   ITensor                   *fused_bias,
+                   const ITensor             *input_bias = nullptr,
+                   const ITensor             *bn_beta    = nullptr,
+                   const ITensor             *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalizationKernel
      *
      * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -86,10 +93,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    static Status validate(const ITensorInfo         *input_weights,
+                           const ITensorInfo         *bn_mean,
+                           const ITensorInfo         *bn_var,
+                           const ITensorInfo         *fused_weights,
+                           const ITensorInfo         *fused_bias,
+                           const ITensorInfo         *input_bias = nullptr,
+                           const ITensorInfo         *bn_beta    = nullptr,
+                           const ITensorInfo         *bn_gamma   = nullptr,
+                           float                      epsilon    = 0.001f,
+                           FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -107,8 +120,16 @@ private:
     bool           _run_in_place_weights;
     bool           _run_in_place_bias;
 
-    using FuseBatchNormFunction = void(const ITensor *input_weights, const ITensor *input_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                       const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window);
+    using FuseBatchNormFunction = void(const ITensor *input_weights,
+                                       const ITensor *input_bias,
+                                       ITensor       *fused_weights,
+                                       ITensor       *fused_bias,
+                                       const ITensor *bn_mean,
+                                       const ITensor *bn_var,
+                                       const ITensor *bn_beta,
+                                       const ITensor *bn_gamma,
+                                       float          epsilon,
+                                       const Window  &window);
 
     FuseBatchNormFunction *_func;
 };
diff --git a/src/core/NEON/kernels/NEGatherKernel.cpp b/src/core/NEON/kernels/NEGatherKernel.cpp
index 11332ffac8748bac7149c8b317374c2ba6a78235..f1d457d399f26b4c1217f5ef56706734d8962a72 100644
--- a/src/core/NEON/kernels/NEGatherKernel.cpp
+++ b/src/core/NEON/kernels/NEGatherKernel.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -42,20 +43,22 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices,
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
-    if(axis < 0)
+    if (axis < 0)
     {
         axis += input->num_dimensions();
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > Coordinates::num_max_dimensions);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 >
+                                Coordinates::num_max_dimensions);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), axis);
+        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+            input->tensor_shape(), indices->tensor_shape(), axis);
         ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
     }
 
@@ -81,23 +84,23 @@ void NEGatherKernel::gather_common(const Window &window, const ThreadInfo &info)
     const auto idx_info = _indices->info();
     const auto dst_info = _output->info();
 
-    const auto num_dims = dst_info->num_dimensions();
+    const auto num_dims     = dst_info->num_dimensions();
     const auto chunk_stride = src_info->strides_in_bytes()[_axis];
 
     const auto window_start_x = window.x().start();
-    const auto window_end_x = window.x().end();
-    auto window_size_x = src_info->element_size();
+    const auto window_end_x   = window.x().end();
+    auto       window_size_x  = src_info->element_size();
 
     const auto idx_limit = static_cast<TIndex>(src_info->tensor_shape()[_axis]);
 
-    if(_axis != 0)
+    if (_axis != 0)
     {
         dst_win.set(0, Window::Dimension(window_start_x, window_start_x + 1, 1));
         window_size_x *= window_end_x - window_start_x;
     }
 
     // Compute source and index tensors window based on the output window.
-    auto src_win = dst_win;
+    auto   src_win = dst_win;
     Window idx_win;
 
     for (size_t i = 0; i < idx_info->num_dimensions(); ++i)
@@ -109,22 +112,27 @@ void NEGatherKernel::gather_common(const Window &window, const ThreadInfo &info)
     // Use the custom strides to access all three tensors using the same loop.
     Iterator src_it(num_dims, _src_it_strides, _input->buffer(), src_info->offset_first_element_in_bytes(), src_win);
     Iterator idx_it(num_dims, _idx_it_strides, _indices->buffer(), idx_info->offset_first_element_in_bytes(), idx_win);
-    Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(), dst_info->offset_first_element_in_bytes(), dst_win);
-
-    execute_window_loop(dst_win, [&](const Coordinates &) {
-        const auto idx = *reinterpret_cast<const TIndex *>(idx_it.ptr());
-
-        if(idx >= 0 && idx < idx_limit)
-        {
-            const auto src_ptr = src_it.ptr() + idx * chunk_stride;
+    Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(),
+                    dst_info->offset_first_element_in_bytes(), dst_win);
 
-            std::copy_n(src_ptr, window_size_x, dst_it.ptr());
-        }
-        else
+    execute_window_loop(
+        dst_win,
+        [&](const Coordinates &)
         {
-            std::fill_n(dst_it.ptr(), window_size_x, 0);
-        }
-    }, src_it, idx_it, dst_it);
+            const auto idx = *reinterpret_cast<const TIndex *>(idx_it.ptr());
+
+            if (idx >= 0 && idx < idx_limit)
+            {
+                const auto src_ptr = src_it.ptr() + idx * chunk_stride;
+
+                std::copy_n(src_ptr, window_size_x, dst_it.ptr());
+            }
+            else
+            {
+                std::fill_n(dst_it.ptr(), window_size_x, 0);
+            }
+        },
+        src_it, idx_it, dst_it);
 }
 
 void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
@@ -137,13 +145,13 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe
     _output  = output;
     _axis    = axis;
 
-    if(_axis < 0)
+    if (_axis < 0)
     {
         _axis += input->info()->num_dimensions();
     }
     ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions()));
 
-    switch(_indices->info()->data_type())
+    switch (_indices->info()->data_type())
     {
         case DataType::U32:
             _func = &NEGatherKernel::gather_common<uint32_t>;
@@ -157,7 +165,8 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe
     }
 
     // Output auto initialization if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+        input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     // Create window
@@ -169,30 +178,31 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe
     // These will be used to iterate lock-step through all tensors (input, indices and output).
     size_t dim_no = 0;
 
-    const auto input_info = input->info();
+    const auto  input_info    = input->info();
     const auto &input_strides = input_info->strides_in_bytes();
 
-    const auto indices_info = indices->info();
-    const auto &indices_strides = indices_info->strides_in_bytes();
-    const auto indices_num_dims = indices_info->num_dimensions();
+    const auto  indices_info     = indices->info();
+    const auto &indices_strides  = indices_info->strides_in_bytes();
+    const auto  indices_num_dims = indices_info->num_dimensions();
 
-    for(; dim_no < static_cast<size_t>(_axis); ++dim_no)
+    for (; dim_no < static_cast<size_t>(_axis); ++dim_no)
     {
         _src_it_strides[dim_no] = input_strides[dim_no];
     }
 
-    for(; dim_no < static_cast<size_t>(_axis) + indices_num_dims; ++dim_no)
+    for (; dim_no < static_cast<size_t>(_axis) + indices_num_dims; ++dim_no)
     {
         _idx_it_strides[dim_no] = indices_strides[dim_no - _axis];
     }
 
-    for(; dim_no < Coordinates::num_max_dimensions; ++dim_no)
+    for (; dim_no < Coordinates::num_max_dimensions; ++dim_no)
     {
         _src_it_strides[dim_no] = input_strides[dim_no - indices_num_dims + 1];
     }
 }
 
-Status NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+Status
+NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
     return Status{};
diff --git a/src/core/NEON/kernels/NEGatherKernel.h b/src/core/NEON/kernels/NEGatherKernel.h
index ce69daeda758d23dfc32ce9bd6526a29cc9d907e..b8c069f99e169d324310b9bcfd09422a4b34236a 100644
--- a/src/core/NEON/kernels/NEGatherKernel.h
+++ b/src/core/NEON/kernels/NEGatherKernel.h
@@ -26,6 +26,7 @@
 #define ARM_COMPUTE_NEGATHERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -92,8 +93,8 @@ private:
     ITensor       *_output;
     kernel_ptr     _func;
 
-    Strides        _src_it_strides;
-    Strides        _idx_it_strides;
+    Strides _src_it_strides;
+    Strides _idx_it_strides;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEGATHERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
index 7bba136e84b3371a02f2c2c905f70b53075da3a1..549319e49f61a43740a6a0b3f503d0710d4f56f4 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
@@ -27,11 +27,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/genproposals/list.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -44,7 +46,8 @@ struct ComputeAllAnchorsData
 };
 
 using ComputeAllAnchorsSelectorPtr = std::add_pointer<bool(const ComputeAllAnchorsData &data)>::type;
-using ComputeAllAnchorsUKernelPtr  = std::add_pointer<void(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type;
+using ComputeAllAnchorsUKernelPtr  = std::add_pointer<void(
+    const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type;
 
 struct ComputeAllAnchorsKernel
 {
@@ -53,27 +56,17 @@ struct ComputeAllAnchorsKernel
     ComputeAllAnchorsUKernelPtr        ukernel;
 };
 
-static const ComputeAllAnchorsKernel available_kernels[] =
-{
+static const ComputeAllAnchorsKernel available_kernels[] = {
 #if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "neon_qu16_computeallanchors",
-        [](const ComputeAllAnchorsData & data) { return data.dt == DataType::QSYMM16; },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)
-    },
+    {"neon_qu16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::QSYMM16; },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)},
 #endif //defined(ARM_COMPUTE_ENABLE_NEON)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "neon_fp16_computeallanchors",
-        [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)
-    },
+    {"neon_fp16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)},
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "neon_fp32_computeallanchors",
-        [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)
-    },
+    {"neon_fp32_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)},
 };
 
 /** Micro-kernel selector
@@ -84,9 +77,9 @@ static const ComputeAllAnchorsKernel available_kernels[] =
  */
 const ComputeAllAnchorsKernel *get_implementation(const ComputeAllAnchorsData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -101,7 +94,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
-    if(all_anchors->total_size() > 0)
+    if (all_anchors->total_size() > 0)
     {
         const size_t feature_height = info.feat_height();
         const size_t feature_width  = info.feat_width();
@@ -111,7 +104,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi());
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors);
 
-        if(is_data_type_quantized(anchors->data_type()))
+        if (is_data_type_quantized(anchors->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors);
         }
@@ -139,7 +132,8 @@ void NEComputeAllAnchorsKernel::configure(const ITensor *anchors, ITensor *all_a
 
     // Initialize the output if empty
     const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors);
-    auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
+    auto_init_if_empty(*all_anchors->info(),
+                       TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
 
     // Set instance variables
     _anchors      = anchors;
@@ -151,7 +145,9 @@ void NEComputeAllAnchorsKernel::configure(const ITensor *anchors, ITensor *all_a
     INEKernel::configure(win);
 }
 
-Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+Status NEComputeAllAnchorsKernel::validate(const ITensorInfo        *anchors,
+                                           const ITensorInfo        *all_anchors,
+                                           const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info));
     return Status{};
@@ -163,7 +159,7 @@ void NEComputeAllAnchorsKernel::run(const Window &window, const ThreadInfo &info
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const auto *uk = get_implementation(ComputeAllAnchorsData{ _anchors->info()->data_type() });
+    const auto *uk = get_implementation(ComputeAllAnchorsData{_anchors->info()->data_type()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     uk->ukernel(_anchors, _all_anchors, _anchors_info, window);
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
index 297d6d4abe0f2de2ee2d7a207823d8d467650f30..30699eee01e63eedeb33381cbd6d32bf7d45142e 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
@@ -78,5 +78,5 @@ private:
     ITensor           *_all_anchors;
     ComputeAnchorsInfo _anchors_info;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif // ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index 71641404bf9e07cffd8118946608c9f048bed4ce..0a1780f6eee3fee6a12ce866605181ec0d11535e 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
@@ -31,12 +31,13 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/instancenorm/list.h"
 
 #include <arm_neon.h>
@@ -51,7 +52,13 @@ struct InstanceNormSelectorData
 };
 
 using InstanceNormSelctorPtr = std::add_pointer<bool(const InstanceNormSelectorData &data)>::type;
-using InstanceNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)>::type;
+using InstanceNormUKernelPtr = std::add_pointer<void(ITensor      *input,
+                                                     ITensor      *output,
+                                                     float         gamma,
+                                                     float         beta,
+                                                     float         epsilon,
+                                                     bool          use_mixed_precision,
+                                                     const Window &window)>::type;
 
 struct InstanceNormKernel
 {
@@ -60,19 +67,12 @@ struct InstanceNormKernel
     InstanceNormUKernelPtr       ukernel;
 };
 
-static const InstanceNormKernel available_kernels[] =
-{
-    {
-        "fp32_neon_instancenorm",
-        [](const InstanceNormSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)
-    },
+static const InstanceNormKernel available_kernels[] = {
+    {"fp32_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)},
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "fp16_neon_instancenorm",
-        [](const InstanceNormSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)
-    },
+    {"fp16_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)},
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 };
 
@@ -84,9 +84,9 @@ static const InstanceNormKernel available_kernels[] =
  */
 const InstanceNormKernel *get_implementation(const InstanceNormSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -102,14 +102,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, "NHWC data layout is not supported by the kernel directly");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC,
+                                    "NHWC data layout is not supported by the kernel directly");
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                        "Input and output have different number of channels");
     }
     return Status{};
 }
@@ -132,7 +134,9 @@ NEInstanceNormalizationLayerKernel::NEInstanceNormalizationLayerKernel()
 {
 }
 
-void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *output, const InstanceNormalizationLayerKernelInfo &info)
+void NEInstanceNormalizationLayerKernel::configure(ITensor                                    *input,
+                                                   ITensor                                    *output,
+                                                   const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -152,10 +156,13 @@ void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *outp
     INEKernel::configure(std::get<1>(win_config));
 }
 
-Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo                          *input,
+                                                    const ITensorInfo                          *output,
+                                                    const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info.gamma, info.beta, info.epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
+        input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
     return Status{};
 }
 
@@ -165,7 +172,7 @@ void NEInstanceNormalizationLayerKernel::run(const Window &window, const ThreadI
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const auto *uk = get_implementation(InstanceNormSelectorData{ _input->info()->data_type() });
+    const auto *uk = get_implementation(InstanceNormSelectorData{_input->info()->data_type()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     uk->ukernel(_input, _output, _gamma, _beta, _epsilon, _use_mixed_precision, window);
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
index f166ce20587fa27afec75dde9f1a800cf5e0c2f4..024ccd9ef2bee46d649a9425810601757440d930 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
@@ -68,7 +68,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -82,14 +83,15 @@ private:
      * @param[in]      beta    The offset scalar value applied to the normalized tensor. Defaults to 0.0
      * @param[in]      epsilon Lower bound value for the normalization. Defaults to 1e-12
      */
-    using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+    using NormalizationFunction =
+        void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
 
     ITensor *_input;
     ITensor *_output;
     float    _gamma;
     float    _beta;
     float    _epsilon;
-    bool     _use_mixed_precision{ true };
+    bool     _use_mixed_precision{true};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index 8ab0288ab1f22102ac13b60cede83a309b8ef346..eea57a17d3835bf514b981f00b4e26ba5d46b6c3 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -30,11 +30,12 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/common/cpuinfo/CpuIsaInfo.h"
-#include "src/core/NEON/NEMath.h"
 #include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
 #include "src/cpu/kernels/l2normlayer/list.h"
 
 #include <arm_neon.h>
@@ -55,7 +56,8 @@ struct L2NormalizeLayerSelectorData
 
 using L2NormalizeLayerKernelSelctorPtr = std::add_pointer<bool(const L2NormalizeLayerSelectorData &data)>::type;
 
-using L2NormalizeLayerPtr = std::add_pointer<void(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)>::type;
+using L2NormalizeLayerPtr = std::add_pointer<void(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)>::type;
 
 struct L2NormalizeLayerKernel
 {
@@ -64,26 +66,25 @@ struct L2NormalizeLayerKernel
     L2NormalizeLayerPtr                    ukernel;
 };
 
-static const L2NormalizeLayerKernel available_kernels[] =
-{
-    {
-        "fp32_neon_l2normalize_x",
-        [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x)
-    },
-    {
-        "fp32_neon_l2normalize_yz",
-        [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz)
-    },
+static const L2NormalizeLayerKernel available_kernels[] = {
+    {"fp32_neon_l2normalize_x",
+     [](const L2NormalizeLayerSelectorData &data)
+     { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x)},
+    {"fp32_neon_l2normalize_yz",
+     [](const L2NormalizeLayerSelectorData &data)
+     { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz)},
     {
         "fp16_neon_l2normalize_x",
-        [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; },
+        [](const L2NormalizeLayerSelectorData &data)
+        { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; },
         REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_x),
     },
     {
         "fp16_neon_l2normalize_yz",
-        [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; },
+        [](const L2NormalizeLayerSelectorData &data)
+        { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; },
         REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_yz),
     },
 };
@@ -96,9 +97,9 @@ static const L2NormalizeLayerKernel available_kernels[] =
  */
 const L2NormalizeLayerKernel *get_implementation(const L2NormalizeLayerSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -106,7 +107,8 @@ const L2NormalizeLayerKernel *get_implementation(const L2NormalizeLayerSelectorD
     return nullptr;
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_UNUSED(epsilon);
 
@@ -115,14 +117,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions,
+                                    "Actual normalization axis greater than max number of dimensions");
 
     // Reduce shape on axis
     TensorShape sum_shape = input->tensor_shape();
     sum_shape.set(actual_axis, 1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -151,7 +154,8 @@ NEL2NormalizeLayerKernel::NEL2NormalizeLayerKernel()
 {
 }
 
-void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon)
+void NEL2NormalizeLayerKernel::configure(
+    const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
@@ -169,10 +173,12 @@ void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *su
     INEKernel::configure(std::get<1>(win_config));
 }
 
-Status NEL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status NEL2NormalizeLayerKernel::validate(
+    const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
 
     return Status{};
 }
@@ -183,12 +189,13 @@ void NEL2NormalizeLayerKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    if(_actual_axis > 2)
+    if (_actual_axis > 2)
     {
         ARM_COMPUTE_ERROR("Unsupported normalization axis");
     }
 
-    const auto *uk = get_implementation(L2NormalizeLayerSelectorData{ _output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa() });
+    const auto *uk = get_implementation(
+        L2NormalizeLayerSelectorData{_output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr);
     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
 
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
index af3ad3403e4e0e46c225411a2f1092a75ac8c7a6..3524e66a211b0ed65d5cbc68a2fe24ae3bf30722 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
@@ -74,7 +74,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NELogicalKernel.cpp b/src/core/NEON/kernels/NELogicalKernel.cpp
index 6939e08ef0324ceb69a63f0dcf66d7517fcb78ae..6be628452837d1d424b0528f941fe27c5fe63035 100644
--- a/src/core/NEON/kernels/NELogicalKernel.cpp
+++ b/src/core/NEON/kernels/NELogicalKernel.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -50,7 +51,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1);
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
         src0 += step;
@@ -58,7 +59,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
         src0 += half_step;
@@ -66,7 +67,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = (*src0) && (*src1);
         ++src0;
@@ -84,21 +85,21 @@ void neon_logical_and_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8
     const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
     const auto broadcast_val_clamped_x8  = vdup_n_u8(broadcast_val_clamped_s);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
         src += step;
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
         src += half_step;
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = (*src) && broadcast_val_clamped_s;
         ++src;
@@ -112,7 +113,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1);
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
         src0 += step;
@@ -120,7 +121,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
         src0 += half_step;
@@ -128,7 +129,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = (*src0) || (*src1);
         ++src0;
@@ -146,21 +147,21 @@ void neon_logical_or_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_
     const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
     const auto broadcast_val_clamped_x8  = vdup_n_u8(broadcast_val_clamped_s);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
         src += step;
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
         src += half_step;
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = (*src) || broadcast_val_clamped_s;
         ++src;
@@ -173,21 +174,21 @@ void neon_logical_not(const uint8_t *src, uint8_t *dst, uint32_t len)
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vbslq_u8(vceqq_u8(vld1q_u8(src), c0_x16), c1_x16, c0_x16));
         src += step;
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vbsl_u8(vceq_u8(vld1_u8(src), c0_x8), c1_x8, c0_x8));
         src += half_step;
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = !(*src);
         ++src;
@@ -197,18 +198,15 @@ void neon_logical_not(const uint8_t *src, uint8_t *dst, uint32_t len)
 
 void run_unary(const Window &window, const ITensor *src, ITensor *dst)
 {
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     const auto len = window.x().end() - window.x().start();
 
     Iterator in(src, win);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        neon_logical_not(in.ptr(), out.ptr(), len);
-    },
-    in, out);
+    execute_window_loop(
+        win, [&](const Coordinates &) { neon_logical_not(in.ptr(), out.ptr(), len); }, in, out);
 }
 
 void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, ITensor *dst, LogicalOperation op)
@@ -216,16 +214,17 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1,
     Window src0_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
     Window src1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
     const auto len                   = window.x().end() - window.x().start();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
-        using LogicalBroadcastUKernelPtr        = std::add_pointer<void(const uint8_t *, uint8_t, uint8_t *, uint32_t)>::type;
-        LogicalBroadcastUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast;
+        using LogicalBroadcastUKernelPtr = std::add_pointer<void(const uint8_t *, uint8_t, uint8_t *, uint32_t)>::type;
+        LogicalBroadcastUKernelPtr logical_func =
+            op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast;
 
         const bool     is_broadcast_input_1 = src1_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_1 ? src1_win : src0_win;
@@ -238,17 +237,18 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1,
         Iterator non_broadcast_in(non_broadcast_tensor, non_broadcast_win);
         Iterator out(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const uint8_t broadcast_value = *broadcast_in.ptr();
-            logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len);
-
-        },
-        broadcast_in, non_broadcast_in, out);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const uint8_t broadcast_value = *broadcast_in.ptr();
+                logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len);
+            },
+            broadcast_in, non_broadcast_in, out);
     }
     else
     {
-        using LogicalUKernelPtr        = std::add_pointer<void(const uint8_t *, const uint8_t *, uint8_t *, uint32_t)>::type;
+        using LogicalUKernelPtr = std::add_pointer<void(const uint8_t *, const uint8_t *, uint8_t *, uint32_t)>::type;
         LogicalUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or : &neon_logical_and;
 
         src0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -257,11 +257,8 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1,
         Iterator in0(src0, src0_win);
         Iterator in1(src1, src1_win);
         Iterator out(dst, win);
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            logical_func(in0.ptr(), in1.ptr(), out.ptr(), len);
-        },
-        in0, in1, out);
+        execute_window_loop(
+            win, [&](const Coordinates &) { logical_func(in0.ptr(), in1.ptr(), out.ptr(), len); }, in0, in1, out);
     }
 }
 } // namespace
@@ -270,7 +267,10 @@ const char *NELogicalKernel::name() const
     return "NELogicalKernel";
 }
 
-void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, LogicalOperation op)
+void NELogicalKernel::configure(const ITensorInfo *input1,
+                                const ITensorInfo *input2,
+                                ITensorInfo       *output,
+                                LogicalOperation   op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate(input1, input2, output, op));
@@ -279,7 +279,7 @@ void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *in
 
     Window      win       = calculate_max_window(*input1, Steps());
     TensorShape out_shape = input1->tensor_shape();
-    if(op != LogicalOperation::Not)
+    if (op != LogicalOperation::Not)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(input2);
         out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
@@ -292,13 +292,16 @@ void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *in
     set_data_type_if_unknown(*output, input1->data_type());
 }
 
-Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op)
+Status NELogicalKernel::validate(const ITensorInfo *input1,
+                                 const ITensorInfo *input2,
+                                 const ITensorInfo *output,
+                                 LogicalOperation   op)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
     ARM_COMPUTE_RETURN_ERROR_ON(op == LogicalOperation::Unknown);
 
     TensorShape out_shape = input1->tensor_shape();
-    if(op != LogicalOperation::Not)
+    if (op != LogicalOperation::Not)
     {
         out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
@@ -306,7 +309,7 @@ Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *i
     }
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
@@ -326,7 +329,7 @@ void NELogicalKernel::run_op(ITensorPack &tensors, const Window &window, const T
     const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     ITensor       *dst  = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_op == LogicalOperation::Not)
+    if (_op == LogicalOperation::Not)
     {
         run_unary(window, src0, dst);
     }
diff --git a/src/core/NEON/kernels/NELogicalKernel.h b/src/core/NEON/kernels/NELogicalKernel.h
index caf69cf45da151ce3c4383ebacca3fe51503aa70..477a59d826e044a9ed87df5132dcda764bef2d74 100644
--- a/src/core/NEON/kernels/NELogicalKernel.h
+++ b/src/core/NEON/kernels/NELogicalKernel.h
@@ -58,10 +58,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op);
+    static Status
+    validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
index 37e88a8565fd389cc6f528a73c837589ce1de8fd..451031d696b64e3e12254c9c1647fbde00f4892b 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
@@ -28,12 +28,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/meanstddevnorm/list.h"
 
 namespace arm_compute
@@ -46,7 +47,8 @@ struct MeanStdDevNormSelectorData
 };
 
 using MeanStdDevNormSelctorPtr = std::add_pointer<bool(const MeanStdDevNormSelectorData &data)>::type;
-using MeanStdDevNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type;
+using MeanStdDevNormUKernelPtr =
+    std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type;
 
 struct MeanStdDevNormKernel
 {
@@ -55,25 +57,15 @@ struct MeanStdDevNormKernel
     MeanStdDevNormUKernelPtr       ukernel;
 };
 
-static const std::vector<MeanStdDevNormKernel> available_kernels =
-{
-    {
-        "fp32_neon_meanstddevnorm",
-        [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)
-    },
+static const std::vector<MeanStdDevNormKernel> available_kernels = {
+    {"fp32_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)},
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "fp16_neon_meanstddevnorm",
-        [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)
-    },
+    {"fp16_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)},
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "qasymm8_neon_meanstddevnorm",
-        [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)
-    },
+    {"qasymm8_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)},
 };
 
 /** Micro-kernel selector
@@ -84,9 +76,9 @@ static const std::vector<MeanStdDevNormKernel> available_kernels =
  */
 const MeanStdDevNormKernel *get_implementation(const MeanStdDevNormSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -103,7 +95,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -113,7 +105,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
 {
-    if(output != nullptr)
+    if (output != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
         // Output auto inizialitation if not yet initialized
@@ -128,8 +120,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel()
-    : _input(nullptr), _output(nullptr), _epsilon(1e-8f)
+NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel() : _input(nullptr), _output(nullptr), _epsilon(1e-8f)
 {
 }
 
@@ -137,7 +128,8 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output,
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
-    ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+    ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(
+        input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
 
     _input   = input;
     _output  = (output == nullptr) ? input : output;
@@ -152,7 +144,9 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output,
 Status NEMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr)
+            .first);
     return Status{};
 }
 
@@ -162,7 +156,7 @@ void NEMeanStdDevNormalizationKernel::run(const Window &window, const ThreadInfo
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    const auto *uk = get_implementation(MeanStdDevNormSelectorData{ _output->info()->data_type() });
+    const auto *uk = get_implementation(MeanStdDevNormSelectorData{_output->info()->data_type()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     uk->ukernel(_input, _output, _epsilon, window);
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 49a045382dd94fdbec151240c65ec8c622197e12..2c61bda14739045fc3de80a3cebc1557e0343c0e 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -29,19 +29,23 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/NormalizationHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status validate_arguments(const ITensorInfo            *input,
+                          const ITensorInfo            *input_squared,
+                          const ITensorInfo            *output,
+                          const NormalizationLayerInfo &norm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
@@ -52,7 +56,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squ
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -69,7 +73,10 @@ NENormalizationLayerKernel::NENormalizationLayerKernel()
 {
 }
 
-void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
+void NENormalizationLayerKernel::configure(const ITensor         *input,
+                                           const ITensor         *input_squared,
+                                           ITensor               *output,
+                                           NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output);
     // Output tensor auto initialization if not yet initialized
@@ -85,15 +92,15 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
     _output        = output;
     _norm_info     = norm_info;
 
-    switch(_input->info()->data_type())
+    switch (_input->info()->data_type())
     {
         case DataType::F32:
         {
-            switch(norm_idx)
+            switch (norm_idx)
             {
                 case 0:
                 {
-                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    if (norm_info.type() == NormType::IN_MAP_2D)
                     {
                         _func = &NENormalizationLayerKernel::normalize_float<float, 4, 0, true>;
                     }
@@ -104,7 +111,7 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
                     break;
                 }
                 case 1:
-                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    if (norm_info.type() == NormType::IN_MAP_2D)
                     {
                         _func = &NENormalizationLayerKernel::normalize_float<float, 4, 1, true>;
                     }
@@ -124,11 +131,11 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
         {
-            switch(norm_idx)
+            switch (norm_idx)
             {
                 case 0:
                 {
-                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    if (norm_info.type() == NormType::IN_MAP_2D)
                     {
                         _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 0, true>;
                     }
@@ -139,7 +146,7 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
                     break;
                 }
                 case 1:
-                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    if (norm_info.type() == NormType::IN_MAP_2D)
                     {
                         _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 1, true>;
                     }
@@ -196,8 +203,9 @@ void NENormalizationLayerKernel::normalize_float(const Window &window)
     const auto beta_vec  = wrapper::vdup_n(static_cast<T>(_norm_info.beta()), ExactTagType{});
     const auto kappa_vec = wrapper::vdup_n(static_cast<T>(_norm_info.kappa()), ExactTagType{});
 
-    auto sequential_normalization = [&](const int x, const Coordinates & id, const int current_row, const int first_row, const int last_row, const T * input_ptr, const uint8_t *input_squared_start_ptr,
-                                        T * output_ptr)
+    auto sequential_normalization = [&](const int x, const Coordinates &id, const int current_row, const int first_row,
+                                        const int last_row, const T *input_ptr, const uint8_t *input_squared_start_ptr,
+                                        T *output_ptr)
     {
         const int current_slice = dim == 0 ? x : id[dim];
         const int first_slice   = std::max(current_slice - radius, 0);
@@ -206,75 +214,87 @@ void NENormalizationLayerKernel::normalize_float(const Window &window)
         const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x;
         // Accumulate 2D In-Map values
         auto accu = static_cast<T>(0.f);
-        for(int j = first_row; j <= last_row; ++j)
+        for (int j = first_row; j <= last_row; ++j)
         {
             // Compute row displacement
             const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
-            for(int i = first_slice; i <= last_slice; ++i)
+            for (int i = first_slice; i <= last_slice; ++i)
             {
-                accu += *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
+                accu +=
+                    *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
             }
         }
 
         // Normalize
-        const auto normalized       = std::pow(accu * static_cast<T>(_norm_info.scale_coeff()) + static_cast<T>(_norm_info.kappa()), _norm_info.beta());
+        const auto normalized = std::pow(
+            accu * static_cast<T>(_norm_info.scale_coeff()) + static_cast<T>(_norm_info.kappa()), _norm_info.beta());
         const auto normalized_pixel = (*(input_ptr + x)) / normalized;
         *(output_ptr + x)           = normalized_pixel;
     };
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        auto       output_ptr = reinterpret_cast<T *>(output.ptr());
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            auto       output_ptr = reinterpret_cast<T *>(output.ptr());
 
-        // Get range to normalize
-        const int current_row = do_2D_norm ? id[dim_y] : 0;
-        const int first_row   = do_2D_norm ? std::max(current_row - radius, 0) : 0;
-        const int last_row    = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+            // Get range to normalize
+            const int current_row = do_2D_norm ? id[dim_y] : 0;
+            const int first_row   = do_2D_norm ? std::max(current_row - radius, 0) : 0;
+            const int last_row    = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
 
-        int x = window_start_x;
-        // Compute serially starting elements for the case x dimension is width
-        for(; x < radius && x < window_end_x && dim == 0; ++x)
-        {
-            sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr);
-        }
+            int x = window_start_x;
+            // Compute serially starting elements for the case x dimension is width
+            for (; x < radius && x < window_end_x && dim == 0; ++x)
+            {
+                sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
+                                         output_ptr);
+            }
 
-        // Compute vectorized
-        for(; x <= window_end_x - window_step_x - radius; x += window_step_x)
-        {
-            const int current_slice = dim == 0 ? x : id[dim];
-            const int first_slice   = std::max(current_slice - radius, 0);
-            const int last_slice    = std::min(current_slice + radius, max_right);
-
-            const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x;
-            // Accumulate 2D In-Map values
-            auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-            for(int j = first_row; j <= last_row; ++j)
+            // Compute vectorized
+            for (; x <= window_end_x - window_step_x - radius; x += window_step_x)
             {
-                // Compute row displacement
-                const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
-                for(int i = first_slice; i <= last_slice; ++i)
+                const int current_slice = dim == 0 ? x : id[dim];
+                const int first_slice   = std::max(current_slice - radius, 0);
+                const int last_slice    = std::min(current_slice + radius, max_right);
+
+                const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x;
+                // Accumulate 2D In-Map values
+                auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+                for (int j = first_row; j <= last_row; ++j)
                 {
-                    accu = wrapper::vadd(accu, wrapper::vloadq(reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
+                    // Compute row displacement
+                    const uint8_t *const input_squared_ptr =
+                        input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
+                    for (int i = first_slice; i <= last_slice; ++i)
+                    {
+                        accu = wrapper::vadd(
+                            accu, wrapper::vloadq(reinterpret_cast<const T *>(
+                                      input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
+                    }
                 }
-            }
 
-            // Normalize
-            const auto normalized       = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
-            const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized));
-            wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel);
-        }
+                // Normalize
+                const auto normalized       = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
+                const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized));
+                wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel);
+            }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr);
-        }
-    },
-    input, input_squared, output);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
+                                         output_ptr);
+            }
+        },
+        input, input_squared, output);
 }
 
-Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info)
+Status NENormalizationLayerKernel::validate(const ITensorInfo           *input,
+                                            const ITensorInfo           *input_squared,
+                                            const ITensorInfo           *output,
+                                            const NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info));
 
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.h b/src/core/NEON/kernels/NENormalizationLayerKernel.h
index 53a06b9ed9907d76a7edcc8435413fd659dc88c4..2d8d9f3d60750df8f8a039c577bae587503e2a23 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -60,7 +60,8 @@ public:
      * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input.
      * @param[in]  norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
      */
-    void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
+    void
+    configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel
      *
      * @param[in] input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -72,7 +73,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, NormalizationLayerInfo norm_info);
+    static Status validate(const ITensorInfo     *input,
+                           const ITensorInfo     *input_squared,
+                           const ITensorInfo     *output,
+                           NormalizationLayerInfo norm_info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp
index 734510b637da46491a360bdba89db7269e34d578..c9bcbc912755b0f8d6a62ff61647621f4163b07b 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp
@@ -28,26 +28,31 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &paddings, const PaddingMode mode)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const PaddingList &paddings,
+                          const PaddingMode  mode)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(mode != PaddingMode::CONSTANT, "Only constant padding mode is supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(paddings.size() > 4, "Padding list bigger than 4 dimensions");
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings);
-        const TensorInfo  expected_output_info  = input->clone()->set_tensor_shape(expected_output_shape);
+        const TensorShape expected_output_shape =
+            arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings);
+        const TensorInfo expected_output_info = input->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -58,30 +63,34 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 template <typename T>
 void NEPadLayerKernel::run_pad_constant(const Window &window)
 {
-    Window output_window{ window };
+    Window output_window{window};
     output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     const size_t element_size = _input->info()->element_size();
     Iterator     output_it(_output, output_window);
-    execute_window_loop(output_window, [&](const Coordinates & id)
-    {
-        Coordinates idin{ id };
-        for(size_t dim = _padding.size() - 1; dim > 0; --dim)
+    execute_window_loop(
+        output_window,
+        [&](const Coordinates &id)
         {
-            idin[dim] -= _padding[dim].first;
-            if(idin[dim] < 0 || static_cast<int>(_input->info()->dimension(dim)) - 1 < idin[dim])
+            Coordinates idin{id};
+            for (size_t dim = _padding.size() - 1; dim > 0; --dim)
             {
-                std::fill_n(reinterpret_cast<T *>(output_it.ptr()), _output->info()->dimension(0), _constant_value.get<T>());
-                return;
+                idin[dim] -= _padding[dim].first;
+                if (idin[dim] < 0 || static_cast<int>(_input->info()->dimension(dim)) - 1 < idin[dim])
+                {
+                    std::fill_n(reinterpret_cast<T *>(output_it.ptr()), _output->info()->dimension(0),
+                                _constant_value.get<T>());
+                    return;
+                }
             }
-        }
-        T *input_it_ptr  = reinterpret_cast<T *>(_input->ptr_to_element(idin));
-        T *output_it_ptr = reinterpret_cast<T *>(output_it.ptr());
-        std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get<T>());
-        memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size);
-        std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second, _constant_value.get<T>());
-    },
-    output_it);
+            T *input_it_ptr  = reinterpret_cast<T *>(_input->ptr_to_element(idin));
+            T *output_it_ptr = reinterpret_cast<T *>(output_it.ptr());
+            std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get<T>());
+            memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size);
+            std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second,
+                        _constant_value.get<T>());
+        },
+        output_it);
 }
 
 void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window)
@@ -92,7 +101,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
     const size_t end_plane   = window.z().end();
 
     size_t start_plane_input = start_plane;
-    if(_padding.size() > 2)
+    if (_padding.size() > 2)
     {
         start_plane_input = (start_plane < _padding[2].first) ? 0 : start_plane - _padding[2].first;
     }
@@ -105,18 +114,20 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
     const size_t jump_to_next_row_input  = _input->info()->dimension(0);
     const size_t jump_to_next_row_output = _padding[0].first + _padding[0].second;
 
-    uint8_t       *output_row_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size;
-    const uint8_t *input_it_ptr   = _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size;
-    const auto     pad_value      = _constant_value.get<uint8_t>();
+    uint8_t *output_row_ptr =
+        _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size;
+    const uint8_t *input_it_ptr =
+        _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size;
+    const auto pad_value = _constant_value.get<uint8_t>();
 
-    for(size_t z_i = start_plane; z_i < end_plane; ++z_i)
+    for (size_t z_i = start_plane; z_i < end_plane; ++z_i)
     {
-        if(_padding.size() > 2 && z_i < _padding[2].first)
+        if (_padding.size() > 2 && z_i < _padding[2].first)
         {
             memset(output_row_ptr, pad_value, output_plane_size);
             output_row_ptr += output_plane_size;
         }
-        else if(_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1))
+        else if (_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1))
         {
             memset(output_row_ptr, pad_value, output_plane_size);
             output_row_ptr += output_plane_size;
@@ -127,7 +138,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
             output_row_ptr += pad_y_elems_top;
             size_t y_i = _input->info()->dimension(1);
             // Basic loop unrolling
-            for(; y_i > 3; y_i -= 4)
+            for (; y_i > 3; y_i -= 4)
             {
                 memset(output_row_ptr, pad_value, _padding[0].first);
                 output_row_ptr += _padding[0].first;
@@ -160,7 +171,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
                 memset(output_row_ptr, pad_value, _padding[0].second);
                 output_row_ptr += _padding[0].second;
             }
-            for(; y_i > 0; --y_i)
+            for (; y_i > 0; --y_i)
             {
                 memset(output_row_ptr, pad_value, _padding[0].first);
                 output_row_ptr += _padding[0].first;
@@ -183,12 +194,17 @@ NEPadLayerKernel::NEPadLayerKernel()
 {
 }
 
-void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+void NEPadLayerKernel::configure(ITensor           *input,
+                                 ITensor           *output,
+                                 const PaddingList &padding,
+                                 const PixelValue   constant_value,
+                                 const PaddingMode  mode)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     // Auto-init
-    const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding);
-    const TensorInfo  expected_output_info  = input->info()->clone()->set_tensor_shape(expected_output_shape);
+    const TensorShape expected_output_shape =
+        arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding);
+    const TensorInfo expected_output_info = input->info()->clone()->set_tensor_shape(expected_output_shape);
     auto_init_if_empty(*output->info(), expected_output_info);
 
     // Perform validation step
@@ -200,14 +216,14 @@ void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingL
     _constant_value = constant_value;
     _mode           = mode;
 
-    if(_mode == PaddingMode::CONSTANT)
+    if (_mode == PaddingMode::CONSTANT)
     {
-        switch(_input->info()->element_size())
+        switch (_input->info()->element_size())
         {
             case 1:
-                if(_input->info()->num_dimensions() == 3 &&                           // Is 3D
-                   padding.size() <= 3 &&                                             // Has 3D padding
-                   !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding
+                if (_input->info()->num_dimensions() == 3 &&                           // Is 3D
+                    padding.size() <= 3 &&                                             // Has 3D padding
+                    !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding
                 {
                     _func = &NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad;
                 }
@@ -240,7 +256,11 @@ void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingL
     ICPPKernel::configure(win);
 }
 
-Status NEPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+Status NEPadLayerKernel::validate(const ITensorInfo *input,
+                                  const ITensorInfo *output,
+                                  const PaddingList &padding,
+                                  const PixelValue   constant_value,
+                                  const PaddingMode  mode)
 {
     ARM_COMPUTE_UNUSED(constant_value);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, mode));
@@ -253,7 +273,7 @@ void NEPadLayerKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    if(_func != nullptr)
+    if (_func != nullptr)
     {
         (this->*_func)(window);
     }
@@ -263,7 +283,7 @@ size_t NEPadLayerKernel::get_mws(const CPUInfo &platform, size_t thread_count) c
 {
     ARM_COMPUTE_UNUSED(thread_count);
     ARM_COMPUTE_UNUSED(platform);
-    
+
     return ICPPKernel::default_mws;
 }
 
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.h b/src/core/NEON/kernels/NEPadLayerKernel.h
index f82af1558a57872e312ca026e1b73a5fe2c5c6a5..d432887d2c4f223268b6e2d8ebc29108e18363a7 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.h
+++ b/src/core/NEON/kernels/NEPadLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEPADLAYERKERNEL_H
 
 #include "arm_compute/core/PixelValue.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -62,7 +63,11 @@ public:
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT.
      *                           Only CONSTANT padding mode is currently supported
      */
-    void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(ITensor           *input,
+                   ITensor           *output,
+                   const PaddingList &padding,
+                   const PixelValue   constant_value = PixelValue(),
+                   const PaddingMode  mode           = PaddingMode::CONSTANT);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEPadLayer.
      *
      * @param[in] input          Source tensor info. Data types supported: All.
@@ -75,7 +80,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const PaddingList &padding,
+                           const PixelValue   constant_value = PixelValue(),
+                           const PaddingMode  mode           = PaddingMode::CONSTANT);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
index 3d89933377f856e4a920337270d9b8b4fc0a1279..15e933e66eb1714dbd0165f58b8de1bf184a3cd4 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -36,7 +37,10 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status validate_arguments(const ITensorInfo       *input1,
+                          const ITensorInfo       *input2,
+                          const ITensorInfo       *output,
+                          const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
@@ -45,10 +49,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
 
     // Check variances
     const int var_size = info.variances().size();
-    if(var_size > 1)
+    if (var_size > 1)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
-        for(int i = 0; i < var_size; ++i)
+        for (int i = 0; i < var_size; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
         }
@@ -56,17 +60,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
 
-    if(!info.max_sizes().empty())
+    if (!info.max_sizes().empty())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(),
+                                        "Max and min sizes dimensions should match");
     }
 
-    for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+    for (unsigned int i = 0; i < info.max_sizes().size(); ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i],
+                                        "Max size should be greater than min size");
     }
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
@@ -76,21 +82,26 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
 }
 } // namespace
 
-NEPriorBoxLayerKernel::NEPriorBoxLayerKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
+NEPriorBoxLayerKernel::NEPriorBoxLayerKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
 {
 }
 
-void NEPriorBoxLayerKernel::store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width,
-                                              const int height)
+void NEPriorBoxLayerKernel::store_coordinates(float      *out,
+                                              const int   offset,
+                                              const float center_x,
+                                              const float center_y,
+                                              const float box_width,
+                                              const float box_height,
+                                              const int   width,
+                                              const int   height)
 {
     float xmin = (center_x - box_width / 2.f) / width;
     float ymin = (center_y - box_height / 2.f) / height;
     float xmax = (center_x + box_width / 2.f) / width;
     float ymax = (center_y + box_height / 2.f) / height;
 
-    float32x4_t vec_elements = { xmin, ymin, xmax, ymax };
-    if(_info.clip())
+    float32x4_t vec_elements = {xmin, ymin, xmax, ymax};
+    if (_info.clip())
     {
         static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
         static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
@@ -112,7 +123,7 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
 
     int img_width  = _info.img_size().x;
     int img_height = _info.img_size().y;
-    if(img_width == 0 || img_height == 0)
+    if (img_width == 0 || img_height == 0)
     {
         img_width  = _input2->info()->dimension(width_idx);
         img_height = _input2->info()->dimension(height_idx);
@@ -120,7 +131,7 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
 
     float step_x = _info.steps()[0];
     float step_y = _info.steps()[1];
-    if(step_x == 0.f || step_y == 0.f)
+    if (step_x == 0.f || step_y == 0.f)
     {
         step_x = static_cast<float>(img_width) / layer_width;
         step_y = static_cast<float>(img_height) / layer_height;
@@ -130,74 +141,80 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
     slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
 
     Iterator output(_output, slice);
-    execute_window_loop(slice, [&](const Coordinates & id)
-    {
-        float center_x = 0;
-        float center_y = 0;
-        int   idx      = id.x() / (4 * num_priors);
-        center_x       = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
-        center_y       = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
-
-        float box_width;
-        float box_height;
-        int   offset = 0;
-
-        auto out = reinterpret_cast<float *>(output.ptr());
-        for(unsigned int i = 0; i < _info.min_sizes().size(); ++i)
+    execute_window_loop(
+        slice,
+        [&](const Coordinates &id)
         {
-            const float min_size = _info.min_sizes().at(i);
-            box_width            = min_size;
-            box_height           = min_size;
-            store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
-            offset += 4;
-
-            if(!_info.max_sizes().empty())
+            float center_x = 0;
+            float center_y = 0;
+            int   idx      = id.x() / (4 * num_priors);
+            center_x       = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
+            center_y       = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
+
+            float box_width;
+            float box_height;
+            int   offset = 0;
+
+            auto out = reinterpret_cast<float *>(output.ptr());
+            for (unsigned int i = 0; i < _info.min_sizes().size(); ++i)
             {
-                const float max_size = _info.max_sizes().at(i);
-                box_width            = std::sqrt(min_size * max_size);
-                box_height           = box_width;
-
+                const float min_size = _info.min_sizes().at(i);
+                box_width            = min_size;
+                box_height           = min_size;
                 store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
                 offset += 4;
-            }
 
-            // rest of priors
-            for(auto ar : _info.aspect_ratios())
-            {
-                if(fabs(ar - 1.) < 1e-6)
+                if (!_info.max_sizes().empty())
                 {
-                    continue;
+                    const float max_size = _info.max_sizes().at(i);
+                    box_width            = std::sqrt(min_size * max_size);
+                    box_height           = box_width;
+
+                    store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+                    offset += 4;
                 }
 
-                box_width  = min_size * sqrt(ar);
-                box_height = min_size / sqrt(ar);
+                // rest of priors
+                for (auto ar : _info.aspect_ratios())
+                {
+                    if (fabs(ar - 1.) < 1e-6)
+                    {
+                        continue;
+                    }
 
-                store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
-                offset += 4;
+                    box_width  = min_size * sqrt(ar);
+                    box_height = min_size / sqrt(ar);
+
+                    store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+                    offset += 4;
+                }
             }
-        }
 
-        // set the variance
-        out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
-        float32x4_t var;
-        if(_info.variances().size() == 1)
-        {
-            var = vdupq_n_f32(_info.variances().at(0));
-        }
-        else
-        {
-            const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) };
-            var                    = vars;
-        }
-        for(int i = 0; i < num_priors; ++i)
-        {
-            vst1q_f32(out + 4 * i, var);
-        }
-    },
-    output);
+            // set the variance
+            out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
+            float32x4_t var;
+            if (_info.variances().size() == 1)
+            {
+                var = vdupq_n_f32(_info.variances().at(0));
+            }
+            else
+            {
+                const float32x4_t vars = {_info.variances().at(0), _info.variances().at(1), _info.variances().at(2),
+                                          _info.variances().at(3)};
+                var                    = vars;
+            }
+            for (int i = 0; i < num_priors; ++i)
+            {
+                vst1q_f32(out + 4 * i, var);
+            }
+        },
+        output);
 }
 
-void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
+void NEPriorBoxLayerKernel::configure(const ITensor           *input1,
+                                      const ITensor           *input2,
+                                      ITensor                 *output,
+                                      const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
@@ -215,7 +232,10 @@ void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *inpu
     INEKernel::configure(win);
 }
 
-Status NEPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status NEPriorBoxLayerKernel::validate(const ITensorInfo       *input1,
+                                       const ITensorInfo       *input2,
+                                       const ITensorInfo       *output,
+                                       const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
@@ -231,4 +251,4 @@ void NEPriorBoxLayerKernel::run(const Window &window, const ThreadInfo &info)
     // Run function
     calculate_prior_boxes(window);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
index 430a47f9f8b79ff43a158a91a6030431041c31f4..460f80e08518510c9e01359061bd18e9c1e4293c 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
@@ -67,7 +67,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+    static Status validate(const ITensorInfo       *input1,
+                           const ITensorInfo       *input2,
+                           const ITensorInfo       *output,
+                           const PriorBoxLayerInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -84,7 +87,14 @@ private:
      * @param[in]  width      Input width.
      * @param[in]  height     Input height.
      */
-    void store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width, const int height);
+    void store_coordinates(float      *out,
+                           const int   offset,
+                           const float center_x,
+                           const float center_y,
+                           const float box_width,
+                           const float box_height,
+                           const int   width,
+                           const int   height);
     /** Function to calculate prior boxes.
      *
      * @param[in] window Input region on which to execute the kernel.
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
index a88b193b314fb4d3bb2e725ac9828376e21a89e8..8e1ed3a2a5ce7d1d1e596d57204d9cfacebb3e51 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,17 +26,17 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/NESymm.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/NESymm.h"
 
 #include <map>
 
@@ -72,8 +72,8 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4
     const int64_t b_3 = vgetlane(b_high, 1);
 
     int64x2x2_t     result;
-    const int64x2_t result_0{ a_0 * b_0, a_1 * b_1 };
-    const int64x2_t result_1{ a_2 * b_2, a_3 * b_3 };
+    const int64x2_t result_0{a_0 * b_0, a_1 * b_1};
+    const int64x2_t result_1{a_2 * b_2, a_3 * b_3};
     result.val[0] = vadd(vmovl(vgetlow(bias)), result_0);
     result.val[1] = vadd(vmovl(vgethigh(bias)), result_1);
 
@@ -81,15 +81,17 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4
 }
 } // namespace
 
-void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *output, const ITensor *weight, const ITensor *bias)
+void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input,
+                                                ITensor       *output,
+                                                const ITensor *weight,
+                                                const ITensor *bias)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output);
     ARM_COMPUTE_ERROR_ON(input == output);
     ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), weight->info(), bias->info()));
 
-    static const std::map<DataType, ComputeFuncType> fn_map =
-    {
-        { DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16) },
+    static const std::map<DataType, ComputeFuncType> fn_map = {
+        {DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16)},
     };
 
     _input  = input;
@@ -102,10 +104,10 @@ void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *o
     _output->info()->set_quantization_info(compute_output_qinfo());
 
     const UniformQuantizationInfo wq_info = _weight->info()->quantization_info().uniform();
-    const Status                  s       = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift);
+    const Status s = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift);
     _output_shift *= -1;
 
-    if(!bool(s))
+    if (!bool(s))
     {
         _output_multiplier = 0;
         _output_shift      = 0;
@@ -134,7 +136,10 @@ Window NEQLSTMLayerNormalizationKernel::configure_window(ITensor *target)
     return window;
 }
 
-Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input,
+                                                 const ITensorInfo *output,
+                                                 const ITensorInfo *weight,
+                                                 const ITensorInfo *bias)
 {
     ARM_COMPUTE_UNUSED(output, bias, weight, input);
 
@@ -151,7 +156,7 @@ Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().x() != weight->tensor_shape().x());
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -182,11 +187,11 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16(
     using AccType       = int64_t;
     using InputDataType = int16_t;
 
-    AccType sum{ 0 };
-    AccType sum_sq{ 0 };
+    AccType sum{0};
+    AccType sum_sq{0};
 
     int32_t x = _window_start_x;
-    for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
+    for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
     {
         using namespace wrapper;
         const int16x8_t val      = vloadq(input_ptr + x);
@@ -200,6 +205,7 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16(
         sum_sq += static_cast<AccType>(vaddv(vmul(val_low, val_low)));
         sum_sq += static_cast<AccType>(vaddv(vmul(val_high, val_high)));
 #else  // __aarch64__
+
         // only AArch64 supports vaddv
         const int64x2_t pair_sum_low  = vpaddl(val_low);
         const int64x2_t pair_sum_high = vpaddl(val_high);
@@ -215,7 +221,7 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16(
 #endif // __aarch64__
     }
 
-    for(; x < _window_end_x; ++x)
+    for (; x < _window_end_x; ++x)
     {
         const InputDataType val = input_ptr[x];
         sum += static_cast<AccType>(val);
@@ -229,7 +235,9 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i
                                                                 int16_t       *output_ptr,
                                                                 const int16_t *weight_ptr,
                                                                 const int32_t *bias_ptr,
-                                                                int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift)
+                                                                int32_t        mean,
+                                                                int32_t        inv_std_mul,
+                                                                int32_t        inv_std_shift)
 {
     using OutputDataType = int16_t;
 
@@ -237,7 +245,7 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i
     const int32x4_t mean_vec = vdup_n(mean, wrapper::traits::vector_128_tag{});
 
     int32_t x = _window_start_x;
-    for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
+    for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
     {
         const int16x8_t val = vloadq(input_ptr + x);
         int32x4x2_t     shifted;
@@ -266,16 +274,18 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i
         vstore(output_ptr + x + 4, vqmovn(out_val.val[1]));
     }
 
-    for(; x < _window_end_x; ++x)
+    for (; x < _window_end_x; ++x)
     {
-        const auto    val             = static_cast<int32_t>(input_ptr[x]);
-        const int32_t shifted         = (val << 10) - mean;
-        const int32_t rescaled        = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift);
-        const int64_t weighted        = rescaled * weight_ptr[x] + bias_ptr[x];
+        const auto    val      = static_cast<int32_t>(input_ptr[x]);
+        const int32_t shifted  = (val << 10) - mean;
+        const int32_t rescaled = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift);
+        const int64_t weighted = rescaled * weight_ptr[x] + bias_ptr[x];
         const auto    reverse_shifted = static_cast<int32_t>((weighted + 512) >> 10);
-        int32_t       out_val         = quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12);
-        out_val                       = utility::clamp<decltype(out_val), OutputDataType>(out_val, std::numeric_limits<OutputDataType>::min());
-        output_ptr[x]                 = static_cast<OutputDataType>(out_val);
+        int32_t       out_val =
+            quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12);
+        out_val =
+            utility::clamp<decltype(out_val), OutputDataType>(out_val, std::numeric_limits<OutputDataType>::min());
+        output_ptr[x] = static_cast<OutputDataType>(out_val);
     }
 }
 
@@ -286,35 +296,38 @@ void NEQLSTMLayerNormalizationKernel::compute_qsymm16()
     using BiasDataType   = int32_t;
     using AccType        = int64_t;
 
-    Iterator input_iterator{ _input, _inout_window };
-    Iterator output_iterator{ _output, _inout_window };
-    Iterator weight_iterator{ _weight, _weight_window };
-    Iterator bias_iterator{ _bias, _weight_window };
+    Iterator input_iterator{_input, _inout_window};
+    Iterator output_iterator{_output, _inout_window};
+    Iterator weight_iterator{_weight, _weight_window};
+    Iterator bias_iterator{_bias, _weight_window};
 
     const auto weight_ptr = reinterpret_cast<const InputDataType *>(weight_iterator.ptr());
     const auto bias_ptr   = reinterpret_cast<const BiasDataType *>(bias_iterator.ptr());
 
     const uint32_t column_size = _input->info()->tensor_shape()[0];
 
-    execute_window_loop(_inout_window, [ &, this](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const InputDataType *>(input_iterator.ptr());
-        auto       out_ptr = reinterpret_cast<OutputDataType *>(output_iterator.ptr());
-
-        AccType sum{ 0 };
-        AccType sum_sq{ 0 };
-        std::tie(sum, sum_sq) = sum_qsymm16(in_ptr);
-
-        AccType mean{ 0 };
-        AccType variance{ 0 };
-        std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size);
-
-        int32_t stddev_invsqrt_mul{};
-        int32_t stddev_invsqrt_shift{};
-        quantization::get_invsqrt_quantized_multiplier_exp(static_cast<int32_t>(variance), -1, stddev_invsqrt_mul, stddev_invsqrt_shift);
-
-        normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift);
-    },
-    input_iterator, output_iterator);
+    execute_window_loop(
+        _inout_window,
+        [&, this](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const InputDataType *>(input_iterator.ptr());
+            auto       out_ptr = reinterpret_cast<OutputDataType *>(output_iterator.ptr());
+
+            AccType sum{0};
+            AccType sum_sq{0};
+            std::tie(sum, sum_sq) = sum_qsymm16(in_ptr);
+
+            AccType mean{0};
+            AccType variance{0};
+            std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size);
+
+            int32_t stddev_invsqrt_mul{};
+            int32_t stddev_invsqrt_shift{};
+            quantization::get_invsqrt_quantized_multiplier_exp(static_cast<int32_t>(variance), -1, stddev_invsqrt_mul,
+                                                               stddev_invsqrt_shift);
+
+            normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift);
+        },
+        input_iterator, output_iterator);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
index a3ff6e988f2512538ac75cced5edf9669d56a52c..af5b6a031584e2e9f1e1d4ef9952422214d61914 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H
 
 #include "src/core/NEON/INEKernel.h"
+
 #include <functional>
 
 namespace arm_compute
@@ -69,34 +70,26 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
     // constants
-    static constexpr uint32_t max_input_dimension{ 2 };  /**< The maximum input dimension supported */
-    static constexpr uint32_t max_weight_dimension{ 1 }; /**< The maximum weight dimension supported */
-    static constexpr uint32_t max_bias_dimension{ 1 };   /**< The maximum bias dimension supported */
-    static constexpr uint32_t vector_size_byte{ 16 };    /**< Computation vector size in byte */
+    static constexpr uint32_t max_input_dimension{2};  /**< The maximum input dimension supported */
+    static constexpr uint32_t max_weight_dimension{1}; /**< The maximum weight dimension supported */
+    static constexpr uint32_t max_bias_dimension{1};   /**< The maximum bias dimension supported */
+    static constexpr uint32_t vector_size_byte{16};    /**< Computation vector size in byte */
 
     using ComputeFuncType = std::function<void(NEQLSTMLayerNormalizationKernel &)>;
 
     ComputeFuncType _fn{}; /**< Function pointer to computation function */
 
-    const ITensor *_input
-    {
-        nullptr
-    }; /**< Input tensor */
-    const ITensor *_weight
-    {
-        nullptr
-    }; /**< Weight tensor */
-    const ITensor *_bias
-    {
-        nullptr
-    };                           /**< Bias tensor */
-    ITensor *_output{ nullptr }; /**< Output tensor */
+    const ITensor *_input{nullptr};  /**< Input tensor */
+    const ITensor *_weight{nullptr}; /**< Weight tensor */
+    const ITensor *_bias{nullptr};   /**< Bias tensor */
+    ITensor       *_output{nullptr}; /**< Output tensor */
 
     int32_t _output_multiplier{}; /**< Multiplier for output values */
     int32_t _output_shift{};      /**< Shift value for output values */
@@ -138,7 +131,9 @@ private:
                             int16_t       *output_ptr,
                             const int16_t *weight_ptr,
                             const int32_t *bias_ptr,
-                            int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift);
+                            int32_t        mean,
+                            int32_t        inv_std_mul,
+                            int32_t        inv_std_shift);
     /** Function to compute output quantization information */
     QuantizationInfo compute_output_qinfo();
 };
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
index 802aebb52645aa7dcfdad2ca4e218cf3baa8a5c0..486cd6d3317742c4f9c3150fa04bbc5c1aeaff78 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/misc/Utility.h"
-#include "src/core/CPP/Validate.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/roialign/list.h"
@@ -49,7 +50,12 @@ struct ROIAlignSelectorData
 };
 
 using ROIAlignSelctorPtr = std::add_pointer<bool(const ROIAlignSelectorData &data)>::type;
-using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)>::type;
+using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor      *input,
+                                                 ITensor            *output,
+                                                 const ITensor      *rois,
+                                                 ROIPoolingLayerInfo pool_info,
+                                                 const Window       &window,
+                                                 const ThreadInfo   &info)>::type;
 
 struct ROIAlignKernel
 {
@@ -58,31 +64,18 @@ struct ROIAlignKernel
     ROIAlignUKernelPtr       ukernel;
 };
 
-static const ROIAlignKernel available_kernels[] =
-{
-    {
-        "fp32_neon_roialign",
-        [](const ROIAlignSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)
-    },
+static const ROIAlignKernel available_kernels[] = {
+    {"fp32_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)},
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "fp16_neon_roialign",
-        [](const ROIAlignSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)
-    },
+    {"fp16_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)},
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "qu8_neon_roialign",
-        [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)
-    },
-    {
-        "qs8_neon_roialign",
-        [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)
-    },
+    {"qu8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)},
+    {"qs8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)},
 #endif //defined(ARM_COMPUTE_ENABLE_NEON)
 };
 
@@ -94,9 +87,9 @@ static const ROIAlignKernel available_kernels[] =
  */
 const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -104,24 +97,29 @@ const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data)
     return nullptr;
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *rois,
+                          ITensorInfo               *output,
+                          const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info),
+                                                           output->tensor_shape());
     }
 
-    if(input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
+    if (input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16);
 
@@ -143,13 +141,17 @@ NEROIAlignLayerKernel::NEROIAlignLayerKernel()
 {
 }
 
-void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIAlignLayerKernel::configure(const ITensor             *input,
+                                      const ITensor             *rois,
+                                      ITensor                   *output,
+                                      const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
     // Output auto inizialitation if not yet initialized
     const TensorShape output_shape = compute_roi_align_shape(*input->info(), *rois->info(), pool_info);
-    auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
     output->info()->set_data_layout(input->info()->data_layout());
 
     // Configure kernel window
@@ -167,7 +169,10 @@ void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois,
     INEKernel::configure(window);
 }
 
-Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIAlignLayerKernel::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *rois,
+                                       ITensorInfo               *output,
+                                       const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
     return Status{};
@@ -176,9 +181,9 @@ Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorIn
 void NEROIAlignLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     const DataLayout data_layout = _input->info()->data_layout();
-    if(data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC)
     {
-        const auto *uk = get_implementation(ROIAlignSelectorData{ _input->info()->data_type() });
+        const auto *uk = get_implementation(ROIAlignSelectorData{_input->info()->data_type()});
         ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
         uk->ukernel(_input, _output, _rois, _pool_info, window, info);
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.h b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
index 48a3de728551284d2c565434056c295bfdacf842..9cc538b429cac3e6fbacd2cb81096331de561e82 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
@@ -83,7 +83,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index 400e8291d6fcfb99486d0381e7bb5b0ee87a9ad8..1a3810fb56e07f9d55a95a5c1edf75ee4519a54a 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
@@ -22,9 +22,11 @@
  * SOFTWARE.
  */
 #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -36,7 +38,10 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *rois,
+                          const ITensorInfo         *output,
+                          const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, rois);
 
@@ -47,10 +52,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, con
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
+        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) ||
+                                    (output->dimension(1) != pool_info.pooled_height()));
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
         ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
     }
@@ -73,19 +79,28 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, con
  * @param[in]  roi_indx       Index of image of coordinate in output Tensor to store value
  */
 template <typename T>
-void template_eval(const ITensor *input, const ITensor *output, int region_start_x, int region_start_y,
-                   int region_end_x, int region_end_y, int fm, int px, int py, int roi_batch, int roi_indx)
+void template_eval(const ITensor *input,
+                   const ITensor *output,
+                   int            region_start_x,
+                   int            region_start_y,
+                   int            region_end_x,
+                   int            region_end_y,
+                   int            fm,
+                   int            px,
+                   int            py,
+                   int            roi_batch,
+                   int            roi_indx)
 {
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
     {
         *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = 0;
     }
     else
     {
         T curr_max = std::numeric_limits<T>::lowest(); // Min value of typename T
-        for(int j = region_start_y; j < region_end_y; ++j)
+        for (int j = region_start_y; j < region_end_y; ++j)
         {
-            for(int i = region_start_x; i < region_end_x; ++i)
+            for (int i = region_start_x; i < region_end_x; ++i)
             {
                 const auto val = *reinterpret_cast<const T *>(input->ptr_to_element(Coordinates(i, j, fm, roi_batch)));
                 curr_max       = std::max(val, curr_max);
@@ -93,11 +108,13 @@ void template_eval(const ITensor *input, const ITensor *output, int region_start
         }
 
         // if quantized datatype, requantize then store in output tensor
-        if(is_data_type_quantized(input->info()->data_type()))
+        if (is_data_type_quantized(input->info()->data_type()))
         {
             // covert qasymm to new output quantization scale and offset
-            UniformQuantizationInfo uqinfo = compute_requantization_scale_offset(input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform());
-            *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = quantize_qasymm8(curr_max, uqinfo);
+            UniformQuantizationInfo uqinfo = compute_requantization_scale_offset(
+                input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform());
+            *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) =
+                quantize_qasymm8(curr_max, uqinfo);
         }
         else
         {
@@ -112,13 +129,19 @@ NEROIPoolingLayerKernel::NEROIPoolingLayerKernel()
 {
 }
 
-Status NEROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIPoolingLayerKernel::validate(const ITensorInfo         *input,
+                                         const ITensorInfo         *rois,
+                                         const ITensorInfo         *output,
+                                         const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
     return Status{};
 }
 
-void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayerKernel::configure(const ITensor             *input,
+                                        const ITensor             *rois,
+                                        const ITensor             *output,
+                                        const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
 
@@ -126,12 +149,15 @@ void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *roi
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
 
     // Output auto initialization if not yet initialized
-    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1));
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2),
+                             rois->info()->dimension(1));
 
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), output->info()->quantization_info());
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       output->info()->quantization_info());
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) ||
+                         (output->info()->dimension(1) != pool_info.pooled_height()));
 
     // Set instance variables
     _input     = input;
@@ -167,7 +193,7 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
     const auto *rois_ptr  = reinterpret_cast<const uint16_t *>(_rois->buffer());
     const auto  data_type = _input->info()->data_type();
 
-    for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
+    for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
     {
         const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
         const auto         x1        = rois_ptr[values_per_roi * roi_indx + 1];
@@ -182,30 +208,35 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
         const int roi_height   = std::max(support::cpp11::round((y2 - y1) * spatial_scale), 1.f);
 
         // Iterate through all feature maps
-        for(int fm = 0; fm < fms; ++fm)
+        for (int fm = 0; fm < fms; ++fm)
         {
             // Iterate through all output pixels
-            for(int py = 0; py < pooled_h; ++py)
+            for (int py = 0; py < pooled_h; ++py)
             {
-                for(int px = 0; px < pooled_w; ++px)
+                for (int px = 0; px < pooled_w; ++px)
                 {
                     auto region_start_x = static_cast<int>(std::floor((static_cast<float>(px) / pooled_w) * roi_width));
-                    auto region_end_x   = static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width));
-                    auto region_start_y = static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height));
-                    auto region_end_y   = static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height));
+                    auto region_end_x =
+                        static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width));
+                    auto region_start_y =
+                        static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height));
+                    auto region_end_y =
+                        static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height));
 
                     region_start_x = std::min(std::max(region_start_x + roi_anchor_x, 0), width);
                     region_end_x   = std::min(std::max(region_end_x + roi_anchor_x, 0), width);
                     region_start_y = std::min(std::max(region_start_y + roi_anchor_y, 0), height);
                     region_end_y   = std::min(std::max(region_end_y + roi_anchor_y, 0), height);
 
-                    switch(data_type)
+                    switch (data_type)
                     {
                         case DataType::F32:
-                            template_eval<float>(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx);
+                            template_eval<float>(_input, _output, region_start_x, region_start_y, region_end_x,
+                                                 region_end_y, fm, px, py, roi_batch, roi_indx);
                             break;
                         case DataType::QASYMM8:
-                            template_eval<qasymm8_t>(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx);
+                            template_eval<qasymm8_t>(_input, _output, region_start_x, region_start_y, region_end_x,
+                                                     region_end_y, fm, px, py, roi_batch, roi_indx);
                             break;
                         default:
                             ARM_COMPUTE_ERROR("DataType not Supported");
@@ -216,4 +247,4 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
         }
     }
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
index e7a7e90eef479cd306b4ac1c104e3a96dce24ade..81f6006ea287fbc19ef5803583c2ce68c16cfd5a 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
@@ -63,7 +63,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois tensor.
      */
-    void configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -82,7 +83,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           const ITensorInfo         *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
 private:
     const ITensor      *_input;
diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
index ec63a35de9b86bd2191d0bc13fd335beca7cd332..87b7b76b729544d249681564cb33ad77198782ab 100644
--- a/src/core/NEON/kernels/NERangeKernel.cpp
+++ b/src/core/NEON/kernels/NERangeKernel.cpp
@@ -29,11 +29,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/range/list.h"
 
 namespace arm_compute
@@ -55,48 +56,23 @@ struct RangeUKernel
     RangeUKernelPtr        ukernel;
 };
 
-static const RangeUKernel available_kernels[] =
-{
-    {
-        "fp16_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)
-    },
-    {
-        "f32_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)
-    },
-    {
-        "u8_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::U8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)
-    },
-    {
-        "u16_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::U16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)
-    },
-    {
-        "u32_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::U32; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)
-    },
-    {
-        "s8_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::S8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)
-    },
-    {
-        "s16_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::S16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)
-    },
-    {
-        "s32_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::S32; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)
-    },
+static const RangeUKernel available_kernels[] = {
+    {"fp16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)},
+    {"f32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)},
+    {"u8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)},
+    {"u16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)},
+    {"u32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U32; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)},
+    {"s8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)},
+    {"s16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)},
+    {"s32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S32; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)},
 };
 
 /** Micro-kernel selector
@@ -107,9 +83,9 @@ static const RangeUKernel available_kernels[] =
  */
 const RangeUKernel *get_implementation(const RangeSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -119,28 +95,31 @@ const RangeUKernel *get_implementation(const RangeSelectorData &data)
 
 Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step)
 {
-    const auto *uk = get_implementation(RangeSelectorData{ output.data_type() });
+    const auto *uk = get_implementation(RangeSelectorData{output.data_type()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start > end) && (step >= 0)), "step must be less than 0 when start > end");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), "start value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), "end value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), "step value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()),
+                                    "start value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()),
+                                    "end value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()),
+                                    "step value is outside the range of the data type");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.num_dimensions() != 1, "Output has to be a 1-D tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step),
+                                    "Output tensor size is incorrect");
 
     return Status{};
 }
 } // namespace
 
-NERangeKernel::NERangeKernel()
-    : _start(0), _end(1), _step(1), _output(nullptr)
+NERangeKernel::NERangeKernel() : _start(0), _end(1), _step(1), _output(nullptr)
 {
 }
 
@@ -151,7 +130,8 @@ void NERangeKernel::configure(ITensor *output, float start, float end, float ste
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*(output->info()), start, end, step));
 
     // Auto initialize output if not initialized
-    auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1, output->info()->data_type(), output->info()->quantization_info());
+    auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1,
+                       output->info()->data_type(), output->info()->quantization_info());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -178,7 +158,7 @@ void NERangeKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    const auto *uk = get_implementation(RangeSelectorData{ _output->info()->data_type() });
+    const auto *uk = get_implementation(RangeSelectorData{_output->info()->data_type()});
 
     uk->ukernel(_output, _start, _step, window);
 }
diff --git a/src/core/NEON/kernels/NERangeKernel.h b/src/core/NEON/kernels/NERangeKernel.h
index 90560995e6d56a5d2045a91162037619ee9e2679..fa555c2c2ede01b5566aedd54575c18cad1478ab 100644
--- a/src/core/NEON/kernels/NERangeKernel.h
+++ b/src/core/NEON/kernels/NERangeKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NERANGEKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 19955af4939b5f66be6199cd1323e97f196b06bb..455d604b3b53fd59eb2143d74ebd50da75c02bf1 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -28,16 +28,17 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/NEON/NEMath.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "support/SaturateCast.h"
 
-#include "src/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -48,7 +49,7 @@ namespace
 template <typename T>
 void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset = 0)
 {
-    if(std::is_same<T, uint8_t>::value)
+    if (std::is_same<T, uint8_t>::value)
     {
         auto res = wrapper::vcombine(wrapper::vqmovun(t1), wrapper::vqmovun(t2));
         wrapper::vstore(output.ptr() + offset, res);
@@ -63,8 +64,8 @@ void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset
 template <typename T>
 uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
 {
-    uint32x4_t mask{ 0 };
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    uint32x4_t mask{0};
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         mask = wrapper::vcgt(b, a);
     }
@@ -73,12 +74,12 @@ uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOp
         mask = wrapper::vclt(b, a);
     }
 
-    uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 };
-    if(axis != 0)
+    uint32x4_t vec_idx = {idx, idx + 1, idx + 2, idx + 3};
+    if (axis != 0)
     {
         vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
     }
-    uint32x4x4_t res = { { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 } };
+    uint32x4x4_t res = {{wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0}};
 
     return res;
 }
@@ -86,9 +87,9 @@ uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOp
 template <typename T>
 uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
 {
-    uint32x4x4_t mask{ { 0 } };
-    uint8x16_t   mask_u8{ 0 };
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    uint32x4x4_t mask{{0}};
+    uint8x16_t   mask_u8{0};
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         mask_u8 = wrapper::vcgt(b, a);
     }
@@ -96,44 +97,43 @@ uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, R
     {
         mask_u8 = wrapper::vclt(b, a);
     }
-    auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
-    auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
-    mask.val[0]     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
-    mask.val[1]     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
-    mask.val[2]     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
-    mask.val[3]     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
-
-    uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
-            { idx + 4, idx + 5, idx + 6, idx + 7 },
-            { idx + 8, idx + 9, idx + 10, idx + 11 },
-            { idx + 12, idx + 13, idx + 14, idx + 15 }
-        }
-    };
-    if(axis != 0)
+    auto wide_u16_1 =
+        wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+    auto wide_u16_2 =
+        wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+    mask.val[0] =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+    mask.val[1] =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+    mask.val[2] =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+    mask.val[3] =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+
+    uint32x4x4_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3},
+                             {idx + 4, idx + 5, idx + 6, idx + 7},
+                             {idx + 8, idx + 9, idx + 10, idx + 11},
+                             {idx + 12, idx + 13, idx + 14, idx + 15}}};
+    if (axis != 0)
     {
         vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
         vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
         vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
         vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
     }
-    uint32x4x4_t res =
-    {
-        {
-            vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]),
-            vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
-            vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]),
-            vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])
-        }
-    };
+    uint32x4x4_t res = {
+        {vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
+         vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])}};
 
     return res;
 }
 
 // Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
 template <typename T>
-inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
-       typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type >::type
-       calculate_min(T in)
+inline typename std::enable_if<
+    std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
+    typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type
+calculate_min(T in)
 {
     auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
     return wrapper::vpmin(pmin, pmin);
@@ -141,9 +141,10 @@ inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_
 
 // Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
 template <typename T>
-inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
-       typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type >::type
-       calculate_min(T in)
+inline typename std::enable_if<
+    std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
+    typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type
+calculate_min(T in)
 {
     auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
     pmin      = wrapper::vpmin(pmin, pmin);
@@ -153,9 +154,10 @@ inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_s
 
 // Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
 template <typename T>
-inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
-       typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type >::type
-       calculate_max(T in)
+inline typename std::enable_if<
+    std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
+    typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type
+calculate_max(T in)
 {
     auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
     return wrapper::vpmax(pmax, pmax);
@@ -163,9 +165,10 @@ inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_
 
 // Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
 template <typename T>
-inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
-       typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type >::type
-       calculate_max(T in)
+inline typename std::enable_if<
+    std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
+    typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type
+calculate_max(T in)
 {
     auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
     pmax      = wrapper::vpmax(pmax, pmax);
@@ -176,10 +179,10 @@ inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_s
 template <typename T>
 uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
 {
-    uint32x4_t res_idx_mask{ 0 };
+    uint32x4_t res_idx_mask{0};
     uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
 
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         auto pmin    = calculate_min(vec_res_value);
         auto mask    = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
@@ -203,10 +206,10 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, Reduc
 template <typename T>
 uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
 {
-    uint32x4x4_t res_idx_mask{ { 0 } };
+    uint32x4x4_t res_idx_mask{{0}};
     uint32x4_t   mask_ones = vdupq_n_u32(0xFFFFFFFF);
-    uint8x16_t   mask_u8{ 0 };
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    uint8x16_t   mask_u8{0};
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         auto pmin = calculate_min(vec_res_value);
         mask_u8   = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
@@ -218,12 +221,18 @@ uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_va
     }
 
     // Widen vectors
-    auto wide_u16_1     = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
-    auto wide_u16_2     = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
-    auto wide_u32_1     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
-    auto wide_u32_2     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
-    auto wide_u32_3     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
-    auto wide_u32_4     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+    auto wide_u16_1 =
+        wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+    auto wide_u16_2 =
+        wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+    auto wide_u32_1 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+    auto wide_u32_2 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+    auto wide_u32_3 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+    auto wide_u32_4 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
     res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
     res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
     res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3);
@@ -241,19 +250,19 @@ uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_va
         pmin      = wrapper::vpmin(pmin, pmin);
         res       = std::min(wrapper::vgetlane(pmin, 0), res);
         iter++;
-    }
-    while(iter < 4);
+    } while (iter < 4);
 
     return (res - 0xFFFFFFFF);
 }
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <>
-uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+uint32x4x4_t
+calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
 {
-    uint32x4x2_t mask{ 0 };
-    uint16x8_t   mask_u16{ 0 };
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    uint32x4x2_t mask{0};
+    uint16x8_t   mask_u16{0};
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         mask_u16 = wrapper::vcgt(b, a);
     }
@@ -263,19 +272,14 @@ uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x
     }
     mask.val[0]          = wrapper::vmovl(wrapper::vgetlow(mask_u16));
     mask.val[1]          = wrapper::vmovl(wrapper::vgethigh(mask_u16));
-    uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
-            { idx + 4, idx + 5, idx + 6, idx + 7 }
-        }
-    };
-    if(axis != 0)
+    uint32x4x2_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, {idx + 4, idx + 5, idx + 6, idx + 7}}};
+    if (axis != 0)
     {
         vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
         vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
     }
-    uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
-                         wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]),
-                         0, 0
-                       };
+    uint32x4x4_t res = {wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
+                        wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]), 0, 0};
 
     return res;
 }
@@ -298,10 +302,10 @@ inline float16x4_t calculate_max(float16x8_t in)
 template <>
 uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op)
 {
-    uint32x4x2_t res_idx_mask{ 0 };
+    uint32x4x2_t res_idx_mask{0};
     uint32x4_t   mask_ones = vdupq_n_u32(0xFFFFFFFF);
     uint16x8_t   mask_u16;
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         auto pmin = calculate_min(vec_res_value);
         mask_u16  = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
@@ -313,8 +317,10 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_va
     }
 
     // Widen vectors
-    auto wide_u32_1     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
-    auto wide_u32_2     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
+    auto wide_u32_1 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
+    auto wide_u32_2 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
     res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
     res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
     res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
@@ -328,8 +334,7 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_va
         pmin      = wrapper::vpmin(pmin, pmin);
         res       = std::min(wrapper::vgetlane(pmin, 0), res);
         iter++;
-    }
-    while(iter < 2);
+    } while (iter < 2);
 
     return (res - 0xFFFFFFFF);
 }
@@ -388,7 +393,8 @@ struct RedOpX
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
+    inline void operator()(
+        const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
     {
         const size_t input_dim_0    = in->info()->dimension(0);
         const int    window_step_x  = 16 / sizeof(T);
@@ -402,211 +408,217 @@ struct RedOpX
         Iterator output(out, out_window);
 
         execute_window_loop(
-            in_win_no_pad, [&](const Coordinates &)
-        {
-            const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
-
-            auto init_res_value = static_cast<T>(0.f);
-            switch(op)
+            in_win_no_pad,
+            [&](const Coordinates &)
             {
-                case ReductionOperation::ARG_IDX_MAX:
-                case ReductionOperation::ARG_IDX_MIN:
-                case ReductionOperation::MIN:
-                case ReductionOperation::MAX:
-                {
-                    init_res_value = static_cast<T>(*input_ptr);
-                    break;
-                }
-                case ReductionOperation::PROD:
-                {
-                    init_res_value = static_cast<T>(1.f);
-                    break;
-                }
-                default:
-                    break;
-            }
-            auto         vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
-            uint32x4x4_t vec_res_idx{ { 0 } };
+                const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
 
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vec_elements = wrapper::vloadq(input_ptr + x);
-                switch(op)
+                auto init_res_value = static_cast<T>(0.f);
+                switch (op)
                 {
-                    case ReductionOperation::SUM_SQUARE:
-                        vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
-                        break;
-                    case ReductionOperation::MEAN_SUM:
-                    case ReductionOperation::SUM:
-                        vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
-                        break;
-                    case ReductionOperation::PROD:
-                        vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
-                        break;
-                    case ReductionOperation::ARG_IDX_MIN:
-                    {
-                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                        vec_res_value           = temp_vec_res_value;
-                        break;
-                    }
                     case ReductionOperation::ARG_IDX_MAX:
-                    {
-                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                        vec_res_value           = temp_vec_res_value;
-                        break;
-                    }
+                    case ReductionOperation::ARG_IDX_MIN:
                     case ReductionOperation::MIN:
+                    case ReductionOperation::MAX:
                     {
-                        vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                        init_res_value = static_cast<T>(*input_ptr);
                         break;
                     }
-                    case ReductionOperation::MAX:
+                    case ReductionOperation::PROD:
                     {
-                        vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                        init_res_value = static_cast<T>(1.f);
                         break;
                     }
                     default:
-                        ARM_COMPUTE_ERROR("Not supported");
+                        break;
                 }
-            }
+                auto         vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
+                uint32x4x4_t vec_res_idx{{0}};
 
-            switch(op)
-            {
-                case ReductionOperation::SUM:
-                case ReductionOperation::MEAN_SUM:
-                case ReductionOperation::SUM_SQUARE:
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-#ifdef ARM_COMPUTE_DEBUG_ENABLED
-                    auto res = static_cast<T>(0.f);
-                    for(int i = 0; i < S; ++i)
+                    const auto vec_elements = wrapper::vloadq(input_ptr + x);
+                    switch (op)
                     {
-                        res += wrapper::vgetlane(vec_res_value, i);
+                        case ReductionOperation::SUM_SQUARE:
+                            vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                            break;
+                        case ReductionOperation::MEAN_SUM:
+                        case ReductionOperation::SUM:
+                            vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                            break;
+                        case ReductionOperation::PROD:
+                            vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+                            break;
+                        case ReductionOperation::ARG_IDX_MIN:
+                        {
+                            auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value,
+                                                                                   vec_res_idx, op, 0);
+                            vec_res_value = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MAX:
+                        {
+                            auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value,
+                                                                                   vec_res_idx, op, 0);
+                            vec_res_value = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::MIN:
+                        {
+                            vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            break;
+                        }
+                        case ReductionOperation::MAX:
+                        {
+                            vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            break;
+                        }
+                        default:
+                            ARM_COMPUTE_ERROR("Not supported");
                     }
-#else  // ARM_COMPUTE_DEBUG_ENABLED
-                    auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
-                    for(int i = 0; i < S / 4; ++i)
+                }
+
+                switch (op)
+                {
+                    case ReductionOperation::SUM:
+                    case ReductionOperation::MEAN_SUM:
+                    case ReductionOperation::SUM_SQUARE:
                     {
-                        carry_res = wrapper::vpadd(carry_res, carry_res);
-                    }
-                    auto res = wrapper::vgetlane(carry_res, 0);
+#ifdef ARM_COMPUTE_DEBUG_ENABLED
+                        auto res = static_cast<T>(0.f);
+                        for (int i = 0; i < S; ++i)
+                        {
+                            res += wrapper::vgetlane(vec_res_value, i);
+                        }
+#else  // ARM_COMPUTE_DEBUG_ENABLED
+                        auto carry_res =
+                            wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+                        for (int i = 0; i < S / 4; ++i)
+                        {
+                            carry_res = wrapper::vpadd(carry_res, carry_res);
+                        }
+                        auto res = wrapper::vgetlane(carry_res, 0);
 #endif // ARM_COMPUTE_DEBUG_ENABLED
-                    if(op == ReductionOperation::SUM_SQUARE)
-                    {
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
+                        if (op == ReductionOperation::SUM_SQUARE)
                         {
-                            res += (*(input_ptr + x)) * (*(input_ptr + x));
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                res += (*(input_ptr + x)) * (*(input_ptr + x));
+                            }
+                        }
+                        else
+                        {
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                res += *(input_ptr + x);
+                            }
+                        }
+
+                        if (op == ReductionOperation::MEAN_SUM)
+                        {
+                            res /= input_dim_0;
                         }
+
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
                     }
-                    else
+                    case ReductionOperation::PROD:
                     {
+                        auto carry_res =
+                            wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+                        T res = 1;
+                        for (int i = 0; i < S / 2; ++i)
+                        {
+                            res *= wrapper::vgetlane(carry_res, i);
+                        }
+
                         // Compute left-over elements
-                        for(; x < window_end_x; ++x)
+                        for (; x < window_end_x; ++x)
                         {
-                            res += *(input_ptr + x);
+                            res *= *(input_ptr + x);
                         }
-                    }
 
-                    if(op == ReductionOperation::MEAN_SUM)
-                    {
-                        res /= input_dim_0;
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
                     }
-
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
-                }
-                case ReductionOperation::PROD:
-                {
-                    auto carry_res = wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
-                    T    res       = 1;
-                    for(int i = 0; i < S / 2; ++i)
+                    case ReductionOperation::ARG_IDX_MIN:
                     {
-                        res *= wrapper::vgetlane(carry_res, i);
-                    }
+                        auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
 
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res *= *(input_ptr + x);
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            if (*(input_ptr + x) < res)
+                            {
+                                idx = x;
+                                res = *(input_ptr + x);
+                            }
+                        }
+                        *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+                        break;
                     }
-
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
-                }
-                case ReductionOperation::ARG_IDX_MIN:
-                {
-                    auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
+                    case ReductionOperation::ARG_IDX_MAX:
                     {
-                        if(*(input_ptr + x) < res)
+                        auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
                         {
-                            idx = x;
-                            res = *(input_ptr + x);
+                            if (*(input_ptr + x) > res)
+                            {
+                                idx = x;
+                                res = *(input_ptr + x);
+                            }
                         }
+                        *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+                        break;
                     }
-                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
-                    break;
-                }
-                case ReductionOperation::ARG_IDX_MAX:
-                {
-                    auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
+                    case ReductionOperation::MIN:
                     {
-                        if(*(input_ptr + x) > res)
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
                         {
-                            idx = x;
-                            res = *(input_ptr + x);
+                            res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
                         }
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
                     }
-                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
-                    break;
-                }
-                case ReductionOperation::MIN:
-                {
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
+                    case ReductionOperation::MAX:
                     {
-                        res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
-                    }
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
-                }
-                case ReductionOperation::MAX:
-                {
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
 
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
+                        }
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
                     }
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
                 }
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        },
-        input, output);
+            },
+            input, output);
     }
 };
 
 template <typename T>
 struct RedOpX_quantized
 {
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
+    inline void operator()(
+        const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
     {
         using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
 
@@ -637,246 +649,257 @@ struct RedOpX_quantized
         const float B = out_offset - (in_scale * in_offset) / (out_scale);
 
         execute_window_loop(
-            in_win_no_pad, [&](const Coordinates &)
-        {
-            const auto input_ptr = reinterpret_cast<T *>(input.ptr());
+            in_win_no_pad,
+            [&](const Coordinates &)
+            {
+                const auto input_ptr = reinterpret_cast<T *>(input.ptr());
+
+                auto vec_res_value1 =
+                    wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+                auto vec_res_value2 =
+                    wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+                auto vec_res_value3 =
+                    wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+                auto vec_res_value4 =
+                    wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+
+                auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
+                auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
+                auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
+                auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
+
+                typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = {0};
+
+                if (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN ||
+                    op == ReductionOperation::MIN || op == ReductionOperation::MAX)
+                {
+                    vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
+                }
 
-            auto vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-            auto vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-            auto vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-            auto vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+                uint32x4x4_t vec_res_idx{{0}};
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto vec_elements = wrapper::vloadq(input_ptr + x);
+                    switch (op)
+                    {
+                        case ReductionOperation::SUM:
+                        case ReductionOperation::MEAN_SUM:
+                        {
+                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
 
-            auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
-            auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
-            auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
-            auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
+                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
 
-            typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = { 0 };
+                            vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+                            vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+                            vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+                            vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                            break;
+                        }
+                        case ReductionOperation::PROD:
+                        {
+                            const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
+                            const auto scale32x4f_4  = vdupq_n_f32(iq_info.scale);
 
-            if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::MIN || op == ReductionOperation::MAX)
-            {
-                vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
-            }
+                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
 
-            uint32x4x4_t vec_res_idx{ { 0 } };
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vec_elements = wrapper::vloadq(input_ptr + x);
-                switch(op)
-                {
-                    case ReductionOperation::SUM:
-                    case ReductionOperation::MEAN_SUM:
-                    {
-                        const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                        const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                        const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                        const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                        const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                        const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                        vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
-                        vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
-                        vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
-                        vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
-                        const auto scale32x4f_4  = vdupq_n_f32(iq_info.scale);
-
-                        const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                        const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                        const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                        const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                        const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                        const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                        auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
-                        auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
-                        auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
-                        auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
-
-                        //de-quantize vec_elements
-                        temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
-                        temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
-                        temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
-                        temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
-
-                        vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
-                        vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
-                        vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
-                        vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
-                        break;
+                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                            auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
+                            auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
+                            auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
+                            auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+
+                            //de-quantize vec_elements
+                            temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+                            temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+                            temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+                            temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+                            vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
+                            vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
+                            vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
+                            vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MIN:
+                        {
+                            auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(
+                                x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                            vec_res_value = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MAX:
+                        {
+                            auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(
+                                x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                            vec_res_value = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::MIN:
+                        {
+                            vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            break;
+                        }
+                        case ReductionOperation::MAX:
+                        {
+                            vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            break;
+                        }
+                        default:
+                            ARM_COMPUTE_ERROR("Not supported");
                     }
+                }
+
+                switch (op)
+                {
                     case ReductionOperation::ARG_IDX_MIN:
                     {
-                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                        vec_res_value           = temp_vec_res_value;
+                        auto idx =
+                            calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            if (*(input_ptr + x) < res)
+                            {
+                                idx = x;
+                                res = *(input_ptr + x);
+                            }
+                        }
+                        *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
                         break;
                     }
                     case ReductionOperation::ARG_IDX_MAX:
                     {
-                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                        vec_res_value           = temp_vec_res_value;
+                        auto idx =
+                            calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            if (*(input_ptr + x) > res)
+                            {
+                                idx = x;
+                                res = *(input_ptr + x);
+                            }
+                        }
+                        *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
                         break;
                     }
                     case ReductionOperation::MIN:
                     {
-                        vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
+                        }
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
                         break;
                     }
                     case ReductionOperation::MAX:
                     {
-                        vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                        break;
-                    }
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
-                }
-            }
-
-            switch(op)
-            {
-                case ReductionOperation::ARG_IDX_MIN:
-                {
-                    auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        if(*(input_ptr + x) < res)
-                        {
-                            idx = x;
-                            res = *(input_ptr + x);
-                        }
-                    }
-                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
-                    break;
-                }
-                case ReductionOperation::ARG_IDX_MAX:
-                {
-                    auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
 
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        if(*(input_ptr + x) > res)
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
                         {
-                            idx = x;
-                            res = *(input_ptr + x);
+                            res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
                         }
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
                     }
-                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
-                    break;
-                }
-                case ReductionOperation::MIN:
-                {
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
+                    case ReductionOperation::PROD:
                     {
-                        res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
-                    }
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
-                }
-                case ReductionOperation::MAX:
-                {
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+                        auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
+                        carry_res      = wrapper::vmul(carry_res, vec_res_value3_f);
+                        carry_res      = wrapper::vmul(carry_res, vec_res_value4_f);
 
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
-                    }
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
-                }
-                case ReductionOperation::PROD:
-                {
-                    auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
-                    carry_res      = wrapper::vmul(carry_res, vec_res_value3_f);
-                    carry_res      = wrapper::vmul(carry_res, vec_res_value4_f);
+                        float res = wrapper::vgetlane(carry_res, 0);
+                        res *= wrapper::vgetlane(carry_res, 1);
+                        res *= wrapper::vgetlane(carry_res, 2);
+                        res *= wrapper::vgetlane(carry_res, 3);
 
-                    float res = wrapper::vgetlane(carry_res, 0);
-                    res *= wrapper::vgetlane(carry_res, 1);
-                    res *= wrapper::vgetlane(carry_res, 2);
-                    res *= wrapper::vgetlane(carry_res, 3);
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            //de-quantize input
+                            if (std::is_same<T, uint8_t>::value)
+                            {
+                                res *= dequantize_qasymm8(*(input_ptr + x), iq_info);
+                            }
+                            else
+                            {
+                                res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info);
+                            }
+                        }
 
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        //de-quantize input
-                        if(std::is_same<T, uint8_t>::value)
+                        //re-quantize result
+                        if (std::is_same<T, uint8_t>::value)
                         {
-                            res *= dequantize_qasymm8(*(input_ptr + x), iq_info);
+                            res = quantize_qasymm8(res, iq_info);
                         }
                         else
                         {
-                            res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info);
+                            res = quantize_qasymm8_signed(res, iq_info);
                         }
-                    }
 
-                    //re-quantize result
-                    if(std::is_same<T, uint8_t>::value)
-                    {
-                        res = quantize_qasymm8(res, iq_info);
+                        *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
+                        break;
                     }
-                    else
+                    case ReductionOperation::SUM:
+                    case ReductionOperation::MEAN_SUM:
                     {
-                        res = quantize_qasymm8_signed(res, iq_info);
-                    }
+                        auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
+                        carry_res      = wrapper::vadd(carry_res, vec_res_value3);
+                        carry_res      = wrapper::vadd(carry_res, vec_res_value4);
 
-                    *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
-                    break;
-                }
-                case ReductionOperation::SUM:
-                case ReductionOperation::MEAN_SUM:
-                {
-                    auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
-                    carry_res      = wrapper::vadd(carry_res, vec_res_value3);
-                    carry_res      = wrapper::vadd(carry_res, vec_res_value4);
+                        auto carry_paddition =
+                            wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
+                        carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
+                        auto res        = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
 
-                    auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
-                    carry_paddition      = wrapper::vpadd(carry_paddition, carry_paddition);
-                    auto res             = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res += *(input_ptr + x);
+                        }
 
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res += *(input_ptr + x);
-                    }
+                        if (op == ReductionOperation::MEAN_SUM)
+                        {
+                            const int32_t resFinal = A * (static_cast<float>(res)) + B;
 
-                    if(op == ReductionOperation::MEAN_SUM)
-                    {
-                        const int32_t resFinal = A * (static_cast<float>(res)) + B;
+                            *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal);
+                        }
+                        else
+                        {
+                            // Subtract accumulated offsets
+                            res -= (in_info.dimension(0) - 1) * iq_info.offset;
+                            *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
+                        }
 
-                        *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal);
-                    }
-                    else
-                    {
-                        // Subtract accumulated offsets
-                        res -= (in_info.dimension(0) - 1) * iq_info.offset;
-                        *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
+                        break;
                     }
-
-                    break;
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
                 }
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        },
-        input, output);
+            },
+            input, output);
     }
 };
 
@@ -887,7 +910,12 @@ struct RedOpYZW
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
     using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
 
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
+    inline void operator()(const Window            &in_window,
+                           Window                  &out_window,
+                           const ITensor           *in,
+                           ITensor                 *out,
+                           int                      axis,
+                           const ReductionOperation op)
     {
         const TensorInfo in_info            = *(in->info());
         const int        window_step_x      = 16 / sizeof(T);
@@ -900,203 +928,210 @@ struct RedOpYZW
         Window in_win_no_pad = in_window;
         in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
         Window out_win_no_pad = out_window;
-        out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+        out_win_no_pad.set(Window::DimX,
+                           Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
 
         Iterator input(in, in_win_no_pad);
         Iterator output(out, out_win_no_pad);
 
         execute_window_loop(
-            in_win_no_pad, [&](const Coordinates &)
-        {
-            const auto input_ptr = reinterpret_cast<T *>(input.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            in_win_no_pad,
+            [&](const Coordinates &)
             {
-                neon_vector vec_res_value = { 0 };
-                switch(op)
-                {
-                    case ReductionOperation::ARG_IDX_MAX:
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::MIN:
-                    case ReductionOperation::MAX:
-                    {
-                        vec_res_value = wrapper::vloadq(input_ptr + x);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
-                        break;
-                    }
-                    default:
-                    {
-                        vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-                        break;
-                    }
-                }
-                uint32x4x4_t vec_res_idx{ { 0 } };
+                const auto input_ptr = reinterpret_cast<T *>(input.ptr());
 
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    const T   *in_ptr       = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
-                    const auto vec_elements = wrapper::vloadq(in_ptr);
-                    switch(op)
+                    neon_vector vec_res_value = {0};
+                    switch (op)
                     {
-                        case ReductionOperation::SUM:
-                        case ReductionOperation::MEAN_SUM:
-                            vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
-                            break;
-                        case ReductionOperation::SUM_SQUARE:
-                            vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
-                            break;
-                        case ReductionOperation::PROD:
-                            vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
-                            break;
-                        case ReductionOperation::ARG_IDX_MIN:
-                        {
-                            auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                            vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                            vec_res_value           = temp_vec_res_value;
-                            break;
-                        }
                         case ReductionOperation::ARG_IDX_MAX:
+                        case ReductionOperation::ARG_IDX_MIN:
+                        case ReductionOperation::MIN:
+                        case ReductionOperation::MAX:
                         {
-                            auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                            vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                            vec_res_value           = temp_vec_res_value;
+                            vec_res_value = wrapper::vloadq(input_ptr + x);
                             break;
                         }
-                        case ReductionOperation::MIN:
+                        case ReductionOperation::PROD:
                         {
-                            vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
                             break;
                         }
-                        case ReductionOperation::MAX:
+                        default:
                         {
-                            vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
                             break;
                         }
-                        default:
-                            ARM_COMPUTE_ERROR("Not supported");
                     }
-                }
+                    uint32x4x4_t vec_res_idx{{0}};
 
-                if(op == ReductionOperation::MEAN_SUM)
-                {
-                    auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
-                    vec_res_value      = wrapper::vmul(vec_res_value, vec_width_inv);
-                }
-
-                if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
-                {
-                    wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    if(std::is_same<T, float16_t>::value)
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
                     {
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]);
+                        const T *in_ptr =
+                            reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
+                        const auto vec_elements = wrapper::vloadq(in_ptr);
+                        switch (op)
+                        {
+                            case ReductionOperation::SUM:
+                            case ReductionOperation::MEAN_SUM:
+                                vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                                break;
+                            case ReductionOperation::SUM_SQUARE:
+                                vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                                break;
+                            case ReductionOperation::PROD:
+                                vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+                                break;
+                            case ReductionOperation::ARG_IDX_MIN:
+                            {
+                                auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                                vec_res_idx =
+                                    calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                                vec_res_value = temp_vec_res_value;
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MAX:
+                            {
+                                auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                                vec_res_idx =
+                                    calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                                vec_res_value = temp_vec_res_value;
+                                break;
+                            }
+                            case ReductionOperation::MIN:
+                            {
+                                vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                                break;
+                            }
+                            case ReductionOperation::MAX:
+                            {
+                                vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                                break;
+                            }
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
+                        }
                     }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                }
-                else
-                {
-                    wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value);
-                }
-            }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                auto res_value = 0.f;
-                switch(op)
-                {
-                    case ReductionOperation::ARG_IDX_MAX:
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::MIN:
-                    case ReductionOperation::MAX:
+                    if (op == ReductionOperation::MEAN_SUM)
                     {
-                        res_value = *(input_ptr + x);
-                        break;
+                        auto vec_width_inv =
+                            wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
+                        vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv);
                     }
-                    case ReductionOperation::PROD:
+
+                    if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
                     {
-                        res_value = static_cast<T>(1.f);
-                        break;
+                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                        if (std::is_same<T, float16_t>::value)
+                        {
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]);
+                        }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                     }
-                    default:
+                    else
                     {
-                        res_value = static_cast<T>(0.f);
-                        break;
+                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value);
                     }
                 }
 
-                uint32_t res_idx = 0;
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
-
-                    switch(op)
+                    auto res_value = 0.f;
+                    switch (op)
                     {
-                        case ReductionOperation::SUM:
-                        case ReductionOperation::MEAN_SUM:
-                            res_value += *in_ptr;
-                            break;
-                        case ReductionOperation::SUM_SQUARE:
-                            res_value += *in_ptr * *in_ptr;
-                            break;
-                        case ReductionOperation::PROD:
-                            res_value *= *in_ptr;
-                            break;
+                        case ReductionOperation::ARG_IDX_MAX:
                         case ReductionOperation::ARG_IDX_MIN:
+                        case ReductionOperation::MIN:
+                        case ReductionOperation::MAX:
                         {
-                            if(*in_ptr < res_value)
-                            {
-                                res_value = *in_ptr;
-                                res_idx   = dim;
-                            }
+                            res_value = *(input_ptr + x);
                             break;
                         }
-                        case ReductionOperation::ARG_IDX_MAX:
+                        case ReductionOperation::PROD:
                         {
-                            if(*in_ptr > res_value)
-                            {
-                                res_value = *in_ptr;
-                                res_idx   = dim;
-                            }
+                            res_value = static_cast<T>(1.f);
                             break;
                         }
-                        case ReductionOperation::MIN:
+                        default:
                         {
-                            res_value = *in_ptr < res_value ? *in_ptr : res_value;
+                            res_value = static_cast<T>(0.f);
                             break;
                         }
-                        case ReductionOperation::MAX:
+                    }
+
+                    uint32_t res_idx = 0;
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        const T *in_ptr =
+                            reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
+
+                        switch (op)
                         {
-                            res_value = *in_ptr > res_value ? *in_ptr : res_value;
-                            break;
+                            case ReductionOperation::SUM:
+                            case ReductionOperation::MEAN_SUM:
+                                res_value += *in_ptr;
+                                break;
+                            case ReductionOperation::SUM_SQUARE:
+                                res_value += *in_ptr * *in_ptr;
+                                break;
+                            case ReductionOperation::PROD:
+                                res_value *= *in_ptr;
+                                break;
+                            case ReductionOperation::ARG_IDX_MIN:
+                            {
+                                if (*in_ptr < res_value)
+                                {
+                                    res_value = *in_ptr;
+                                    res_idx   = dim;
+                                }
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MAX:
+                            {
+                                if (*in_ptr > res_value)
+                                {
+                                    res_value = *in_ptr;
+                                    res_idx   = dim;
+                                }
+                                break;
+                            }
+                            case ReductionOperation::MIN:
+                            {
+                                res_value = *in_ptr < res_value ? *in_ptr : res_value;
+                                break;
+                            }
+                            case ReductionOperation::MAX:
+                            {
+                                res_value = *in_ptr > res_value ? *in_ptr : res_value;
+                                break;
+                            }
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
                         }
-                        default:
-                            ARM_COMPUTE_ERROR("Not supported");
                     }
-                }
 
-                if(op == ReductionOperation::MEAN_SUM)
-                {
-                    res_value /= in_info.dimension(axis);
-                }
+                    if (op == ReductionOperation::MEAN_SUM)
+                    {
+                        res_value /= in_info.dimension(axis);
+                    }
 
-                if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
-                {
-                    *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx;
-                }
-                else
-                {
-                    *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value;
+                    if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+                    {
+                        *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx;
+                    }
+                    else
+                    {
+                        *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value;
+                    }
                 }
-            }
-        },
-        input, output);
+            },
+            input, output);
     }
 };
 
@@ -1107,7 +1142,8 @@ struct RedOpYZW_complex
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
     using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
 
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation)
+    inline void operator()(
+        const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation)
     {
         ARM_COMPUTE_ERROR_ON(axis != 2);
         ARM_COMPUTE_ERROR_ON(op != ReductionOperation::SUM);
@@ -1124,70 +1160,77 @@ struct RedOpYZW_complex
         Window in_win_no_pad = in_window;
         in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
         Window out_win_no_pad = out_window;
-        out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+        out_win_no_pad.set(Window::DimX,
+                           Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
 
         Iterator input(in, in_win_no_pad);
         Iterator output(out, out_win_no_pad);
 
         execute_window_loop(
-            in_win_no_pad, [&](const Coordinates &)
-        {
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            in_win_no_pad,
+            [&](const Coordinates &)
             {
-                neon_vector vec_res_value_0 = { 0 };
-                neon_vector vec_res_value_1 = { 0 };
-
-                vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-                vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-
-                T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
-                    T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim);
+                    neon_vector vec_res_value_0 = {0};
+                    neon_vector vec_res_value_1 = {0};
 
-                    const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
-                    const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
+                    vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+                    vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
 
-                    vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
-                    vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
-                }
+                    T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
+                        T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim);
 
-                wrapper::vstore(out_ptr, vec_res_value_0);
-                wrapper::vstore(out_ptr + 4, vec_res_value_1);
-            }
+                        const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
+                        const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                auto res_value_0 = 0.f;
-                auto res_value_1 = 0.f;
+                        vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
+                        vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
+                    }
 
-                T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    wrapper::vstore(out_ptr, vec_res_value_0);
+                    wrapper::vstore(out_ptr + 4, vec_res_value_1);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
-                    res_value_0 += *in_ptr;
-                    res_value_1 += *(in_ptr + 1);
+                    auto res_value_0 = 0.f;
+                    auto res_value_1 = 0.f;
+
+                    T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
+                        res_value_0 += *in_ptr;
+                        res_value_1 += *(in_ptr + 1);
+                    }
+                    *out_ptr       = res_value_0;
+                    *(out_ptr + 1) = res_value_1;
                 }
-                *out_ptr       = res_value_0;
-                *(out_ptr + 1) = res_value_1;
-            }
-        },
-        input, output);
+            },
+            input, output);
     }
 };
 
 template <typename T>
 struct RedOpYZW_quantized
 {
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
+    inline void operator()(const Window            &in_window,
+                           Window                  &out_window,
+                           const ITensor           *in,
+                           ITensor                 *out,
+                           int                      axis,
+                           const ReductionOperation op)
     {
         const TensorInfo              in_info = *(in->info());
         const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
-        using PromotedType                    = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
+        using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
 
         const auto oq_info = out->info()->quantization_info().uniform();
 
@@ -1201,12 +1244,14 @@ struct RedOpYZW_quantized
         Window in_win_no_pad = in_window;
         in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
         Window out_win_no_pad = out_window;
-        out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+        out_win_no_pad.set(Window::DimX,
+                           Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
 
         Iterator input(in, in_win_no_pad);
         Iterator output(out, out_win_no_pad);
 
-        using vector_type   = typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type;
+        using vector_type =
+            typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type;
         using vector_type_f = typename wrapper::traits::neon_vector<float, 4>::type;
 
         vector_type vec_res_value1{};
@@ -1234,362 +1279,384 @@ struct RedOpYZW_quantized
         const auto vec_B = wrapper::vdup_n(static_cast<float>(B), wrapper::traits::vector_128_tag{});
 
         execute_window_loop(
-            in_win_no_pad, [&](const Coordinates &)
-        {
-            const auto input_ptr = reinterpret_cast<T *>(input.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            in_win_no_pad,
+            [&](const Coordinates &)
             {
-                uint32x4x4_t vec_res_idx{ { 0 } };
-                vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-                vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-                vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-                vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                const auto input_ptr = reinterpret_cast<T *>(input.ptr());
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    uint32x4x4_t vec_res_idx{{0}};
+                    vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                    vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                    vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                    vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
 
-                vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-                vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-                vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-                vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                    vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                    vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                    vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                    vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
 
-                auto vec_res_value = wrapper::vloadq(input_ptr + x);
+                    auto vec_res_value = wrapper::vloadq(input_ptr + x);
 
-                for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
-                {
-                    const T   *in_ptr       = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
-                    const auto vec_elements = wrapper::vloadq(in_ptr);
-                    switch(op)
+                    for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
                     {
-                        case ReductionOperation::SUM:
-                        case ReductionOperation::MEAN_SUM:
+                        const T   *in_ptr       = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
+                        const auto vec_elements = wrapper::vloadq(in_ptr);
+                        switch (op)
                         {
-                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                            vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
-                            vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
-                            vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
-                            vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
-                            break;
+                            case ReductionOperation::SUM:
+                            case ReductionOperation::MEAN_SUM:
+                            {
+                                const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                                const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                                const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                                const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                                const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                                const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                                vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+                                vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+                                vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+                                vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                                break;
+                            }
+                            case ReductionOperation::PROD:
+                            {
+                                const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset),
+                                                                           wrapper::traits::vector_128_tag{});
+                                const auto scale32x4f_4 =
+                                    wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
+
+                                const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                                const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                                const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                                const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                                const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                                const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                                auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
+                                auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
+                                auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
+                                auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+
+                                //de-quantize vec_elements
+                                temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+                                temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+                                temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+                                temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+                                vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f);
+                                vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f);
+                                vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f);
+                                vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f);
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MIN:
+                            {
+                                auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                                vec_res_idx   = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value,
+                                                                          vec_res_idx, op, axis);
+                                vec_res_value = temp_vec_res_value;
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MAX:
+                            {
+                                auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                                vec_res_idx   = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value,
+                                                                          vec_res_idx, op, axis);
+                                vec_res_value = temp_vec_res_value;
+                                break;
+                            }
+                            case ReductionOperation::MIN:
+                            {
+                                vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                                break;
+                            }
+                            case ReductionOperation::MAX:
+                            {
+                                vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                                break;
+                            }
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
                         }
-                        case ReductionOperation::PROD:
-                        {
-                            const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
-                            const auto scale32x4f_4  = wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
-
-                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                            auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
-                            auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
-                            auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
-                            auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+                    }
 
-                            //de-quantize vec_elements
-                            temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4);
-                            temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4);
-                            temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4);
-                            temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4);
-
-                            vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f);
-                            vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f);
-                            vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f);
-                            vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f);
-                            break;
-                        }
+                    switch (op)
+                    {
                         case ReductionOperation::ARG_IDX_MIN:
-                        {
-                            auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                            vec_res_idx             = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                            vec_res_value           = temp_vec_res_value;
-                            break;
-                        }
                         case ReductionOperation::ARG_IDX_MAX:
                         {
-                            auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                            vec_res_idx             = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                            vec_res_value           = temp_vec_res_value;
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]);
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12,
+                                            vec_res_idx.val[3]);
                             break;
                         }
                         case ReductionOperation::MIN:
-                        {
-                            vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                            break;
-                        }
                         case ReductionOperation::MAX:
                         {
-                            vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value);
                             break;
                         }
-                        default:
-                            ARM_COMPUTE_ERROR("Not supported");
-                    }
-                }
-
-                switch(op)
-                {
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::ARG_IDX_MAX:
-                    {
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]);
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12, vec_res_idx.val[3]);
-                        break;
-                    }
-                    case ReductionOperation::MIN:
-                    case ReductionOperation::MAX:
-                    {
-                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value);
-                        break;
-                    }
-                    case ReductionOperation::SUM:
-                    {
-                        // Subtract offsets
-                        auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
+                        case ReductionOperation::SUM:
+                        {
+                            // Subtract offsets
+                            auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
 
-                        auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1);
-                        auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2);
-                        auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3);
-                        auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4);
+                            auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1);
+                            auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2);
+                            auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3);
+                            auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4);
 
-                        vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets);
-                        vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets);
-                        vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets);
-                        vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets);
+                            vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets);
+                            vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets);
+                            vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets);
+                            vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets);
 
-                        const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2));
-                        const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4));
+                            const auto temp16x8t_1 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2));
+                            const auto temp16x8t_2 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4));
 
-                        combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
-                        break;
-                    }
-                    case ReductionOperation::MEAN_SUM:
-                    {
-                        vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A);
-                        vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A);
-                        vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A);
-                        vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A);
+                            combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
+                            break;
+                        }
+                        case ReductionOperation::MEAN_SUM:
+                        {
+                            vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A);
+                            vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A);
+                            vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A);
+                            vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A);
 
 #ifdef __aarch64__
-                        vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f);
-                        vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f);
-                        vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f);
-                        vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f);
+                            vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f);
+                            vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f);
+                            vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f);
+                            vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f);
 #else  // defined(__aarch64__)
-                        vec_res_value1 = wrapper::vcvt<PromotedType>(vec_res_value1_f);
-                        vec_res_value2 = wrapper::vcvt<PromotedType>(vec_res_value2_f);
-                        vec_res_value3 = wrapper::vcvt<PromotedType>(vec_res_value3_f);
-                        vec_res_value4 = wrapper::vcvt<PromotedType>(vec_res_value4_f);
+                            vec_res_value1    = wrapper::vcvt<PromotedType>(vec_res_value1_f);
+                            vec_res_value2    = wrapper::vcvt<PromotedType>(vec_res_value2_f);
+                            vec_res_value3    = wrapper::vcvt<PromotedType>(vec_res_value3_f);
+                            vec_res_value4    = wrapper::vcvt<PromotedType>(vec_res_value4_f);
 #endif // __aarch64__
 
-                        const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
-                        const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
-                        auto       res         = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+                            const auto temp16x8t_1 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+                            const auto temp16x8t_2 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+                            auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
 
-                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
-                        const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale));
-
-                        //re-quantize
-                        vec_res_value1_f = wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
-                        vec_res_value2_f = wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
-                        vec_res_value3_f = wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
-                        vec_res_value4_f = wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
-
-                        vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
-                        vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
-                        vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
-                        vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
-
-                        const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
-                        const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
-                        auto       res         = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
-
-                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
-                        break;
+                            wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
+                            break;
+                        }
+                        case ReductionOperation::PROD:
+                        {
+                            const auto offset32x4f_4 =
+                                wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
+                            const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale));
+
+                            //re-quantize
+                            vec_res_value1_f =
+                                wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
+                            vec_res_value2_f =
+                                wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
+                            vec_res_value3_f =
+                                wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
+                            vec_res_value4_f =
+                                wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
+
+                            vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
+                            vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
+                            vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
+                            vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
+
+                            const auto temp16x8t_1 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+                            const auto temp16x8t_2 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+                            auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+
+                            wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
+                            break;
+                        }
+                        default:
+                            ARM_COMPUTE_ERROR("Not supported");
                     }
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
                 }
-            }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                float   res_value   = 0.f;
-                int32_t res_value_q = 0;
-
-                switch(op)
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    case ReductionOperation::ARG_IDX_MAX:
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::MIN:
-                    case ReductionOperation::MAX:
-                    {
-                        res_value = *(input_ptr + x);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        res_value = static_cast<T>(1.0f);
-                        break;
-                    }
-                    default:
-                    {
-                        res_value = static_cast<T>(0.0f);
-                        break;
-                    }
-                }
-                uint32_t res_idx = 0;
+                    float   res_value   = 0.f;
+                    int32_t res_value_q = 0;
 
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
-                {
-                    const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
-                    switch(op)
+                    switch (op)
                     {
-                        case ReductionOperation::SUM:
+                        case ReductionOperation::ARG_IDX_MAX:
+                        case ReductionOperation::ARG_IDX_MIN:
+                        case ReductionOperation::MIN:
+                        case ReductionOperation::MAX:
                         {
-                            res_value += *in_ptr;
+                            res_value = *(input_ptr + x);
                             break;
                         }
-                        case ReductionOperation::MEAN_SUM:
+                        case ReductionOperation::PROD:
                         {
-                            res_value_q += *in_ptr;
+                            res_value = static_cast<T>(1.0f);
                             break;
                         }
-                        case ReductionOperation::SUM_SQUARE:
+                        default:
                         {
-                            res_value += *in_ptr * *in_ptr;
+                            res_value = static_cast<T>(0.0f);
                             break;
                         }
-                        case ReductionOperation::PROD:
+                    }
+                    uint32_t res_idx = 0;
+
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        const T *in_ptr =
+                            reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
+                        switch (op)
                         {
-                            //de-quantize input
-                            if(std::is_same<T, uint8_t>::value)
+                            case ReductionOperation::SUM:
                             {
-                                res_value *= dequantize_qasymm8(*in_ptr, iq_info);
+                                res_value += *in_ptr;
+                                break;
                             }
-                            else
+                            case ReductionOperation::MEAN_SUM:
                             {
-                                res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info);
+                                res_value_q += *in_ptr;
+                                break;
                             }
-                            break;
-                        }
-                        case ReductionOperation::ARG_IDX_MIN:
-                        {
-                            if(*in_ptr < res_value)
+                            case ReductionOperation::SUM_SQUARE:
                             {
-                                res_value = *in_ptr;
-                                res_idx   = dim;
+                                res_value += *in_ptr * *in_ptr;
+                                break;
                             }
-                            break;
-                        }
-                        case ReductionOperation::ARG_IDX_MAX:
-                        {
-                            if(*in_ptr > res_value)
+                            case ReductionOperation::PROD:
                             {
-                                res_value = *in_ptr;
-                                res_idx   = dim;
+                                //de-quantize input
+                                if (std::is_same<T, uint8_t>::value)
+                                {
+                                    res_value *= dequantize_qasymm8(*in_ptr, iq_info);
+                                }
+                                else
+                                {
+                                    res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info);
+                                }
+                                break;
                             }
-                            break;
+                            case ReductionOperation::ARG_IDX_MIN:
+                            {
+                                if (*in_ptr < res_value)
+                                {
+                                    res_value = *in_ptr;
+                                    res_idx   = dim;
+                                }
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MAX:
+                            {
+                                if (*in_ptr > res_value)
+                                {
+                                    res_value = *in_ptr;
+                                    res_idx   = dim;
+                                }
+                                break;
+                            }
+                            case ReductionOperation::MIN:
+                            {
+                                res_value = *in_ptr < res_value ? *in_ptr : res_value;
+                                break;
+                            }
+                            case ReductionOperation::MAX:
+                            {
+                                res_value = *in_ptr > res_value ? *in_ptr : res_value;
+                                break;
+                            }
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
                         }
-                        case ReductionOperation::MIN:
+                    }
+
+                    switch (op)
+                    {
+                        case ReductionOperation::MEAN_SUM:
                         {
-                            res_value = *in_ptr < res_value ? *in_ptr : res_value;
+                        // Apply previously calculated coefficients (with rounding on aarch64)
+#ifdef __aarch64__
+                            const int32_t res =
+                                arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B);
+#else  // defined(__aarch64__)
+                            const int32_t res = A * (static_cast<float>(res_value_q)) + B;
+#endif // __aarch64__
+                            *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
                             break;
                         }
-                        case ReductionOperation::MAX:
+                        case ReductionOperation::SUM:
                         {
-                            res_value = *in_ptr > res_value ? *in_ptr : res_value;
+                            // Subtract accumulated offsets
+                            res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
+                            *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
                             break;
                         }
-                        default:
-                            ARM_COMPUTE_ERROR("Not supported");
-                    }
-                }
-
-                switch(op)
-                {
-                    case ReductionOperation::MEAN_SUM:
-                    {
-                        // Apply previously calculated coefficients (with rounding on aarch64)
-#ifdef  __aarch64__
-                        const int32_t res                        = arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B);
-#else   // defined(__aarch64__)
-                        const int32_t res                        = A * (static_cast<float>(res_value_q)) + B;
-#endif  // __aarch64__
-                        *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
-                        break;
-                    }
-                    case ReductionOperation::SUM:
-                    {
-                        // Subtract accumulated offsets
-                        res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
-                        *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        //re-quantize result
-                        T res = 0;
-                        if(std::is_same<T, uint8_t>::value)
+                        case ReductionOperation::PROD:
                         {
-                            res = quantize_qasymm8(res_value, iq_info);
+                            //re-quantize result
+                            T res = 0;
+                            if (std::is_same<T, uint8_t>::value)
+                            {
+                                res = quantize_qasymm8(res_value, iq_info);
+                            }
+                            else
+                            {
+                                res = quantize_qasymm8_signed(res_value, iq_info);
+                            }
+                            *(reinterpret_cast<T *>(output.ptr() + x)) = res;
+                            break;
                         }
-                        else
+                        case ReductionOperation::ARG_IDX_MIN:
+                        case ReductionOperation::ARG_IDX_MAX:
                         {
-                            res = quantize_qasymm8_signed(res_value, iq_info);
+                            *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx;
+                            break;
                         }
-                        *(reinterpret_cast<T *>(output.ptr() + x)) = res;
-                        break;
-                    }
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::ARG_IDX_MAX:
-                    {
-                        *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx;
-                        break;
+                        default:
+                            *(reinterpret_cast<T *>(output.ptr() + x)) = res_value;
                     }
-                    default:
-                        *(reinterpret_cast<T *>(output.ptr() + x)) = res_value;
                 }
-            }
-        },
-        input, output);
+            },
+            input, output);
     }
 };
 
-void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
+void reduce_op(
+    const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
 {
     const bool is_complex = (input->info()->num_channels() == 2);
 
-    if(is_complex)
+    if (is_complex)
     {
-        switch(axis)
+        switch (axis)
         {
             case 2:
-                switch(input->info()->data_type())
+                switch (input->info()->data_type())
                 {
                     case DataType::F32:
-                        switch(op)
+                        switch (op)
                         {
                             case ReductionOperation::SUM:
-                                return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), op);
+                                return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(
+                                    window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(),
+                                    op);
                             default:
                                 ARM_COMPUTE_ERROR("Not supported");
                         }
@@ -1602,19 +1669,21 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
         return;
     }
 
-    switch(axis)
+    switch (axis)
     {
         case 0:
         {
-            switch(input->info()->data_type())
+            switch (input->info()->data_type())
             {
                 case DataType::QASYMM8:
                 {
-                    return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, RedOpX_quantized<uint8_t>(), op);
+                    return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output,
+                                                                       RedOpX_quantized<uint8_t>(), op);
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
-                    return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), op);
+                    return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(),
+                                                                      op);
                 }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
@@ -1635,19 +1704,22 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
             }
         }
         case 1:
-            switch(input->info()->data_type())
+            switch (input->info()->data_type())
             {
                 case DataType::QASYMM8:
                 {
-                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output,
+                                                                         RedOpYZW_quantized<uint8_t>(), op);
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
-                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output,
+                                                                        RedOpYZW_quantized<int8_t>(), op);
                 }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(),
+                                                                    op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
                     return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
@@ -1657,15 +1729,18 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
                     ARM_COMPUTE_ERROR("Not supported");
             }
         case 2:
-            switch(input->info()->data_type())
+            switch (input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output,
+                                                                         RedOpYZW_quantized<uint8_t>(), op);
                 case DataType::QASYMM8_SIGNED:
-                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output,
+                                                                        RedOpYZW_quantized<int8_t>(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op);
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(),
+                                                                    op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
                     return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
@@ -1675,15 +1750,18 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
                     ARM_COMPUTE_ERROR("Not supported");
             }
         case 3:
-            switch(input->info()->data_type())
+            switch (input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output,
+                                                                         RedOpYZW_quantized<uint8_t>(), op);
                 case DataType::QASYMM8_SIGNED:
-                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output,
+                                                                        RedOpYZW_quantized<int8_t>(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op);
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(),
+                                                                    op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
                     return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
@@ -1704,9 +1782,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
 
-    if(input->num_channels() == 1)
+    if (input->num_channels() == 1)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                             DataType::S32, DataType::F16, DataType::F32);
     }
     else
     {
@@ -1715,13 +1794,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
         ARM_COMPUTE_RETURN_ERROR_ON(axis != 2);
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
-        if(!is_arg_min_max)
+        if (!is_arg_min_max)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
             ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels());
@@ -1731,8 +1811,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
         }
 
-        const TensorShape output_shape         = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
-        const TensorInfo  tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
+        const TensorShape output_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
+        const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
     }
 
@@ -1745,7 +1826,10 @@ NEReductionOperationKernel::NEReductionOperationKernel()
 {
 }
 
-void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
+void NEReductionOperationKernel::configure(const ITensor     *input,
+                                           ITensor           *output,
+                                           unsigned int       axis,
+                                           ReductionOperation op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -1761,14 +1845,23 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output
     INEKernel::configure(win);
 
     // Calculate output shape and set if empty
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
     // Output auto initialization if not yet initialized
     const bool is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
     DataType   output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+    auto_init_if_empty(*output->info(), input->info()
+                                            ->clone()
+                                            ->set_tensor_shape(output_shape)
+                                            .set_data_type(output_data_type)
+                                            .reset_padding()
+                                            .set_is_resizable(true));
 }
 
-Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status NEReductionOperationKernel::validate(const ITensorInfo *input,
+                                            const ITensorInfo *output,
+                                            unsigned int       axis,
+                                            ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
 
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.h b/src/core/NEON/kernels/NEReductionOperationKernel.h
index 08e654fd2137ca49a52f74ff99fd4090788ba83e..78bec62c14ed9f2978339cfca4ed9efad728e217 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.h
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.h
@@ -77,7 +77,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEReorderKernel.cpp b/src/core/NEON/kernels/NEReorderKernel.cpp
index 1a7f58bb08e0716fd6b19f2d3b8223d518c757e9..6c2c987eb720bed723534727e24766fd1d2d278e 100644
--- a/src/core/NEON/kernels/NEReorderKernel.cpp
+++ b/src/core/NEON/kernels/NEReorderKernel.cpp
@@ -24,11 +24,13 @@
 #if defined(__aarch64__)
 
 #include "src/core/NEON/kernels/NEReorderKernel.h"
-#include "src/common/utils/Log.h"
-#include "src/core/NEON/kernels/arm_gemm/transform.hpp"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/arm_gemm/transform.hpp"
+
 namespace arm_compute
 {
 
@@ -37,29 +39,32 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    switch(_input->info()->data_type())
+    switch (_input->info()->data_type())
     {
         case DataType::F32:
         {
             const int ksize_rows_elements = _xmax * _ksize;
-            const int jump_rows = ksize_rows_elements * window.x().start();
-            const int k_start = window.x().start() * _ksize;
-            const int k_end = std::min(window.x().end() * _ksize, _kmax);
-            const int stride = _kmax;
-            if(k_start < k_end)
+            const int jump_rows           = ksize_rows_elements * window.x().start();
+            const int k_start             = window.x().start() * _ksize;
+            const int k_end               = std::min(window.x().end() * _ksize, _kmax);
+            const int stride              = _kmax;
+            if (k_start < k_end)
             {
-
-                switch(_output_wf)
+                switch (_output_wf)
                 {
                     case WeightFormat::OHWIo4:
                     {
-                        arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(reinterpret_cast<float *>(_output->buffer()) + jump_rows, reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+                        arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(
+                            reinterpret_cast<float *>(_output->buffer()) + jump_rows,
+                            reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
                         break;
                     }
 #if defined(ARM_COMPUTE_ENABLE_SVE)
                     case WeightFormat::OHWIo8:
                     {
-                        arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(reinterpret_cast<float *>(_output->buffer()) + jump_rows, reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+                        arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(
+                            reinterpret_cast<float *>(_output->buffer()) + jump_rows,
+                            reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
                         break;
                     }
 #endif /* ARM_COMPUTE_ENABLE_SVE */
@@ -78,11 +83,20 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info)
 }
 
 NEReorderKernel::NEReorderKernel()
-    : _input(nullptr), _output(nullptr), _ksize(0), _kmax(0), _xmax(0), _input_wf(WeightFormat::ANY), _output_wf(WeightFormat::ANY)
+    : _input(nullptr),
+      _output(nullptr),
+      _ksize(0),
+      _kmax(0),
+      _xmax(0),
+      _input_wf(WeightFormat::ANY),
+      _output_wf(WeightFormat::ANY)
 {
 }
 
-void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf)
+void NEReorderKernel::configure(const ITensor            *input,
+                                ITensor                  *output,
+                                arm_compute::WeightFormat input_wf,
+                                arm_compute::WeightFormat output_wf)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, input_wf, output_wf);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -96,7 +110,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu
 
     // Setting parameters for transform
     auto dims = input->info()->num_dimensions();
-    switch(dims)
+    switch (dims)
     {
         case 2:
         {
@@ -120,7 +134,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu
     // Window size is set by rows / _ksize
     Window win;
     int    window_size = 0;
-    switch(_output_wf)
+    switch (_output_wf)
     {
 #if defined(ARM_COMPUTE_ENABLE_SVE)
         case WeightFormat::OHWIo8:
@@ -142,7 +156,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu
             break;
         }
     }
-    if(_kmax % _ksize != 0)
+    if (_kmax % _ksize != 0)
     {
         window_size += 1;
     }
@@ -152,11 +166,14 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu
     INEKernel::configure(win);
 }
 
-Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf)
+Status NEReorderKernel::validate(const ITensorInfo        *input,
+                                 const ITensorInfo        *output,
+                                 arm_compute::WeightFormat input_wf,
+                                 arm_compute::WeightFormat output_wf)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    if(output->tensor_shape().total_size() != 0)
+    if (output->tensor_shape().total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -167,20 +184,20 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou
         int  output_x_dim;
         int  output_k_dim;
         auto dims = output->num_dimensions();
-        switch(dims)
+        switch (dims)
         {
             case 2:
             {
-                input_x_dim = input->dimension(0); // Number of columns in input matrix
-                input_k_dim = input->dimension(1); // Number of rows in input matrix
+                input_x_dim  = input->dimension(0);  // Number of columns in input matrix
+                input_k_dim  = input->dimension(1);  // Number of rows in input matrix
                 output_x_dim = output->dimension(0); // Number of columns in output matrix
                 output_k_dim = output->dimension(1); // Number of rows in output matrix
                 break;
             }
             case 4:
             {
-                input_x_dim = input->dimension(2); // Number of columns in input matrix
-                input_k_dim = input->dimension(3); // Number of rows in input matrix
+                input_x_dim  = input->dimension(2);  // Number of columns in input matrix
+                input_k_dim  = input->dimension(3);  // Number of rows in input matrix
                 output_x_dim = output->dimension(2); // Number of columns in output matrix
                 output_k_dim = output->dimension(3); // Number of rows in output matrix
                 break;
@@ -192,13 +209,15 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou
         }
 
         int ksize;
-        switch(output_wf)
+        switch (output_wf)
         {
+#if defined(ARM_COMPUTE_ENABLE_SVE)
             case WeightFormat::OHWIo8:
             {
                 ksize = 8;
                 break;
             }
+#endif /* ARM_COMPUTE_ENABLE_SVE */
             case WeightFormat::OHWIo4:
             {
                 ksize = 4;
@@ -216,11 +235,10 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou
         ARM_COMPUTE_RETURN_ERROR_ON(rnd_up_input_kdim != output_k_dim);
         // output x_dim needs to be same as input
         ARM_COMPUTE_RETURN_ERROR_ON(input_x_dim != output_x_dim);
-
     }
     return Status{};
 }
 
 } // namespace arm_compute
 
-#endif  // defined(__aarch64__)
\ No newline at end of file
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/NEReorderKernel.h b/src/core/NEON/kernels/NEReorderKernel.h
index 07908890f4d5c8565cee9d30428814db170a2606..4528b25245703f84eef5d11597af273368c722c0 100644
--- a/src/core/NEON/kernels/NEReorderKernel.h
+++ b/src/core/NEON/kernels/NEReorderKernel.h
@@ -26,9 +26,10 @@
 #ifndef ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL
 #define ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL
 
-#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 
+#include "src/core/NEON/INEKernel.h"
+
 namespace arm_compute
 {
 
@@ -36,7 +37,6 @@ namespace arm_compute
 class NEReorderKernel : public INEKernel
 {
 public:
-
     const char *name() const override
     {
         return "NEReorderKernel";
@@ -62,7 +62,10 @@ public:
      * @param[in]  input_wf  WeightFormat of input.
      * @param[in]  output_wf WeightFormat of output.
      */
-    void configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf);
+    void configure(const ITensor            *input,
+                   ITensor                  *output,
+                   arm_compute::WeightFormat input_wf,
+                   arm_compute::WeightFormat output_wf);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReorderKernel
      *
@@ -73,25 +76,27 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf);
+    static Status validate(const ITensorInfo        *input,
+                           const ITensorInfo        *output,
+                           arm_compute::WeightFormat input_wf,
+                           arm_compute::WeightFormat output_wf);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
-
-/*****************************************************************************/
+    /*****************************************************************************/
 
 private:
-    const ITensor *_input{nullptr}; // Input tensor
-    ITensor *_output{nullptr}; // Output tensor
-    int32_t  _ksize{0}; // Blocking parameter, how many rows kernel reorders on each call
-    int32_t  _kmax{0}; // Rows in input tensor
-    int32_t  _xmax{0}; // Columns in input tensor
-    WeightFormat _input_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of input tensor
-    WeightFormat _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor
+    const ITensor *_input{nullptr};  // Input tensor
+    ITensor       *_output{nullptr}; // Output tensor
+    int32_t        _ksize{0};        // Blocking parameter, how many rows kernel reorders on each call
+    int32_t        _kmax{0};         // Rows in input tensor
+    int32_t        _xmax{0};         // Columns in input tensor
+    WeightFormat   _input_wf{WeightFormat::UNSPECIFIED};  // WeightFormat of input tensor
+    WeightFormat   _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor
 };
 
 } // namespace arm_compute
 #endif /* ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL */
 
-#endif  // defined(__aarch64__)
\ No newline at end of file
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
index a7b830c066e7a919a721a25fba9e575196922c02..227570405cc5a1e7be4212001a59c50e94543cb2 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
@@ -28,8 +28,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -50,13 +51,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
     ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0,
+                                    "The width of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0,
+                                    "The height of the input tensor must be a multiple of stride");
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+        const TensorInfo tensor_info_output =
+            output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -65,8 +69,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 }
 } // namespace
 
-NEReorgLayerKernel::NEReorgLayerKernel()
-    : _input(nullptr), _output(nullptr), _stride(1)
+NEReorgLayerKernel::NEReorgLayerKernel() : _input(nullptr), _output(nullptr), _stride(1)
 {
 }
 
@@ -121,23 +124,26 @@ void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info)
     Iterator out(_output, collapsed_window);
 
     // Perform reorg
-    execute_window_loop(collapsed_window, [&](const Coordinates & id)
-    {
-        // Get spatial coords and channels
-        const unsigned int w = id[idx_w];
-        const unsigned int h = id[idx_h];
-        const unsigned int c = id[idx_c];
-
-        // Calculate mapping
-        const unsigned int offset     = c / out_c;
-        Coordinates        map_coords = id;
-        map_coords.set(idx_w, w * stride + offset % stride);
-        map_coords.set(idx_h, h * stride + offset / stride);
-        map_coords.set(idx_c, c % out_c);
-
-        // Perform mapping
-        std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords), _input->info()->element_size());
-    },
-    out);
+    execute_window_loop(
+        collapsed_window,
+        [&](const Coordinates &id)
+        {
+            // Get spatial coords and channels
+            const unsigned int w = id[idx_w];
+            const unsigned int h = id[idx_h];
+            const unsigned int c = id[idx_c];
+
+            // Calculate mapping
+            const unsigned int offset     = c / out_c;
+            Coordinates        map_coords = id;
+            map_coords.set(idx_w, w * stride + offset % stride);
+            map_coords.set(idx_h, h * stride + offset / stride);
+            map_coords.set(idx_c, c % out_c);
+
+            // Perform mapping
+            std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords),
+                        _input->info()->element_size());
+        },
+        out);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.h b/src/core/NEON/kernels/NEReorgLayerKernel.h
index 38a7d9f196ce2df3d922dc93d3846fea1865c1a9..6e67eb364ebb745387f0eeb4985ad946ce189629 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.h
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEREORGLAYERKERNEL_H
-#define ARM_COMPUTE_NEREORGLAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NEREORGLAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NEREORGLAYERKERNEL_H
 
 #include "src/core/NEON/INEKernel.h"
 
@@ -60,7 +60,7 @@ public:
      */
     void configure(const ITensor *input, ITensor *output, int32_t stride);
 
-    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuReshapeKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
      * @param[in] input  Source tensor info. Data type supported: All
      * @param[in] output Destination tensor info. Data type supported: Same as @p input
@@ -80,4 +80,4 @@ private:
     int32_t        _stride;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREORGLAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NEREORGLAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index 758433f89fb2fa630cc6e0fceccecf2f2755c44e..b3710555dfaf425d376e69fdcfc648962abc18a2 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,25 +26,30 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
 {
+    ARM_COMPUTE_UNUSED(use_inverted_axis);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, axis);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4,
+                                    "Current implementation only supports up to 4 dimensions.");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -55,42 +60,67 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-NEReverseKernel::NEReverseKernel()
-    : _input(nullptr), _output(nullptr), _axis(nullptr)
+NEReverseKernel::NEReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr), _use_inverted_axis(false)
 {
 }
 
-void NEReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *axis)
+void NEReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *axis, bool use_inverted_axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
 
-    _input  = input;
-    _output = output;
-    _axis   = axis;
+    _input             = input;
+    _output            = output;
+    _axis              = axis;
+    _use_inverted_axis = use_inverted_axis;
 
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), *input->info()->clone());
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info(), use_inverted_axis));
 
     // Configure kernel window
     INEKernel::configure(calculate_max_window(*output->info()));
 }
 
-Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status NEReverseKernel::validate(const ITensorInfo *input,
+                                 const ITensorInfo *output,
+                                 const ITensorInfo *axis,
+                                 bool               use_inverted_axis)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, use_inverted_axis));
 
     return Status{};
 }
 
 template <typename T>
-void run_reverse(const Window &window, const ITensor *input, const ITensor *axis, ITensor *output)
+void run_reverse(
+    const Window &window, const ITensor *input, const ITensor *axis, ITensor *output, bool use_inverted_axis)
 {
-    int axis_bit = 0;
-    for(unsigned int i = 0; i < axis->info()->dimension(0); ++i)
+    unsigned int axis_bit = 0;
+    const int    rank     = input->info()->num_dimensions();
+
+    for (unsigned int i = 0; i < axis->info()->dimension(0); ++i)
     {
-        const int axis_i = *(reinterpret_cast<const int *>(axis->buffer()) + i);
+        int axis_i = *(reinterpret_cast<const int *>(axis->buffer()) + i);
+
+        // The values of axis tensor must be between [-rank, rank-1].
+        if ((axis_i < -rank) || (axis_i >= rank))
+        {
+            ARM_COMPUTE_ERROR("the values of the axis tensor must be within [-rank, rank-1].");
+        }
+
+        // In case of negative axis value i.e targeted axis(i) = rank + axis(i)
+        if (axis_i < 0)
+        {
+            axis_i = rank + axis_i;
+        }
+
+        // Reverse ACL axis indices convention i.e. (inverted)axis = (tensor_rank - 1) - axis
+        if (use_inverted_axis)
+        {
+            axis_i = (rank - 1) - axis_i;
+        }
+
         axis_bit |= 1 << axis_i;
     }
 
@@ -103,43 +133,47 @@ void run_reverse(const Window &window, const ITensor *input, const ITensor *axis
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator input_it(input, win);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            auto in = wrapper::vloadq(reinterpret_cast<T *>(input_it.ptr()) + x);
-
-            // Reverse 0 axis
-            if(axis_bit & 0x1)
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                in = wrapper::vrev64(in);
-                in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+                auto in = wrapper::vloadq(reinterpret_cast<T *>(input_it.ptr()) + x);
+
+                // Reverse 0 axis
+                if (axis_bit & 0x1)
+                {
+                    in = wrapper::vrev64(in);
+                    in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+                }
+
+                const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x;
+                const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
+                const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
+                const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
+
+                auto out_ptr =
+                    reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w)));
+                wrapper::vstore(out_ptr, in);
             }
 
-            const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x;
-            const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
-            const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
-            const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
-
-            auto out_ptr = reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w)));
-            wrapper::vstore(out_ptr, in);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const auto in = *(reinterpret_cast<T *>(input_it.ptr()) + x);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const auto in = *(reinterpret_cast<T *>(input_it.ptr()) + x);
 
-            const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x;
-            const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
-            const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
-            const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
+                const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x;
+                const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
+                const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
+                const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
 
-            *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) = in;
-        }
-    },
-    input_it);
+                *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) =
+                    in;
+            }
+        },
+        input_it);
 }
 
 void NEReverseKernel::run(const Window &window, const ThreadInfo &info)
@@ -148,16 +182,16 @@ void NEReverseKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_input->info()->element_size())
+    switch (_input->info()->element_size())
     {
         case 4:
-            run_reverse<uint32_t>(window, _input, _axis, _output);
+            run_reverse<uint32_t>(window, _input, _axis, _output, _use_inverted_axis);
             break;
         case 2:
-            run_reverse<uint16_t>(window, _input, _axis, _output);
+            run_reverse<uint16_t>(window, _input, _axis, _output, _use_inverted_axis);
             break;
         case 1:
-            run_reverse<uint8_t>(window, _input, _axis, _output);
+            run_reverse<uint8_t>(window, _input, _axis, _output, _use_inverted_axis);
             break;
         default:
             ARM_COMPUTE_ERROR("Element size not supported");
diff --git a/src/core/NEON/kernels/NEReverseKernel.h b/src/core/NEON/kernels/NEReverseKernel.h
index 07b547a32767dd94b1982f2f4a1a3d4285225ac2..92261887f4148a18eefd47c187542f34c263e928 100644
--- a/src/core/NEON/kernels/NEReverseKernel.h
+++ b/src/core/NEON/kernels/NEReverseKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEREVERSEKERNEL_H
-#define ARM_COMPUTE_NEREVERSEKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NEREVERSEKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NEREVERSEKERNEL_H
 
 #include "src/core/NEON/INEKernel.h"
 
@@ -52,21 +52,24 @@ public:
     ~NEReverseKernel() = default;
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]  input  Input tensor. Data types supported: All
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis   Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in]  input             Input tensor. Data types supported: All
+     * @param[out] output            Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis              Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in]  use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
      */
-    void configure(const ITensor *input, ITensor *output, const ITensor *axis);
+    void configure(const ITensor *input, ITensor *output, const ITensor *axis, bool use_inverted_axis);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReverseKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: All
-     * @param[in] output Output tensor info. Data type supported: Same as @p input
-     * @param[in] axis   Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in] input             Input tensor info. Data types supported: All
+     * @param[in] output            Output tensor info. Data type supported: Same as @p input
+     * @param[in] axis              Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in] use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -75,6 +78,7 @@ private:
     const ITensor *_input;
     ITensor       *_output;
     const ITensor *_axis;
+    bool           _use_inverted_axis;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREVERSEKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NEREVERSEKERNEL_H
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index b8c9b244eec0e459e8c101fd107f946134ed45ef..7789b828ea55728ea8979ec13e5bbba49990f572 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -29,13 +29,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
-
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/select/list.h"
 
 #include <arm_neon.h>
@@ -54,7 +53,8 @@ struct SelectKernelSelectorData
 };
 
 using SelectorPtr = std::add_pointer<bool(const SelectKernelSelectorData &data)>::type;
-using KernelPtr   = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
+using KernelPtr =
+    std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
 
 struct SelectKernelSelector
 {
@@ -63,95 +63,62 @@ struct SelectKernelSelector
     KernelPtr         ukernel;
 };
 
-static const SelectKernelSelector available_kernels[] =
-{
-    {
-        "neon_s8_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)
-    },
-    {
-        "neon_s16_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)
-    },
-    {
-        "neon_s32_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)
-    },
-    {
-        "neon_u8_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)
-    },
-    {
-        "neon_u16_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)
-    },
-    {
-        "neon_u32_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)
-    },
-    {
-        "neon_s8_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)
-    },
-    {
-        "neon_s16_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)
-    },
-    {
-        "neon_s32_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)
-    },
-    {
-        "neon_u8_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)
-    },
-    {
-        "neon_u16_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)
-    },
-    {
-        "neon_u32_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)
-    },
-    {
-        "neon_f16_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == true; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)
-    },
-    {
-        "neon_f16_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == false; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)
-    },
-    {
-        "neon_f32_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == true; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)
-    },
-    {
-        "neon_f32_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == false; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)
-    },
+static const SelectKernelSelector available_kernels[] = {
+    {"neon_s8_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)},
+    {"neon_s16_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)},
+    {"neon_s32_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)},
+    {"neon_u8_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)},
+    {"neon_u16_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)},
+    {"neon_u32_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)},
+    {"neon_s8_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)},
+    {"neon_s16_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)},
+    {"neon_s32_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)},
+    {"neon_u8_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)},
+    {"neon_u16_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)},
+    {"neon_u32_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)},
+    {"neon_f16_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == true; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)},
+    {"neon_f16_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == false; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)},
+    {"neon_f32_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == true; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)},
+    {"neon_f32_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == false; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)},
 };
 
 const SelectKernelSelector *get_implementation(const SelectKernelSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -184,7 +151,8 @@ void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor
     INEKernel::configure(win);
 }
 
-Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+Status
+NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(x);
@@ -195,9 +163,11 @@ Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, cons
 
     const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
-    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank &&
+                                ((c->tensor_shape().num_dimensions() > 1) ||
+                                 (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
@@ -214,7 +184,7 @@ void NESelectKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON(_output == nullptr);
     ARM_COMPUTE_ERROR_ON(_output->info() == nullptr);
 
-    const auto *uk = get_implementation(SelectKernelSelectorData{ _output->info()->data_type(), _has_same_rank });
+    const auto *uk = get_implementation(SelectKernelSelectorData{_output->info()->data_type(), _has_same_rank});
     ARM_COMPUTE_ERROR_ON(uk == nullptr);
     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
     uk->ukernel(_c, _x, _y, _output, window);
diff --git a/src/core/NEON/kernels/NESelectKernel.h b/src/core/NEON/kernels/NESelectKernel.h
index e82105a68ee1531d2cf7fc27fe3e25c395b3e1fc..4fec42b53673f2b4b148f5ce98b3e6b83db38ae0 100644
--- a/src/core/NEON/kernels/NESelectKernel.h
+++ b/src/core/NEON/kernels/NESelectKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NESELECTKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -82,7 +83,6 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-
     const ITensor *_c;             /**< Condition tensor */
     const ITensor *_x;             /**< Source tensor 1 */
     const ITensor *_y;             /**< Source tensor 2 */
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
index 673eace3c1fbbf077ca7c6a075971eef8000681d..da023aeb9647499be58c75fcf93f91e716a49a41 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstdint>
@@ -41,19 +42,22 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *block_info,
+                          const ITensorInfo *paddings,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2});
     ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2});
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
@@ -64,7 +68,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status validate_arguments_static(const ITensorInfo *input,
+                                 const int          block_shape_x,
+                                 const int          block_shape_y,
+                                 const Size2D      &padding_left,
+                                 const Size2D      &padding_right,
                                  const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -73,9 +81,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right);
+        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+            input, block_shape_x, block_shape_y, padding_left, padding_right);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -86,14 +95,25 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
 } // namespace
 
 NESpaceToBatchLayerKernel::NESpaceToBatchLayerKernel()
-    : _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _padding_left(), _block_shape_x(), _block_shape_y()
+    : _input(nullptr),
+      _block_shape(nullptr),
+      _paddings(nullptr),
+      _output(nullptr),
+      _data_layout(DataLayout::UNKNOWN),
+      _padding_left(),
+      _block_shape_x(),
+      _block_shape_y()
 {
 }
 
-void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+void NESpaceToBatchLayerKernel::configure(const ITensor *input,
+                                          const ITensor *block_shape,
+                                          const ITensor *paddings,
+                                          ITensor       *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
 
     _input       = input;
     _block_shape = block_shape;
@@ -106,15 +126,22 @@ void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *b
     ICPPKernel::configure(win);
 }
 
-void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                                          ITensor *output)
+void NESpaceToBatchLayerKernel::configure(const ITensor *input,
+                                          const int      block_shape_x,
+                                          const int      block_shape_y,
+                                          const Size2D  &padding_left,
+                                          const Size2D  &padding_right,
+                                          ITensor       *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+        input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left,
+                                                         padding_right, output->info()));
 
     _input         = input;
     _output        = output;
@@ -128,15 +155,23 @@ void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_
     INEKernel::configure(win);
 }
 
-Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const ITensorInfo *block_shape,
+                                           const ITensorInfo *paddings,
+                                           const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
     return Status{};
 }
-Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const int          block_shape_x,
+                                           const int          block_shape_y,
+                                           const Size2D      &padding_left,
+                                           const Size2D      &padding_right,
                                            const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
     return Status{};
 }
 
@@ -146,17 +181,17 @@ void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    if(_block_shape != nullptr)
+    if (_block_shape != nullptr)
     {
         // Retrieve the block shapes dynamically
         _block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0)));
         _block_shape_y = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(1)));
     }
 
-    if(_paddings != nullptr)
+    if (_paddings != nullptr)
     {
-        const size_t pad_left_x = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 0, 0 }));
-        const size_t pad_left_y = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 1, 0 }));
+        const size_t pad_left_x = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({0, 0}));
+        const size_t pad_left_y = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({1, 0}));
         _padding_left           = Size2D(pad_left_x, pad_left_y);
     }
     const int height_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
@@ -173,57 +208,61 @@ void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info
     int batch_id = 0;
 
     // Main loop for NCHW and NHWC
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-                const size_t out_x = id.x();
-                const size_t out_y = id.y();
-                const size_t z     = id.z();
-                const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
-                const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
-                if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
                 {
-                    const int   w    = batch_id % batch_size;
-                    const int   in_x = pos_x - _padding_left.x();
-                    const int   in_y = pos_y - _padding_left.y();
-                    Coordinates input_coords{ in_x, in_y, z, w };
-                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-                }
-            },
-            out);
+                    const size_t out_x = id.x();
+                    const size_t out_y = id.y();
+                    const size_t z     = id.z();
+                    const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
+                    const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
+                    if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height &&
+                        pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+                    {
+                        const int   w    = batch_id % batch_size;
+                        const int   in_x = pos_x - _padding_left.x();
+                        const int   in_y = pos_y - _padding_left.y();
+                        Coordinates input_coords{in_x, in_y, z, w};
+                        memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                    }
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
     else
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-                const size_t out_x = id.y();
-                const size_t out_y = id.z();
-                const size_t z     = id.x();
-                const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
-                const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
-                if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
                 {
-                    const int   w    = batch_id % batch_size;
-                    const int   in_x = pos_x - _padding_left.x();
-                    const int   in_y = pos_y - _padding_left.y();
-                    Coordinates input_coords{ z, in_x, in_y, w };
-                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-                }
-            },
-            out);
+                    const size_t out_x = id.y();
+                    const size_t out_y = id.z();
+                    const size_t z     = id.x();
+                    const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
+                    const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
+                    if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height &&
+                        pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+                    {
+                        const int   w    = batch_id % batch_size;
+                        const int   in_x = pos_x - _padding_left.x();
+                        const int   in_y = pos_y - _padding_left.y();
+                        Coordinates input_coords{z, in_x, in_y, w};
+                        memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                    }
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
index 44b8cbb514243a720b7662a437d1e1074f6b682f..6292c07136fba377629e641f06e8a55104139299 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -69,7 +70,12 @@ public:
      * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
-    void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+    void configure(const ITensor *input,
+                   const int      block_shape_x,
+                   const int      block_shape_y,
+                   const Size2D  &padding_left,
+                   const Size2D  &padding_right,
+                   ITensor       *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -79,7 +85,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *block_shape,
+                           const ITensorInfo *paddings,
+                           const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel (Static block shape and paddings)
      *
      * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -91,7 +100,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int          block_shape_x,
+                           const int          block_shape_y,
+                           const Size2D      &padding_left,
+                           const Size2D      &padding_right,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
index 7687c50c40ea1200c8a7e79b7c14f1b30ed55851..b49c5ee344b2d3ffeff520f25fb9cf61ac3f673a 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstdint>
@@ -50,7 +51,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -115,43 +116,45 @@ void NESpaceToDepthLayerKernel::run(const Window &window, const ThreadInfo &info
     int batch_id = 0;
 
     // Main loop for NCHW and NHWC
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-                const size_t channel_id = id.z();
-                const size_t in_x       = id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
-                const size_t in_y       = id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
-                const int    z          = channel_id % channel_size;
-                Coordinates  input_coords{ in_x, in_y, z, batch_id };
-                memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-            },
-            out);
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
+                {
+                    const size_t channel_id = id.z();
+                    const size_t in_x       = id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
+                    const size_t in_y       = id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
+                    const int    z          = channel_id % channel_size;
+                    Coordinates  input_coords{in_x, in_y, z, batch_id};
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
     else
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-                const size_t channel_id = id.x();
-                const size_t in_x       = id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
-                const size_t in_y       = id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
-                const int    z          = channel_id % channel_size;
-                Coordinates  input_coords{ z, in_x, in_y, batch_id };
-                memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-            },
-            out);
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
+                {
+                    const size_t channel_id = id.x();
+                    const size_t in_x       = id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
+                    const size_t in_y       = id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
+                    const int    z          = channel_id % channel_size;
+                    Coordinates  input_coords{z, in_x, in_y, batch_id};
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
index 953b68a401d198a6640440a9e7b2d6c5afcf107f..7d147c5b94f20669cf77c9826d6fc5fa50474293 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index 93080e2ac71a26ad9f6932b1a306bb7907b42dbf..225e4fcfd2a7854202988c92046df1932fc4e5d6 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,14 +25,15 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/Utils.h"
 #include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
@@ -41,7 +42,12 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          uint32_t           axis,
+                          uint32_t           idx_input,
+                          uint32_t           num_tensors,
+                          uint32_t           rank,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
@@ -49,10 +55,12 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
     ARM_COMPUTE_RETURN_ERROR_ON(idx_input >= num_tensors);
     ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != rank);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+                                                           compute_stack_shape(*input, axis, num_tensors));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
@@ -60,83 +68,162 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+inline Coordinates
+shift_from_axis_and_replace_coordinate(const Coordinates &id, uint32_t axis, uint32_t idx_input, uint32_t num_dims)
 {
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
+    Coordinates id_out = id;
+    for (uint32_t i = num_dims; i > axis; --i)
+    {
+        id_out.set(i, id[i - 1]);
+    }
+    id_out.set(axis, idx_input);
+    return id_out;
+}
+
+void elementwise_stack(const std::vector<ITensor *> &input, ITensor *output, uint32_t axis, const Window &window)
+{
+    Window window_out;
+    window_out.use_tensor_dimensions(output->info()->tensor_shape());
 
-    // Configure kernel window
-    Window win = calculate_max_window(*input);
+    const int32_t  num_tensors  = input.size();
+    const size_t   element_size = input[0]->info()->element_size();
+    const uint32_t num_dims     = static_cast<uint32_t>(input[0]->info()->num_dimensions());
 
-    return std::make_pair(Status{}, win);
+    for (int32_t idx_input = 0; idx_input < num_tensors; ++idx_input)
+    {
+        Iterator input_it(input[idx_input], window);
+
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                Coordinates id_out = shift_from_axis_and_replace_coordinate(id, axis, idx_input, num_dims);
+                std::memcpy(output->ptr_to_element(id_out), input_it.ptr(), element_size);
+            },
+            input_it);
+    }
 }
 
-inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input)
+void memcpy_stack(const std::vector<ITensor *> &input, ITensor *output, uint32_t axis, const Window &window)
 {
-    constexpr int max_out_coord = 5; // Input shape is max a 4D shape, output is max 5D
-    Coordinates   id_out        = id;
-    for(unsigned int i = max_out_coord - 1; i > axis; --i)
+    const int32_t element_size   = input[0]->info()->element_size();
+    const int32_t chunk_size     = input[0]->info()->tensor_shape().total_size_lower(axis) * element_size;
+    const int32_t num_tensors    = input.size();
+    const int32_t out_chunk_step = chunk_size * num_tensors;
+
+    const int32_t start_x = window.x().start();
+    const int32_t end_x   = window.x().end();
+    const int32_t start_y = window.y().start();
+    const int32_t end_y   = window.y().end();
+
+    uint8_t *out_ptr_base = output->buffer() + output->info()->offset_first_element_in_bytes() + start_x * chunk_size;
+
+    for (int32_t x = start_x; x < end_x; ++x)
     {
-        id_out.set(i, id[i - 1]);
+        const uint8_t *in_ptr =
+            input[x]->buffer() + input[x]->info()->offset_first_element_in_bytes() + start_y * chunk_size;
+        uint8_t *out_ptr = out_ptr_base + start_y * out_chunk_step;
+
+        for (int32_t y = start_y; y < end_y; ++y)
+        {
+            std::memcpy(out_ptr, in_ptr, chunk_size);
+
+            in_ptr += chunk_size;
+            out_ptr += out_chunk_step;
+        }
+
+        out_ptr_base += chunk_size;
     }
-    id_out.set(axis, idx_input);
-    return id_out;
 }
+
 } // namespace
 
-NEStackLayerKernel::NEStackLayerKernel()
-    : _input(nullptr), _output(nullptr), _axis(), _idx_input()
+NEStackLayerKernel::NEStackLayerKernel() : _input(), _output(nullptr), _axis(), _split_dimension(Window::DimY)
 {
 }
 
-void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output)
+void NEStackLayerKernel::configure(const std::vector<ITensor *> &input, uint32_t axis, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
-    _input     = input;
-    _output    = output;
-    _axis      = axis;
-    _idx_input = idx_input;
+    const int32_t num_tensors = input.size();
+    ARM_COMPUTE_ERROR_ON(num_tensors == 0);
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), axis, num_tensors, output->info());
+    const uint32_t rank = input[0]->info()->num_dimensions();
+    ARM_COMPUTE_UNUSED(rank);
 
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    for (int32_t i = 0; i < num_tensors; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(input[i]);
+        ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input[i]->info(), axis, i, num_tensors, rank, output->info()));
+    }
+
+    auto_init_if_empty(*output->info(), input[0]->info()->clone()->set_tensor_shape(
+                                            compute_stack_shape(*input[0]->info(), axis, num_tensors)));
+
+    _input  = input;
+    _output = output;
+    _axis   = axis;
 }
 
-Status NEStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status NEStackLayerKernel::validate(const std::vector<ITensorInfo *> &input, uint32_t axis, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+    const int32_t num_tensors = input.size();
+    const size_t  rank        = input[0]->num_dimensions();
+
+    for (int32_t i = 0; i < num_tensors; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(input[i]);
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input[i], axis, i, num_tensors, rank, output));
+    }
+
     return Status{};
 }
 
+void NEStackLayerKernel::prepare()
+{
+    // Prepare calculates the window at runtime, in case there is padding being added after configure()
+    const ITensorInfo *input_info  = _input[0]->info();
+    const int32_t      num_dims    = input_info->num_dimensions();
+    const int32_t      num_tensors = _input.size();
+
+    // Check if there are any paddings in the input tensors
+    bool has_padding = false;
+    for (const ITensor *in : _input)
+    {
+        if (has_holes(*in->info(), num_dims - 1))
+        {
+            has_padding = true;
+            break;
+        }
+    }
+
+    has_padding = has_padding || has_holes(*_output->info(), num_dims);
+
+    Window win;
+    if (!has_padding)
+    {
+        _stack_fn = memcpy_stack;
+
+        // 2D execution window (X,Y): [Num_tensors, Dimensions >= axis]
+        win.set(Window::DimX, Window::Dimension(0, num_tensors, 1));
+        win.set(Window::DimY, Window::Dimension(0, input_info->tensor_shape().total_size_upper(_axis), 1));
+    }
+    else
+    {
+        _stack_fn = elementwise_stack;
+        win       = calculate_max_window(*input_info);
+    }
+
+    INEKernel::configure(win);
+}
+
 void NEStackLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    Window window_out;
-    window_out.use_tensor_dimensions(_output->info()->tensor_shape());
-
-    Iterator input(_input, window);
-    Iterator output(_output, window_out);
-
-    const int stride_x = _output->info()->strides_in_bytes()[0];
-    const int stride_y = _output->info()->num_dimensions() >= 1 ? _output->info()->strides_in_bytes()[1] : 0;
-    const int stride_z = _output->info()->num_dimensions() >= 2 ? _output->info()->strides_in_bytes()[2] : 0;
-    const int stride_w = _output->info()->num_dimensions() >= 3 ? _output->info()->strides_in_bytes()[3] : 0;
-    const int stride_k = _output->info()->num_dimensions() >= 4 ? _output->info()->strides_in_bytes()[4] : 0;
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
-        const int   idx    = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k;
-        std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size());
-    },
-    input);
+    _stack_fn(_input, _output, _axis, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.h b/src/core/NEON/kernels/NEStackLayerKernel.h
index 9b36518e4db42e529f3d5a9f4c8951c366371c81..02ee776ea426b2d4f6ad2d0ef9e12344de173f3a 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.h
+++ b/src/core/NEON/kernels/NEStackLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,12 +22,16 @@
  * SOFTWARE.
  */
 
-#ifndef ARM_COMPUTE_NESTACKLAYERKERNEL_H
-#define ARM_COMPUTE_NESTACKLAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NESTACKLAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NESTACKLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
+#include <cstdint>
+#include <functional>
+
 namespace arm_compute
 {
 class ITensor;
@@ -56,38 +60,48 @@ public:
      *
      * @note Supported input tensor rank: up to 4
      *
-     * @param[in]  input       Input tensor. Data types supported: All
-     * @param[in]  axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
-     * @param[in]  idx_input   Index of the input tensor in the list of tensors to stack.
-     *                         All tensors in the list must have the same shape
-     * @param[in]  num_tensors Number of tensors to stack
-     * @param[out] output      Output tensor. Data types supported: Same as @p input.
+     * @param[in]  input  Input tensors. Data types supported: All
+     * @param[in]  axis   The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
+     * @param[out] output Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output);
+    void configure(const std::vector<ITensor *> &input, uint32_t axis, ITensor *output);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEStackLayerKernel
      *
      * @note Supported input tensor rank: up to 4
      *
-     * @param[in] input       Input tensor info. Data types supported: All
-     * @param[in] axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
-     * @param[in] idx_input   Index of the input tensor in the list of tensors to stack
-     *                        All tensors in the list must have the same shape
-     * @param[in] num_tensors Number of tensors to stack
-     * @param[in] output      Output tensor info. Data types supported: Same as @p input.
+     * @param[in] input  Input tensor infos. Data types supported: All
+     * @param[in] axis   The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
+    static Status validate(const std::vector<ITensorInfo *> &input, uint32_t axis, const ITensorInfo *output);
+
+    /** Prepare the reshape kernel for execution (Only executed once) for
+     *  choosing the window and the algorithm.
+     */
+    void prepare();
 
     // Inherited methods overridden
     void run(const Window &window, const ThreadInfo &info) override;
 
+    /** Get the dimension to split the kernel workload
+     *
+     * @return the split dimension
+     */
+    uint32_t get_split_dimension() const
+    {
+        return _split_dimension;
+    }
+
 private:
-    const ITensor *_input;
-    ITensor       *_output;
-    unsigned int   _axis;
-    unsigned int   _idx_input;
+    std::vector<ITensor *> _input;
+    ITensor               *_output;
+    uint32_t               _axis;
+    uint32_t               _split_dimension;
+
+    std::function<void(const std::vector<ITensor *> &, ITensor *, uint32_t, const Window &)> _stack_fn{};
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NESTACKLAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NESTACKLAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.cpp b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
index 2b406a8b8b5e4e561c463fdb94add275240991d0..efff51be9d73aca009debd889875516655a7883a 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.cpp
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
@@ -26,9 +26,10 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -38,9 +39,14 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                          int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const Coordinates &starts,
+                          const Coordinates &ends,
+                          const BiStrides   &strides,
+                          int32_t            begin_mask,
+                          int32_t            end_mask,
+                          int32_t            shrink_axis_mask)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -49,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
     ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
-    {
-        return i == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; }));
 
     // Get expected output shape
-    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                          starts, ends, strides,
-                                                                                                          begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
 
     // Checks output if configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
@@ -71,14 +74,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output,
-                                                        const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                                        int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input,
+                                                        ITensorInfo       *output,
+                                                        const Coordinates &starts,
+                                                        const Coordinates &ends,
+                                                        const BiStrides   &strides,
+                                                        int32_t            begin_mask,
+                                                        int32_t            end_mask,
+                                                        int32_t            shrink_axis_mask)
 {
     // Output tensor auto initialization if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                      starts, ends, strides,
-                                                                                                      begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
 
     // Create window
@@ -88,38 +95,49 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
 }
 } // namespace
 
-NEStridedSliceKernel::NEStridedSliceKernel()
-    : _starts_abs(), _final_strides(), _shrink_mask()
+NEStridedSliceKernel::NEStridedSliceKernel() : _starts_abs(), _final_strides(), _shrink_mask()
 {
 }
 
-void NEStridedSliceKernel::configure(const ITensorInfo *input, ITensorInfo *output,
-                                     const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                     int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void NEStridedSliceKernel::configure(const ITensorInfo *input,
+                                     ITensorInfo       *output,
+                                     const Coordinates &starts,
+                                     const Coordinates &ends,
+                                     const BiStrides   &strides,
+                                     int32_t            begin_mask,
+                                     int32_t            end_mask,
+                                     int32_t            shrink_axis_mask)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
     _shrink_mask                   = shrink_axis_mask;
     const TensorShape &input_shape = input->tensor_shape();
     Coordinates        ends_abs;
-    std::tie(_starts_abs, ends_abs, _final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
-                                                          input_shape,
-                                                          starts, ends, strides,
-                                                          begin_mask, end_mask, shrink_axis_mask);
+    std::tie(_starts_abs, ends_abs, _final_strides) =
+        arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides,
+                                                                               begin_mask, end_mask, shrink_axis_mask);
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    auto win_config =
+        validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
-Status NEStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                      const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                      int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSliceKernel::validate(const ITensorInfo *input,
+                                      const ITensorInfo *output,
+                                      const Coordinates &starts,
+                                      const Coordinates &ends,
+                                      const BiStrides   &strides,
+                                      int32_t            begin_mask,
+                                      int32_t            end_mask,
+                                      int32_t            shrink_axis_mask)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
-                                                              starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), starts, ends,
+                                                              strides, begin_mask, end_mask, shrink_axis_mask)
+                                    .first);
 
     return Status{};
 }
@@ -156,7 +174,7 @@ void NEStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, co
 
     size_t length_x = win.shape()[0];
 
-    if(_final_strides[0] == 1 && !is_shrink_x)
+    if (_final_strides[0] == 1 && !is_shrink_x)
     {
         win.set(Window::DimX, Window::Dimension(0, 1, 1));
         width_size = width_size * length_x;
@@ -183,16 +201,17 @@ void NEStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, co
     uint8_t *cur_ptr;
 
     execute_window_loop(
-        win, [&](const Coordinates & id)
-    {
-        cur_ptr = input_base;
-        cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0;
-        cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1;
-        cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2;
-        cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3;
-
-        std::copy_n(cur_ptr, width_size, output_it.ptr());
-    },
-    output_it);
+        win,
+        [&](const Coordinates &id)
+        {
+            cur_ptr = input_base;
+            cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0;
+            cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1;
+            cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2;
+            cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3;
+
+            std::copy_n(cur_ptr, width_size, output_it.ptr());
+        },
+        output_it);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.h b/src/core/NEON/kernels/NEStridedSliceKernel.h
index 9ce517417d21faafb5462c55a3c0ef7518df3cbc..a475f09a172419a60424bcb4e787a1c077ab763e 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.h
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 #include <cstdint>
@@ -68,9 +69,14 @@ public:
      * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const ITensorInfo *input, ITensorInfo *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    void configure(const ITensorInfo *input,
+                   ITensorInfo       *output,
+                   const Coordinates &starts,
+                   const Coordinates &ends,
+                   const BiStrides   &strides,
+                   int32_t            begin_mask,
+                   int32_t            end_mask,
+                   int32_t            shrink_axis_mask);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSliceKernel
      *
@@ -86,9 +92,14 @@ public:
      * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask,
+                           int32_t            end_mask,
+                           int32_t            shrink_axis_mask);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NETileKernel.cpp b/src/core/NEON/kernels/NETileKernel.cpp
index 94256dc12dc8f88e81ef2e6ca7e806e431b8797d..577ce5b69e3478809730628a53ccc1fe018281e3 100644
--- a/src/core/NEON/kernels/NETileKernel.cpp
+++ b/src/core/NEON/kernels/NETileKernel.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -43,15 +44,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
-    {
-        return e == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; }));
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
@@ -59,8 +58,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-NETileKernel::NETileKernel()
-    : _input(nullptr), _output(nullptr)
+NETileKernel::NETileKernel() : _input(nullptr), _output(nullptr)
 {
 }
 
@@ -95,8 +93,9 @@ void NETileKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    Window output_window{ window };
-    output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(), _input->info()->dimension(0)));
+    Window output_window{window};
+    output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(),
+                                                      _input->info()->dimension(0)));
     Window out_slice = output_window.first_slice_window_1D();
 
     const auto src_shape = _input->info()->tensor_shape();
@@ -104,17 +103,19 @@ void NETileKernel::run(const Window &window, const ThreadInfo &info)
     {
         Iterator output_it(_output, out_slice);
 
-        execute_window_loop(out_slice, [&](const Coordinates & id)
-        {
-            const size_t x = id.x();
-            const size_t y = id.y();
-            const size_t z = id.z();
-            const size_t w = id[3];
-            Coordinates  input_coords{ x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3] };
-            memcpy(output_it.ptr(), _input->ptr_to_element(input_coords), _input->info()->dimension(0) * _input->info()->element_size());
-        },
-        output_it);
-    }
-    while(output_window.slide_window_slice_1D(out_slice));
+        execute_window_loop(
+            out_slice,
+            [&](const Coordinates &id)
+            {
+                const size_t x = id.x();
+                const size_t y = id.y();
+                const size_t z = id.z();
+                const size_t w = id[3];
+                Coordinates  input_coords{x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3]};
+                memcpy(output_it.ptr(), _input->ptr_to_element(input_coords),
+                       _input->info()->dimension(0) * _input->info()->element_size());
+            },
+            output_it);
+    } while (output_window.slide_window_slice_1D(out_slice));
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index efdaeeb1704900fe930efd529e928e73352ea961..3b444ae33398c2631e3e38d1190ad7089e152d35 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -32,6 +32,7 @@
 #include "gemm_hybrid_indirect.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
+#include "gemv_pretransposed.hpp"
 
 #include "kernels/a32_sgemm_8x6.hpp"
 #ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
@@ -42,6 +43,7 @@
 #include "kernels/a64_hybrid_fp16_mla_6x32.hpp"
 #include "kernels/a64_sgemm_8x12.hpp"
 #ifdef ARM_COMPUTE_ENABLE_SME2
+#include "kernels/sme2_gemv_fp16fp32fp16_dot_16VL.hpp"
 #include "kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL.hpp"
 #include "kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL.hpp"
 #include "kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL.hpp"
@@ -58,6 +60,13 @@ namespace arm_gemm {
 static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = {
 #ifdef ARM_COMPUTE_ENABLE_SVE
 #ifdef ARM_COMPUTE_ENABLE_SME2
+{
+    GemmMethod::GEMM_HYBRID,
+    "sme2_gemv_fp16fp32fp16_dot_16VL",
+    [](const GemmArgs &args) { return args._ci->has_sme2() && args._Msize==1 && args._nbatches==1 && !args._indirect_input; },
+    nullptr,
+    [](const GemmArgs &args) { return new GemvPretransposed<cls_sme2_gemv_fp16fp32fp16_dot_16VL, __fp16, __fp16>(args); }
+},
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL",
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 13f548e39ee3117e7ad083bfc62da91e93674fbc..362a3e30ea811c88b08a72ed87a98593ece01f6e 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -802,6 +802,13 @@ public:
                             }
                         }
 
+                        Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride);
+
+                        // If we are using an accumulation buffer and this isn't the last pass, don't pass a result pointer.
+                        if (_accumulation_buffer && !last_pass) {
+                            result_ptr = nullptr;
+                        }
+
                         // Perform the kernel and merge step, either separately or together as required.
                         kernel_and_merge<MergeStep, FixedFormat, OutputStage>::run(
                         #ifdef CYCLE_PROFILING
@@ -810,7 +817,7 @@ public:
                             // Strategy and panel pointers
                             strat, a_panel, b_ptr, this->_ldb, c_panel,
                             // Result buffer pointers
-                            this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride), this->_ldc,
+                            result_ptr, this->_ldc,
                             // K size, and M/N ranges
                             kern_k, start_row, end_row, start_x, end_x,
                             // Only do bias on the first pass
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16fp32fp16_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16fp32fp16_dot_16VL.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..50013e581c3bf4668ab3191c53bcce16e5be4314
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16fp32fp16_dot_16VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "../std_transforms_sme.hpp"
+
+#define ARGLIST  \
+    const __fp16 *, const __fp16 *, \
+    __fp16 *, size_t, size_t, \
+    const __fp16 *, Activation, bool
+
+namespace arm_gemm
+{
+void sme2_gemv_fp16fp32fp16_dot_16VL( ARGLIST );
+
+class cls_sme2_gemv_fp16fp32fp16_dot_16VL
+{
+public:
+    typedef __fp16 operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    static unsigned int out_width()
+    {
+        return sme::get_vector_length<float>() * 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 2;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    static constexpr bool supports_bias()
+    {
+        return true;
+    }
+
+    static constexpr bool supports_activation()
+    {
+        return true;
+    }
+
+
+    StdTransformsSME<operand_type, result_type, 1, 16, 2> transforms = {};
+
+
+    // Default to the generic kernel
+    kern_type kernel=sme2_gemv_fp16fp32fp16_dot_16VL;
+    cls_sme2_gemv_fp16fp32fp16_dot_16VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16fp32fp16_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16fp32fp16_dot_16VL/generic.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1067a8548aa9e65bf218fe5af12594d299527619
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16fp32fp16_dot_16VL/generic.cpp
@@ -0,0 +1,592 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sme2_gemv_fp16fp32fp16_dot_16VL (
+    const __fp16 *A_ptr, const __fp16 *B_ptr, __fp16 *output_ptr,
+    size_t N, size_t K,
+    const __fp16 *bias, Activation act, bool
+)
+{
+    struct KernelArgs {
+        __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        const __fp16 *B_ptr = {};
+        size_t output_offset = {};
+        unsigned int input_initial_col = {};
+    } ka;
+
+    unsigned long flags=0;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<__fp16>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p8.b\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "mov x9, #0x0\n"
+      "cntw x28, ALL, MUL #4\n"
+      "mov x27, %x[B_ptr]\n"
+      "add x26, %x[N], x28\n"
+      "mov x25, %x[output_ptr]\n"
+      "sub x26, x26, #0x1\n"
+      "ptrue p1.b\n"
+      "udiv x26, x26, x28\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "add x22, x26, #0x3\n"
+      "mov x21, #0x1\n"
+      "and x22, x22, #0xfffffffffffffffc\n"
+      "mul x22, x22, x28\n"
+      "mul x22, x22, %x[K]\n"
+      "lsl x22, x22, #0x1\n"
+      "1:"  // RHS size check loop
+      "cmp x22, #0x200000\n"
+      "blt 2f\n"
+      "tbnz x22, #0, 3f\n"
+      "lsr x22, x22, #0x1\n"
+      "lsl x21, x21, #0x1\n"
+      "b 1b\n"
+      "2:"  // RHS do prefetch
+      "lsl x20, x22, #0x26\n"
+      "sub x21, x21, #0x1\n"
+      "lsl x21, x21, #0x16\n"
+      "orr x22, x22, x20\n"
+      "orr x22, x22, x21\n"
+      ".inst 0xf8b64b7a  // rprfm pldonce, x22, [x27]\n"
+      "3:"  // RHS prefetch exit
+      "mov x24, %x[bias]\n"
+      "4:"  // Column loop
+      "cmp x26, #0x4\n"
+      "bge 28f\n"
+      "cmp x26, #0x2\n"
+      "bgt 20f\n"
+      "beq 12f\n"
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x1\n"
+      "mov x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x257467f0  // whilelt p8.h, XZR, x20, VLx4\n"
+      "cbz x24, 5f\n"
+      ".inst 0xa040c700  // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      "b 6f\n"
+      "5:"  // Width 1: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "6:"  // Width 1: setup done
+      "cmp x22, #0x8\n"
+      "ble 8f\n"
+      "7:"  // Width 1: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x22\n"
+      ".inst 0xa040a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
+      "addvl x27, x27, #16\n"
+      "ld1rqh { z14.h }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x8\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa040a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
+      "addvl x27, x27, #16\n"
+      "cmp x22, #0x8\n"
+      ".inst 0xa040a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15eb308  // fdot za.s[x9, 0], { z24.h-z27.h }, z14.h[0]\n"
+      ".inst 0xa040a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15eb608  // fdot za.s[x9, 0], { z16.h-z19.h }, z14.h[1]\n"
+      ".inst 0xc15eb808  // fdot za.s[x9, 0], { z0.h-z3.h }, z14.h[2]\n"
+      ".inst 0xc15ebf88  // fdot za.s[x9, 0], { z28.h-z31.h }, z14.h[3]\n"
+      "bgt 7b\n"
+      "8:"  // Width 1: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x22\n"
+      ".inst 0xa040a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      "ld1rqh { z7.h }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157b388  // fdot za.s[x9, 0], { z28.h-z31.h }, z7.h[0]\n"
+      "ble 9f\n"
+      ".inst 0xa040a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157b508  // fdot za.s[x9, 0], { z8.h-z11.h }, z7.h[1]\n"
+      "ble 9f\n"
+      ".inst 0xa040a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157b988  // fdot za.s[x9, 0], { z12.h-z15.h }, z7.h[2]\n"
+      "ble 9f\n"
+      ".inst 0xa040a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157bd08  // fdot za.s[x9, 0], { z8.h-z11.h }, z7.h[3]\n"
+      "9:"  // Width 1: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 10f\n"
+      ".inst 0xc0062c10  // mova { z16.d-z19.d }, za.d[x9, #0]\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z2.h }, p1/Z, [x21]\n"
+      "ld1rh { z1.h }, p1/Z, [x20]\n"
+      ".inst 0xc120e210  // fcvt z16.h, { z16.s-z17.s }\n"
+      ".inst 0xc120e251  // fcvt z17.h, { z18.s-z19.s }\n"
+      ".inst 0xc161c050  // fclamp { z16.h-z17.h }, z2.h, z1.h\n"
+      ".inst 0xa0602330  // st1h { z16.h-z17.h }, p8, [x25]\n"
+      "addvl x25, x25, #2\n"
+      "b 11f\n"
+      "10:"  // Width 1: No activation
+      ".inst 0xc0062c04  // mova { z4.d-z7.d }, za.d[x9, #0]\n"
+      ".inst 0xc120e09e  // fcvt z30.h, { z4.s-z5.s }\n"
+      ".inst 0xc120e0df  // fcvt z31.h, { z6.s-z7.s }\n"
+      ".inst 0xa060233e  // st1h { z30.h-z31.h }, p8, [x25]\n"
+      "addvl x25, x25, #2\n"
+      "11:"  // Width 1: Output done
+      "b 36f\n"
+      "12:"  // Width 2
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x1\n"
+      "sub x20, %x[N], x28\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x257467f0  // whilelt p8.h, XZR, x20, VLx4\n"
+      "cbz x24, 13f\n"
+      ".inst 0xa040c71c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xa041c700  // ld1w { z0.s-z3.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042f80  // mova za.d[x9, #0], { z28.d-z31.d }\n"
+      ".inst 0xc0042c01  // mova za.d[x9, #1], { z0.d-z3.d }\n"
+      "b 14f\n"
+      "13:"  // Width 2: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "14:"  // Width 2: setup done
+      "cmp x22, #0x8\n"
+      "ble 16f\n"
+      "15:"  // Width 2: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x22\n"
+      ".inst 0xa040a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
+      "sub x22, x22, #0x8\n"
+      "ld1rqh { z1.h }, p0/Z, [x23]\n"
+      "cmp x22, #0x8\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xa040a765  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xc151b308  // fdot za.s[x9, 0], { z24.h-z27.h }, z1.h[0]\n"
+      ".inst 0xa041a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc151b289  // fdot za.s[x9, 1], { z20.h-z23.h }, z1.h[0]\n"
+      ".inst 0xa040a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xa041a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc151b488  // fdot za.s[x9, 0], { z4.h-z7.h }, z1.h[1]\n"
+      ".inst 0xa040a765  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xc151b589  // fdot za.s[x9, 1], { z12.h-z15.h }, z1.h[1]\n"
+      ".inst 0xa041a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc151bb88  // fdot za.s[x9, 0], { z28.h-z31.h }, z1.h[2]\n"
+      ".inst 0xc151b909  // fdot za.s[x9, 1], { z8.h-z11.h }, z1.h[2]\n"
+      ".inst 0xc151bc88  // fdot za.s[x9, 0], { z4.h-z7.h }, z1.h[3]\n"
+      ".inst 0xc151bd89  // fdot za.s[x9, 1], { z12.h-z15.h }, z1.h[3]\n"
+      "bgt 15b\n"
+      "16:"  // Width 2: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x22\n"
+      ".inst 0xa040a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      "ld1rqh { z7.h }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157b108  // fdot za.s[x9, 0], { z8.h-z11.h }, z7.h[0]\n"
+      ".inst 0xc157b309  // fdot za.s[x9, 1], { z24.h-z27.h }, z7.h[0]\n"
+      "ble 17f\n"
+      ".inst 0xa040a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa041a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157b708  // fdot za.s[x9, 0], { z24.h-z27.h }, z7.h[1]\n"
+      ".inst 0xc157b689  // fdot za.s[x9, 1], { z20.h-z23.h }, z7.h[1]\n"
+      "ble 17f\n"
+      ".inst 0xa040a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa041a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157b808  // fdot za.s[x9, 0], { z0.h-z3.h }, z7.h[2]\n"
+      ".inst 0xc157ba09  // fdot za.s[x9, 1], { z16.h-z19.h }, z7.h[2]\n"
+      "ble 17f\n"
+      ".inst 0xa040a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xa041a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157bd88  // fdot za.s[x9, 0], { z12.h-z15.h }, z7.h[3]\n"
+      ".inst 0xc157bf89  // fdot za.s[x9, 1], { z28.h-z31.h }, z7.h[3]\n"
+      "17:"  // Width 2: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 18f\n"
+      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c30  // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+      "ld1rh { z13.h }, p1/Z, [x21]\n"
+      "ld1rh { z0.h }, p1/Z, [x20]\n"
+      ".inst 0xc120e38e  // fcvt z14.h, { z28.s-z29.s }\n"
+      ".inst 0xc120e3cf  // fcvt z15.h, { z30.s-z31.s }\n"
+      ".inst 0xc120e218  // fcvt z24.h, { z16.s-z17.s }\n"
+      ".inst 0xc120e259  // fcvt z25.h, { z18.s-z19.s }\n"
+      ".inst 0xc160c1ae  // fclamp { z14.h-z15.h }, z13.h, z0.h\n"
+      ".inst 0xc160c1b8  // fclamp { z24.h-z25.h }, z13.h, z0.h\n"
+      ".inst 0xa060272e  // st1h { z14.h-z15.h }, pn9.b, [x25]\n"
+      ".inst 0xa0612338  // st1h { z24.h-z25.h }, p8, [x25, #0x2, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "b 19f\n"
+      "18:"  // Width 2: No activation
+      ".inst 0xc0062c10  // mova { z16.d-z19.d }, za.d[x9, #0]\n"
+      ".inst 0xc0062c38  // mova { z24.d-z27.d }, za.d[x9, #1]\n"
+      ".inst 0xc120e205  // fcvt z5.h, { z16.s-z17.s }\n"
+      ".inst 0xc120e24d  // fcvt z13.h, { z18.s-z19.s }\n"
+      ".inst 0xa1602725  // st1h { z5.h, z13.h }, pn9.b, [x25]\n"
+      ".inst 0xc120e316  // fcvt z22.h, { z24.s-z25.s }\n"
+      ".inst 0xc120e35e  // fcvt z30.h, { z26.s-z27.s }\n"
+      ".inst 0xa1612336  // st1h { z22.h, z30.h }, p8, [x25, #0x2, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "19:"  // Width 2: Output done
+      "b 36f\n"
+      "20:"  // Width 3
+      "mov x20, #0x2\n"
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x1\n"
+      "msub x20, x28, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x257467f0  // whilelt p8.h, XZR, x20, VLx4\n"
+      "cbz x24, 21f\n"
+      ".inst 0xa040c718  // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xa041c70c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xa042c708  // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042f00  // mova za.d[x9, #0], { z24.d-z27.d }\n"
+      ".inst 0xc0042d81  // mova za.d[x9, #1], { z12.d-z15.d }\n"
+      ".inst 0xc0042d02  // mova za.d[x9, #2], { z8.d-z11.d }\n"
+      "b 22f\n"
+      "21:"  // Width 3: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "22:"  // Width 3: setup done
+      "cmp x22, #0x8\n"
+      "ble 24f\n"
+      "23:"  // Width 3: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x22\n"
+      ".inst 0xa040a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+      "sub x22, x22, #0x8\n"
+      "ld1rqh { z7.h }, p0/Z, [x23]\n"
+      "cmp x22, #0x8\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157b188  // fdot za.s[x9, 0], { z12.h-z15.h }, z7.h[0]\n"
+      ".inst 0xa040a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xc157b309  // fdot za.s[x9, 1], { z24.h-z27.h }, z7.h[0]\n"
+      ".inst 0xa041a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xc157b38a  // fdot za.s[x9, 2], { z28.h-z31.h }, z7.h[0]\n"
+      ".inst 0xa042a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157b408  // fdot za.s[x9, 0], { z0.h-z3.h }, z7.h[1]\n"
+      ".inst 0xa040a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xc157b609  // fdot za.s[x9, 1], { z16.h-z19.h }, z7.h[1]\n"
+      ".inst 0xa041a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xc157b58a  // fdot za.s[x9, 2], { z12.h-z15.h }, z7.h[1]\n"
+      ".inst 0xa042a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157bb88  // fdot za.s[x9, 0], { z28.h-z31.h }, z7.h[2]\n"
+      ".inst 0xa040a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xc157b909  // fdot za.s[x9, 1], { z8.h-z11.h }, z7.h[2]\n"
+      ".inst 0xa041a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xc157b80a  // fdot za.s[x9, 2], { z0.h-z3.h }, z7.h[2]\n"
+      ".inst 0xa042a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157bd88  // fdot za.s[x9, 0], { z12.h-z15.h }, z7.h[3]\n"
+      ".inst 0xc157be09  // fdot za.s[x9, 1], { z16.h-z19.h }, z7.h[3]\n"
+      ".inst 0xc157be8a  // fdot za.s[x9, 2], { z20.h-z23.h }, z7.h[3]\n"
+      "bgt 23b\n"
+      "24:"  // Width 3: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x22\n"
+      ".inst 0xa040a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      "ld1rqh { z7.h }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157b188  // fdot za.s[x9, 0], { z12.h-z15.h }, z7.h[0]\n"
+      ".inst 0xc157b209  // fdot za.s[x9, 1], { z16.h-z19.h }, z7.h[0]\n"
+      ".inst 0xc157b10a  // fdot za.s[x9, 2], { z8.h-z11.h }, z7.h[0]\n"
+      "ble 25f\n"
+      ".inst 0xa040a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa041a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157b608  // fdot za.s[x9, 0], { z16.h-z19.h }, z7.h[1]\n"
+      ".inst 0xc157b409  // fdot za.s[x9, 1], { z0.h-z3.h }, z7.h[1]\n"
+      ".inst 0xc157b68a  // fdot za.s[x9, 2], { z20.h-z23.h }, z7.h[1]\n"
+      "ble 25f\n"
+      ".inst 0xa040a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa041a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157bb08  // fdot za.s[x9, 0], { z24.h-z27.h }, z7.h[2]\n"
+      ".inst 0xc157ba89  // fdot za.s[x9, 1], { z20.h-z23.h }, z7.h[2]\n"
+      ".inst 0xc157b90a  // fdot za.s[x9, 2], { z8.h-z11.h }, z7.h[2]\n"
+      "ble 25f\n"
+      ".inst 0xa040a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xa041a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157bc08  // fdot za.s[x9, 0], { z0.h-z3.h }, z7.h[3]\n"
+      ".inst 0xc157bf89  // fdot za.s[x9, 1], { z28.h-z31.h }, z7.h[3]\n"
+      ".inst 0xc157bd8a  // fdot za.s[x9, 2], { z12.h-z15.h }, z7.h[3]\n"
+      "25:"  // Width 3: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 26f\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c24  // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+      "ld1rh { z17.h }, p1/Z, [x21]\n"
+      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+      "ld1rh { z16.h }, p1/Z, [x20]\n"
+      ".inst 0xc120e18c  // fcvt z12.h, { z12.s-z13.s }\n"
+      ".inst 0xc120e1cd  // fcvt z13.h, { z14.s-z15.s }\n"
+      ".inst 0xc120e092  // fcvt z18.h, { z4.s-z5.s }\n"
+      ".inst 0xc120e0d3  // fcvt z19.h, { z6.s-z7.s }\n"
+      ".inst 0xc170c22c  // fclamp { z12.h-z13.h }, z17.h, z16.h\n"
+      ".inst 0xc170c232  // fclamp { z18.h-z19.h }, z17.h, z16.h\n"
+      ".inst 0xc120e00e  // fcvt z14.h, { z0.s-z1.s }\n"
+      ".inst 0xc120e04f  // fcvt z15.h, { z2.s-z3.s }\n"
+      ".inst 0xc170c22e  // fclamp { z14.h-z15.h }, z17.h, z16.h\n"
+      ".inst 0xa060272c  // st1h { z12.h-z13.h }, pn9.b, [x25]\n"
+      ".inst 0xa0612732  // st1h { z18.h-z19.h }, pn9.b, [x25, #0x2, MUL VL]\n"
+      ".inst 0xa062232e  // st1h { z14.h-z15.h }, p8, [x25, #0x4, MUL VL]\n"
+      "addvl x25, x25, #6\n"
+      "b 27f\n"
+      "26:"  // Width 3: No activation
+      ".inst 0xc0062c04  // mova { z4.d-z7.d }, za.d[x9, #0]\n"
+      ".inst 0xc0062c20  // mova { z0.d-z3.d }, za.d[x9, #1]\n"
+      ".inst 0xc0062c48  // mova { z8.d-z11.d }, za.d[x9, #2]\n"
+      ".inst 0xc120e091  // fcvt z17.h, { z4.s-z5.s }\n"
+      ".inst 0xc120e0d9  // fcvt z25.h, { z6.s-z7.s }\n"
+      ".inst 0xa1602731  // st1h { z17.h, z25.h }, pn9.b, [x25]\n"
+      ".inst 0xc120e012  // fcvt z18.h, { z0.s-z1.s }\n"
+      ".inst 0xc120e053  // fcvt z19.h, { z2.s-z3.s }\n"
+      ".inst 0xa0612732  // st1h { z18.h-z19.h }, pn9.b, [x25, #0x2, MUL VL]\n"
+      ".inst 0xc120e111  // fcvt z17.h, { z8.s-z9.s }\n"
+      ".inst 0xc120e159  // fcvt z25.h, { z10.s-z11.s }\n"
+      ".inst 0xa1622331  // st1h { z17.h, z25.h }, p8, [x25, #0x4, MUL VL]\n"
+      "addvl x25, x25, #6\n"
+      "27:"  // Width 3: Output done
+      "b 36f\n"
+      "28:"  // Width 4
+      "mov x20, #0x3\n"
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x1\n"
+      "msub x20, x28, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x257467f0  // whilelt p8.h, XZR, x20, VLx4\n"
+      "cbz x24, 29f\n"
+      ".inst 0xa040c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xa041c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xa042c708  // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xa043c71c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      "addvl x24, x24, #16\n"
+      ".inst 0xc0042e01  // mova za.d[x9, #1], { z16.d-z19.d }\n"
+      ".inst 0xc0042d02  // mova za.d[x9, #2], { z8.d-z11.d }\n"
+      ".inst 0xc0042f83  // mova za.d[x9, #3], { z28.d-z31.d }\n"
+      "b 30f\n"
+      "29:"  // Width 4: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "30:"  // Width 4: setup done
+      "cmp x22, #0x8\n"
+      "ble 32f\n"
+      "31:"  // Width 4: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x22\n"
+      ".inst 0xa040a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27]\n"
+      "sub x22, x22, #0x8\n"
+      "ld1rqh { z7.h }, p0/Z, [x23]\n"
+      "cmp x22, #0x8\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      ".inst 0xa043a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xc157b288  // fdot za.s[x9, 0], { z20.h-z23.h }, z7.h[0]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157b189  // fdot za.s[x9, 1], { z12.h-z15.h }, z7.h[0]\n"
+      ".inst 0xa040a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xc157b38a  // fdot za.s[x9, 2], { z28.h-z31.h }, z7.h[0]\n"
+      ".inst 0xa041a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xc157b10b  // fdot za.s[x9, 3], { z8.h-z11.h }, z7.h[0]\n"
+      ".inst 0xa042a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      ".inst 0xa043a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xc157b588  // fdot za.s[x9, 0], { z12.h-z15.h }, z7.h[1]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157b409  // fdot za.s[x9, 1], { z0.h-z3.h }, z7.h[1]\n"
+      ".inst 0xa040a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xc157b50a  // fdot za.s[x9, 2], { z8.h-z11.h }, z7.h[1]\n"
+      ".inst 0xa041a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xc157b70b  // fdot za.s[x9, 3], { z24.h-z27.h }, z7.h[1]\n"
+      ".inst 0xa042a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      ".inst 0xa043a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xc157b808  // fdot za.s[x9, 0], { z0.h-z3.h }, z7.h[2]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157b909  // fdot za.s[x9, 1], { z8.h-z11.h }, z7.h[2]\n"
+      ".inst 0xa040a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xc157b98a  // fdot za.s[x9, 2], { z12.h-z15.h }, z7.h[2]\n"
+      ".inst 0xa041a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xc157bb0b  // fdot za.s[x9, 3], { z24.h-z27.h }, z7.h[2]\n"
+      ".inst 0xa042a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      ".inst 0xa043a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xc157bf88  // fdot za.s[x9, 0], { z28.h-z31.h }, z7.h[3]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157bc09  // fdot za.s[x9, 1], { z0.h-z3.h }, z7.h[3]\n"
+      ".inst 0xc157bd0a  // fdot za.s[x9, 2], { z8.h-z11.h }, z7.h[3]\n"
+      ".inst 0xc157bd8b  // fdot za.s[x9, 3], { z12.h-z15.h }, z7.h[3]\n"
+      "bgt 31b\n"
+      "32:"  // Width 4: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x22\n"
+      ".inst 0xa040a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      "ld1rqh { z7.h }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      ".inst 0xa043a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xc157b108  // fdot za.s[x9, 0], { z8.h-z11.h }, z7.h[0]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157b009  // fdot za.s[x9, 1], { z0.h-z3.h }, z7.h[0]\n"
+      ".inst 0xc157b20a  // fdot za.s[x9, 2], { z16.h-z19.h }, z7.h[0]\n"
+      ".inst 0xc157b18b  // fdot za.s[x9, 3], { z12.h-z15.h }, z7.h[0]\n"
+      "ble 33f\n"
+      ".inst 0xa040a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa041a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      ".inst 0xa043a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xc157b508  // fdot za.s[x9, 0], { z8.h-z11.h }, z7.h[1]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157b789  // fdot za.s[x9, 1], { z28.h-z31.h }, z7.h[1]\n"
+      ".inst 0xc157b40a  // fdot za.s[x9, 2], { z0.h-z3.h }, z7.h[1]\n"
+      ".inst 0xc157b68b  // fdot za.s[x9, 3], { z20.h-z23.h }, z7.h[1]\n"
+      "ble 33f\n"
+      ".inst 0xa040a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa041a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      ".inst 0xa043a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xc157b908  // fdot za.s[x9, 0], { z8.h-z11.h }, z7.h[2]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157b989  // fdot za.s[x9, 1], { z12.h-z15.h }, z7.h[2]\n"
+      ".inst 0xc157bb8a  // fdot za.s[x9, 2], { z28.h-z31.h }, z7.h[2]\n"
+      ".inst 0xc157ba0b  // fdot za.s[x9, 3], { z16.h-z19.h }, z7.h[2]\n"
+      "ble 33f\n"
+      ".inst 0xa040a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xa041a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      ".inst 0xa043a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xc157bd08  // fdot za.s[x9, 0], { z8.h-z11.h }, z7.h[3]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc157bf89  // fdot za.s[x9, 1], { z28.h-z31.h }, z7.h[3]\n"
+      ".inst 0xc157bd8a  // fdot za.s[x9, 2], { z12.h-z15.h }, z7.h[3]\n"
+      ".inst 0xc157be8b  // fdot za.s[x9, 3], { z20.h-z23.h }, z7.h[3]\n"
+      "33:"  // Width 4: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 34f\n"
+      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      "ld1rh { z19.h }, p1/Z, [x21]\n"
+      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+      "ld1rh { z18.h }, p1/Z, [x20]\n"
+      ".inst 0xc0062c64  // mova { z4.d-z7.d }, za.d[x9, #3]\n"
+      ".inst 0xc120e38a  // fcvt z10.h, { z28.s-z29.s }\n"
+      ".inst 0xc120e3cb  // fcvt z11.h, { z30.s-z31.s }\n"
+      ".inst 0xc120e18c  // fcvt z12.h, { z12.s-z13.s }\n"
+      ".inst 0xc120e1cd  // fcvt z13.h, { z14.s-z15.s }\n"
+      ".inst 0xc172c26a  // fclamp { z10.h-z11.h }, z19.h, z18.h\n"
+      ".inst 0xc172c26c  // fclamp { z12.h-z13.h }, z19.h, z18.h\n"
+      ".inst 0xc120e00e  // fcvt z14.h, { z0.s-z1.s }\n"
+      ".inst 0xc120e04f  // fcvt z15.h, { z2.s-z3.s }\n"
+      ".inst 0xc172c26e  // fclamp { z14.h-z15.h }, z19.h, z18.h\n"
+      ".inst 0xc120e090  // fcvt z16.h, { z4.s-z5.s }\n"
+      ".inst 0xc120e0d1  // fcvt z17.h, { z6.s-z7.s }\n"
+      ".inst 0xc172c270  // fclamp { z16.h-z17.h }, z19.h, z18.h\n"
+      ".inst 0xa060272a  // st1h { z10.h-z11.h }, pn9.b, [x25]\n"
+      ".inst 0xa061272c  // st1h { z12.h-z13.h }, pn9.b, [x25, #0x2, MUL VL]\n"
+      ".inst 0xa062272e  // st1h { z14.h-z15.h }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xa0632330  // st1h { z16.h-z17.h }, p8, [x25, #0x6, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "b 35f\n"
+      "34:"  // Width 4: No activation
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      ".inst 0xc0062c30  // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+      ".inst 0xc0062c5c  // mova { z28.d-z31.d }, za.d[x9, #2]\n"
+      ".inst 0xc0062c68  // mova { z8.d-z11.d }, za.d[x9, #3]\n"
+      ".inst 0xc120e187  // fcvt z7.h, { z12.s-z13.s }\n"
+      ".inst 0xc120e1cf  // fcvt z15.h, { z14.s-z15.s }\n"
+      ".inst 0xa1602727  // st1h { z7.h, z15.h }, pn9.b, [x25]\n"
+      ".inst 0xc120e207  // fcvt z7.h, { z16.s-z17.s }\n"
+      ".inst 0xc120e24f  // fcvt z15.h, { z18.s-z19.s }\n"
+      ".inst 0xa1612727  // st1h { z7.h, z15.h }, pn9.b, [x25, #0x2, MUL VL]\n"
+      ".inst 0xc120e38e  // fcvt z14.h, { z28.s-z29.s }\n"
+      ".inst 0xc120e3cf  // fcvt z15.h, { z30.s-z31.s }\n"
+      ".inst 0xa062272e  // st1h { z14.h-z15.h }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc120e112  // fcvt z18.h, { z8.s-z9.s }\n"
+      ".inst 0xc120e15a  // fcvt z26.h, { z10.s-z11.s }\n"
+      ".inst 0xa1632332  // st1h { z18.h, z26.h }, p8, [x25, #0x6, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "35:"  // Width 4: Output done
+      "subs x26, x26, #0x4\n"
+      "sub %x[N], %x[N], x28, LSL #2\n"
+      "bgt 4b\n"
+      "36:"  // Exit
+      ".inst 0xd503467f  // SMSTOP\n"
+      "ptrue p8.b\n"
+      : [N] "+&r" (N)
+      : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
index bea455ca67542b611ba7dff5378b286148565b86..989bb17dfb583e5a848944a860b7f70c58eb3b43 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -274,7 +274,13 @@ void MergeResults<8, 6, false>(float *out, const float *in, const int ldout, con
                         "VMIN.f32	q6, q6, %q[maxv]\n"
                         "VMIN.f32	q7, q7, %q[maxv]\n"
                         "VST1.32	{d12-d15}, [%[outptr3]]!\n"
+                    : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
+                      [inptr] "+r" (inptr)
+                    : [minv] "w" (minv), [maxv] "w" (maxv), [biasptr] "r" (biasptr)
+                    : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory"
+                    );
 
+                    __asm __volatile (
                         // Rows 4-5
                         "VLD1.32	{d8-d11},   [%[inptr]]!\n"
                         "VLD1.32	{d12-d15},  [%[inptr]]!\n"
@@ -296,7 +302,7 @@ void MergeResults<8, 6, false>(float *out, const float *in, const int ldout, con
                         "VMIN.f32	q6, q6, %q[maxv]\n"
                         "VMIN.f32	q7, q7, %q[maxv]\n"
                         "VST1.32	{d12-d15}, [%[outptr5]]!\n"
-                    : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
+                    : [outptr3] "+r" (outptr3),
                       [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [inptr] "+r" (inptr)
                     : [minv] "w" (minv), [maxv] "w" (maxv), [biasptr] "r" (biasptr)
                     : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp
index 9c6f5c83a1cc00cff29bc6e4a2655a9b3e743416..dac6b06f1eeda22e331120545afd444e1c4bc1c5 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp
@@ -40,88 +40,88 @@ void sme_transpose_interleave_16VL_2x2(uint16_t *out, const uint16_t *in, size_t
 
     __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
-      "ptrue p5.b\n"
+      "ptrue p6.b\n"
       "1:"  // Main row loop: Head
-      "mov x24, %x[in]\n"
-      "add x23, x24, %x[in_stride]\n"
+      "mov x25, %x[in]\n"
       "cmp %x[height], #0x1\n"
-      "add %x[in], x23, %x[in_stride]\n"
-      "mov x22, %x[out]\n"
-      "csel x23, x23, %x[pad_row], GT\n"
+      "add x24, x25, %x[in_stride]\n"
+      "mov x23, %x[out]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "csel x24, x24, %x[pad_row], GT\n"
       "sub %x[height], %x[height], #0x2\n"
-      "mov x21, %x[width]\n"
+      "mov x22, %x[width]\n"
       "2:"  // Main row loop: Column loop
-      "mov x20, x21\n"
-      "whilelt p2.h, XZR, x20\n"
-      "ld1h { z17.h }, p2/Z, [x24]\n"
-      "dech x20\n"
-      "whilelt p1.h, XZR, x20\n"
-      "ld1h { z19.h }, p1/Z, [x24, #1, MUL VL]\n"
-      "dech x20\n"
-      "whilelt p0.h, XZR, x20\n"
-      "ld1h { z21.h }, p0/Z, [x24, #2, MUL VL]\n"
-      "dech x20\n"
-      "whilelt p4.h, XZR, x20\n"
-      "ld1h { z20.h }, p4/Z, [x24, #3, MUL VL]\n"
-      "dech x20\n"
-      "whilelt p3.h, XZR, x20\n"
-      "ld1h { z16.h }, p2/Z, [x23]\n"
-      "zip1 z0.h, z17.h, z16.h\n"
-      "dech x20\n"
-      "whilelt p2.h, XZR, x20\n"
-      "ld1h { z18.h }, p1/Z, [x23, #1, MUL VL]\n"
-      "zip2 z31.h, z17.h, z16.h\n"
-      "dech x20\n"
-      "whilelt p1.h, XZR, x20\n"
-      "ld1h { z17.h }, p0/Z, [x23, #2, MUL VL]\n"
-      "zip1 z30.h, z19.h, z18.h\n"
-      "dech x20\n"
-      "whilelt p0.h, XZR, x20\n"
-      "ld1h { z16.h }, p4/Z, [x23, #3, MUL VL]\n"
-      "zip2 z29.h, z19.h, z18.h\n"
+      "mov x21, x22\n"
+      "mov x20, x23\n"
+      "whilelt p1.h, XZR, x21\n"
+      "dech x21\n"
+      "whilelt p0.h, XZR, x21\n"
+      "dech x21\n"
+      "ld1h { z21.h }, p1/Z, [x25]\n"
+      "whilelt p5.h, XZR, x21\n"
+      "dech x21\n"
+      "ld1h { z20.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "whilelt p4.h, XZR, x21\n"
+      "dech x21\n"
+      "ld1h { z25.h }, p5/Z, [x25, #2, MUL VL]\n"
+      "whilelt p3.h, XZR, x21\n"
+      "dech x21\n"
+      "ld1h { z24.h }, p4/Z, [x25, #3, MUL VL]\n"
+      "whilelt p2.h, XZR, x21\n"
+      "dech x21\n"
+      "ld1h { z19.h }, p1/Z, [x24]\n"
+      "whilelt p1.h, XZR, x21\n"
+      "dech x21\n"
+      "ld1h { z18.h }, p0/Z, [x24, #1, MUL VL]\n"
+      "whilelt p0.h, XZR, x21\n"
+      "ld1h { z17.h }, p5/Z, [x24, #2, MUL VL]\n"
+      "decw x22, ALL, MUL #16\n"
+      "ld1h { z16.h }, p4/Z, [x24, #3, MUL VL]\n"
+      "zip1 z23.h, z21.h, z19.h\n"
+      "zip2 z22.h, z21.h, z19.h\n"
+      "cmp x22, #0x0\n"
+      "ld1h { z21.h }, p3/Z, [x25, #4, MUL VL]\n"
+      "zip1 z31.h, z20.h, z18.h\n"
+      "zip2 z30.h, z20.h, z18.h\n"
+      "add x23, x23, %x[out_stride]\n"
+      "ld1h { z20.h }, p2/Z, [x25, #5, MUL VL]\n"
+      "zip1 z29.h, z25.h, z17.h\n"
+      "zip2 z28.h, z25.h, z17.h\n"
+      "ld1h { z27.h }, p1/Z, [x25, #6, MUL VL]\n"
+      "zip1 z26.h, z24.h, z16.h\n"
+      "zip2 z25.h, z24.h, z16.h\n"
+      "ld1h { z24.h }, p0/Z, [x25, #7, MUL VL]\n"
+      "addvl x25, x25, #8\n"
       "ld1h { z19.h }, p3/Z, [x24, #4, MUL VL]\n"
-      "mov x20, x22\n"
-      "decw x21, ALL, MUL #16\n"
-      "zip1 z28.h, z21.h, z17.h\n"
       "ld1h { z18.h }, p2/Z, [x24, #5, MUL VL]\n"
-      "zip2 z27.h, z21.h, z17.h\n"
-      "zip1 z26.h, z20.h, z16.h\n"
-      "cmp x21, #0x0\n"
       "ld1h { z17.h }, p1/Z, [x24, #6, MUL VL]\n"
-      "zip2 z25.h, z20.h, z16.h\n"
-      "add x22, x22, %x[out_stride]\n"
-      "ld1h { z24.h }, p0/Z, [x24, #7, MUL VL]\n"
+      "ld1h { z16.h }, p0/Z, [x24, #7, MUL VL]\n"
+      "st1h { z23.h }, p6, [x20]\n"
       "addvl x24, x24, #8\n"
-      "ld1h { z16.h }, p3/Z, [x23, #4, MUL VL]\n"
-      "zip1 z23.h, z19.h, z16.h\n"
-      "zip2 z22.h, z19.h, z16.h\n"
-      "ld1h { z16.h }, p2/Z, [x23, #5, MUL VL]\n"
-      "zip1 z21.h, z18.h, z16.h\n"
-      "zip2 z20.h, z18.h, z16.h\n"
-      "ld1h { z16.h }, p1/Z, [x23, #6, MUL VL]\n"
-      "zip1 z19.h, z17.h, z16.h\n"
-      "zip2 z18.h, z17.h, z16.h\n"
-      "ld1h { z16.h }, p0/Z, [x23, #7, MUL VL]\n"
-      "st1h { z0.h }, p5, [x20]\n"
-      "addvl x23, x23, #8\n"
+      "zip1 z23.h, z21.h, z19.h\n"
+      "st1h { z22.h }, p6, [x20, #1, MUL VL]\n"
+      "zip2 z22.h, z21.h, z19.h\n"
+      "zip1 z21.h, z20.h, z18.h\n"
+      "st1h { z31.h }, p6, [x20, #2, MUL VL]\n"
+      "zip2 z20.h, z20.h, z18.h\n"
+      "zip1 z19.h, z27.h, z17.h\n"
+      "st1h { z30.h }, p6, [x20, #3, MUL VL]\n"
+      "zip2 z18.h, z27.h, z17.h\n"
       "zip1 z17.h, z24.h, z16.h\n"
-      "st1h { z31.h }, p5, [x20, #1, MUL VL]\n"
+      "st1h { z29.h }, p6, [x20, #4, MUL VL]\n"
       "zip2 z16.h, z24.h, z16.h\n"
-      "st1h { z30.h }, p5, [x20, #2, MUL VL]\n"
-      "st1h { z29.h }, p5, [x20, #3, MUL VL]\n"
-      "st1h { z28.h }, p5, [x20, #4, MUL VL]\n"
-      "st1h { z27.h }, p5, [x20, #5, MUL VL]\n"
-      "st1h { z26.h }, p5, [x20, #6, MUL VL]\n"
-      "st1h { z25.h }, p5, [x20, #7, MUL VL]\n"
+      "st1h { z28.h }, p6, [x20, #5, MUL VL]\n"
+      "st1h { z26.h }, p6, [x20, #6, MUL VL]\n"
+      "st1h { z25.h }, p6, [x20, #7, MUL VL]\n"
       "addvl x20, x20, #16\n"
-      "st1h { z23.h }, p5, [x20, #-8, MUL VL]\n"
-      "st1h { z22.h }, p5, [x20, #-7, MUL VL]\n"
-      "st1h { z21.h }, p5, [x20, #-6, MUL VL]\n"
-      "st1h { z20.h }, p5, [x20, #-5, MUL VL]\n"
-      "st1h { z19.h }, p5, [x20, #-4, MUL VL]\n"
-      "st1h { z18.h }, p5, [x20, #-3, MUL VL]\n"
-      "st1h { z17.h }, p5, [x20, #-2, MUL VL]\n"
-      "st1h { z16.h }, p5, [x20, #-1, MUL VL]\n"
+      "st1h { z23.h }, p6, [x20, #-8, MUL VL]\n"
+      "st1h { z22.h }, p6, [x20, #-7, MUL VL]\n"
+      "st1h { z21.h }, p6, [x20, #-6, MUL VL]\n"
+      "st1h { z20.h }, p6, [x20, #-5, MUL VL]\n"
+      "st1h { z19.h }, p6, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p6, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p6, [x20, #-2, MUL VL]\n"
+      "st1h { z16.h }, p6, [x20, #-1, MUL VL]\n"
       "bgt 2b\n"
       "3:"  // Main row loop: Column loop skip
       "cmp %x[height], #0x1\n"
@@ -130,7 +130,7 @@ void sme_transpose_interleave_16VL_2x2(uint16_t *out, const uint16_t *in, size_t
       ".inst 0xd503467f  // SMSTOP\n"
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
-      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
@@ -149,5 +149,18 @@ void Transform<16, 2, true, VLType::SME>(
     );
 }
 
+template<>
+void Transform<16, 2, true, VLType::SME>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_16VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
 
 #endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
index dbd47ccfa98886ea6d4bf09aaaae4a9c2aa241a8..13c2d314e47eb8f9bf74c797d2819578d059acb3 100644
--- a/src/core/NEON/kernels/assembly/depthwise.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp
@@ -38,9 +38,8 @@ struct DepthwiseConfig
     DepthwiseMethod method = DepthwiseMethod::DEFAULT;
     std::string     filter = "";
 
-    DepthwiseConfig(DepthwiseMethod method)
-        : method(method) {};
-    DepthwiseConfig() {};
+    DepthwiseConfig(DepthwiseMethod method) : method(method){};
+    DepthwiseConfig(){};
 };
 
 struct DepthwiseArgs
@@ -63,18 +62,24 @@ struct DepthwiseArgs
 
     bool fast_mode = false;
 
-    DepthwiseArgs(
-        const CPUInfo *cpu_info,
-        unsigned int kernel_rows, unsigned int kernel_cols,
-        unsigned int stride_rows, unsigned int stride_cols,
-        unsigned int dilation_rows, unsigned int dilation_cols,
-        unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
-        unsigned int input_channels,
-        unsigned int output_rows, unsigned int output_cols,
-        unsigned int  channel_multiplier,
-        PaddingValues padding, arm_gemm::Activation activation,
-
-        const DepthwiseConfig *config)
+    DepthwiseArgs(const CPUInfo       *cpu_info,
+                  unsigned int         kernel_rows,
+                  unsigned int         kernel_cols,
+                  unsigned int         stride_rows,
+                  unsigned int         stride_cols,
+                  unsigned int         dilation_rows,
+                  unsigned int         dilation_cols,
+                  unsigned int         n_batches,
+                  unsigned int         input_rows,
+                  unsigned int         input_cols,
+                  unsigned int         input_channels,
+                  unsigned int         output_rows,
+                  unsigned int         output_cols,
+                  unsigned int         channel_multiplier,
+                  PaddingValues        padding,
+                  arm_gemm::Activation activation,
+
+                  const DepthwiseConfig *config)
         : cpu_info(cpu_info),
           kernel_rows(kernel_rows),
           kernel_cols(kernel_cols),
@@ -95,20 +100,38 @@ struct DepthwiseArgs
     {
     }
 
-    DepthwiseArgs(
-        const CPUInfo *cpu_info,
-        unsigned int kernel_rows, unsigned int kernel_cols,
-        unsigned int stride_rows, unsigned int stride_cols,
-        unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
-        unsigned int input_channels,
-        unsigned int output_rows, unsigned int output_cols,
-        unsigned int  channel_multiplier,
-        PaddingValues padding, arm_gemm::Activation activation,
-        const DepthwiseConfig *config)
-        : DepthwiseArgs(cpu_info, kernel_rows, kernel_cols, stride_rows,
-                        stride_cols, 1, 1, n_batches, input_rows, input_cols,
-                        input_channels, output_rows, output_cols,
-                        channel_multiplier, padding, activation, config)
+    DepthwiseArgs(const CPUInfo         *cpu_info,
+                  unsigned int           kernel_rows,
+                  unsigned int           kernel_cols,
+                  unsigned int           stride_rows,
+                  unsigned int           stride_cols,
+                  unsigned int           n_batches,
+                  unsigned int           input_rows,
+                  unsigned int           input_cols,
+                  unsigned int           input_channels,
+                  unsigned int           output_rows,
+                  unsigned int           output_cols,
+                  unsigned int           channel_multiplier,
+                  PaddingValues          padding,
+                  arm_gemm::Activation   activation,
+                  const DepthwiseConfig *config)
+        : DepthwiseArgs(cpu_info,
+                        kernel_rows,
+                        kernel_cols,
+                        stride_rows,
+                        stride_cols,
+                        1,
+                        1,
+                        n_batches,
+                        input_rows,
+                        input_cols,
+                        input_channels,
+                        output_rows,
+                        output_cols,
+                        channel_multiplier,
+                        padding,
+                        activation,
+                        config)
     {
     }
 };
@@ -127,17 +150,18 @@ struct Tile
     {
     }
 
-    Tile()
-        : Tile(nullptr, 0, 0, 0)
+    Tile() : Tile(nullptr, 0, 0, 0)
     {
     }
 
-    void load_from(
-        const TInput      *input,
-        const unsigned int ld_row, const unsigned int ld_col,
-        const unsigned int n_rows, const unsigned int n_cols,
-        const int input_i, const int input_j,
-        const unsigned int channel_multiplier) const
+    void load_from(const TInput      *input,
+                   const unsigned int ld_row,
+                   const unsigned int ld_col,
+                   const unsigned int n_rows,
+                   const unsigned int n_cols,
+                   const int          input_i,
+                   const int          input_j,
+                   const unsigned int channel_multiplier) const
     {
         const auto pad_top  = input_i < 0 ? -input_i : 0;
         const auto pad_left = input_j < 0 ? -input_j : 0;
@@ -145,18 +169,15 @@ struct Tile
         const auto padded_rows = std::min(n_rows - input_i, tile_rows) - pad_top;
         const auto padded_cols = std::min(n_cols - input_j, tile_cols) - pad_left;
 
-        if(padded_rows < tile_rows || padded_cols < tile_cols)
+        if (padded_rows < tile_rows || padded_cols < tile_cols)
         {
             memset(array, 0, tile_rows * tile_cols * tile_channels * sizeof(TInput));
         }
 
-        do_premultiply<TInput>(
-            (TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col,
-            ld_row, ld_col,
-            array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
-            tile_cols * tile_channels, tile_channels,
-            padded_rows, padded_cols, tile_channels / channel_multiplier,
-            channel_multiplier);
+        do_premultiply<TInput>((TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, ld_row,
+                               ld_col, array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
+                               tile_cols * tile_channels, tile_channels, padded_rows, padded_cols,
+                               tile_channels / channel_multiplier, channel_multiplier);
     }
 };
 
@@ -168,9 +189,8 @@ protected:
     std::string         m_name{};
 
 public:
-    DepthwiseCommon(const DepthwiseArgs &args)
-        : m_args(args) {};
-    DepthwiseCommon(DepthwiseCommon &) = delete;
+    DepthwiseCommon(const DepthwiseArgs &args) : m_args(args){};
+    DepthwiseCommon(DepthwiseCommon &)            = delete;
     DepthwiseCommon &operator=(DepthwiseCommon &) = delete;
 
     std::string name() const override
@@ -181,19 +201,18 @@ public:
     void set_name(std::string name)
     {
         // Only allow the name to be set once
-        if(m_name.empty())
+        if (m_name.empty())
         {
             m_name = name;
         }
     }
 
-    void execute(
-        const void *const  input,
-        const void *const  parameters,
-        void *const        output,
-        void *const        working_space,
-        const unsigned int thread_id,
-        const unsigned int n_threads) const override final
+    void execute(const void *const  input,
+                 const void *const  parameters,
+                 void *const        output,
+                 void *const        working_space,
+                 const unsigned int thread_id,
+                 const unsigned int n_threads) const override final
     {
         const size_t ld_input_col    = m_args.input_channels;
         const size_t ld_input_row    = ld_input_col * m_args.input_cols;
@@ -202,56 +221,47 @@ public:
         const size_t ld_output_row   = ld_output_col * m_args.output_cols;
         const size_t ld_output_batch = ld_output_row * m_args.output_rows;
 
-        execute(
-            input, ld_input_col, ld_input_row, ld_input_batch,
-            parameters, output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, n_threads);
+        execute(input, ld_input_col, ld_input_row, ld_input_batch, parameters, output, ld_output_col, ld_output_row,
+                ld_output_batch, working_space, thread_id, n_threads);
     }
 
-    void execute(
-        const void *const  input,
-        size_t             ld_input_col,
-        size_t             ld_input_row,
-        size_t             ld_input_batch,
-        const void *const  parameters,
-        void *const        output,
-        size_t             ld_output_col,
-        size_t             ld_output_row,
-        size_t             ld_output_batch,
-        void *const        working_space,
-        const unsigned int thread_id,
-        const unsigned int n_threads) const override final
+    void execute(const void *const  input,
+                 size_t             ld_input_col,
+                 size_t             ld_input_row,
+                 size_t             ld_input_batch,
+                 const void *const  parameters,
+                 void *const        output,
+                 size_t             ld_output_col,
+                 size_t             ld_output_row,
+                 size_t             ld_output_batch,
+                 void *const        working_space,
+                 const unsigned int thread_id,
+                 const unsigned int n_threads) const override final
     {
-        execute(
-            m_args.n_batches, m_args.input_rows, m_args.input_cols,
-            m_args.input_channels, m_args.padding,
-            input, ld_input_col, ld_input_row, ld_input_batch,
-            parameters,
-            m_args.output_rows, m_args.output_cols,
-            output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, n_threads);
+        execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.input_channels, m_args.padding, input,
+                ld_input_col, ld_input_row, ld_input_batch, parameters, m_args.output_rows, m_args.output_cols, output,
+                ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, n_threads);
     }
 
-    void execute(
-        unsigned int         batches,
-        unsigned int         input_height,
-        unsigned int         input_width,
-        unsigned int         channels,
-        const PaddingValues &padding,
-        const void          *input,
-        size_t               ld_input_col,
-        size_t               ld_input_row,
-        size_t               ld_input_batch,
-        const void          *parameters,
-        unsigned int         output_height,
-        unsigned int         output_width,
-        void                *output,
-        size_t               ld_output_col,
-        size_t               ld_output_row,
-        size_t               ld_output_batch,
-        void                *working_space,
-        unsigned int         thread_id,
-        unsigned int         n_threads) const override final
+    void execute(unsigned int         batches,
+                 unsigned int         input_height,
+                 unsigned int         input_width,
+                 unsigned int         channels,
+                 const PaddingValues &padding,
+                 const void          *input,
+                 size_t               ld_input_col,
+                 size_t               ld_input_row,
+                 size_t               ld_input_batch,
+                 const void          *parameters,
+                 unsigned int         output_height,
+                 unsigned int         output_width,
+                 void                *output,
+                 size_t               ld_output_col,
+                 size_t               ld_output_row,
+                 size_t               ld_output_batch,
+                 void                *working_space,
+                 unsigned int         thread_id,
+                 unsigned int         n_threads) const override final
     {
         // Construct a new set of arguments to reflect that we might have been
         // passed different input/output tensors. Dilation is handled at this
@@ -271,38 +281,33 @@ public:
         auto ld_output_col_d = ld_output_col * m_args.dilation_cols;
         auto ld_output_row_d = ld_output_row * m_args.dilation_rows;
 
-        for(size_t drow = 0; drow < m_args.dilation_rows; drow++)
+        for (size_t drow = 0; drow < m_args.dilation_rows; drow++)
         {
             size_t start_i;
-            std::tie(args.output_rows, args.input_rows, start_i,
-                     args.padding.top, args.padding.bottom) =
-                         get_reduced_view_for_dilation(
-                             output_height, input_height, drow, m_args.dilation_rows,
-                             m_args.kernel_rows, m_args.stride_rows, padding.top);
+            std::tie(args.output_rows, args.input_rows, start_i, args.padding.top, args.padding.bottom) =
+                get_reduced_view_for_dilation(output_height, input_height, drow, m_args.dilation_rows,
+                                              m_args.kernel_rows, m_args.stride_rows, padding.top);
 
             auto input_row  = static_cast<const TInput *>(input) + start_i * ld_input_row;
             auto output_row = static_cast<TOutput *>(output) + drow * ld_output_row;
 
-            if(args.output_rows)
+            if (args.output_rows)
             {
-                for(size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
+                for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
                 {
                     size_t start_j;
-                    std::tie(args.output_cols, args.input_cols, start_j,
-                             args.padding.left, args.padding.right) =
-                                 get_reduced_view_for_dilation(
-                                     output_width, input_width, dcol, m_args.dilation_cols,
-                                     m_args.kernel_cols, m_args.stride_cols, padding.left);
+                    std::tie(args.output_cols, args.input_cols, start_j, args.padding.left, args.padding.right) =
+                        get_reduced_view_for_dilation(output_width, input_width, dcol, m_args.dilation_cols,
+                                                      m_args.kernel_cols, m_args.stride_cols, padding.left);
 
                     const TInput *input_col  = input_row + start_j * ld_input_col;
                     TOutput      *output_col = output_row + dcol * ld_output_col;
 
-                    if(args.output_cols)
+                    if (args.output_cols)
                     {
-                        this->execute_internal(
-                            args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, parameters,
-                            output_col, ld_output_col_d, ld_output_row_d, ld_output_batch,
-                            working_space, thread_id, n_threads);
+                        this->execute_internal(args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch,
+                                               parameters, output_col, ld_output_col_d, ld_output_row_d,
+                                               ld_output_batch, working_space, thread_id, n_threads);
                     }
                 }
             }
@@ -310,20 +315,19 @@ public:
     }
 
 protected:
-    virtual void execute_internal(
-        const DepthwiseArgs &instance_args,
-        const void          *input,
-        size_t               ld_input_col,
-        size_t               ld_input_row,
-        size_t               ld_input_batch,
-        const void          *parameters,
-        void                *output,
-        size_t               ld_output_col,
-        size_t               ld_output_row,
-        size_t               ld_output_batch,
-        void                *working_space,
-        unsigned int         thread_id,
-        unsigned int         n_threads) const = 0;
+    virtual void execute_internal(const DepthwiseArgs &instance_args,
+                                  const void          *input,
+                                  size_t               ld_input_col,
+                                  size_t               ld_input_row,
+                                  size_t               ld_input_batch,
+                                  const void          *parameters,
+                                  void                *output,
+                                  size_t               ld_output_col,
+                                  size_t               ld_output_row,
+                                  size_t               ld_output_batch,
+                                  void                *working_space,
+                                  unsigned int         thread_id,
+                                  unsigned int         n_threads) const = 0;
 
     virtual bool uses_premultiply() const
     {
diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp
index a5db793b3df54fef0e876e06b43105e3b6e8a3a1..5ff848e281748b49e9af372e8d6e493da9724488 100644
--- a/src/core/NEON/kernels/assembly/depthwise_common.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp
@@ -49,11 +49,7 @@ struct KernelDescription
     bool            is_default     = false;
     uint64_t        cycle_estimate = 0;
 
-    KernelDescription(
-        DepthwiseMethod method,
-        std::string     name,
-        bool            is_default,
-        uint64_t        cycle_estimate)
+    KernelDescription(DepthwiseMethod method, std::string name, bool is_default, uint64_t cycle_estimate)
         : method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate)
     {
     }
@@ -78,58 +74,51 @@ public:
     // pointer the bias vector (which may be nullptr in the case of no bias) and
     // a pointer to the array of weights (stored in HWIO order).
     virtual void pack_parameters(
-        void       *buffer,
-        const void *biases,
-        const void *weights,
-        size_t      ld_weight_col = 0,
-        size_t      ld_weight_row = 0) = 0;
+        void *buffer, const void *biases, const void *weights, size_t ld_weight_col = 0, size_t ld_weight_row = 0) = 0;
 
     // Determine the amount of working space required
     virtual size_t get_working_size(unsigned int n_threads) const = 0;
 
     // Execute the convolution over the specified area of memory.
-    virtual void execute(
-        const void *input,       // Pointer to input tensor
-        const void *parameters,  // Packed parameters buffer
-        void        *output,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int n_threads) const = 0;
-
-    virtual void execute(
-        const void *input,
-        size_t       ld_input_col,
-        size_t       ld_input_row,
-        size_t       ld_input_batch,
-        const void *parameters,
-        void        *output,
-        size_t       ld_output_col,
-        size_t       ld_output_row,
-        size_t       ld_output_batch,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int n_threads) const = 0;
-
-    virtual void execute(
-        unsigned int batches,
-        unsigned int input_height,
-        unsigned int input_width,
-        unsigned int channels,
-        const PaddingValues &,
-        const void *input,
-        size_t       ld_input_col,
-        size_t       ld_input_row,
-        size_t       ld_input_batch,
-        const void *parameters,
-        unsigned int output_height,
-        unsigned int output_width,
-        void        *output,
-        size_t       ld_output_col,
-        size_t       ld_output_row,
-        size_t       ld_output_batch,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int n_threads) const = 0;
+    virtual void execute(const void  *input,      // Pointer to input tensor
+                         const void  *parameters, // Packed parameters buffer
+                         void        *output,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
+
+    virtual void execute(const void  *input,
+                         size_t       ld_input_col,
+                         size_t       ld_input_row,
+                         size_t       ld_input_batch,
+                         const void  *parameters,
+                         void        *output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
+
+    virtual void execute(unsigned int batches,
+                         unsigned int input_height,
+                         unsigned int input_width,
+                         unsigned int channels,
+                         const PaddingValues &,
+                         const void  *input,
+                         size_t       ld_input_col,
+                         size_t       ld_input_row,
+                         size_t       ld_input_batch,
+                         const void  *parameters,
+                         unsigned int output_height,
+                         unsigned int output_width,
+                         void        *output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
 };
 
 // To handle a dilation factor of D execute the kernel once for each d in
@@ -145,12 +134,13 @@ public:
 // - Number of valid input pixels corresponding to `d`
 // - Offset of the first pixel corresponding to `d`
 // - Amount of padding in the view for `d`
-std::tuple<size_t, size_t, size_t, size_t, size_t>
-get_reduced_view_for_dilation(
-    size_t out_size, size_t in_size,
-    size_t d, size_t dilation_factor,
-    size_t kernel_size, size_t stride,
-    size_t pad_before);
+std::tuple<size_t, size_t, size_t, size_t, size_t> get_reduced_view_for_dilation(size_t out_size,
+                                                                                 size_t in_size,
+                                                                                 size_t d,
+                                                                                 size_t dilation_factor,
+                                                                                 size_t kernel_size,
+                                                                                 size_t stride,
+                                                                                 size_t pad_before);
 
 } // namespace depthwise
 } // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp
index f1f70cf1d65fdda9b98b8cad0aad54c101fe90c6..045f9f95d3aee744fccd92109fa51700b5529a3b 100644
--- a/src/core/NEON/kernels/assembly/pool_common.hpp
+++ b/src/core/NEON/kernels/assembly/pool_common.hpp
@@ -68,45 +68,42 @@ public:
     virtual size_t get_working_size(unsigned int num_threads) const = 0;
 
     // Execute pooling over the specified area of memory.
-    virtual void execute(
-        const void *const input,
-        void *const       output,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const = 0;
+    virtual void execute(const void *const input,
+                         void *const       output,
+                         void             *working_space,
+                         unsigned int      thread_id,
+                         unsigned int      num_threads) const = 0;
 
-    virtual void execute(
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        void *const       output,
-        size_t            ld_output_col,
-        size_t            ld_output_row,
-        size_t            ld_output_batch,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const = 0;
+    virtual void execute(const void *const input,
+                         size_t            ld_input_col,
+                         size_t            ld_input_row,
+                         size_t            ld_input_batch,
+                         void *const       output,
+                         size_t            ld_output_col,
+                         size_t            ld_output_row,
+                         size_t            ld_output_batch,
+                         void             *working_space,
+                         unsigned int      thread_id,
+                         unsigned int      num_threads) const = 0;
 
-    virtual void execute(
-        unsigned int      batches,
-        unsigned int      height,
-        unsigned int      width,
-        unsigned int      channels,
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        const PaddingValues &,
-        unsigned int output_height,
-        unsigned int output_width,
-        void *const  output,
-        size_t       ld_output_col,
-        size_t       ld_output_row,
-        size_t       ld_output_batch,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int num_threads) const = 0;
+    virtual void execute(unsigned int      batches,
+                         unsigned int      height,
+                         unsigned int      width,
+                         unsigned int      channels,
+                         const void *const input,
+                         size_t            ld_input_col,
+                         size_t            ld_input_row,
+                         size_t            ld_input_batch,
+                         const PaddingValues &,
+                         unsigned int output_height,
+                         unsigned int output_width,
+                         void *const  output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int num_threads) const = 0;
 };
 
 } // namespace pooling
diff --git a/src/core/NEON/kernels/assembly/pooling.hpp b/src/core/NEON/kernels/assembly/pooling.hpp
index e8db35c5930d8654837c637abf5aa26a6b36d469..89d594298ea0bb343e2e5ed6586e95daaa5a96f3 100644
--- a/src/core/NEON/kernels/assembly/pooling.hpp
+++ b/src/core/NEON/kernels/assembly/pooling.hpp
@@ -36,9 +36,8 @@ struct PoolingConfig
     PoolingMethod method = PoolingMethod::DEFAULT;
     std::string   filter = "";
 
-    PoolingConfig(PoolingMethod method)
-        : method(method) {};
-    PoolingConfig() {};
+    PoolingConfig(PoolingMethod method) : method(method){};
+    PoolingConfig(){};
 };
 
 struct PoolingArgs
@@ -57,30 +56,40 @@ struct PoolingArgs
 
     const PoolingConfig *config;
 
-    PoolingArgs(
-        const CPUInfo       *cpu_info,
-        PoolingType          pool_type,
-        const PoolingWindow &window,
-        const PoolingStride &stride,
-        bool                 exclude_padding,
-        unsigned int         n_batches,
-        unsigned int         input_rows,
-        unsigned int         input_cols,
-        unsigned int         n_channels,
-        unsigned int         output_rows,
-        unsigned int         output_cols,
-        const PaddingValues &padding,
-        const PoolingConfig *cfg)
-        : cpu_info(cpu_info), pool_type(pool_type), pool_window(window), pool_stride(stride), exclude_padding(exclude_padding), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols),
-          n_channels(n_channels), output_rows(output_rows), output_cols(output_cols), padding(padding), config(cfg)
+    PoolingArgs(const CPUInfo       *cpu_info,
+                PoolingType          pool_type,
+                const PoolingWindow &window,
+                const PoolingStride &stride,
+                bool                 exclude_padding,
+                unsigned int         n_batches,
+                unsigned int         input_rows,
+                unsigned int         input_cols,
+                unsigned int         n_channels,
+                unsigned int         output_rows,
+                unsigned int         output_cols,
+                const PaddingValues &padding,
+                const PoolingConfig *cfg)
+        : cpu_info(cpu_info),
+          pool_type(pool_type),
+          pool_window(window),
+          pool_stride(stride),
+          exclude_padding(exclude_padding),
+          n_batches(n_batches),
+          input_rows(input_rows),
+          input_cols(input_cols),
+          n_channels(n_channels),
+          output_rows(output_rows),
+          output_cols(output_cols),
+          padding(padding),
+          config(cfg)
     {
         // If either of the pooling window dimensions are set to zero, meaning
         // "pool everything", then replace with the corresponding input dimension.
-        if(pool_window.rows == 0)
+        if (pool_window.rows == 0)
         {
             pool_window.rows = input_rows;
         }
-        if(pool_window.cols == 0)
+        if (pool_window.cols == 0)
         {
             pool_window.cols = input_cols;
         }
@@ -100,10 +109,16 @@ struct Requantize32
     int32_t per_layer_right_shift = 0;
     int32_t per_layer_mul         = 0;
 
-    Requantize32(int32_t input_offset, int32_t output_offset,
-                 int32_t per_layer_left_shift, int32_t per_layer_right_shift,
+    Requantize32(int32_t input_offset,
+                 int32_t output_offset,
+                 int32_t per_layer_left_shift,
+                 int32_t per_layer_right_shift,
                  int32_t per_layer_mul)
-        : input_offset(input_offset), output_offset(output_offset), per_layer_left_shift(per_layer_left_shift), per_layer_right_shift(per_layer_right_shift), per_layer_mul(per_layer_mul)
+        : input_offset(input_offset),
+          output_offset(output_offset),
+          per_layer_left_shift(per_layer_left_shift),
+          per_layer_right_shift(per_layer_right_shift),
+          per_layer_mul(per_layer_mul)
     {
     }
 };
@@ -115,105 +130,88 @@ protected:
     const PoolingArgs m_args;
 
 public:
-    PoolingCommon(const PoolingArgs &args)
-        : m_args(args)
+    PoolingCommon(const PoolingArgs &args) : m_args(args)
     {
     }
-    PoolingCommon(PoolingCommon &) = delete;
+    PoolingCommon(PoolingCommon &)            = delete;
     PoolingCommon &operator=(PoolingCommon &) = delete;
 
     size_t get_working_size(unsigned int) const override = 0;
 
     // Execute pooling over the specified area of memory.
-    void execute(
-        const void *const input,
-        void *const       output,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const override
+    void execute(const void *const input,
+                 void *const       output,
+                 void             *working_space,
+                 unsigned int      thread_id,
+                 unsigned int      num_threads) const override
     {
-        this->execute(
-            input,
-            m_args.n_channels,
-            m_args.n_channels * m_args.input_cols,
-            m_args.n_channels * m_args.input_cols * m_args.input_rows,
-            output,
-            m_args.n_channels,
-            m_args.n_channels * m_args.output_cols,
-            m_args.n_channels * m_args.output_cols * m_args.output_rows,
-            working_space,
-            thread_id, num_threads);
+        this->execute(input, m_args.n_channels, m_args.n_channels * m_args.input_cols,
+                      m_args.n_channels * m_args.input_cols * m_args.input_rows, output, m_args.n_channels,
+                      m_args.n_channels * m_args.output_cols,
+                      m_args.n_channels * m_args.output_cols * m_args.output_rows, working_space, thread_id,
+                      num_threads);
     }
 
-    void execute(
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        void *const       output,
-        size_t            ld_output_col,
-        size_t            ld_output_row,
-        size_t            ld_output_batch,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const override
+    void execute(const void *const input,
+                 size_t            ld_input_col,
+                 size_t            ld_input_row,
+                 size_t            ld_input_batch,
+                 void *const       output,
+                 size_t            ld_output_col,
+                 size_t            ld_output_row,
+                 size_t            ld_output_batch,
+                 void             *working_space,
+                 unsigned int      thread_id,
+                 unsigned int      num_threads) const override
     {
-        this->execute(
-            m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels,
-            input, ld_input_col, ld_input_row, ld_input_batch,
-            m_args.padding, m_args.output_rows, m_args.output_cols,
-            output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, num_threads);
+        this->execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, input, ld_input_col,
+                      ld_input_row, ld_input_batch, m_args.padding, m_args.output_rows, m_args.output_cols, output,
+                      ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, num_threads);
     }
 
-    void execute(
-        unsigned int         batches,
-        unsigned int         height,
-        unsigned int         width,
-        unsigned int         channels,
-        const void *const    input,
-        size_t               ld_input_col,
-        size_t               ld_input_row,
-        size_t               ld_input_batch,
-        const PaddingValues &padding,
-        unsigned int         output_height,
-        unsigned int         output_width,
-        void *const          output,
-        size_t               ld_output_col,
-        size_t               ld_output_row,
-        size_t               ld_output_batch,
-        void                *working_space,
-        unsigned int         thread_id,
-        unsigned int         num_threads) const override
+    void execute(unsigned int         batches,
+                 unsigned int         height,
+                 unsigned int         width,
+                 unsigned int         channels,
+                 const void *const    input,
+                 size_t               ld_input_col,
+                 size_t               ld_input_row,
+                 size_t               ld_input_batch,
+                 const PaddingValues &padding,
+                 unsigned int         output_height,
+                 unsigned int         output_width,
+                 void *const          output,
+                 size_t               ld_output_col,
+                 size_t               ld_output_row,
+                 size_t               ld_output_batch,
+                 void                *working_space,
+                 unsigned int         thread_id,
+                 unsigned int         num_threads) const override
     {
-        this->execute_internal(
-            batches, height, width, channels, padding,
-            input, ld_input_col, ld_input_row, ld_input_batch,
-            output_height, output_width,
-            output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, num_threads);
+        this->execute_internal(batches, height, width, channels, padding, input, ld_input_col, ld_input_row,
+                               ld_input_batch, output_height, output_width, output, ld_output_col, ld_output_row,
+                               ld_output_batch, working_space, thread_id, num_threads);
     }
 
 protected:
-    virtual void execute_internal(
-        unsigned int batches,
-        unsigned int height,
-        unsigned int width,
-        unsigned int channels,
-        const PaddingValues &,
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        unsigned int      output_height,
-        unsigned int      output_width,
-        void *const       output,
-        size_t            ld_output_col,
-        size_t            ld_output_row,
-        size_t            ld_output_batch,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const = 0;
+    virtual void execute_internal(unsigned int batches,
+                                  unsigned int height,
+                                  unsigned int width,
+                                  unsigned int channels,
+                                  const PaddingValues &,
+                                  const void *const input,
+                                  size_t            ld_input_col,
+                                  size_t            ld_input_row,
+                                  size_t            ld_input_batch,
+                                  unsigned int      output_height,
+                                  unsigned int      output_width,
+                                  void *const       output,
+                                  size_t            ld_output_col,
+                                  size_t            ld_output_row,
+                                  size_t            ld_output_batch,
+                                  void             *working_space,
+                                  unsigned int      thread_id,
+                                  unsigned int      num_threads) const = 0;
 };
 
 template <typename TInput, typename TOutput>
diff --git a/src/core/NEON/kernels/assembly/premultiply.hpp b/src/core/NEON/kernels/assembly/premultiply.hpp
index 16f26de38abd74dfa0945a17a662cd31db2a3c59..fb97cf8baf46af59ea9e73a02518ebb113ce8699 100644
--- a/src/core/NEON/kernels/assembly/premultiply.hpp
+++ b/src/core/NEON/kernels/assembly/premultiply.hpp
@@ -44,30 +44,27 @@ void do_premultiply(const T           *in_ptr,
                     const unsigned     input_channels,
                     const unsigned int channel_multiplier)
 {
-    if(sizeof(T) == 4 && channel_multiplier == 6)
+    if (sizeof(T) == 4 && channel_multiplier == 6)
     {
-        do_premultiply_float_6(
-            (const float *)in_ptr, ld_row, ld_col,
-            (float *)out_ptr, out_ld_row, out_ld_col,
-            tile_rows, tile_cols,
-            input_channels);
+        do_premultiply_float_6((const float *)in_ptr, ld_row, ld_col, (float *)out_ptr, out_ld_row, out_ld_col,
+                               tile_rows, tile_cols, input_channels);
     }
     else
     {
-        for(unsigned int i = 0; i < tile_rows; i++)
+        for (unsigned int i = 0; i < tile_rows; i++)
         {
             const T *ip2 = in_ptr + i * ld_row;
             T       *op2 = out_ptr + i * out_ld_row;
-            for(unsigned int j = 0; j < tile_cols; j++)
+            for (unsigned int j = 0; j < tile_cols; j++)
             {
                 const T *ip = ip2;
                 T       *op = op2;
-                for(unsigned int c = 0; c < input_channels; c++)
+                for (unsigned int c = 0; c < input_channels; c++)
                 {
                     T val = *ip;
                     ip++;
 
-                    for(unsigned int r = 0; r < channel_multiplier; r++)
+                    for (unsigned int r = 0; r < channel_multiplier; r++)
                     {
                         op[r] = val;
                     }
diff --git a/src/core/NEON/kernels/assembly/winograd.hpp b/src/core/NEON/kernels/assembly/winograd.hpp
index 50290757ecffe823fc46ca032a1c143e6cb4fa1d..dbf95d23cd263601a7719df5e843f96c89e306cb 100644
--- a/src/core/NEON/kernels/assembly/winograd.hpp
+++ b/src/core/NEON/kernels/assembly/winograd.hpp
@@ -45,17 +45,24 @@ struct ConvolutionArgs
     Shape2D              kernel_shape;
     arm_gemm::Activation activation;
 
-    ConvolutionArgs(
-        unsigned int   n_batches,
-        const Shape2D &input_shape,
-        unsigned int   n_input_channels,
-        unsigned int pad_top, unsigned int pad_left,
-        const Shape2D              &output_shape,
-        unsigned int                n_output_channels,
-        const Shape2D               kernel_shape,
-        const arm_gemm::Activation &activation = {})
-        : n_batches(n_batches), input_shape(input_shape), n_input_channels(n_input_channels), pad_top(pad_top), pad_left(pad_left), output_shape(output_shape), n_output_channels(n_output_channels),
-          kernel_shape(kernel_shape), activation(activation)
+    ConvolutionArgs(unsigned int                n_batches,
+                    const Shape2D              &input_shape,
+                    unsigned int                n_input_channels,
+                    unsigned int                pad_top,
+                    unsigned int                pad_left,
+                    const Shape2D              &output_shape,
+                    unsigned int                n_output_channels,
+                    const Shape2D               kernel_shape,
+                    const arm_gemm::Activation &activation = {})
+        : n_batches(n_batches),
+          input_shape(input_shape),
+          n_input_channels(n_input_channels),
+          pad_top(pad_top),
+          pad_left(pad_left),
+          output_shape(output_shape),
+          n_output_channels(n_output_channels),
+          kernel_shape(kernel_shape),
+          activation(activation)
     {
     }
 };
@@ -105,23 +112,30 @@ public:
     virtual unsigned int get_transformed_tile_rows(void) const = 0;
     virtual unsigned int get_transformed_tile_cols(void) const = 0;
 
-    void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
-        void *outptr, const WinogradDomainSpec &wds,
-        unsigned int thread_id, unsigned int n_threads) const
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 size_t                    ld_in_row,
+                 size_t                    ld_in_col,
+                 size_t                    ld_input_channel,
+                 void                     *outptr,
+                 const WinogradDomainSpec &wds,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
     {
-        this->execute(
-            args, inptr, ld_in_row, ld_in_col, ld_input_channel,
-            outptr, wds.weight_ld_matrix, wds.weight_ld_row,
-            thread_id, n_threads);
+        this->execute(args, inptr, ld_in_row, ld_in_col, ld_input_channel, outptr, wds.weight_ld_matrix,
+                      wds.weight_ld_row, thread_id, n_threads);
     }
 
-    virtual void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
-        void *outptr, size_t ld_out_matrix, size_t ld_out_row,
-        unsigned int thread_id, unsigned int n_threads) const = 0;
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_row,
+                         size_t                 ld_in_col,
+                         size_t                 ld_input_channel,
+                         void                  *outptr,
+                         size_t                 ld_out_matrix,
+                         size_t                 ld_out_row,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
 };
 
 } // namespace weight_transform
@@ -136,27 +150,35 @@ public:
     virtual unsigned int get_input_rows(void) const = 0;
     virtual unsigned int get_input_cols(void) const = 0;
 
-    virtual size_t get_working_space_size(
-        const ConvolutionArgs &args,
-        unsigned int           n_threads) const = 0;
-
-    void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
-        void *outptr, const WinogradDomainSpec &wds,
-        void *working_space, unsigned int thread_id, unsigned int n_threads) const
+    virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 size_t                    ld_in_batch,
+                 size_t                    ld_in_row,
+                 size_t                    ld_in_col,
+                 void                     *outptr,
+                 const WinogradDomainSpec &wds,
+                 void                     *working_space,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
     {
-        this->execute(
-            args, inptr, ld_in_batch, ld_in_row, ld_in_col,
-            outptr, wds.input_ld_batch, wds.input_ld_matrix, wds.input_ld_row,
-            working_space, thread_id, n_threads);
+        this->execute(args, inptr, ld_in_batch, ld_in_row, ld_in_col, outptr, wds.input_ld_batch, wds.input_ld_matrix,
+                      wds.input_ld_row, working_space, thread_id, n_threads);
     }
 
-    virtual void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
-        void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
-        void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0;
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_batch,
+                         size_t                 ld_in_row,
+                         size_t                 ld_in_col,
+                         void                  *outptr,
+                         size_t                 ld_out_batch,
+                         size_t                 ld_out_matrix,
+                         size_t                 ld_out_row,
+                         void                  *working_space,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
 };
 
 } // namespace input_transform
@@ -177,31 +199,37 @@ public:
     virtual unsigned int get_kernel_rows(void) const = 0;
     virtual unsigned int get_kernel_cols(void) const = 0;
 
-    virtual size_t get_working_space_size(
-        const ConvolutionArgs &args,
-        unsigned int           n_threads) const = 0;
-
-    void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, const WinogradDomainSpec &wds,
-        const void *bias,
-        void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
-        void *working_space, unsigned int thread_id, unsigned int n_threads) const
+    virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 const WinogradDomainSpec &wds,
+                 const void               *bias,
+                 void                     *outptr,
+                 size_t                    ld_out_batch,
+                 size_t                    ld_out_row,
+                 size_t                    ld_out_col,
+                 void                     *working_space,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
     {
-        this->execute(
-            args,
-            inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row,
-            bias,
-            outptr, ld_out_batch, ld_out_row, ld_out_col,
-            working_space, thread_id, n_threads);
+        this->execute(args, inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, bias, outptr,
+                      ld_out_batch, ld_out_row, ld_out_col, working_space, thread_id, n_threads);
     }
 
-    virtual void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
-        const void *bias,
-        void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
-        void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0;
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_batch,
+                         size_t                 ld_in_matrix,
+                         size_t                 ld_in_row,
+                         const void            *bias,
+                         void                  *outptr,
+                         size_t                 ld_out_batch,
+                         size_t                 ld_out_row,
+                         size_t                 ld_out_col,
+                         void                  *working_space,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
 };
 
 } // namespace output_transform
@@ -210,7 +238,7 @@ struct WinogradImpl
 {
     const output_transform::ITransform *output_transform = nullptr;
     const weight_transform::ITransform *weight_transform = nullptr;
-    const input_transform::ITransform *input_transform  = nullptr;
+    const input_transform::ITransform  *input_transform  = nullptr;
     std::unique_ptr<arm_gemm::GemmArgs> gemm_args;
     WinogradDomainSpec                  winograd_spec;
 };
@@ -220,15 +248,18 @@ struct WinogradImpl
  * Assigns to the pointers in the `dest` struct and returns true or false to
  * indicate whether the given problem can be executed or not.
  */
-template <typename TIn, typename TWeight = TIn, typename TOut = TIn, typename TWinogradIn = TIn, typename TWinogradOut = TOut>
-bool get_implementation(
-    WinogradImpl &dest, // Destination for the selected implementation
-    const CPUInfo *,
-    const ConvolutionArgs &,
-    int  max_threads,
-    bool fast_mode,
-    const WinogradConfig *,
-    const arm_gemm::GemmConfig *);
+template <typename TIn,
+          typename TWeight      = TIn,
+          typename TOut         = TIn,
+          typename TWinogradIn  = TIn,
+          typename TWinogradOut = TOut>
+bool get_implementation(WinogradImpl &dest, // Destination for the selected implementation
+                        const CPUInfo *,
+                        const ConvolutionArgs &,
+                        int  max_threads,
+                        bool fast_mode,
+                        const WinogradConfig *,
+                        const arm_gemm::GemmConfig *);
 
 } // namespace winograd
 } // namespace arm_conv
diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
index ed5254a0a4bec7a20795928ff2f107e0192b8a95..e3d9b670b36a7dd0abfd668cfa1a7942e8d006ac 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
+
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
@@ -37,12 +38,26 @@ namespace arm_compute
 {
 namespace
 {
-using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                      float epsilon, ActivationLayerInfo &act_info, const Window &window);
+using BatchNomalizationPtr = void (*)(ITensor             *src,
+                                      ITensor             *dst,
+                                      const ITensor       *mean,
+                                      const ITensor       *var,
+                                      const ITensor       *beta,
+                                      const ITensor       *gamma,
+                                      float                epsilon,
+                                      ActivationLayerInfo &act_info,
+                                      const Window        &window);
 
 template <typename T>
-void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                         float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void batch_normalization(ITensor             *src,
+                         ITensor             *dst,
+                         const ITensor       *mean,
+                         const ITensor       *var,
+                         const ITensor       *beta,
+                         const ITensor       *gamma,
+                         float                epsilon,
+                         ActivationLayerInfo &act_info,
+                         const Window        &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
@@ -57,86 +72,99 @@ void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
 
-    const auto input_mean  = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     T activation_functor(act_info);
 
     const auto epsilon_vec = wrapper::vdup_n(static_cast<float16_t>(epsilon), ExactTagType{});
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-        // Perform core calculations using vector operations
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            // Conctruct vectors
-            const auto mean_vec  = wrapper::vloadq(input_mean + x);
-            const auto var_vec   = wrapper::vloadq(input_var + x);
-            const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
-            const auto beta_vec  = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
-
-            // Calculate denominator
-            const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
-            // Calculate x bar
-            const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
-            const auto x_bar     = wrapper::vmul(numerator, denominator);
-            auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
-            // Perform fused activation
-            if(act_info.enabled())
+            const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+            // Perform core calculations using vector operations
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                activation_functor(res);
+                // Conctruct vectors
+                const auto mean_vec  = wrapper::vloadq(input_mean + x);
+                const auto var_vec   = wrapper::vloadq(input_var + x);
+                const auto gamma_vec = (input_gamma != nullptr)
+                                           ? wrapper::vloadq(input_gamma + x)
+                                           : wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
+                const auto beta_vec  = (input_beta != nullptr)
+                                           ? wrapper::vloadq(input_beta + x)
+                                           : wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
+
+                // Calculate denominator
+                const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+                // Calculate x bar
+                const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+                const auto x_bar     = wrapper::vmul(numerator, denominator);
+                auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                wrapper::vstore(output_ptr + x, res);
             }
 
-            // Store results
-            wrapper::vstore(output_ptr + x, res);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            // Conctruct vectors
-            const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
-            const float16_t beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
-
-            const float16_t denominator = sqrt(input_var[x] + epsilon);
-            const float16_t numerator   = input_ptr[x] - input_mean[x];
-            const float16_t x_bar       = numerator / denominator;
-            float16_t       res         = beta + x_bar * gamma;
-
-            // Perform fused activation
-            if(act_info.enabled())
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                activation_functor(res);
+                // Conctruct vectors
+                const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
+                const float16_t beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
+
+                const float16_t denominator = sqrt(input_var[x] + epsilon);
+                const float16_t numerator   = input_ptr[x] - input_mean[x];
+                const float16_t x_bar       = numerator / denominator;
+                float16_t       res         = beta + x_bar * gamma;
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                *reinterpret_cast<float16_t *>(output_ptr + x) = res;
             }
-
-            // Store results
-            *reinterpret_cast<float16_t *>(output_ptr + x) = res;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 
 // Fused Batched Normalization with activation functions
-static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map =
-{
-    { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float16_t, 8>> },
-    { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float16_t, 8>> },
-    { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float16_t, 8>> }
-};
-}
+static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map = {
+    {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float16_t, 8>>},
+    {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float16_t, 8>>},
+    {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float16_t, 8>>}};
+} // namespace
 namespace cpu
 {
-void fp16_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                   float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp16_neon_batch_normalization(ITensor             *src,
+                                   ITensor             *dst,
+                                   const ITensor       *mean,
+                                   const ITensor       *var,
+                                   const ITensor       *beta,
+                                   const ITensor       *gamma,
+                                   float                epsilon,
+                                   ActivationLayerInfo &act_info,
+                                   const Window        &window)
 {
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window);
     }
diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
index d6e22e1843c671cc924e302958320902516e8acf..4e1654ee6b55e054fe39dfff6730b18f3f798243 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
+
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
@@ -36,12 +37,26 @@ namespace arm_compute
 {
 namespace
 {
-using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                      float epsilon, ActivationLayerInfo &act_info, const Window &window);
+using BatchNomalizationPtr = void (*)(ITensor             *src,
+                                      ITensor             *dst,
+                                      const ITensor       *mean,
+                                      const ITensor       *var,
+                                      const ITensor       *beta,
+                                      const ITensor       *gamma,
+                                      float                epsilon,
+                                      ActivationLayerInfo &act_info,
+                                      const Window        &window);
 
 template <typename T>
-void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                         float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void batch_normalization(ITensor             *src,
+                         ITensor             *dst,
+                         const ITensor       *mean,
+                         const ITensor       *var,
+                         const ITensor       *beta,
+                         const ITensor       *gamma,
+                         float                epsilon,
+                         ActivationLayerInfo &act_info,
+                         const Window        &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
@@ -56,86 +71,99 @@ void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
 
-    const auto input_mean  = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     T activation_functor(act_info);
 
     const auto epsilon_vec = wrapper::vdup_n(static_cast<float>(epsilon), ExactTagType{});
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        // Perform core calculations using vector operations
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            // Conctruct vectors
-            const auto mean_vec  = wrapper::vloadq(input_mean + x);
-            const auto var_vec   = wrapper::vloadq(input_var + x);
-            const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<float>(1.f), ExactTagType{});
-            const auto beta_vec  = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
-
-            // Calculate denominator
-            const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
-            // Calculate x bar
-            const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
-            const auto x_bar     = wrapper::vmul(numerator, denominator);
-            auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
-            // Perform fused activation
-            if(act_info.enabled())
+            const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+            // Perform core calculations using vector operations
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                activation_functor(res);
+                // Conctruct vectors
+                const auto mean_vec  = wrapper::vloadq(input_mean + x);
+                const auto var_vec   = wrapper::vloadq(input_var + x);
+                const auto gamma_vec = (input_gamma != nullptr)
+                                           ? wrapper::vloadq(input_gamma + x)
+                                           : wrapper::vdup_n(static_cast<float>(1.f), ExactTagType{});
+                const auto beta_vec  = (input_beta != nullptr)
+                                           ? wrapper::vloadq(input_beta + x)
+                                           : wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
+
+                // Calculate denominator
+                const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+                // Calculate x bar
+                const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+                const auto x_bar     = wrapper::vmul(numerator, denominator);
+                auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                wrapper::vstore(output_ptr + x, res);
             }
 
-            // Store results
-            wrapper::vstore(output_ptr + x, res);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            // Conctruct vectors
-            const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
-            const float beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
-
-            const float denominator = sqrt(input_var[x] + epsilon);
-            const float numerator   = input_ptr[x] - input_mean[x];
-            const float x_bar       = numerator / denominator;
-            float       res         = beta + x_bar * gamma;
-
-            // Perform fused activation
-            if(act_info.enabled())
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                activation_functor(res);
+                // Conctruct vectors
+                const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
+                const float beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
+
+                const float denominator = sqrt(input_var[x] + epsilon);
+                const float numerator   = input_ptr[x] - input_mean[x];
+                const float x_bar       = numerator / denominator;
+                float       res         = beta + x_bar * gamma;
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                *reinterpret_cast<float *>(output_ptr + x) = res;
             }
-
-            // Store results
-            *reinterpret_cast<float *>(output_ptr + x) = res;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 
 // Fused Batched Normalization with activation functions
-static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map =
-{
-    { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float, 4>> },
-    { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float, 4>> },
-    { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float, 4>> }
-};
-}
+static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map = {
+    {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float, 4>>},
+    {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float, 4>>},
+    {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float, 4>>}};
+} // namespace
 namespace cpu
 {
-void fp32_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                   float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp32_neon_batch_normalization(ITensor             *src,
+                                   ITensor             *dst,
+                                   const ITensor       *mean,
+                                   const ITensor       *var,
+                                   const ITensor       *beta,
+                                   const ITensor       *gamma,
+                                   float                epsilon,
+                                   ActivationLayerInfo &act_info,
+                                   const Window        &window)
 {
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window);
     }
diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
index 98cd9aa7fe57517356b61cc4336e95bbc1fdbfe5..48caaa3e6329f74e5e7c9388ec4e3705db3d749f 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
@@ -25,6 +25,7 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/SVEMath.h"
 
 #include <cmath>
@@ -37,8 +38,15 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                  float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp16_sve_batch_normalization(ITensor             *src,
+                                  ITensor             *dst,
+                                  const ITensor       *mean,
+                                  const ITensor       *var,
+                                  const ITensor       *beta,
+                                  const ITensor       *gamma,
+                                  float                epsilon,
+                                  ActivationLayerInfo &act_info,
+                                  const Window        &window)
 {
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
@@ -49,69 +57,74 @@ void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
 
-    const auto input_mean  = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     const auto epsilon_vec = svdup_n_f16(epsilon);
     const auto const_1     = svdup_n_f16(1.f);
     const auto const_0     = svdup_n_f16(0.f);
     const auto va          = svdup_n_f16(act_info.a());
     const auto vb          = svdup_n_f16(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            // Conctruct vectors
-            const auto mean_vec  = svld1_f16(pg, input_mean + x);
-            const auto var_vec   = svld1_f16(pg, input_var + x);
-            const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1;
-            const auto beta_vec  = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0;
+            const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
 
-            // Calculate denominator
-            const auto tmp         = svadd_f16_z(pg, var_vec, epsilon_vec);
-            auto       denominator = svrsqrte_f16(tmp);
-            denominator            = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
-            denominator            = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
+            {
+                // Conctruct vectors
+                const auto mean_vec  = svld1_f16(pg, input_mean + x);
+                const auto var_vec   = svld1_f16(pg, input_var + x);
+                const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1;
+                const auto beta_vec  = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0;
 
-            // Calculate x bar
-            const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec);
-            const auto x_bar     = svmul_f16_z(pg, numerator, denominator);
-            auto       res       = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec);
+                // Calculate denominator
+                const auto tmp         = svadd_f16_z(pg, var_vec, epsilon_vec);
+                auto       denominator = svrsqrte_f16(tmp);
+                denominator =
+                    svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
+                denominator =
+                    svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
 
-            // Perform fused activation
-            if(act_info.enabled())
-            {
-                if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
-                {
-                    res = svmax_f16_z(pg, const_0, res);
-                }
-                else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-                {
-                    res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res));
-                }
-                else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                // Calculate x bar
+                const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec);
+                const auto x_bar     = svmul_f16_z(pg, numerator, denominator);
+                auto       res       = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (act_info.enabled())
                 {
-                    res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res));
+                    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+                    {
+                        res = svmax_f16_z(pg, const_0, res);
+                    }
+                    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                    {
+                        res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res));
+                    }
+                    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                    {
+                        res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res));
+                    }
                 }
-            }
 
-            // Store results
-            svst1_f16(pg, output_ptr + x, res);
+                // Store results
+                svst1_f16(pg, output_ptr + x, res);
 
-            x += svcntw();
-            pg = svwhilelt_b16(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    input, output);
+                x += svcntw();
+                pg = svwhilelt_b16(x, window_end_x);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
index 952ab320bfab9f71a67387e357bd2af53c0cc751..df4fbfe60717e28e561e85cc2839b47afca6f1d1 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
@@ -25,6 +25,7 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/SVEMath.h"
 
 #include <cmath>
@@ -37,8 +38,15 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                  float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp32_sve_batch_normalization(ITensor             *src,
+                                  ITensor             *dst,
+                                  const ITensor       *mean,
+                                  const ITensor       *var,
+                                  const ITensor       *beta,
+                                  const ITensor       *gamma,
+                                  float                epsilon,
+                                  ActivationLayerInfo &act_info,
+                                  const Window        &window)
 {
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
@@ -49,69 +57,74 @@ void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
 
-    const auto input_mean  = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     const auto epsilon_vec = svdup_n_f32(epsilon);
     const auto const_1     = svdup_n_f32(1.f);
     const auto const_0     = svdup_n_f32(0.f);
     const auto va          = svdup_n_f32(act_info.a());
     const auto vb          = svdup_n_f32(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b32(x, window_end_x);
-        do
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            // Conctruct vectors
-            const auto mean_vec  = svld1_f32(pg, input_mean + x);
-            const auto var_vec   = svld1_f32(pg, input_var + x);
-            const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1;
-            const auto beta_vec  = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0;
+            const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
 
-            // Calculate denominator
-            const auto tmp         = svadd_f32_z(pg, var_vec, epsilon_vec);
-            auto       denominator = svrsqrte_f32(tmp);
-            denominator            = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
-            denominator            = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b32(x, window_end_x);
+            do
+            {
+                // Conctruct vectors
+                const auto mean_vec  = svld1_f32(pg, input_mean + x);
+                const auto var_vec   = svld1_f32(pg, input_var + x);
+                const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1;
+                const auto beta_vec  = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0;
 
-            // Calculate x bar
-            const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec);
-            const auto x_bar     = svmul_f32_z(pg, numerator, denominator);
-            auto       res       = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec);
+                // Calculate denominator
+                const auto tmp         = svadd_f32_z(pg, var_vec, epsilon_vec);
+                auto       denominator = svrsqrte_f32(tmp);
+                denominator =
+                    svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
+                denominator =
+                    svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
 
-            // Perform fused activation
-            if(act_info.enabled())
-            {
-                if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
-                {
-                    res = svmax_f32_z(pg, const_0, res);
-                }
-                else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-                {
-                    res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res));
-                }
-                else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                // Calculate x bar
+                const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec);
+                const auto x_bar     = svmul_f32_z(pg, numerator, denominator);
+                auto       res       = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (act_info.enabled())
                 {
-                    res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res));
+                    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+                    {
+                        res = svmax_f32_z(pg, const_0, res);
+                    }
+                    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                    {
+                        res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res));
+                    }
+                    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                    {
+                        res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res));
+                    }
                 }
-            }
 
-            // Store results
-            svst1_f32(pg, output_ptr + x, res);
+                // Store results
+                svst1_f32(pg, output_ptr + x, res);
 
-            x += svcntw();
-            pg = svwhilelt_b32(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b32(), pg));
-    },
-    input, output);
+                x += svcntw();
+                pg = svwhilelt_b32(x, window_end_x);
+            } while (svptest_any(svptrue_b32(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/batchnormalization/impl/list.h b/src/core/NEON/kernels/batchnormalization/impl/list.h
index 8e0ea36f5adc3cb31f077a3be1a66fd4988304ed..cbf540bd71e83b5c9683c05a94be93153f2fb4b0 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/list.h
+++ b/src/core/NEON/kernels/batchnormalization/impl/list.h
@@ -28,9 +28,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name)                                                                              \
-    void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, \
-                   float epsilon, ActivationLayerInfo &act_info, const Window &window)
+#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name)                                                        \
+    void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, \
+                   const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
 
 DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_neon_batch_normalization);
 DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_sve_batch_normalization);
diff --git a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
index 3900ea62cdddbf5e5dca98f4a70fb4fb19389dd0..95cdc8f2f9e73bbdc0c2ce556563ef016c1c103c 100644
--- a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
+++ b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -158,8 +159,7 @@ struct logistic
      *
      * @param[in] act_info Activation layer information.
      */
-    explicit logistic(ActivationLayerInfo act_info)
-        : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{}))
+    explicit logistic(ActivationLayerInfo act_info) : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{}))
     {
         ARM_COMPUTE_UNUSED(act_info);
     }
@@ -198,8 +198,7 @@ struct relu
      *
      * @param[in] act_info Activation layer information.
      */
-    explicit relu(ActivationLayerInfo act_info)
-        : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{}))
+    explicit relu(ActivationLayerInfo act_info) : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{}))
     {
         ARM_COMPUTE_UNUSED(act_info);
     }
diff --git a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
index ac196d9dbb2ec30bab3fc7ef3d0c6657680a96ab..50fff04cad244a1f57e989dd0cefbedab1876d4b 100644
--- a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
+++ b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
@@ -25,6 +25,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IMultiImage.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/NEON/NEMath.h"
 
 #include <arm_neon.h>
@@ -50,8 +51,12 @@ constexpr float rgb2u8_red_coef   = 0.2126f;
 constexpr float rgb2u8_green_coef = 0.7152f;
 constexpr float rgb2u8_blue_coef  = 0.0722f;
 
-inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor,
-                                                const float rcoef, const float gcoef, const float bcoef)
+inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor,
+                                                const float32x4_t &gcolor,
+                                                const float32x4_t &bcolor,
+                                                const float        rcoef,
+                                                const float        gcoef,
+                                                const float        bcoef)
 {
     float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef);
     greyscale             = vmlaq_n_f32(greyscale, gcolor, gcoef);
@@ -86,8 +91,12 @@ inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out)
     arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out);
 }
 
-inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec,
-                                   float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec)
+inline void rgb_to_yuv_calculation(const float32x4_t &rvec,
+                                   const float32x4_t &gvec,
+                                   const float32x4_t &bvec,
+                                   float32x4_t       &yvec,
+                                   float32x4_t       &uvec,
+                                   float32x4_t       &vvec)
 {
     /*
     Y'= 0.2126*R' + 0.7152*G' + 0.0722*B'
@@ -110,8 +119,12 @@ inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &g
     vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv);
 }
 
-inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val,
-                                    float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha)
+inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val,
+                                    float32x4_t        uvec_val,
+                                    const float32x4_t &yyvec_val,
+                                    float32x4_t        vvec_val,
+                                    unsigned char     *output_ptr,
+                                    const bool         alpha)
 {
     float32x4x3_t rgb1, rgb2;
 
@@ -126,8 +139,7 @@ inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uve
     // b = 1.8556f*f_u + 0.0000f*f_v;
     const auto red   = vmulq_n_f32(vvec_val, red_coef_bt709);
     const auto blue  = vmulq_n_f32(uvec_val, blue_coef_bt709);
-    const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709),
-                                 vmulq_n_f32(vvec_val, green_coef2_bt709));
+    const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), vmulq_n_f32(vvec_val, green_coef2_bt709));
 
     // Compute the final r,g,b values using y1 for the first texel and y2 for the second one.
     // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t
@@ -144,7 +156,7 @@ inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uve
     uint8x8x3_t u8_rgb;
     arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb);
 
-    if(!alpha)
+    if (!alpha)
     {
         vst3_lane_u8(&output_ptr[0], u8_rgb, 0);
         vst3_lane_u8(&output_ptr[3], u8_rgb, 4);
@@ -177,7 +189,7 @@ inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha)
 {
     uint8x16x3_t rgb;
 
-    if(alpha)
+    if (alpha)
     {
         const auto tmp = vld4q_u8(ptr);
         rgb.val[0]     = tmp.val[0];
@@ -206,12 +218,12 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto
     float32x4x4_t fyvec_top, fuvec_top, fvvec_top;
     float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom;
 
-    for(auto i = 0; i < 4; ++i)
+    for (auto i = 0; i < 4; ++i)
     {
-        rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i],
-                               fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]);
-        rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i],
-                               fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]);
+        rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], fyvec_top.val[i], fuvec_top.val[i],
+                               fvvec_top.val[i]);
+        rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], fyvec_bottom.val[i],
+                               fuvec_bottom.val[i], fvvec_bottom.val[i]);
     }
 
     arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]);
@@ -222,9 +234,14 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto
     arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]);
 }
 
-inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
-                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
-                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+inline void store_rgb_to_nv12(const uint8x16_t &rvec_top,
+                              const uint8x16_t &gvec_top,
+                              const uint8x16_t &bvec_top,
+                              const uint8x16_t &rvec_bottom,
+                              const uint8x16_t &gvec_bottom,
+                              const uint8x16_t &bvec_bottom,
+                              unsigned char *const __restrict out_y_top,
+                              unsigned char *const __restrict out_y_bottom,
                               unsigned char *const __restrict out_uv)
 {
     uint8x16x3_t vec_top, vec_bottom;
@@ -252,9 +269,14 @@ inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec
     vst2_u8(out_uv, uvvec);
 }
 
-inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
-                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
-                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top,
+                              const uint8x16_t &gvec_top,
+                              const uint8x16_t &bvec_top,
+                              const uint8x16_t &rvec_bottom,
+                              const uint8x16_t &gvec_bottom,
+                              const uint8x16_t &bvec_bottom,
+                              unsigned char *const __restrict out_y_top,
+                              unsigned char *const __restrict out_y_bottom,
                               unsigned char *const __restrict out_u,
                               unsigned char *const __restrict out_v)
 {
@@ -273,14 +295,16 @@ inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec
 
     const auto uvvec_top    = vuzpq_u8(vec_top.val[1], vec_top.val[2]);
     const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]);
-    const auto uvvec        = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]),
-                                        vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
+    const auto uvvec =
+        vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
 
     vst1_u8(out_u, vget_low_u8(uvvec));
     vst1_u8(out_v, vget_high_u8(uvvec));
 }
 
-inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec,
+inline void store_rgb_to_yuv4(const uint8x16_t &rvec,
+                              const uint8x16_t &gvec,
+                              const uint8x16_t &bvec,
                               unsigned char *const __restrict out_y,
                               unsigned char *const __restrict out_u,
                               unsigned char *const __restrict out_v)
@@ -291,10 +315,9 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co
     const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec);
 
     float32x4x4_t fyvec, fuvec, fvvec;
-    for(auto i = 0; i < 4; ++i)
+    for (auto i = 0; i < 4; ++i)
     {
-        rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i],
-                               fyvec.val[i], fuvec.val[i], fvvec.val[i]);
+        rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], fyvec.val[i], fuvec.val[i], fvvec.val[i]);
     }
 
     uint8x16_t yvec, uvec, vvec;
@@ -307,7 +330,7 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co
     vst1q_u8(out_v, vvec);
 }
 #endif /* DOXYGEN_SKIP_THIS */
-}
+} // namespace
 
 namespace arm_compute
 {
@@ -329,17 +352,19 @@ void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict out
     Iterator in(input_ptr, win);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta1 = vld3q_u8(in.ptr());
-        uint8x16x4_t ta2;
-        ta2.val[0] = ta1.val[0];
-        ta2.val[1] = ta1.val[1];
-        ta2.val[2] = ta1.val[2];
-        ta2.val[3] = vdupq_n_u8(255);
-        vst4q_u8(out.ptr(), ta2);
-    },
-    in, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto   ta1 = vld3q_u8(in.ptr());
+            uint8x16x4_t ta2;
+            ta2.val[0] = ta1.val[0];
+            ta2.val[1] = ta1.val[1];
+            ta2.val[2] = ta1.val[2];
+            ta2.val[3] = vdupq_n_u8(255);
+            vst4q_u8(out.ptr(), ta2);
+        },
+        in, out);
 }
 
 /** Convert RGB to U8.
@@ -360,14 +385,16 @@ void colorconvert_rgb_to_u8(const void *__restrict input, void *__restrict outpu
     Iterator in(input_ptr, win);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta1 = vld3q_u8(in.ptr());
-        uint8x16_t ta2;
-        rgb_to_u8_conversion(ta1, ta2);
-        vst1q_u8(out.ptr(), ta2);
-    },
-    in, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta1 = vld3q_u8(in.ptr());
+            uint8x16_t ta2;
+            rgb_to_u8_conversion(ta1, ta2);
+            vst1q_u8(out.ptr(), ta2);
+        },
+        in, out);
 }
 
 /** Convert RGBX to RGB.
@@ -388,16 +415,18 @@ void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win
     Iterator in(input_ptr, win);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta1 = vld4q_u8(in.ptr());
-        uint8x16x3_t ta2;
-        ta2.val[0] = ta1.val[0];
-        ta2.val[1] = ta1.val[1];
-        ta2.val[2] = ta1.val[2];
-        vst3q_u8(out.ptr(), ta2);
-    },
-    in, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto   ta1 = vld4q_u8(in.ptr());
+            uint8x16x3_t ta2;
+            ta2.val[0] = ta1.val[0];
+            ta2.val[1] = ta1.val[1];
+            ta2.val[2] = ta1.val[2];
+            vst3q_u8(out.ptr(), ta2);
+        },
+        in, out);
 }
 
 /** Convert YUYV to RGB.
@@ -422,26 +451,32 @@ void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict out
     Iterator in(input_ptr, win);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta = vld4q_u8(in.ptr());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        const float32x4x4_t yvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
-        const float32x4x4_t uvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
-        const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
-        const float32x4x4_t vvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
-
-        yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-    },
-    in, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta = vld4q_u8(in.ptr());
+            //ta.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta.val[1] = U0 U2 U4 U6 ...
+            //ta.val[2] = Y1 Y3 Y5 Y7 ...
+            //ta.val[3] = V0 V2 V4 V7 ...
+
+            // Convert the uint8x16x4_t to float32x4x4_t
+            const float32x4x4_t yvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
+            const float32x4x4_t uvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
+            const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
+            const float32x4x4_t vvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
+
+            yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size,
+                                    alpha);
+            yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size,
+                                    alpha);
+            yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size,
+                                    alpha);
+            yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size,
+                                    alpha);
+        },
+        in, out);
 }
 
 /** Convert NV12 to RGB.
@@ -475,35 +510,45 @@ void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict out
     Iterator in_uv(input_ptr->plane(1), win_uv);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
-
-        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-
-        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
-    },
-    in_y, in_uv, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            const auto ta_uv       = vld2q_u8(in_uv.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_uv.val[0] = U0 U2 U4 U6 ...
+            //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+            // Convert the uint8x16x4_t to float32x4x4_t
+            float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
+            float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
+            float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
+            float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
+            float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
+            float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
+
+            yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0],
+                                    out.ptr() + 0 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1],
+                                    out.ptr() + 1 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2],
+                                    out.ptr() + 2 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3],
+                                    out.ptr() + 3 * element_size, alpha);
+
+            yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0],
+                                    out.ptr() + out_stride + 0 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1],
+                                    out.ptr() + out_stride + 1 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2],
+                                    out.ptr() + out_stride + 2 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3],
+                                    out.ptr() + out_stride + 3 * element_size, alpha);
+        },
+        in_y, in_uv, out);
 }
 
 /** Convert IYUV to RGB.
@@ -537,59 +582,71 @@ void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict out
     Iterator in_v(input_ptr->plane(2), win_uv);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto *y_top_ptr    = in_y.ptr();
-        const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y();
-        const auto *u_ptr        = in_u.ptr();
-        const auto *v_ptr        = in_v.ptr();
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto *y_top_ptr    = in_y.ptr();
+            const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y();
+            const auto *u_ptr        = in_u.ptr();
+            const auto *v_ptr        = in_v.ptr();
 
         // Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation
 #if defined(__arch64__)
-        const auto ta0_y_top    = vld1q_u8(y_top_ptr);
-        const auto ta1_y_top    = vld1q_u8(y_top_ptr + 16);
-        const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr);
-        const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16);
-        const auto ta_u         = vld1q_u8(u_ptr);
-        const auto ta_v         = vld1q_u8(v_ptr);
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top));
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top));
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom));
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom));
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
+            const auto ta0_y_top    = vld1q_u8(y_top_ptr);
+            const auto ta1_y_top    = vld1q_u8(y_top_ptr + 16);
+            const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr);
+            const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16);
+            const auto ta_u         = vld1q_u8(u_ptr);
+            const auto ta_v         = vld1q_u8(v_ptr);
+
+            // Convert the uint8x16x4_t to float32x4x4_t
+            float32x4x4_t yvec_top  = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top));
+            float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top));
+            float32x4x4_t yvec_bottom =
+                arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom));
+            float32x4x4_t yyvec_bottom =
+                arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom));
+            float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
+            float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
 #else  /* defined(__arch64__) */
-        const auto ta_y_top    = vld2q_u8(y_top_ptr);
-        const auto ta_y_bottom = vld2q_u8(y_bottom_ptr);
-        const auto ta_u        = vld1q_u8(u_ptr);
-        const auto ta_v        = vld1q_u8(v_ptr);
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_u.val[0] = U0 U2 U4 U6 ...
-        //ta_v.val[0] = V0 V2 V4 V6 ...
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
+            const auto ta_y_top    = vld2q_u8(y_top_ptr);
+            const auto ta_y_bottom = vld2q_u8(y_bottom_ptr);
+            const auto ta_u        = vld1q_u8(u_ptr);
+            const auto ta_v        = vld1q_u8(v_ptr);
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_u.val[0] = U0 U2 U4 U6 ...
+            //ta_v.val[0] = V0 V2 V4 V6 ...
+
+            // Convert the uint8x16x4_t to float32x4x4_t
+            float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
+            float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
+            float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
+            float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
+            float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
+            float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
 #endif /* defined(__arch64__) */
 
-        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-
-        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
-    },
-    in_y, in_u, in_v, out);
+            yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0],
+                                    out.ptr() + 0 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1],
+                                    out.ptr() + 1 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2],
+                                    out.ptr() + 2 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3],
+                                    out.ptr() + 3 * element_size, alpha);
+
+            yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0],
+                                    out.ptr() + out_stride + 0 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1],
+                                    out.ptr() + out_stride + 1 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2],
+                                    out.ptr() + out_stride + 2 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3],
+                                    out.ptr() + out_stride + 3 * element_size, alpha);
+        },
+        in_y, in_u, in_v, out);
 }
 
 /** Convert YUYV to NV12.
@@ -621,31 +678,33 @@ void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict ou
     Iterator out_y(output_ptr->plane(0), win);
     Iterator out_uv(output_ptr->plane(1), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_top    = vld4q_u8(in.ptr());
-        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
-
-        uint8x16x2_t yvec;
-        yvec.val[0] = ta_top.val[0 + shift];
-        yvec.val[1] = ta_top.val[2 + shift];
-        vst2q_u8(out_y.ptr(), yvec);
-
-        uint8x16x2_t yyvec;
-        yyvec.val[0] = ta_bottom.val[0 + shift];
-        yyvec.val[1] = ta_bottom.val[2 + shift];
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
-
-        uint8x16x2_t uvvec;
-        uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
-        uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
-        vst2q_u8(out_uv.ptr(), uvvec);
-    },
-    in, out_y, out_uv);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_top    = vld4q_u8(in.ptr());
+            const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+            //ta.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta.val[1] = U0 U2 U4 U6 ...
+            //ta.val[2] = Y1 Y3 Y5 Y7 ...
+            //ta.val[3] = V0 V2 V4 V7 ...
+
+            uint8x16x2_t yvec;
+            yvec.val[0] = ta_top.val[0 + shift];
+            yvec.val[1] = ta_top.val[2 + shift];
+            vst2q_u8(out_y.ptr(), yvec);
+
+            uint8x16x2_t yyvec;
+            yyvec.val[0] = ta_bottom.val[0 + shift];
+            yyvec.val[1] = ta_bottom.val[2 + shift];
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+            uint8x16x2_t uvvec;
+            uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+            uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+            vst2q_u8(out_uv.ptr(), uvvec);
+        },
+        in, out_y, out_uv);
 }
 
 /** Convert IYUV to NV12.
@@ -676,23 +735,25 @@ void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict ou
     Iterator out_y(output_ptr->plane(0), win);
     Iterator out_uv(output_ptr->plane(1), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto   ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        uint8x16x2_t ta_uv;
-        ta_uv.val[0] = vld1q_u8(in_u.ptr());
-        ta_uv.val[1] = vld1q_u8(in_v.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-        vst2q_u8(out_uv.ptr(), ta_uv);
-    },
-    in_y, in_u, in_v, out_y, out_uv);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto   ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto   ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            uint8x16x2_t ta_uv;
+            ta_uv.val[0] = vld1q_u8(in_u.ptr());
+            ta_uv.val[1] = vld1q_u8(in_v.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_uv.val[0] = U0 U2 U4 U6 ...
+            //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+            vst2q_u8(out_y.ptr(), ta_y_top);
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+            vst2q_u8(out_uv.ptr(), ta_uv);
+        },
+        in_y, in_u, in_v, out_y, out_uv);
 }
 
 /** Convert NV12 to IYUV.
@@ -726,22 +787,24 @@ void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict ou
     Iterator out_u(output_ptr->plane(1), win_uv);
     Iterator out_v(output_ptr->plane(2), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-        vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
-        vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
-    },
-    in_y, in_uv, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            const auto ta_uv       = vld2q_u8(in_uv.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_uv.val[0] = U0 U2 U4 U6 ...
+            //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+            vst2q_u8(out_y.ptr(), ta_y_top);
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+            vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
+            vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
+        },
+        in_y, in_uv, out_y, out_u, out_v);
 }
 
 /** Convert YUYV to IYUV.
@@ -774,34 +837,36 @@ void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict ou
     Iterator out_u(output_ptr->plane(1), win_uv);
     Iterator out_v(output_ptr->plane(2), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_top    = vld4q_u8(in.ptr());
-        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
-
-        uint8x16x2_t yvec;
-        yvec.val[0] = ta_top.val[0 + shift];
-        yvec.val[1] = ta_top.val[2 + shift];
-        vst2q_u8(out_y.ptr(), yvec);
-
-        uint8x16x2_t yyvec;
-        yyvec.val[0] = ta_bottom.val[0 + shift];
-        yyvec.val[1] = ta_bottom.val[2 + shift];
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
-
-        uint8x16_t uvec;
-        uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
-        vst1q_u8(out_u.ptr(), uvec);
-
-        uint8x16_t vvec;
-        vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
-        vst1q_u8(out_v.ptr(), vvec);
-    },
-    in, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_top    = vld4q_u8(in.ptr());
+            const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+            //ta.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta.val[1] = U0 U2 U4 U6 ...
+            //ta.val[2] = Y1 Y3 Y5 Y7 ...
+            //ta.val[3] = V0 V2 V4 V7 ...
+
+            uint8x16x2_t yvec;
+            yvec.val[0] = ta_top.val[0 + shift];
+            yvec.val[1] = ta_top.val[2 + shift];
+            vst2q_u8(out_y.ptr(), yvec);
+
+            uint8x16x2_t yyvec;
+            yyvec.val[0] = ta_bottom.val[0 + shift];
+            yyvec.val[1] = ta_bottom.val[2 + shift];
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+            uint8x16_t uvec;
+            uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+            vst1q_u8(out_u.ptr(), uvec);
+
+            uint8x16_t vvec;
+            vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+            vst1q_u8(out_v.ptr(), vvec);
+        },
+        in, out_y, out_u, out_v);
 }
 
 /** Convert NV12 to YUV4.
@@ -835,32 +900,34 @@ void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict ou
     Iterator out_u(output_ptr->plane(1), win);
     Iterator out_v(output_ptr->plane(2), win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-
-        uint8x16x2_t uvec;
-        uvec.val[0] = ta_uv.val[0 + shift];
-        uvec.val[1] = ta_uv.val[0 + shift];
-        vst2q_u8(out_u.ptr(), uvec);
-        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
-
-        uint8x16x2_t vvec;
-        vvec.val[0] = ta_uv.val[1 - shift];
-        vvec.val[1] = ta_uv.val[1 - shift];
-        vst2q_u8(out_v.ptr(), vvec);
-        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
-    },
-    in_y, in_uv, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            const auto ta_uv       = vld2q_u8(in_uv.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_uv.val[0] = U0 U2 U4 U6 ...
+            //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+            vst2q_u8(out_y.ptr(), ta_y_top);
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+            uint8x16x2_t uvec;
+            uvec.val[0] = ta_uv.val[0 + shift];
+            uvec.val[1] = ta_uv.val[0 + shift];
+            vst2q_u8(out_u.ptr(), uvec);
+            vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+            uint8x16x2_t vvec;
+            vvec.val[0] = ta_uv.val[1 - shift];
+            vvec.val[1] = ta_uv.val[1 - shift];
+            vst2q_u8(out_v.ptr(), vvec);
+            vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+        },
+        in_y, in_uv, out_y, out_u, out_v);
 }
 
 /** Convert IYUV to YUV4.
@@ -892,33 +959,35 @@ void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict ou
     Iterator out_u(output_ptr->plane(1), win);
     Iterator out_v(output_ptr->plane(2), win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_u        = vld1q_u8(in_u.ptr());
-        const auto ta_v        = vld1q_u8(in_v.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_u = U0 U2 U4 U6 ...
-        //ta_v = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-
-        uint8x16x2_t uvec;
-        uvec.val[0] = ta_u;
-        uvec.val[1] = ta_u;
-        vst2q_u8(out_u.ptr(), uvec);
-        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
-
-        uint8x16x2_t vvec;
-        vvec.val[0] = ta_v;
-        vvec.val[1] = ta_v;
-        vst2q_u8(out_v.ptr(), vvec);
-        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
-    },
-    in_y, in_u, in_v, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            const auto ta_u        = vld1q_u8(in_u.ptr());
+            const auto ta_v        = vld1q_u8(in_v.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_u = U0 U2 U4 U6 ...
+            //ta_v = V0 V2 V4 V6 ...
+
+            vst2q_u8(out_y.ptr(), ta_y_top);
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+            uint8x16x2_t uvec;
+            uvec.val[0] = ta_u;
+            uvec.val[1] = ta_u;
+            vst2q_u8(out_u.ptr(), uvec);
+            vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+            uint8x16x2_t vvec;
+            vvec.val[0] = ta_v;
+            vvec.val[1] = ta_v;
+            vst2q_u8(out_v.ptr(), vvec);
+            vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+        },
+        in_y, in_u, in_v, out_y, out_u, out_v);
 }
 
 /** Convert RGB to NV12.
@@ -948,20 +1017,21 @@ void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict out
     Iterator out_y(output_ptr->plane(0), win);
     Iterator out_uv(output_ptr->plane(1), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
-        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
-        store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
-                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
-                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
-                          out_uv.ptr());
-    },
-    in, out_y, out_uv);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
+            const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+            //ta_rgb.val[0] = R0 R1 R2 R3 ...
+            //ta_rgb.val[1] = G0 G1 G2 G3 ...
+            //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+            store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0],
+                              ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(),
+                              out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_uv.ptr());
+        },
+        in, out_y, out_uv);
 }
 
 /** Convert RGB to IYUV.
@@ -992,20 +1062,22 @@ void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict out
     Iterator out_u(output_ptr->plane(1), win_uv);
     Iterator out_v(output_ptr->plane(2), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
-        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
-        store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
-                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
-                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
-                          out_u.ptr(), out_v.ptr());
-    },
-    in, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
+            const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+            //ta_rgb.val[0] = R0 R1 R2 R3 ...
+            //ta_rgb.val[1] = G0 G1 G2 G3 ...
+            //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+            store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0],
+                              ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(),
+                              out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_u.ptr(),
+                              out_v.ptr());
+        },
+        in, out_y, out_u, out_v);
 }
 
 /** Convert RGB to YUV4.
@@ -1030,16 +1102,17 @@ void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict out
     Iterator out_u(output_ptr->plane(1), win);
     Iterator out_v(output_ptr->plane(2), win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb = load_rgb(in.ptr(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
-        store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2],
-                          out_y.ptr(), out_u.ptr(), out_v.ptr());
-    },
-    in, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_rgb = load_rgb(in.ptr(), alpha);
+            //ta_rgb.val[0] = R0 R1 R2 R3 ...
+            //ta_rgb.val[1] = G0 G1 G2 G3 ...
+            //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+            store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], out_y.ptr(), out_u.ptr(), out_v.ptr());
+        },
+        in, out_y, out_u, out_v);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
index 96defbc9c931d7d031122e66bd33dadefd152f2b..4b1eb079b25be61504d47675a0f2a686bdf50d8f 100644
--- a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
+++ b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
@@ -33,56 +33,32 @@ namespace detail
 {
 inline float32x4x3_t load_matrix_row(const float *ptr)
 {
-    const float32x4x3_t r =
-    {
-        {
-            vld1q_dup_f32(ptr),
-            vld1q_dup_f32(1 + ptr),
-            vld1q_dup_f32(2 + ptr)
-        }
-    };
+    const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}};
     return r;
 }
 
 template <unsigned int stridex>
-float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2);
+float32x4x2_t convolve_3x3(const float         *in_top,
+                           const float         *in_mid,
+                           const float         *in_low,
+                           const float32x4x3_t &m0,
+                           const float32x4x3_t &m1,
+                           const float32x4x3_t &m2);
 
 template <>
-inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<1>(const float         *in_top,
+                                     const float         *in_mid,
+                                     const float         *in_low,
+                                     const float32x4x3_t &m0,
+                                     const float32x4x3_t &m1,
+                                     const float32x4x3_t &m2)
 {
-    const float32x4x3_t vtop =
-    {
-        {
-            vld1q_f32(in_top),
-            vld1q_f32(in_top + 4),
-            vld1q_f32(in_top + 8)
-        }
-    };
-    const float32x4x3_t vmid =
-    {
-        {
-            vld1q_f32(in_mid),
-            vld1q_f32(in_mid + 4),
-            vld1q_f32(in_mid + 8)
-        }
-    };
-    const float32x4x3_t vlow =
-    {
-        {
-            vld1q_f32(in_low),
-            vld1q_f32(in_low + 4),
-            vld1q_f32(in_low + 8)
-        }
-    };
-    float32x4x2_t out =
-    {
-        {
-            vmulq_f32(vtop.val[0], m0.val[0]),
-            vmulq_f32(vtop.val[1], m0.val[0])
-        }
-    };
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
+    const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}};
+    const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}};
+    const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}};
+    float32x4x2_t       out  = {{vmulq_f32(vtop.val[0], m0.val[0]), vmulq_f32(vtop.val[1], m0.val[0])}};
+    out.val[0]               = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
+    out.val[0]               = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
 
     out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
@@ -106,7 +82,12 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c
 }
 
 template <>
-inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<2>(const float         *in_top,
+                                     const float         *in_mid,
+                                     const float         *in_low,
+                                     const float32x4x3_t &m0,
+                                     const float32x4x3_t &m1,
+                                     const float32x4x3_t &m2)
 {
     float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
@@ -116,7 +97,12 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c
 }
 
 template <>
-inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<3>(const float         *in_top,
+                                     const float         *in_mid,
+                                     const float         *in_low,
+                                     const float32x4x3_t &m0,
+                                     const float32x4x3_t &m1,
+                                     const float32x4x3_t &m2)
 {
     float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
@@ -165,6 +151,6 @@ int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteratio
 {
     return num_elems_written_per_iteration * 3;
 }
-}
+} // namespace detail
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */
diff --git a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
index 7ba52a16b78d918d2b41d7b536b0f09251425157..fd1ee545974aa479cb9015ee03d63b4801d48235 100644
--- a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
+++ b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
@@ -45,14 +45,7 @@ namespace detail
 inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0)
 {
     ARM_COMPUTE_UNUSED(weights_offset);
-    const float32x4x3_t r =
-    {
-        {
-            vld1q_dup_f32(ptr),
-            vld1q_dup_f32(1 + ptr),
-            vld1q_dup_f32(2 + ptr)
-        }
-    };
+    const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}};
     return r;
 }
 
@@ -63,21 +56,16 @@ inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0)
  *
  * @return The loaded matrix.
  */
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
 inline int32x4x3_t load_matrix_row(const T *ptr, int weights_offset = 0)
 {
     const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
 
     /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
        r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
-    int32x4x3_t r =
-    {
-        {
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))
-        }
-    };
+    int32x4x3_t r = {{vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
+                      vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
+                      vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))}};
     return r;
 }
 
@@ -245,36 +233,23 @@ inline void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values
  * @param[in] input_offset (Optional) Input quantization offset.
  *
  */
-inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
-                                                const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                                const size_t dilation_x, int input_offset)
+inline float32x4_t single_convolve_3x3_dilation(const float         *in_top,
+                                                const float         *in_mid,
+                                                const float         *in_low,
+                                                const float32x4x3_t &m0,
+                                                const float32x4x3_t &m1,
+                                                const float32x4x3_t &m2,
+                                                const size_t         dilation_x,
+                                                int                  input_offset)
 {
     ARM_COMPUTE_UNUSED(input_offset);
 
-    const float32x4x3_t vtop =
-    {
-        {
-            vld1q_f32(in_top),
-            vld1q_f32(in_top + dilation_x),
-            vld1q_f32(in_top + 2 * dilation_x)
-        }
-    };
-    const float32x4x3_t vmid =
-    {
-        {
-            vld1q_f32(in_mid),
-            vld1q_f32(in_mid + dilation_x),
-            vld1q_f32(in_mid + 2 * dilation_x)
-        }
-    };
-    const float32x4x3_t vlow =
-    {
-        {
-            vld1q_f32(in_low),
-            vld1q_f32(in_low + dilation_x),
-            vld1q_f32(in_low + 2 * dilation_x)
-        }
-    };
+    const float32x4x3_t vtop = {
+        {vld1q_f32(in_top), vld1q_f32(in_top + dilation_x), vld1q_f32(in_top + 2 * dilation_x)}};
+    const float32x4x3_t vmid = {
+        {vld1q_f32(in_mid), vld1q_f32(in_mid + dilation_x), vld1q_f32(in_mid + 2 * dilation_x)}};
+    const float32x4x3_t vlow = {
+        {vld1q_f32(in_low), vld1q_f32(in_low + dilation_x), vld1q_f32(in_low + 2 * dilation_x)}};
     float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]);
     out             = vmlaq_f32(out, vtop.val[1], m0.val[1]);
     out             = vmlaq_f32(out, vtop.val[2], m0.val[2]);
@@ -303,26 +278,28 @@ inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float
  * @param[in] input_offset (Optional) Input quantization offset.
  *
  */
-inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
-                                           const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                           const size_t dilation_x, unsigned int stridex, int input_offset = 0)
+inline float32x4x2_t convolve_3x3_dilation(const float         *in_top,
+                                           const float         *in_mid,
+                                           const float         *in_low,
+                                           const float32x4x3_t &m0,
+                                           const float32x4x3_t &m1,
+                                           const float32x4x3_t &m2,
+                                           const size_t         dilation_x,
+                                           unsigned int         stridex,
+                                           int                  input_offset = 0)
 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
-    float32x4x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
+    float32x4x2_t out = {
+        {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+         single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}};
 
-    if(stridex == 2)
+    if (stridex == 2)
     {
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
     }
-    else if(stridex == 3)
+    else if (stridex == 3)
     {
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
     }
@@ -344,26 +321,32 @@ inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_
  *
  */
 template <bool accumulate>
-void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
-                  const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                  unsigned int stridex, int input_offset = 0);
+void convolve_3x3(const float         *in_top,
+                  const float         *in_mid,
+                  const float         *in_low,
+                  float               *out_ptr,
+                  const float32x4x3_t &m0,
+                  const float32x4x3_t &m1,
+                  const float32x4x3_t &m2,
+                  unsigned int         stridex,
+                  int                  input_offset = 0);
 
 template <bool accumulate>
-inline void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
-                         const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                         unsigned int stridex, int input_offset)
+inline void convolve_3x3(const float         *in_top,
+                         const float         *in_mid,
+                         const float         *in_low,
+                         float               *out_ptr,
+                         const float32x4x3_t &m0,
+                         const float32x4x3_t &m1,
+                         const float32x4x3_t &m2,
+                         unsigned int         stridex,
+                         int                  input_offset)
 {
     ARM_COMPUTE_UNUSED(input_offset);
     ARM_COMPUTE_ERROR_ON(stridex > 3);
 
-    float32x4x2_t out =
-    {
-        {
-            vdupq_n_f32(0.f),
-            vdupq_n_f32(0.f)
-        }
-    };
-    if(stridex == 2)
+    float32x4x2_t out = {{vdupq_n_f32(0.f), vdupq_n_f32(0.f)}};
+    if (stridex == 2)
     {
         const float32x4x2_t vtop     = vld2q_f32(in_top);
         const float32x4x2_t vmid     = vld2q_f32(in_mid);
@@ -389,32 +372,11 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float *
     }
     else
     {
-        const float32x4x3_t vtop =
-        {
-            {
-                vld1q_f32(in_top),
-                vld1q_f32(in_top + 4),
-                vld1q_f32(in_top + 8)
-            }
-        };
-        const float32x4x3_t vmid =
-        {
-            {
-                vld1q_f32(in_mid),
-                vld1q_f32(in_mid + 4),
-                vld1q_f32(in_mid + 8)
-            }
-        };
-        const float32x4x3_t vlow =
-        {
-            {
-                vld1q_f32(in_low),
-                vld1q_f32(in_low + 4),
-                vld1q_f32(in_low + 8)
-            }
-        };
-        out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
-        out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]);
+        const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}};
+        const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}};
+        const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}};
+        out.val[0]               = vmulq_f32(vtop.val[0], m0.val[0]);
+        out.val[1]               = vmulq_f32(vtop.val[1], m0.val[0]);
 
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
@@ -438,7 +400,7 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float *
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
 
-        if(stridex == 3)
+        if (stridex == 3)
         {
             out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
             accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
@@ -462,65 +424,43 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float *
  * @param[in] input_offset Input quantization offset.
  *
  */
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low,
-                                              const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                              size_t dilation_x, int32_t input_offset)
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
+inline int32x4_t single_convolve_3x3_dilation(const T           *in_top,
+                                              const T           *in_mid,
+                                              const T           *in_low,
+                                              const int32x4x3_t &m0,
+                                              const int32x4x3_t &m1,
+                                              const int32x4x3_t &m2,
+                                              size_t             dilation_x,
+                                              int32_t            input_offset)
 {
     using VectorType    = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x3_t, int8x8x3_t>::type;
     using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
 
     const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
 
-    const VectorType vtop =
-    {
-        {
-            wrapper::vload(in_top),
-            wrapper::vload(in_top + dilation_x),
-            wrapper::vload(in_top + 2 * dilation_x)
-        }
-    };
-    const VectorType vmid =
-    {
-        {
-            wrapper::vload(in_mid),
-            wrapper::vload(in_mid + dilation_x),
-            wrapper::vload(in_mid + 2 * dilation_x)
-        }
-    };
-    const VectorType vlow =
-    {
-        {
-            wrapper::vload(in_low),
-            wrapper::vload(in_low + dilation_x),
-            wrapper::vload(in_low + 2 * dilation_x)
-        }
-    };
-
-    const int32x4x3_t vtop_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
-        }
-    };
-    const int32x4x3_t vmid_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
-        }
-    };
-    const int32x4x3_t vlow_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
-        }
-    };
+    const VectorType vtop = {
+        {wrapper::vload(in_top), wrapper::vload(in_top + dilation_x), wrapper::vload(in_top + 2 * dilation_x)}};
+    const VectorType vmid = {
+        {wrapper::vload(in_mid), wrapper::vload(in_mid + dilation_x), wrapper::vload(in_mid + 2 * dilation_x)}};
+    const VectorType vlow = {
+        {wrapper::vload(in_low), wrapper::vload(in_low + dilation_x), wrapper::vload(in_low + 2 * dilation_x)}};
+
+    const int32x4x3_t vtop_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
+    }};
+    const int32x4x3_t vmid_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
+    }};
+    const int32x4x3_t vlow_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
+    }};
 
     int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]);
     out           = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]);
@@ -550,26 +490,29 @@ inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid,
  * @param[in] input_offset Input quantization offset.
  *
  */
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                         const size_t dilation_x, unsigned int stridex, int input_offset)
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
+inline int32x4x2_t convolve_3x3_dilation(const T           *in_top,
+                                         const T           *in_mid,
+                                         const T           *in_low,
+                                         const int32x4x3_t &m0,
+                                         const int32x4x3_t &m1,
+                                         const int32x4x3_t &m2,
+                                         const size_t       dilation_x,
+                                         unsigned int       stridex,
+                                         int                input_offset)
 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
-    int32x4x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
+    int32x4x2_t out = {
+        {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+         single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}};
 
-    if(stridex == 2)
+    if (stridex == 2)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
     }
-    else if(stridex == 3)
+    else if (stridex == 3)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
     }
@@ -589,10 +532,19 @@ inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const
  * @param[in]  input_offset Input quantization offset.
  *
  */
-template < bool accumulate, typename T1, typename T2, ARM_COMPUTE_REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value) >
-void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr,
-                  const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                  unsigned int stridex, int32_t input_offset)
+template <bool accumulate,
+          typename T1,
+          typename T2,
+          ARM_COMPUTE_REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value)>
+void convolve_3x3(const T1          *in_top,
+                  const T1          *in_mid,
+                  const T1          *in_low,
+                  T2                *out_ptr,
+                  const int32x4x3_t &m0,
+                  const int32x4x3_t &m1,
+                  const int32x4x3_t &m2,
+                  unsigned int       stridex,
+                  int32_t            input_offset)
 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
     using VectorType    = typename std::conditional<std::is_same<T1, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
@@ -600,60 +552,30 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_
 
     const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
 
-    const VectorType vtop =
-    {
-        {
-            wrapper::vload(in_top),
-            wrapper::vload(in_top + 8)
-        }
-    };
-    const VectorType vmid =
-    {
-        {
-            wrapper::vload(in_mid),
-            wrapper::vload(in_mid + 8)
-        }
-    };
-    const VectorType vlow =
-    {
-        {
-            wrapper::vload(in_low),
-            wrapper::vload(in_low + 8)
-        }
-    };
-
-    const int32x4x3_t vtop_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
-        }
-    };
-    const int32x4x3_t vmid_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
-        }
-    };
-    const int32x4x3_t vlow_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
-        }
-    };
-
-    int32x4x2_t out
-    {
-        {
-            wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
-            wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
-        }
-    };
+    const VectorType vtop = {{wrapper::vload(in_top), wrapper::vload(in_top + 8)}};
+    const VectorType vmid = {{wrapper::vload(in_mid), wrapper::vload(in_mid + 8)}};
+    const VectorType vlow = {{wrapper::vload(in_low), wrapper::vload(in_low + 8)}};
+
+    const int32x4x3_t vtop_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
+    }};
+    const int32x4x3_t vmid_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
+    }};
+    const int32x4x3_t vlow_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
+    }};
+
+    int32x4x2_t out{{
+        wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
+        wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
+    }};
 
     // 0
     out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]);
@@ -681,11 +603,11 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]);
 
-    if(stridex == 1)
+    if (stridex == 1)
     {
         accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
     }
-    else if(stridex == 2)
+    else if (stridex == 2)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
@@ -693,7 +615,7 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_
 
         accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
     }
-    else if(stridex == 3)
+    else if (stridex == 3)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
         accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
@@ -712,14 +634,7 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset =
     ARM_COMPUTE_UNUSED(weights_offset);
     /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
        r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
-    const float16x8x3_t r =
-    {
-        {
-            vld1q_dup_f16(ptr),
-            vld1q_dup_f16(1 + ptr),
-            vld1q_dup_f16(2 + ptr)
-        }
-    };
+    const float16x8x3_t r = {{vld1q_dup_f16(ptr), vld1q_dup_f16(1 + ptr), vld1q_dup_f16(2 + ptr)}};
     return r;
 }
 
@@ -735,35 +650,22 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset =
  * @param[in] input_offset (Optional)Input quantization offset.
  *
  */
-inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
-                                                const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                                const size_t dilation_x, int input_offset = 0)
+inline float16x8_t single_convolve_3x3_dilation(const float16_t     *in_top,
+                                                const float16_t     *in_mid,
+                                                const float16_t     *in_low,
+                                                const float16x8x3_t &m0,
+                                                const float16x8x3_t &m1,
+                                                const float16x8x3_t &m2,
+                                                const size_t         dilation_x,
+                                                int                  input_offset = 0)
 {
     ARM_COMPUTE_UNUSED(input_offset);
-    const float16x8x3_t vtop =
-    {
-        {
-            vld1q_f16(in_top),
-            vld1q_f16(in_top + dilation_x),
-            vld1q_f16(in_top + 2 * dilation_x)
-        }
-    };
-    const float16x8x3_t vmid =
-    {
-        {
-            vld1q_f16(in_mid),
-            vld1q_f16(in_mid + dilation_x),
-            vld1q_f16(in_mid + 2 * dilation_x)
-        }
-    };
-    const float16x8x3_t vlow =
-    {
-        {
-            vld1q_f16(in_low),
-            vld1q_f16(in_low + dilation_x),
-            vld1q_f16(in_low + 2 * dilation_x)
-        }
-    };
+    const float16x8x3_t vtop = {
+        {vld1q_f16(in_top), vld1q_f16(in_top + dilation_x), vld1q_f16(in_top + 2 * dilation_x)}};
+    const float16x8x3_t vmid = {
+        {vld1q_f16(in_mid), vld1q_f16(in_mid + dilation_x), vld1q_f16(in_mid + 2 * dilation_x)}};
+    const float16x8x3_t vlow = {
+        {vld1q_f16(in_low), vld1q_f16(in_low + dilation_x), vld1q_f16(in_low + 2 * dilation_x)}};
     float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]);
     out             = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1]));
     out             = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2]));
@@ -792,19 +694,21 @@ inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const f
  * @param[in] input_offset (Optional) Input quantization offset.
  *
  */
-inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
-                                           const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                           const size_t dilation_x, unsigned int stridex, int input_offset = 0)
-{
-    float16x8x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
-
-    if(stridex == 2)
+inline float16x8x2_t convolve_3x3_dilation(const float16_t     *in_top,
+                                           const float16_t     *in_mid,
+                                           const float16_t     *in_low,
+                                           const float16x8x3_t &m0,
+                                           const float16x8x3_t &m1,
+                                           const float16x8x3_t &m2,
+                                           const size_t         dilation_x,
+                                           unsigned int         stridex,
+                                           int                  input_offset = 0)
+{
+    float16x8x2_t out = {
+        {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+         single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)}};
+
+    if (stridex == 2)
     {
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2);
@@ -814,7 +718,7 @@ inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float1
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6);
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7);
     }
-    else if(stridex == 3)
+    else if (stridex == 3)
     {
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
@@ -838,20 +742,20 @@ inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float1
  *
  */
 template <bool accumulate>
-inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, float16_t *out_ptr,
-                         const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                         unsigned int stridex, int input_offset = 0)
+inline void convolve_3x3(const float16_t     *in_top,
+                         const float16_t     *in_mid,
+                         const float16_t     *in_low,
+                         float16_t           *out_ptr,
+                         const float16x8x3_t &m0,
+                         const float16x8x3_t &m1,
+                         const float16x8x3_t &m2,
+                         unsigned int         stridex,
+                         int                  input_offset = 0)
 {
     ARM_COMPUTE_UNUSED(input_offset);
 
-    float16x8x2_t out =
-    {
-        {
-            vdupq_n_f16(0),
-            vdupq_n_f16(0)
-        }
-    };
-    if(stridex == 2)
+    float16x8x2_t out = {{vdupq_n_f16(0), vdupq_n_f16(0)}};
+    if (stridex == 2)
     {
         const float16x8x2_t vtop     = vld2q_f16(in_top);
         const float16x8x2_t vmid     = vld2q_f16(in_mid);
@@ -877,32 +781,11 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const
     }
     else
     {
-        const float16x8x3_t vtop =
-        {
-            {
-                vld1q_f16(in_top),
-                vld1q_f16(in_top + 8),
-                vld1q_f16(in_top + 16)
-            }
-        };
-        const float16x8x3_t vmid =
-        {
-            {
-                vld1q_f16(in_mid),
-                vld1q_f16(in_mid + 8),
-                vld1q_f16(in_mid + 16)
-            }
-        };
-        const float16x8x3_t vlow =
-        {
-            {
-                vld1q_f16(in_low),
-                vld1q_f16(in_low + 8),
-                vld1q_f16(in_low + 16)
-            }
-        };
-        out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]);
-        out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]);
+        const float16x8x3_t vtop = {{vld1q_f16(in_top), vld1q_f16(in_top + 8), vld1q_f16(in_top + 16)}};
+        const float16x8x3_t vmid = {{vld1q_f16(in_mid), vld1q_f16(in_mid + 8), vld1q_f16(in_mid + 16)}};
+        const float16x8x3_t vlow = {{vld1q_f16(in_low), vld1q_f16(in_low + 8), vld1q_f16(in_low + 16)}};
+        out.val[0]               = vmulq_f16(vtop.val[0], m0.val[0]);
+        out.val[1]               = vmulq_f16(vtop.val[1], m0.val[0]);
 
         out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1]));
         out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2]));
@@ -921,7 +804,7 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const
         out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1]));
         out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2]));
 
-        if(stridex == 3)
+        if (stridex == 3)
         {
             out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
             out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
@@ -946,7 +829,7 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const
  */
 inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex)
 {
-    switch(stridex)
+    switch (stridex)
     {
         case 1:
             return num_elems_written_per_iteration;
@@ -959,6 +842,6 @@ inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iter
             return 0;
     }
 }
-}
+} // namespace detail
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H */
diff --git a/src/core/NEON/wrapper/intrinsics/cvt.h b/src/core/NEON/wrapper/intrinsics/cvt.h
index 1c77a9e9f08789b2e39628fa30cfe690a8a856ac..381de2284af965aac8378748b4876440728da170 100644
--- a/src/core/NEON/wrapper/intrinsics/cvt.h
+++ b/src/core/NEON/wrapper/intrinsics/cvt.h
@@ -30,12 +30,11 @@ namespace arm_compute
 {
 namespace wrapper
 {
-#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2)                   \
-    template <typename T>                                                            \
-    inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type \
-    vcvt(const vtype &a)                                                             \
-    {                                                                                \
-        return prefix##_##postfix1##_##postfix2(a);                                  \
+#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2)                                        \
+    template <typename T>                                                                                 \
+    inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type vcvt(const vtype &a) \
+    {                                                                                                     \
+        return prefix##_##postfix1##_##postfix2(a);                                                       \
     }
 
 VCVT_TO_F32_IMPL(float32x4_t, uint32x4_t, vcvtq, f32, u32)
@@ -46,12 +45,11 @@ VCVT_TO_F32_IMPL(float32x4_t, float16x4_t, vcvt, f32, f16)
 #undef VCVT_TO_F32_IMPL
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2)                       \
-    template <typename T>                                                                \
-    inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type \
-    vcvt(const vtype &a)                                                                 \
-    {                                                                                    \
-        return prefix##_##postfix1##_##postfix2(a);                                      \
+#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2)                                            \
+    template <typename T>                                                                                     \
+    inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type vcvt(const vtype &a) \
+    {                                                                                                         \
+        return prefix##_##postfix1##_##postfix2(a);                                                           \
     }
 
 VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32)
@@ -59,14 +57,14 @@ VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 template <typename T>
-inline typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, uint32_t>::value, uint32x4_t >::type
+inline typename std::enable_if<std::is_same<T, uint8_t>::value || std::is_same<T, uint32_t>::value, uint32x4_t>::type
 vcvt(const float32x4_t &a)
 {
     return vcvtq_u32_f32(a);
 }
 
 template <typename T>
-inline typename std::enable_if < std::is_same<T, int8_t>::value || std::is_same<T, int32_t>::value, int32x4_t >::type
+inline typename std::enable_if<std::is_same<T, int8_t>::value || std::is_same<T, int32_t>::value, int32x4_t>::type
 vcvt(const float32x4_t &a)
 {
     return vcvtq_s32_f32(a);
@@ -74,15 +72,13 @@ vcvt(const float32x4_t &a)
 
 #ifdef __aarch64__
 template <typename T>
-inline typename std::enable_if<std::is_same<T, uint32_t>::value, uint32x4_t>::type
-vcvta(const float32x4_t &a)
+inline typename std::enable_if<std::is_same<T, uint32_t>::value, uint32x4_t>::type vcvta(const float32x4_t &a)
 {
     return vcvtaq_u32_f32(a);
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, int32_t>::value, int32x4_t>::type
-vcvta(const float32x4_t &a)
+inline typename std::enable_if<std::is_same<T, int32_t>::value, int32x4_t>::type vcvta(const float32x4_t &a)
 {
     return vcvtaq_s32_f32(a);
 }
@@ -96,14 +92,13 @@ vcvta(const float32x4_t &a)
  */
 inline void vcvt_bf16_f32(const float *inptr, uint16_t *outptr)
 {
-    __asm __volatile(
-        "ldp    q0, q1, [%[inptr]]\n"
-        ".inst  0xea16800\n"  // BFCVTN v0, v0
-        ".inst  0x4ea16820\n" // BFCVTN2 v0, v1
-        "str    q0, [%[outptr]]\n"
-        : [inptr] "+r"(inptr)
-        : [outptr] "r"(outptr)
-        : "v0", "v1", "memory");
+    __asm __volatile("ldp    q0, q1, [%[inptr]]\n"
+                     ".inst  0xea16800\n"  // BFCVTN v0, v0
+                     ".inst  0x4ea16820\n" // BFCVTN2 v0, v1
+                     "str    q0, [%[outptr]]\n"
+                     : [inptr] "+r"(inptr)
+                     : [outptr] "r"(outptr)
+                     : "v0", "v1", "memory");
 }
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 
diff --git a/src/core/NEON/wrapper/intrinsics/div.h b/src/core/NEON/wrapper/intrinsics/div.h
index 265f30d33b70b2a6e6daec772213003855867514..ece991a5b0575a300e5db761740c253845b25da8 100644
--- a/src/core/NEON/wrapper/intrinsics/div.h
+++ b/src/core/NEON/wrapper/intrinsics/div.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_DIV_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/erf.h b/src/core/NEON/wrapper/intrinsics/erf.h
index e2207648e530cfa441e26f747f00baf5ec3c2347..0e34462b9623deb9e1c1c3518173f8c896974448 100644
--- a/src/core/NEON/wrapper/intrinsics/erf.h
+++ b/src/core/NEON/wrapper/intrinsics/erf.h
@@ -26,6 +26,7 @@
 #define ARM_COMPUTE_WRAPPER_ERF_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/exp.h b/src/core/NEON/wrapper/intrinsics/exp.h
index c2a697096793bd69a15d88b20f45c6e704e5dfa9..f44577b926c074e8a58e9b82d117949deee8c1b8 100644
--- a/src/core/NEON/wrapper/intrinsics/exp.h
+++ b/src/core/NEON/wrapper/intrinsics/exp.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_EXP_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/getlane.h b/src/core/NEON/wrapper/intrinsics/getlane.h
index 20527516122aab9073ddd3252c78ae4086bcd5b0..ae813bb2faae5fc0bb75235873cd0b1cbc7efce8 100644
--- a/src/core/NEON/wrapper/intrinsics/getlane.h
+++ b/src/core/NEON/wrapper/intrinsics/getlane.h
@@ -33,7 +33,7 @@ namespace wrapper
 #define VGETLANE_IMPL_8(stype, vtype, postfix)                         \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vget_lane_##postfix(vector, 0);                 \
@@ -59,7 +59,7 @@ namespace wrapper
 #define VGETLANE_IMPL_4(stype, vtype, postfix)                         \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vget_lane_##postfix(vector, 0);                 \
@@ -77,7 +77,7 @@ namespace wrapper
 #define VGETLANE_IMPL_2(stype, vtype, postfix)                         \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vget_lane_##postfix(vector, 0);                 \
@@ -102,7 +102,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
 #define VGETQLANE_IMPL_16(stype, vtype, postfix)                       \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vgetq_lane_##postfix(vector, 0);                \
@@ -144,7 +144,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
 #define VGETQLANE_IMPL_8(stype, vtype, postfix)                        \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vgetq_lane_##postfix(vector, 0);                \
@@ -170,7 +170,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
 #define VGETQLANE_IMPL_4(stype, vtype, postfix)                        \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vgetq_lane_##postfix(vector, 0);                \
@@ -188,7 +188,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
 #define VGETQLANE_IMPL_2(stype, vtype, postfix)                        \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vgetq_lane_##postfix(vector, 0);                \
diff --git a/src/core/NEON/wrapper/intrinsics/inv.h b/src/core/NEON/wrapper/intrinsics/inv.h
index de398b04039074174fec26e847f748ce12704674..e443be679b77b28e0330e8fd6d19c3fee0c77104 100644
--- a/src/core/NEON/wrapper/intrinsics/inv.h
+++ b/src/core/NEON/wrapper/intrinsics/inv.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_INV_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/invsqrt.h b/src/core/NEON/wrapper/intrinsics/invsqrt.h
index 2343efa8f84c3a53b1d867f72f000200dbf20127..257b445cc742a8334feb832cf9a389c104911b80 100644
--- a/src/core/NEON/wrapper/intrinsics/invsqrt.h
+++ b/src/core/NEON/wrapper/intrinsics/invsqrt.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_INVSQRT_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/log.h b/src/core/NEON/wrapper/intrinsics/log.h
index 357a77ca782290e63c595a77916b91e36ddaed53..d091407edb2247309ca03c7068dbdc2a42cdeae3 100644
--- a/src/core/NEON/wrapper/intrinsics/log.h
+++ b/src/core/NEON/wrapper/intrinsics/log.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_LOG_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/pow.h b/src/core/NEON/wrapper/intrinsics/pow.h
index 61f834ed23a59cecf19536fcd8187b5fb5e044be..dfd6ccc3589bfd896b91ec1a06cbf7bc870704d0 100644
--- a/src/core/NEON/wrapper/intrinsics/pow.h
+++ b/src/core/NEON/wrapper/intrinsics/pow.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_POW_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/qmov.h b/src/core/NEON/wrapper/intrinsics/qmov.h
index 167f3cf43b92e302f9befd08b5a5c9fc70499c73..9a0a23a2415dfe010d33aa529c79563fe83cea55 100644
--- a/src/core/NEON/wrapper/intrinsics/qmov.h
+++ b/src/core/NEON/wrapper/intrinsics/qmov.h
@@ -31,15 +31,13 @@ namespace arm_compute
 namespace wrapper
 {
 template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type
-vqmov(const int16x8_t &a)
+inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type vqmov(const int16x8_t &a)
 {
     return vqmovun_s16(a);
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type
-vqmov(const int16x8_t &a)
+inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type vqmov(const int16x8_t &a)
 {
     return vqmovn_s16(a);
 }
diff --git a/src/core/NEON/wrapper/intrinsics/reinterpret.h b/src/core/NEON/wrapper/intrinsics/reinterpret.h
index cf00a4aceb1d0b0d729eeb604689e9006cd12f66..c2c4f720d20e78facd58f8c710d8035d81dce72c 100644
--- a/src/core/NEON/wrapper/intrinsics/reinterpret.h
+++ b/src/core/NEON/wrapper/intrinsics/reinterpret.h
@@ -35,7 +35,7 @@ namespace wrapper
     {                                                               \
         return prefix##_##postfix1##_##postfix2(a);                 \
     }                                                               \
-    \
+                                                                    \
     inline ptype vreinterpret(const ptype &a)                       \
     {                                                               \
         return a;                                                   \
diff --git a/src/core/NEON/wrapper/intrinsics/round.h b/src/core/NEON/wrapper/intrinsics/round.h
index d23feb6b4227bc1756f86539e9a1052c7bf74edf..7789aab77010e65b658d8bdd0adb0caa694d603f 100644
--- a/src/core/NEON/wrapper/intrinsics/round.h
+++ b/src/core/NEON/wrapper/intrinsics/round.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_ROUND_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/setlane.h b/src/core/NEON/wrapper/intrinsics/setlane.h
index 197eedacb5613c876126401f439f6c71f04a97ec..259b8eaf90e96d46c34bd04711323566c5577e02 100644
--- a/src/core/NEON/wrapper/intrinsics/setlane.h
+++ b/src/core/NEON/wrapper/intrinsics/setlane.h
@@ -33,7 +33,7 @@ namespace wrapper
 #define VSETLANE_IMPL_8(stype, atype, vtype, postfix)                                     \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vset_lane_##postfix(value, vector, 0);                             \
@@ -59,7 +59,7 @@ namespace wrapper
 #define VSETLANE_IMPL_4(stype, atype, vtype, postfix)                                     \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vset_lane_##postfix(value, vector, 0);                             \
@@ -77,7 +77,7 @@ namespace wrapper
 #define VSETLANE_IMPL_2(stype, atype, vtype, postfix)                                     \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vset_lane_##postfix(value, vector, 0);                             \
@@ -102,7 +102,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16)
 #define VSETQLANE_IMPL_16(stype, atype, vtype, postfix)                                   \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vsetq_lane_##postfix(value, vector, 0);                            \
@@ -144,7 +144,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16)
 #define VSETQLANE_IMPL_8(stype, atype, vtype, postfix)                                    \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vsetq_lane_##postfix(value, vector, 0);                            \
@@ -170,7 +170,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16)
 #define VSETQLANE_IMPL_4(stype, atype, vtype, postfix)                                    \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vsetq_lane_##postfix(value, vector, 0);                            \
diff --git a/src/core/NEON/wrapper/intrinsics/shr.h b/src/core/NEON/wrapper/intrinsics/shr.h
index 73ca9c56c666333839f464b246ec80500a33ff63..6ccb9cdf926a7d8168779574de449d0aeb508b89 100644
--- a/src/core/NEON/wrapper/intrinsics/shr.h
+++ b/src/core/NEON/wrapper/intrinsics/shr.h
@@ -75,7 +75,7 @@ VQRSHRN_SCALAR_IMPL(uint32_t, uint64_t, vqrshrnd_n, u64)
     {                                                                                                            \
         return prefix_signed##_##postfix(a, b);                                                                  \
     }                                                                                                            \
-    \
+                                                                                                                 \
     template <int b, typename T>                                                                                 \
     inline typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, u##half_vtype>::type \
     vqrshrn_ex(const vtype &a)                                                                                   \
@@ -128,7 +128,7 @@ VSHRQ_SCALAR_IMPL(int32_t, vshrd_n, s64)
     {                                                                                                            \
         return prefix_signed##_##postfix(a, b);                                                                  \
     }                                                                                                            \
-    \
+                                                                                                                 \
     template <int b, typename T>                                                                                 \
     inline typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, u##half_vtype>::type \
     vqrshrn_ex(const vtype &a)                                                                                   \
diff --git a/src/core/NEON/wrapper/intrinsics/sin.h b/src/core/NEON/wrapper/intrinsics/sin.h
index 03c2813a328ab3ee03087db1ce6a2a955b928205..d24fdfa816e1baa76c69fb9d461c97777840a609 100644
--- a/src/core/NEON/wrapper/intrinsics/sin.h
+++ b/src/core/NEON/wrapper/intrinsics/sin.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_SIN_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -54,4 +55,4 @@ VSIN_IMPL_INT(int32x4_t, vsinq, s32)
 #undef vsub_IMPL
 } // namespace wrapper
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_SUB_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_WRAPPER_SUB_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svcnt.h b/src/core/NEON/wrapper/intrinsics/svcnt.h
index e530e7c83f73ee810cf5d2f6bb045feb03910e20..c4652504b481e9045e1282b3a47b1c2ac7fe3fed 100644
--- a/src/core/NEON/wrapper/intrinsics/svcnt.h
+++ b/src/core/NEON/wrapper/intrinsics/svcnt.h
@@ -30,7 +30,7 @@ namespace arm_compute
 namespace wrapper
 {
 template <size_t element_size>
-inline uint64_t  svcnt_size();
+inline uint64_t svcnt_size();
 
 template <>
 inline uint64_t svcnt_size<64>()
@@ -65,4 +65,4 @@ inline uint64_t svcnt()
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCNT_H */
\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCNT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svcvt.h b/src/core/NEON/wrapper/intrinsics/svcvt.h
index 746b004d7d251a8844ecc6196a7547c19379e061..00ef7b7eb38e3f5ab46e7ae09f860364a2173919 100644
--- a/src/core/NEON/wrapper/intrinsics/svcvt.h
+++ b/src/core/NEON/wrapper/intrinsics/svcvt.h
@@ -29,11 +29,12 @@ namespace arm_compute
 {
 namespace wrapper
 {
-#define SVCVT_Z_TO_F32_IMPL(vtype)                                                                                        \
-    template <typename T>                                                                                                 \
-    inline typename std::enable_if<std::is_same<T, float>::value, svfloat32_t>::type svcvt_z(svbool_t pg, const vtype &a) \
-    {                                                                                                                     \
-        return svcvt_f32_z(pg, a);                                                                                        \
+#define SVCVT_Z_TO_F32_IMPL(vtype)                                                                            \
+    template <typename T>                                                                                     \
+    inline typename std::enable_if<std::is_same<T, float>::value, svfloat32_t>::type svcvt_z(svbool_t     pg, \
+                                                                                             const vtype &a)  \
+    {                                                                                                         \
+        return svcvt_f32_z(pg, a);                                                                            \
     }
 
 SVCVT_Z_TO_F32_IMPL(svuint32_t)
@@ -42,11 +43,12 @@ SVCVT_Z_TO_F32_IMPL(svfloat16_t)
 
 #undef SVCVT_Z_TO_F32_IMPL
 
-#define SVCVT_Z_TO_F16_IMPL(vtype)                                                                                            \
-    template <typename T>                                                                                                     \
-    inline typename std::enable_if<std::is_same<T, float16_t>::value, svfloat16_t>::type svcvt_z(svbool_t pg, const vtype &a) \
-    {                                                                                                                         \
-        return svcvt_f16_z(pg, a);                                                                                            \
+#define SVCVT_Z_TO_F16_IMPL(vtype)                                                                                \
+    template <typename T>                                                                                         \
+    inline typename std::enable_if<std::is_same<T, float16_t>::value, svfloat16_t>::type svcvt_z(svbool_t     pg, \
+                                                                                                 const vtype &a)  \
+    {                                                                                                             \
+        return svcvt_f16_z(pg, a);                                                                                \
     }
 
 SVCVT_Z_TO_F16_IMPL(svuint32_t)
@@ -55,11 +57,12 @@ SVCVT_Z_TO_F16_IMPL(svfloat32_t)
 
 #undef SVCVT_Z_TO_F16_IMPL
 
-#define SVCVT_Z_TO_S32_IMPL(vtype)                                                                                        \
-    template <typename T>                                                                                                 \
-    inline typename std::enable_if<std::is_same<T, int32_t>::value, svint32_t>::type svcvt_z(svbool_t pg, const vtype &a) \
-    {                                                                                                                     \
-        return svcvt_s32_z(pg, a);                                                                                        \
+#define SVCVT_Z_TO_S32_IMPL(vtype)                                                                            \
+    template <typename T>                                                                                     \
+    inline typename std::enable_if<std::is_same<T, int32_t>::value, svint32_t>::type svcvt_z(svbool_t     pg, \
+                                                                                             const vtype &a)  \
+    {                                                                                                         \
+        return svcvt_s32_z(pg, a);                                                                            \
     }
 
 SVCVT_Z_TO_S32_IMPL(svfloat16_t)
@@ -71,4 +74,4 @@ SVCVT_Z_TO_S32_IMPL(svfloat32_t)
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCVT_H */
\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCVT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svexp.h b/src/core/NEON/wrapper/intrinsics/svexp.h
index d6ce9a77d1529b106f1e84e68f891dc68c3a580a..1e8bce3960d8ef8c9c8bb2755f4aefc5f9af8a0a 100644
--- a/src/core/NEON/wrapper/intrinsics/svexp.h
+++ b/src/core/NEON/wrapper/intrinsics/svexp.h
@@ -26,6 +26,7 @@
 
 #if defined(__ARM_FEATURE_SVE)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -46,4 +47,4 @@ SVEXP_IMPL(svfloat16_t, f16)
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVEXP_H */
\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVEXP_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svlog.h b/src/core/NEON/wrapper/intrinsics/svlog.h
index 5b505ae1e313a9c827cfa34c880ec6229da4d03f..b4630e20edbb5de10de41c467956b374d4a44397 100644
--- a/src/core/NEON/wrapper/intrinsics/svlog.h
+++ b/src/core/NEON/wrapper/intrinsics/svlog.h
@@ -25,6 +25,7 @@
 #define SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H
 #if defined(__ARM_FEATURE_SVE)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -44,4 +45,4 @@ SVLOG_IMPL(svfloat16_t, f16)
 } // namespace wrapper
 } // namespace arm_compute
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H */
\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svptrue.h b/src/core/NEON/wrapper/intrinsics/svptrue.h
index 53407e5301cd05f4ccf00b6a1eb88387005a01d4..6ed00bccbf2e829c08b8cbb4bc21b0b04824bc07 100644
--- a/src/core/NEON/wrapper/intrinsics/svptrue.h
+++ b/src/core/NEON/wrapper/intrinsics/svptrue.h
@@ -30,7 +30,7 @@ namespace arm_compute
 namespace wrapper
 {
 template <size_t element_size>
-inline svbool_t  svptrue_size();
+inline svbool_t svptrue_size();
 
 template <>
 inline svbool_t svptrue_size<64>()
@@ -65,4 +65,4 @@ svbool_t svptrue()
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVPTRUE_H */
\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVPTRUE_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svwhilelt.h b/src/core/NEON/wrapper/intrinsics/svwhilelt.h
index ef58217dc4041fda2f8b8a19b3e479d8a0e474b9..f0f84a95088d6774af77188f8c3eba4238e8c4d7 100644
--- a/src/core/NEON/wrapper/intrinsics/svwhilelt.h
+++ b/src/core/NEON/wrapper/intrinsics/svwhilelt.h
@@ -32,7 +32,7 @@ namespace wrapper
 #define SVWHILELT_IMPL(type)                           \
     template <size_t element_size>                     \
     inline svbool_t svwhilelt_size(type a, type b);    \
-    \
+                                                       \
     template <>                                        \
     inline svbool_t svwhilelt_size<64>(type a, type b) \
     {                                                  \
@@ -70,4 +70,4 @@ inline svbool_t svwhilelt(IndexType a, IndexType b)
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVWHILELT_H */
\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVWHILELT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/tanh.h b/src/core/NEON/wrapper/intrinsics/tanh.h
index daeaf19997b393def3f7e881468669ab41c4f7fa..e74f0e86feae17940a5017d6499ebaf21b61bb59 100644
--- a/src/core/NEON/wrapper/intrinsics/tanh.h
+++ b/src/core/NEON/wrapper/intrinsics/tanh.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_TANH_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/scalar/add.h b/src/core/NEON/wrapper/scalar/add.h
index 642d9261f3f482608134ac8ae94c4dea171edecb..2ec88869e38cb46bffa5085a8d70a80f463af1a5 100644
--- a/src/core/NEON/wrapper/scalar/add.h
+++ b/src/core/NEON/wrapper/scalar/add.h
@@ -32,22 +32,22 @@ namespace wrapper
 {
 inline uint8_t add_sat(const uint8_t &a, const uint8_t &b)
 {
-    const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
-    const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
+    const uint8x8_t va = {a, 0, 0, 0, 0, 0, 0, 0};
+    const uint8x8_t vb = {b, 0, 0, 0, 0, 0, 0, 0};
     return vget_lane_u8(vqadd_u8(va, vb), 0);
 }
 
 inline int16_t add_sat(const int16_t &a, const int16_t &b)
 {
-    const int16x4_t va = { a, 0, 0, 0 };
-    const int16x4_t vb = { b, 0, 0, 0 };
+    const int16x4_t va = {a, 0, 0, 0};
+    const int16x4_t vb = {b, 0, 0, 0};
     return vget_lane_s16(vqadd_s16(va, vb), 0);
 }
 
 inline int32_t add_sat(const int32_t &a, const int32_t &b)
 {
-    const int32x2_t va = { a, 0 };
-    const int32x2_t vb = { b, 0 };
+    const int32x2_t va = {a, 0};
+    const int32x2_t vb = {b, 0};
     return vget_lane_s32(vqadd_s32(va, vb), 0);
 }
 
diff --git a/src/core/NEON/wrapper/scalar/sub.h b/src/core/NEON/wrapper/scalar/sub.h
index 1fe51d75fc8a2c1c8a122af3bf6ebfe75e53bbe5..00de7d867f6d44cd314cfb54a589b1b994d1163d 100644
--- a/src/core/NEON/wrapper/scalar/sub.h
+++ b/src/core/NEON/wrapper/scalar/sub.h
@@ -32,22 +32,22 @@ namespace wrapper
 {
 inline uint8_t sub_sat(const uint8_t &a, const uint8_t &b)
 {
-    const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
-    const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
+    const uint8x8_t va = {a, 0, 0, 0, 0, 0, 0, 0};
+    const uint8x8_t vb = {b, 0, 0, 0, 0, 0, 0, 0};
     return vget_lane_u8(vqsub_u8(va, vb), 0);
 }
 
 inline int16_t sub_sat(const int16_t &a, const int16_t &b)
 {
-    const int16x4_t va = { a, 0, 0, 0 };
-    const int16x4_t vb = { b, 0, 0, 0 };
+    const int16x4_t va = {a, 0, 0, 0};
+    const int16x4_t vb = {b, 0, 0, 0};
     return vget_lane_s16(vqsub_s16(va, vb), 0);
 }
 
 inline int32_t sub_sat(const int32_t &a, const int32_t &b)
 {
-    const int32x2_t va = { a, 0 };
-    const int32x2_t vb = { b, 0 };
+    const int32x2_t va = {a, 0};
+    const int32x2_t vb = {b, 0};
     return vget_lane_s32(vqsub_s32(va, vb), 0);
 }
 
diff --git a/src/core/NEON/wrapper/svtraits.h b/src/core/NEON/wrapper/svtraits.h
index 5ccd0ba8f122f68f8baf6db86bb7ddce771833b6..330d27275217ad30a442e3523a5a21428ac94a00 100644
--- a/src/core/NEON/wrapper/svtraits.h
+++ b/src/core/NEON/wrapper/svtraits.h
@@ -25,6 +25,7 @@
 #define SRC_CORE_NEON_WRAPPER_SVTRAITS_H
 #if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/traits.h b/src/core/NEON/wrapper/traits.h
index ebb64d9d76afa1abc3932d16c8f2de42db7029a6..1dac61af74382cf6f8576b64608fc20dc1b1ac36 100644
--- a/src/core/NEON/wrapper/traits.h
+++ b/src/core/NEON/wrapper/traits.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,23 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WRAPPER_TRAITS_H
-#define ARM_COMPUTE_WRAPPER_TRAITS_H
+#ifndef ACL_SRC_CORE_NEON_WRAPPER_TRAITS_H
+#define ACL_SRC_CORE_NEON_WRAPPER_TRAITS_H
+
+#include "arm_compute/core/CoreTypes.h"
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#include "src/cpu/CpuTypes.h" // required for float16_t
+#endif                        // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
 #include <arm_neon.h>
 
-#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FEATURE_SVE)
 #include <arm_sve.h>
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FEATURE_SVE) */
+
+#include <cmath>
+#include <cstdint>
 
 namespace arm_compute
 {
@@ -116,13 +125,13 @@ template <> struct neon_bitvector<float16_t, BitWidth::W128>{ using type = float
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 
-#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FEATURE_SVE)
 /** Create the appropriate SVE vector given its type */
 template <typename T> struct sve_vector;
 
 template <> struct sve_vector<uint8_t>{ using scalar_type = uint8_t; using type = svuint8_t; };
 template <> struct sve_vector<int8_t>{ using scalar_type = int8_t; using type = svint8_t; };
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FEATURE_SVE) */
 
 #endif /* DOXYGEN_SKIP_THIS */
 
@@ -151,4 +160,4 @@ using promote_t = typename promote<T>::type;
 } // namespace traits
 } // namespace wrapper
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_TRAITS_H */
+#endif // ACL_SRC_CORE_NEON_WRAPPER_TRAITS_H
diff --git a/src/core/NEON/wrapper/wrapper.h b/src/core/NEON/wrapper/wrapper.h
index e5467e98ff54b54964c5f6fb74af304f3e175229..f3f3c5d9e657a71921ddcc3269de558d3971ad20 100644
--- a/src/core/NEON/wrapper/wrapper.h
+++ b/src/core/NEON/wrapper/wrapper.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WRAPPER_H
-#define ARM_COMPUTE_WRAPPER_H
+#ifndef ACL_SRC_CORE_NEON_WRAPPER_WRAPPER_H
+#define ACL_SRC_CORE_NEON_WRAPPER_WRAPPER_H
+
+#include "arm_compute/core/Error.h"
 
 // Traits
 #include "src/core/NEON/wrapper/traits.h"
@@ -31,4 +33,4 @@
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/core/NEON/wrapper/scalar/scalar.h"
 
-#endif /* ARM_COMPUTE_WRAPPER_H */
+#endif // ACL_SRC_CORE_NEON_WRAPPER_WRAPPER_H
diff --git a/src/core/Rounding.cpp b/src/core/Rounding.cpp
index 99858e2a98ca79ddb62e76e08565b0fcd6b34f5b..62ce33581558d6a49c1f038f873d940e2d7a7e4c 100644
--- a/src/core/Rounding.cpp
+++ b/src/core/Rounding.cpp
@@ -25,6 +25,7 @@
 #include "arm_compute/core/Rounding.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "support/ToolchainSupport.h"
 
 #include <cmath>
@@ -36,7 +37,7 @@ int arm_compute::round(float x, RoundingPolicy rounding_policy)
 {
     using namespace std;
     int rounded = 0;
-    switch(rounding_policy)
+    switch (rounding_policy)
     {
         case RoundingPolicy::TO_ZERO:
         {
@@ -51,9 +52,7 @@ int arm_compute::round(float x, RoundingPolicy rounding_policy)
         case RoundingPolicy::TO_NEAREST_EVEN:
         {
 #ifdef __aarch64__
-            asm("fcvtns %x[res], %s[value]"
-                : [res] "=r"(rounded)
-                : [value] "w"(x));
+            asm("fcvtns %x[res], %s[value]" : [res] "=r"(rounded) : [value] "w"(x));
 #else  // __aarch64__
             ARM_COMPUTE_ERROR("TO_NEAREST_EVEN rounding policy is not supported.");
 #endif // __aarch64__
diff --git a/src/core/Size2D.cpp b/src/core/Size2D.cpp
index 6eb46e56afa568d1cd3c637dd7fa30f7ee0d9744..69b26515207280854e3733185ba316fd21dc1841 100644
--- a/src/core/Size2D.cpp
+++ b/src/core/Size2D.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Size2D.h"
+
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -30,4 +31,4 @@ std::string Size2D::to_string() const
 {
     return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height);
 }
-}
+} // namespace arm_compute
diff --git a/src/core/Size3D.cpp b/src/core/Size3D.cpp
index 3ee9fb8e5ce6c5778041c2c72e1faf703e13a600..b56a99acd7906ca15753664de362ec6df9f2194f 100644
--- a/src/core/Size3D.cpp
+++ b/src/core/Size3D.cpp
@@ -22,12 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Size3D.h"
+
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 std::string Size3D::to_string() const
 {
-    return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height) + std::string("x") + support::cpp11::to_string(depth);
+    return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height) + std::string("x") +
+           support::cpp11::to_string(depth);
 }
-}
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
index 723b6bc0165d3e80be2b181613d9625b82b97a3f..8012c3d7216969e416c2c2ec3d396eb67b60020d 100644
--- a/src/core/SubTensorInfo.cpp
+++ b/src/core/SubTensorInfo.cpp
@@ -42,10 +42,10 @@ namespace
 TensorShape extend_parent_shape(TensorShape parent_shape, TensorShape shape, Coordinates coords)
 {
     // Extend shape
-    for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
     {
         int dimension_extend = coords[i] + static_cast<int>(shape[i]);
-        if((dimension_extend > static_cast<int>(parent_shape[i])) && (dimension_extend > 0))
+        if ((dimension_extend > static_cast<int>(parent_shape[i])) && (dimension_extend > 0))
         {
             parent_shape.set(i, static_cast<size_t>(dimension_extend));
         }
@@ -56,23 +56,35 @@ TensorShape extend_parent_shape(TensorShape parent_shape, TensorShape shape, Coo
 } // namespace
 
 SubTensorInfo::SubTensorInfo()
-    : _parent(nullptr), _tensor_shape(), _dims_state(), _coords(), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(false), _lock_paddings(false)
+    : _parent(nullptr),
+      _tensor_shape(),
+      _dims_state(),
+      _coords(),
+      _valid_region{Coordinates(), _tensor_shape},
+      _extend_parent(false),
+      _lock_paddings(false)
 {
 }
 
 SubTensorInfo::SubTensorInfo(ITensorInfo *parent, TensorShape tensor_shape, Coordinates coords, bool extend_parent)
-    : _parent(parent), _tensor_shape(tensor_shape), _dims_state(), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(extend_parent), _lock_paddings(false)
+    : _parent(parent),
+      _tensor_shape(tensor_shape),
+      _dims_state(),
+      _coords(coords),
+      _valid_region{Coordinates(), _tensor_shape},
+      _extend_parent(extend_parent),
+      _lock_paddings(false)
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
 
     // Check if subtensor is valid if parent is configured
-    if(parent->tensor_shape().total_size() != 0 && !_extend_parent)
+    if (parent->tensor_shape().total_size() != 0 && !_extend_parent)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(parent->tensor_shape(), coords, tensor_shape);
     }
 
     // Initialize valid region
-    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+    _valid_region = ValidRegion{Coordinates(), _tensor_shape};
 }
 
 std::unique_ptr<ITensorInfo> SubTensorInfo::clone() const
@@ -91,17 +103,17 @@ ITensorInfo &SubTensorInfo::set_tensor_shape(const TensorShape &shape)
     ARM_COMPUTE_ERROR_ON(_parent == nullptr);
 
     // Check if subtensor is valid if parent is configured
-    if(_parent->tensor_shape().total_size() != 0 && !_extend_parent)
+    if (_parent->tensor_shape().total_size() != 0 && !_extend_parent)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape);
-        _valid_region = ValidRegion{ _coords, shape };
+        _valid_region = ValidRegion{_coords, shape};
     }
-    else if(_extend_parent) // Extend parent shape, configure if specified
+    else if (_extend_parent) // Extend parent shape, configure if specified
     {
         ARM_COMPUTE_ERROR_ON((_parent->data_type() == DataType::UNKNOWN) && (_parent->format() == Format::UNKNOWN));
         TensorShape parent_extended_shape = extend_parent_shape(_parent->tensor_shape(), shape, _coords);
         _parent->set_tensor_shape(parent_extended_shape);
-        _parent->set_valid_region(ValidRegion{ Coordinates(), parent_extended_shape });
+        _parent->set_valid_region(ValidRegion{Coordinates(), parent_extended_shape});
     }
     _tensor_shape = shape;
     return *this;
@@ -133,11 +145,11 @@ bool SubTensorInfo::extend_padding(const PaddingSize &padding)
     ARM_COMPUTE_ERROR_ON(_parent->total_size() == 0);
 
     // Check that you do not extend padding on sub-tensors unless XY shape matches parent tensor
-    if(!_extend_parent && (padding.left || padding.right))
+    if (!_extend_parent && (padding.left || padding.right))
     {
         ARM_COMPUTE_ERROR_ON(_parent->tensor_shape().x() != tensor_shape().x());
     }
-    if(!_extend_parent && (padding.top || padding.bottom))
+    if (!_extend_parent && (padding.top || padding.bottom))
     {
         ARM_COMPUTE_ERROR_ON(_parent->tensor_shape().y() != tensor_shape().y());
     }
@@ -153,7 +165,7 @@ int32_t SubTensorInfo::offset_element_in_bytes(const Coordinates &pos) const
     int32_t        offset  = offset_first_element_in_bytes();
     const Strides &strides = strides_in_bytes();
 
-    for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
+    for (size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
     {
         offset += pos[i] * strides[i];
     }
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index 5905ba521564a3a6c00a08784caeddd60dac6dd0..31bddbde40d3215ad8041442137d50b51f238c68 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/Utils.h"
 
 #include <memory>
@@ -34,13 +35,26 @@
 namespace arm_compute
 {
 TensorInfo::TensorInfo()
-    : _total_size(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _dims_state(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN), _is_resizable{ true },
-      _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW), _are_values_constant(true), _id(invalid_tensor_id), _lock_paddings(false)
-{
-}
-
-TensorInfo::TensorInfo(const ITensorInfo &info)
-    : TensorInfo()
+    : _total_size(0),
+      _offset_first_element_in_bytes(0),
+      _strides_in_bytes(),
+      _num_channels(0),
+      _tensor_shape(),
+      _dims_state(),
+      _data_type(DataType::UNKNOWN),
+      _format(Format::UNKNOWN),
+      _is_resizable{true},
+      _valid_region{Coordinates(), _tensor_shape},
+      _padding{0},
+      _quantization_info(),
+      _data_layout(DataLayout::NCHW),
+      _are_values_constant(true),
+      _id(invalid_tensor_id),
+      _lock_paddings(false)
+{
+}
+
+TensorInfo::TensorInfo(const ITensorInfo &info) : TensorInfo()
 {
     _total_size                    = info.total_size();
     _offset_first_element_in_bytes = info.offset_first_element_in_bytes();
@@ -60,8 +74,7 @@ TensorInfo::TensorInfo(const ITensorInfo &info)
     _lock_paddings                 = info.lock_paddings();
 }
 
-TensorInfo::TensorInfo(const TensorInfo &info)
-    : TensorInfo()
+TensorInfo::TensorInfo(const TensorInfo &info) : TensorInfo()
 {
     _total_size                    = info.total_size();
     _offset_first_element_in_bytes = info.offset_first_element_in_bytes();
@@ -80,8 +93,7 @@ TensorInfo::TensorInfo(const TensorInfo &info)
     _id                            = info.id();
     _lock_paddings                 = false;
 }
-TensorInfo::TensorInfo(Format format)
-    : TensorInfo(TensorShape(), format)
+TensorInfo::TensorInfo(Format format) : TensorInfo(TensorShape(), format)
 {
 }
 
@@ -90,25 +102,25 @@ TensorInfo::TensorInfo(unsigned int width, unsigned int height, Format format)
 {
 }
 
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, Format format)
-    : TensorInfo()
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, Format format) : TensorInfo()
 {
     init(tensor_shape, format);
 }
 
-TensorInfo::TensorInfo(size_t num_channels, DataType data_type)
-    : TensorInfo()
+TensorInfo::TensorInfo(size_t num_channels, DataType data_type) : TensorInfo()
 {
     init(TensorShape(), num_channels, data_type);
 }
 
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type)
-    : TensorInfo()
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type) : TensorInfo()
 {
     init(tensor_shape, num_channels, data_type);
 }
 
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info)
+TensorInfo::TensorInfo(const TensorShape &tensor_shape,
+                       size_t             num_channels,
+                       DataType           data_type,
+                       QuantizationInfo   quantization_info)
     : TensorInfo()
 {
     init(tensor_shape, num_channels, data_type);
@@ -137,9 +149,11 @@ void TensorInfo::init(const TensorShape &tensor_shape, Format format)
     _format = format;
 }
 
-void TensorInfo::init(const TensorShape &tensor_shape, Format format,
-                      const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
-                      size_t total_size_in_bytes)
+void TensorInfo::init(const TensorShape &tensor_shape,
+                      Format             format,
+                      const Strides     &strides_in_bytes,
+                      size_t             offset_first_element_in_bytes,
+                      size_t             total_size_in_bytes)
 {
     size_t         num_channels = num_channels_from_format(format);
     const DataType type         = data_type_from_format(format);
@@ -165,9 +179,12 @@ void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, Data
     set_tensor_shape(tensor_shape);
 }
 
-void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type,
-                      const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
-                      size_t total_size_in_bytes)
+void TensorInfo::init(const TensorShape &tensor_shape,
+                      size_t             num_channels,
+                      DataType           data_type,
+                      const Strides     &strides_in_bytes,
+                      size_t             offset_first_element_in_bytes,
+                      size_t             total_size_in_bytes)
 {
     ARM_COMPUTE_ERROR_ON(num_channels == 0);
 
@@ -179,7 +196,7 @@ void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, Data
     _strides_in_bytes              = strides_in_bytes;
     _total_size                    = total_size_in_bytes;
 
-    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+    _valid_region = ValidRegion{Coordinates(), _tensor_shape};
 }
 
 size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, Format format)
@@ -202,7 +219,7 @@ size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, size_t num
     _format       = Format::UNKNOWN;
     _tensor_shape = tensor_shape;
 
-    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+    _valid_region = ValidRegion{Coordinates(), _tensor_shape};
 
     auto_padding();
 
@@ -233,11 +250,11 @@ std::tuple<Strides, size_t, size_t> TensorInfo::calculate_padding_requirements(c
     size_t       required_total_size           = 0;
     const size_t required_offset_first_element = padding.left * stride_x + padding.top * stride_y;
 
-    switch(_tensor_shape.num_dimensions())
+    switch (_tensor_shape.num_dimensions())
     {
         case 0:
         {
-            if(_tensor_shape.total_size() > 0)
+            if (_tensor_shape.total_size() > 0)
             {
                 required_strides    = Strides(stride_x, stride_x);
                 required_total_size = stride_z;
@@ -258,7 +275,8 @@ std::tuple<Strides, size_t, size_t> TensorInfo::calculate_padding_requirements(c
 
             const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
 
-            required_total_size = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * required_strides[idx_last_dimension];
+            required_total_size =
+                static_cast<size_t>(_tensor_shape[idx_last_dimension]) * required_strides[idx_last_dimension];
             break;
         }
     }
@@ -284,25 +302,25 @@ bool TensorInfo::extend_padding(const PaddingSize &padding)
 
     bool updated = false;
 
-    if(padding.top > _padding.top)
+    if (padding.top > _padding.top)
     {
         _padding.top = padding.top;
         updated      = true;
     }
 
-    if(padding.right > _padding.right)
+    if (padding.right > _padding.right)
     {
         _padding.right = padding.right;
         updated        = true;
     }
 
-    if(padding.bottom > _padding.bottom)
+    if (padding.bottom > _padding.bottom)
     {
         _padding.bottom = padding.bottom;
         updated         = true;
     }
 
-    if(padding.left > _padding.left)
+    if (padding.left > _padding.left)
     {
         _padding.left = padding.left;
         updated       = true;
@@ -336,7 +354,7 @@ ITensorInfo &TensorInfo::set_format(Format format)
 {
     _format = format;
 
-    if(_data_type == DataType::UNKNOWN)
+    if (_data_type == DataType::UNKNOWN)
     {
         _num_channels = num_channels_from_format(format);
         _data_type    = data_type_from_format(format);
@@ -355,19 +373,19 @@ ITensorInfo &TensorInfo::set_tensor_shape(const TensorShape &shape)
     _offset_first_element_in_bytes = 0;
     _strides_in_bytes              = compute_strides(*this);
 
-    if(_tensor_shape.num_dimensions() == 0)
+    if (_tensor_shape.num_dimensions() == 0)
     {
         _total_size = _strides_in_bytes[0];
     }
     else
     {
         const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
-        _total_size                           = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * _strides_in_bytes[idx_last_dimension];
+        _total_size = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * _strides_in_bytes[idx_last_dimension];
     }
 
     std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding);
 
-    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+    _valid_region = ValidRegion{Coordinates(), _tensor_shape};
     return *this;
 }
 
@@ -392,9 +410,10 @@ ITensorInfo &TensorInfo::set_data_layout(const DataLayout &data_layout)
 ITensorInfo &TensorInfo::reset_padding()
 {
     _padding = PaddingSize();
-    if(((_format != Format::UNKNOWN) || (_data_type != DataType::UNKNOWN)) && _total_size != 0)
+    if (((_format != Format::UNKNOWN) || (_data_type != DataType::UNKNOWN)) && _total_size != 0)
     {
-        std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding);
+        std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) =
+            calculate_padding_requirements(_padding);
     }
     return *this;
 }
@@ -405,7 +424,7 @@ int32_t TensorInfo::offset_element_in_bytes(const Coordinates &pos) const
 
     int32_t offset = _offset_first_element_in_bytes;
 
-    for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
+    for (size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
     {
         offset += pos[i] * _strides_in_bytes[i];
     }
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 0701ee7c9076b3e91ec283e8d1792753e0bf6896..90a7ac32c041fb418def9e1f2e2b4cf246be1d2d 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -49,7 +49,7 @@ std::string read_file(const std::string &filename, bool binary)
         fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
         std::ios_base::openmode mode = std::ios::in;
 
-        if(binary)
+        if (binary)
         {
             mode |= std::ios::binary;
         }
@@ -66,7 +66,7 @@ std::string read_file(const std::string &filename, bool binary)
         out.assign(std::istreambuf_iterator<char>(fs), std::istreambuf_iterator<char>());
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::ifstream::failure &e)
+    catch (const std::ifstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", filename.c_str(), e.what());
     }
@@ -77,32 +77,28 @@ std::string read_file(const std::string &filename, bool binary)
 
 const std::string &string_from_channel(Channel channel)
 {
-    static std::map<Channel, const std::string> channels_map =
-    {
-        { Channel::UNKNOWN, "UNKNOWN" },
-        { Channel::R, "R" },
-        { Channel::G, "G" },
-        { Channel::B, "B" },
-        { Channel::A, "A" },
-        { Channel::Y, "Y" },
-        { Channel::U, "U" },
-        { Channel::V, "V" },
-        { Channel::C0, "C0" },
-        { Channel::C1, "C1" },
-        { Channel::C2, "C2" },
-        { Channel::C3, "C3" }
-    };
+    static std::map<Channel, const std::string> channels_map = {{Channel::UNKNOWN, "UNKNOWN"},
+                                                                {Channel::R, "R"},
+                                                                {Channel::G, "G"},
+                                                                {Channel::B, "B"},
+                                                                {Channel::A, "A"},
+                                                                {Channel::Y, "Y"},
+                                                                {Channel::U, "U"},
+                                                                {Channel::V, "V"},
+                                                                {Channel::C0, "C0"},
+                                                                {Channel::C1, "C1"},
+                                                                {Channel::C2, "C2"},
+                                                                {Channel::C3, "C3"}};
 
     return channels_map[channel];
 }
 
 const std::string &string_from_border_mode(BorderMode border_mode)
 {
-    static std::map<BorderMode, const std::string> border_mode_map =
-    {
-        { BorderMode::UNDEFINED, "UNDEFINED" },
-        { BorderMode::CONSTANT, "CONSTANT" },
-        { BorderMode::REPLICATE, "REPLICATE" },
+    static std::map<BorderMode, const std::string> border_mode_map = {
+        {BorderMode::UNDEFINED, "UNDEFINED"},
+        {BorderMode::CONSTANT, "CONSTANT"},
+        {BorderMode::REPLICATE, "REPLICATE"},
     };
 
     return border_mode_map[border_mode];
@@ -110,11 +106,10 @@ const std::string &string_from_border_mode(BorderMode border_mode)
 
 const std::string &string_from_norm_type(NormType type)
 {
-    static std::map<NormType, const std::string> norm_type_map =
-    {
-        { NormType::IN_MAP_1D, "IN_MAP_1D" },
-        { NormType::IN_MAP_2D, "IN_MAP_2D" },
-        { NormType::CROSS_MAP, "CROSS_MAP" },
+    static std::map<NormType, const std::string> norm_type_map = {
+        {NormType::IN_MAP_1D, "IN_MAP_1D"},
+        {NormType::IN_MAP_2D, "IN_MAP_2D"},
+        {NormType::CROSS_MAP, "CROSS_MAP"},
     };
 
     return norm_type_map[type];
@@ -122,11 +117,10 @@ const std::string &string_from_norm_type(NormType type)
 
 const std::string &string_from_pooling_type(PoolingType type)
 {
-    static std::map<PoolingType, const std::string> pool_type_map =
-    {
-        { PoolingType::MAX, "MAX" },
-        { PoolingType::AVG, "AVG" },
-        { PoolingType::L2, "L2" },
+    static std::map<PoolingType, const std::string> pool_type_map = {
+        {PoolingType::MAX, "MAX"},
+        {PoolingType::AVG, "AVG"},
+        {PoolingType::L2, "L2"},
     };
 
     return pool_type_map[type];
@@ -134,38 +128,36 @@ const std::string &string_from_pooling_type(PoolingType type)
 
 bool is_pool_region_entirely_outside_input(const PoolingLayerInfo &info)
 {
-    if(info.is_global_pooling || info.exclude_padding || info.pool_size.x() == 0 || info.pool_size.y() == 0)
+    if (info.is_global_pooling || info.exclude_padding || info.pool_size.x() == 0 || info.pool_size.y() == 0)
     {
         return false;
     }
     const auto ps                = info.pad_stride_info;
-    const auto pool_le_padding_x = info.pool_size.x() <= std::max({ ps.pad_left(), ps.pad_right() });
-    const auto pool_le_padding_y = info.pool_size.y() <= std::max({ ps.pad_top(), ps.pad_bottom() });
+    const auto pool_le_padding_x = info.pool_size.x() <= std::max({ps.pad_left(), ps.pad_right()});
+    const auto pool_le_padding_y = info.pool_size.y() <= std::max({ps.pad_top(), ps.pad_bottom()});
     return pool_le_padding_x || pool_le_padding_y;
 }
 
 bool is_pool_3d_region_entirely_outside_input(const Pooling3dLayerInfo &info)
 {
-    if(info.is_global_pooling || info.pool_size.x() == 0 || info.pool_size.y() == 0 || info.pool_size.z() == 0)
+    if (info.is_global_pooling || info.pool_size.x() == 0 || info.pool_size.y() == 0 || info.pool_size.z() == 0)
     {
         return false;
     }
     const auto ps                = info.padding;
-    const auto pool_le_padding_x = info.pool_size.x() <= std::max({ ps.left, ps.right });
-    const auto pool_le_padding_y = info.pool_size.y() <= std::max({ ps.top, ps.bottom });
-    const auto pool_le_padding_z = info.pool_size.z() <= std::max({ ps.front, ps.back });
+    const auto pool_le_padding_x = info.pool_size.x() <= std::max({ps.left, ps.right});
+    const auto pool_le_padding_y = info.pool_size.y() <= std::max({ps.top, ps.bottom});
+    const auto pool_le_padding_z = info.pool_size.z() <= std::max({ps.front, ps.back});
     return pool_le_padding_x || pool_le_padding_y || pool_le_padding_z;
 }
 
 const std::string &string_from_gemmlowp_output_stage(GEMMLowpOutputStageType output_stage)
 {
-    static std::map<GEMMLowpOutputStageType, const std::string> output_stage_map =
-    {
-        { GEMMLowpOutputStageType::NONE, "" },
-        { GEMMLowpOutputStageType::QUANTIZE_DOWN, "quantize_down" },
-        { GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, "quantize_down_fixedpoint" },
-        { GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT, "quantize_down_float" }
-    };
+    static std::map<GEMMLowpOutputStageType, const std::string> output_stage_map = {
+        {GEMMLowpOutputStageType::NONE, ""},
+        {GEMMLowpOutputStageType::QUANTIZE_DOWN, "quantize_down"},
+        {GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, "quantize_down_fixedpoint"},
+        {GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT, "quantize_down_float"}};
 
     return output_stage_map[output_stage];
 }
@@ -175,7 +167,7 @@ std::string string_from_pixel_value(const PixelValue &value, const DataType data
     std::stringstream ss;
     std::string       converted_string;
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -223,11 +215,16 @@ std::string string_from_pixel_value(const PixelValue &value, const DataType data
     return converted_string;
 }
 
-PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout, const Size2D &dilation,
+PadStrideInfo calculate_same_pad(TensorShape                  input_shape,
+                                 TensorShape                  weights_shape,
+                                 PadStrideInfo                conv_info,
+                                 DataLayout                   data_layout,
+                                 const Size2D                &dilation,
                                  const DimensionRoundingType &rounding_type)
 {
     const auto &strides = conv_info.stride();
-    ARM_COMPUTE_ERROR_ON_MSG((strides.first < 1 || strides.second < 1), "Stride values should be greater than or equal to 1.");
+    ARM_COMPUTE_ERROR_ON_MSG((strides.first < 1 || strides.second < 1),
+                             "Stride values should be greater than or equal to 1.");
 
     const unsigned int width_idx     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const unsigned int height_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -246,8 +243,9 @@ PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_sh
     const int real_weight_height = (kernel_height - 1) * dilation.y() + 1;
 
     // Calculate total pad
-    const int pad_width  = std::max(0, static_cast<int>((out_width - 1) * strides.first + real_weight_width - in_width));
-    const int pad_height = std::max(0, static_cast<int>((out_height - 1) * strides.second + real_weight_height - in_height));
+    const int pad_width = std::max(0, static_cast<int>((out_width - 1) * strides.first + real_weight_width - in_width));
+    const int pad_height =
+        std::max(0, static_cast<int>((out_height - 1) * strides.second + real_weight_height - in_height));
 
     // Calculate individual paddings
     const unsigned int pad_left   = pad_width / 2;
@@ -265,8 +263,10 @@ PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_sh
     return same_info;
 }
 
-std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
-                                                                      unsigned int kernel_width, unsigned int kernel_height,
+std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int         in_width,
+                                                                      unsigned int         in_height,
+                                                                      unsigned int         kernel_width,
+                                                                      unsigned int         kernel_height,
                                                                       const PadStrideInfo &pad_stride_info)
 {
     const unsigned int pad_left   = pad_stride_info.pad_left();
@@ -285,8 +285,10 @@ std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned i
     return std::make_pair<unsigned int, unsigned int>(w, h);
 }
 
-std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
-                                                        int kernel_width, int kernel_height,
+std::pair<unsigned int, unsigned int> scaled_dimensions(int                  width,
+                                                        int                  height,
+                                                        int                  kernel_width,
+                                                        int                  kernel_height,
                                                         const PadStrideInfo &pad_stride_info,
                                                         const Size2D        &dilation)
 {
@@ -300,15 +302,25 @@ std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
     const int stride_y   = pad_stride_info.stride().second;
     int       w          = 0;
     int       h          = 0;
-    switch(pad_stride_info.round())
+    switch (pad_stride_info.round())
     {
         case DimensionRoundingType::FLOOR:
-            w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + 1));
-            h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / stride_y) + 1));
+            w = static_cast<int>(std::floor(
+                (static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) +
+                1));
+            h = static_cast<int>(
+                std::floor((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) /
+                            stride_y) +
+                           1));
             break;
         case DimensionRoundingType::CEIL:
-            w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + 1));
-            h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / stride_y) + 1));
+            w = static_cast<int>(std::ceil(
+                (static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) +
+                1));
+            h = static_cast<int>(
+                std::ceil((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) /
+                           stride_y) +
+                          1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -319,9 +331,8 @@ std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
     return std::make_pair(static_cast<unsigned int>(w), static_cast<unsigned int>(h));
 }
 
-std::pair<int, int> scaled_dimensions_signed(int width, int height,
-                                             int kernel_width, int kernel_height,
-                                             const PadStrideInfo &pad_stride_info)
+std::pair<int, int> scaled_dimensions_signed(
+    int width, int height, int kernel_width, int kernel_height, const PadStrideInfo &pad_stride_info)
 {
     const int pad_left   = pad_stride_info.pad_left();
     const int pad_top    = pad_stride_info.pad_top();
@@ -331,15 +342,19 @@ std::pair<int, int> scaled_dimensions_signed(int width, int height,
     const int stride_y   = pad_stride_info.stride().second;
     int       w          = 0;
     int       h          = 0;
-    switch(pad_stride_info.round())
+    switch (pad_stride_info.round())
     {
         case DimensionRoundingType::FLOOR:
-            w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            w = static_cast<int>(
+                std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<int>(
+                std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
             break;
         case DimensionRoundingType::CEIL:
-            w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            w = static_cast<int>(
+                std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<int>(
+                std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -348,8 +363,12 @@ std::pair<int, int> scaled_dimensions_signed(int width, int height,
     return std::make_pair(static_cast<int>(w), static_cast<int>(h));
 }
 
-std::tuple<int, int, int> scaled_3d_dimensions_signed(int width, int height, int depth,
-                                                      int kernel_width, int kernel_height, int kernel_depth,
+std::tuple<int, int, int> scaled_3d_dimensions_signed(int                       width,
+                                                      int                       height,
+                                                      int                       depth,
+                                                      int                       kernel_width,
+                                                      int                       kernel_height,
+                                                      int                       kernel_depth,
                                                       const Pooling3dLayerInfo &pool3d_info)
 {
     const int pad_left   = pool3d_info.padding.left;
@@ -365,17 +384,23 @@ std::tuple<int, int, int> scaled_3d_dimensions_signed(int width, int height, int
     int       h          = 0;
     int       d          = 0;
 
-    switch(pool3d_info.round_type)
+    switch (pool3d_info.round_type)
     {
         case DimensionRoundingType::FLOOR:
-            w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
-            d = static_cast<int>(std::floor((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
+            w = static_cast<int>(
+                std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<int>(
+                std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            d = static_cast<int>(
+                std::floor((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
             break;
         case DimensionRoundingType::CEIL:
-            w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
-            d = static_cast<int>(std::ceil((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
+            w = static_cast<int>(
+                std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<int>(
+                std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            d = static_cast<int>(
+                std::ceil((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -390,7 +415,7 @@ bool needs_serialized_reduction(ReductionOperation op, DataType dt, unsigned int
     const bool is_quantized_type = is_data_type_quantized(dt);
     const bool is_first_dim      = (axis == 0);
 
-    return !is_first_dim || is_min_max || is_quantized_type;
+    return !is_first_dim || (is_quantized_type && !is_min_max);
 }
 
 QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool is_log)
@@ -400,9 +425,9 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool
     // * Softmax with QASYMM8_SIGNED: scale = 1/256, offset = -128
     // * LogSoftmax with QASYMM8: scale = 1/256, offset = 0
     // * LogSoftmax with QASYMM8_SIGNED: scale = 16/256, offset = 127
-    if(is_data_type_quantized_asymmetric_signed(input_type))
+    if (is_data_type_quantized_asymmetric_signed(input_type))
     {
-        if(is_log)
+        if (is_log)
         {
             return QuantizationInfo(16.f / 256, 127);
         }
@@ -414,17 +439,21 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool
     return QuantizationInfo(1.f / 256, 0);
 }
 
-std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info, DataType data_type, UniformQuantizationInfo oq_info)
+std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info,
+                                                             DataType                   data_type,
+                                                             UniformQuantizationInfo    oq_info)
 {
     const bool is_qasymm8_signed = is_data_type_quantized_asymmetric_signed(data_type);
     const auto a                 = act_info.a();
     const auto b                 = act_info.b();
-    const int  a_int             = is_qasymm8_signed ? quantize_qasymm8_signed(a, oq_info) : quantize_qasymm8(a, oq_info);
-    const int  b_int             = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info);
-    const auto type_max_value    = std::get<1>(get_min_max(data_type)).get<int32_t>();
+    const int  a_int          = is_qasymm8_signed ? quantize_qasymm8_signed(a, oq_info) : quantize_qasymm8(a, oq_info);
+    const int  b_int          = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info);
+    const auto type_max_value = std::get<1>(get_min_max(data_type)).get<int32_t>();
 
-    const int32_t min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oq_info.offset : b_int;
-    const int32_t max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int;
+    const int32_t min_activation =
+        act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oq_info.offset : b_int;
+    const int32_t max_activation =
+        act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int;
 
     return std::make_pair(min_activation, max_activation);
 }
@@ -433,11 +462,11 @@ std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initi
 {
     std::unordered_map<const ITensorInfo *, PaddingSize> res;
 
-    for(const ITensor *tensor : tensors)
+    for (const ITensor *tensor : tensors)
     {
-        if(tensor)
+        if (tensor)
         {
-            res.insert({ tensor->info(), tensor->info()->padding() });
+            res.insert({tensor->info(), tensor->info()->padding()});
         }
     }
 
@@ -448,11 +477,11 @@ std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initi
 {
     std::unordered_map<const ITensorInfo *, PaddingSize> res;
 
-    for(const ITensorInfo *info : infos)
+    for (const ITensorInfo *info : infos)
     {
-        if(info)
+        if (info)
         {
-            res.insert({ info, info->padding() });
+            res.insert({info, info->padding()});
         }
     }
 
@@ -461,17 +490,20 @@ std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initi
 
 bool has_padding_changed(const std::unordered_map<const ITensorInfo *, PaddingSize> &padding_map)
 {
-    return std::find_if(padding_map.begin(), padding_map.end(), [](const std::pair<const ITensorInfo *, PaddingSize> &padding_info)
-    {
-        return (padding_info.first->padding() != padding_info.second);
-    })
-    != padding_map.end();
+    return std::find_if(padding_map.begin(), padding_map.end(),
+                        [](const std::pair<const ITensorInfo *, PaddingSize> &padding_info)
+                        { return (padding_info.first->padding() != padding_info.second); }) != padding_map.end();
 }
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
-void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim)
+void print_consecutive_elements(std::ostream      &s,
+                                DataType           dt,
+                                const uint8_t     *ptr,
+                                unsigned int       n,
+                                int                stream_width,
+                                const std::string &element_delim)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -481,36 +513,46 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr
         case DataType::QSYMM8:
         case DataType::QASYMM8_SIGNED:
         case DataType::QSYMM8_PER_CHANNEL:
-            print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width,
+                                                    element_delim);
             break;
         case DataType::U16:
         case DataType::QASYMM16:
-            print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width,
+                                                      element_delim);
             break;
         case DataType::S16:
         case DataType::QSYMM16:
-            print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width,
+                                                     element_delim);
             break;
         case DataType::U32:
-            print_consecutive_elements_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n, stream_width,
+                                                      element_delim);
             break;
         case DataType::S32:
-            print_consecutive_elements_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n, stream_width,
+                                                     element_delim);
             break;
         case DataType::U64:
-            print_consecutive_elements_impl<uint64_t>(s, reinterpret_cast<const uint64_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<uint64_t>(s, reinterpret_cast<const uint64_t *>(ptr), n, stream_width,
+                                                      element_delim);
             break;
         case DataType::S64:
-            print_consecutive_elements_impl<int64_t>(s, reinterpret_cast<const int64_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<int64_t>(s, reinterpret_cast<const int64_t *>(ptr), n, stream_width,
+                                                     element_delim);
             break;
         case DataType::BFLOAT16:
-            print_consecutive_elements_impl<bfloat16>(s, reinterpret_cast<const bfloat16 *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<bfloat16>(s, reinterpret_cast<const bfloat16 *>(ptr), n, stream_width,
+                                                      element_delim);
             break;
         case DataType::F16:
-            print_consecutive_elements_impl<half>(s, reinterpret_cast<const half *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<half>(s, reinterpret_cast<const half *>(ptr), n, stream_width,
+                                                  element_delim);
             break;
         case DataType::F32:
-            print_consecutive_elements_impl<float>(s, reinterpret_cast<const float *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<float>(s, reinterpret_cast<const float *>(ptr), n, stream_width,
+                                                   element_delim);
             break;
         default:
             ARM_COMPUTE_ERROR("Undefined element size for given data type");
@@ -519,7 +561,7 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr
 
 int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
index 5a6486e11e8f0407f535f38c7b29cea5301fe64a..d8f796193ee16fad3e5d5c344b313187252cc775 100644
--- a/src/core/Validate.cpp
+++ b/src/core/Validate.cpp
@@ -23,13 +23,16 @@
  */
 #include "arm_compute/core/Validate.h"
 
-arm_compute::Status arm_compute::error_on_mismatching_windows(const char *function, const char *file, const int line,
-                                                              const arm_compute::Window &full, const arm_compute::Window &win)
+arm_compute::Status arm_compute::error_on_mismatching_windows(const char                *function,
+                                                              const char                *file,
+                                                              const int                  line,
+                                                              const arm_compute::Window &full,
+                                                              const arm_compute::Window &win)
 {
     full.validate();
     win.validate();
 
-    for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].start() != win[i].start(), function, file, line);
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].end() != win[i].end(), function, file, line);
@@ -38,13 +41,16 @@ arm_compute::Status arm_compute::error_on_mismatching_windows(const char *functi
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function, const char *file, const int line,
-                                                            const arm_compute::Window &full, const arm_compute::Window &sub)
+arm_compute::Status arm_compute::error_on_invalid_subwindow(const char                *function,
+                                                            const char                *file,
+                                                            const int                  line,
+                                                            const arm_compute::Window &full,
+                                                            const arm_compute::Window &sub)
 {
     full.validate();
     sub.validate();
 
-    for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].start() > sub[i].start(), function, file, line);
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].end() < sub[i].end(), function, file, line);
@@ -54,8 +60,12 @@ arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line,
-                                                                              const arm_compute::Window &full, const arm_compute::Window &window, const int dim)
+arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(const char                *function,
+                                                                              const char                *file,
+                                                                              const int                  line,
+                                                                              const arm_compute::Window &full,
+                                                                              const arm_compute::Window &window,
+                                                                              const int                  dim)
 {
     full.validate();
     window.validate();
@@ -67,65 +77,73 @@ arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(co
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
-                                                                     const arm_compute::Coordinates &pos, unsigned int max_dim)
+arm_compute::Status arm_compute::error_on_coordinates_dimensions_gte(
+    const char *function, const char *file, const int line, const arm_compute::Coordinates &pos, unsigned int max_dim)
 {
-    for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(pos[i] != 0, function, file, line);
     }
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_window_dimensions_gte(const char *function, const char *file, const int line,
-                                                                const arm_compute::Window &win, unsigned int max_dim)
+arm_compute::Status arm_compute::error_on_window_dimensions_gte(
+    const char *function, const char *file, const int line, const arm_compute::Window &win, unsigned int max_dim)
 {
-    for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR((win[i].start() != 0) || (win[i].end() != win[i].step()),
-                                                function, file, line,
-                                                "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(
+            (win[i].start() != 0) || (win[i].end() != win[i].step()), function, file, line,
+            "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
     }
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line,
+arm_compute::Status arm_compute::error_on_tensor_not_2d(const char                 *function,
+                                                        const char                 *file,
+                                                        const int                   line,
                                                         const arm_compute::ITensor *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor->info() == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->info()->num_dimensions() != 2,
-                                            function, file, line,
-                                            "Only 2D Tensors are supported by this kernel (%zu passed)", tensor->info()->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->info()->num_dimensions() != 2, function, file, line,
+                                            "Only 2D Tensors are supported by this kernel (%zu passed)",
+                                            tensor->info()->num_dimensions());
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line,
+arm_compute::Status arm_compute::error_on_tensor_not_2d(const char                     *function,
+                                                        const char                     *file,
+                                                        const int                       line,
                                                         const arm_compute::ITensorInfo *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->num_dimensions() != 2,
-                                            function, file, line,
-                                            "Only 2D Tensors are supported by this kernel (%zu passed)", tensor->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->num_dimensions() != 2, function, file, line,
+                                            "Only 2D Tensors are supported by this kernel (%zu passed)",
+                                            tensor->num_dimensions());
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
-                                                                      arm_compute::Format fmt, arm_compute::Channel cn)
+arm_compute::Status arm_compute::error_on_channel_not_in_known_format(
+    const char *function, const char *file, const int line, arm_compute::Format fmt, arm_compute::Channel cn)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(fmt == arm_compute::Format::UNKNOWN, function, file, line);
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(cn == arm_compute::Channel::UNKNOWN, function, file, line);
 
-    switch(fmt)
+    switch (fmt)
     {
         case arm_compute::Format::RGB888:
-            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B);
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R,
+                                                 arm_compute::Channel::G, arm_compute::Channel::B);
             break;
         case arm_compute::Format::RGBA8888:
-            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B, arm_compute::Channel::A);
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R,
+                                                 arm_compute::Channel::G, arm_compute::Channel::B,
+                                                 arm_compute::Channel::A);
             break;
         case arm_compute::Format::UV88:
-            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::U, arm_compute::Channel::V);
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::U,
+                                                 arm_compute::Channel::V);
             break;
         case arm_compute::Format::IYUV:
         case arm_compute::Format::UYVY422:
@@ -133,7 +151,8 @@ arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char
         case arm_compute::Format::NV12:
         case arm_compute::Format::NV21:
         case arm_compute::Format::YUV444:
-            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::Y, arm_compute::Channel::U, arm_compute::Channel::V);
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::Y,
+                                                 arm_compute::Channel::U, arm_compute::Channel::V);
             break;
         default:
             ARM_COMPUTE_ERROR_LOC(function, file, line, "Not supported format.");
@@ -141,21 +160,26 @@ arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_unconfigured_kernel(const char *function, const char *file, const int line,
+arm_compute::Status arm_compute::error_on_unconfigured_kernel(const char                 *function,
+                                                              const char                 *file,
+                                                              const int                   line,
                                                               const arm_compute::IKernel *kernel)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(kernel == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(!kernel->is_window_configured(),
-                                        function, file, line,
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(!kernel->is_window_configured(), function, file, line,
                                         "This kernel hasn't been configured.");
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function, const char *file, const int line,
-                                                            const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape)
+arm_compute::Status arm_compute::error_on_invalid_subtensor(const char        *function,
+                                                            const char        *file,
+                                                            const int          line,
+                                                            const TensorShape &parent_shape,
+                                                            const Coordinates &coords,
+                                                            const TensorShape &shape)
 {
     // Check dimensions
-    for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
     {
         const bool invalid_idx        = coords[i] >= static_cast<int>(parent_shape[i]);
         const bool out_of_bounds_size = coords[i] + static_cast<int>(shape[i]) > static_cast<int>(parent_shape[i]);
@@ -164,15 +188,20 @@ arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
-                                                                         const ValidRegion &parent_valid_region, const ValidRegion &valid_region)
+arm_compute::Status arm_compute::error_on_invalid_subtensor_valid_region(const char        *function,
+                                                                         const char        *file,
+                                                                         const int          line,
+                                                                         const ValidRegion &parent_valid_region,
+                                                                         const ValidRegion &valid_region)
 {
     // Check valid regions
-    for(unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d)
+    for (unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_LOC((parent_valid_region.anchor[d] > valid_region.anchor[d]), function, file, line);
-        ARM_COMPUTE_RETURN_ERROR_ON_LOC((parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) < (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])),
-                                        function, file, line);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC(
+            (parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) <
+                (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])),
+            function, file, line);
     }
 
     return arm_compute::Status{};
diff --git a/src/core/common/Macros.h b/src/core/common/Macros.h
index d791154e5c4d4853d7247b7821e509638e7ed632..bc0ea299118ee066ee4906f4f862a2f5b5dbc7d6 100644
--- a/src/core/common/Macros.h
+++ b/src/core/common/Macros.h
@@ -25,9 +25,9 @@
 #define ARM_COMPUTE_COMMON_MACROS_H
 
 #define ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(TypeName) \
-    TypeName(const TypeName &) = delete;               \
+    TypeName(const TypeName &)            = delete;    \
     TypeName &operator=(const TypeName &) = delete;    \
     TypeName(TypeName &&)                 = default;   \
-    TypeName &operator=(TypeName &&) = default
+    TypeName &operator=(TypeName &&)      = default
 
 #endif /* ARM_COMPUTE_COMMON_MACROS_H */
diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h
index d6dc3449fcd975e0b066074e5ccffd86cf6cff69..686304b8d7451ba0c5cc955f27d9f2b748a45d90 100644
--- a/src/core/common/Registrars.h
+++ b/src/core/common/Registrars.h
@@ -46,7 +46,7 @@
 
 #else /* !defined(ENABLE_FP16_KERNELS) */
 #define REGISTER_FP16_NEON(func_name) nullptr
-#define REGISTER_FP16_SVE(func_name) nullptr
+#define REGISTER_FP16_SVE(func_name)  nullptr
 #define REGISTER_FP16_SVE2(func_name) nullptr
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
 
@@ -72,7 +72,7 @@
 
 #else /* defined(ENABLE_FP32_KERNELS) */
 #define REGISTER_FP32_NEON(func_name) nullptr
-#define REGISTER_FP32_SVE(func_name) nullptr
+#define REGISTER_FP32_SVE(func_name)  nullptr
 #define REGISTER_FP32_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_FP32_KERNELS) */
 
@@ -94,7 +94,7 @@
 
 #else /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
 #define REGISTER_QASYMM8_SIGNED_NEON(func_name) nullptr
-#define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr
+#define REGISTER_QASYMM8_SIGNED_SVE(func_name)  nullptr
 #define REGISTER_QASYMM8_SIGNED_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
 
@@ -115,7 +115,7 @@
 
 #else /* defined(ENABLE_QASYMM8_KERNELS) */
 #define REGISTER_QASYMM8_NEON(func_name) nullptr
-#define REGISTER_QASYMM8_SVE(func_name) nullptr
+#define REGISTER_QASYMM8_SVE(func_name)  nullptr
 #define REGISTER_QASYMM8_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_QASYMM8_KERNELS) */
 
@@ -137,7 +137,7 @@
 
 #else /* defined(ENABLE_QSYMM16_KERNELS) */
 #define REGISTER_QSYMM16_NEON(func_name) nullptr
-#define REGISTER_QSYMM16_SVE(func_name) nullptr
+#define REGISTER_QSYMM16_SVE(func_name)  nullptr
 #define REGISTER_QSYMM16_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_QSYMM16_KERNELS) */
 
@@ -169,7 +169,7 @@
 
 #else /* defined(ENABLE_INTEGER_KERNELS) */
 #define REGISTER_INTEGER_NEON(func_name) nullptr
-#define REGISTER_INTEGER_SVE(func_name) nullptr
+#define REGISTER_INTEGER_SVE(func_name)  nullptr
 #define REGISTER_INTEGER_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_INTEGER_KERNELS) */
 
diff --git a/src/core/experimental/PostOpUtils.h b/src/core/experimental/PostOpUtils.h
deleted file mode 100644
index 6217dcc3da745e8001161f21333ffc32c32d0f75..0000000000000000000000000000000000000000
--- a/src/core/experimental/PostOpUtils.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2021, 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_EXPERIMENTAL_POSTOPUTILS
-#define ARM_COMPUTE_EXPERIMENTAL_POSTOPUTILS
-
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "arm_compute/core/experimental/PostOps.h"
-
-#include "arm_compute/core/experimental/Types.h"
-#include "support/Cast.h"
-
-#include <vector>
-
-/** (EXPERIMENTAL_POST_OPS) */
-namespace arm_compute
-{
-namespace experimental
-{
-/** Transform a PostOpList of type FromTensorT to one of type ToTensorT */
-template <typename FromTensorT, typename ToTensorT>
-PostOpList<ToTensorT> transform_post_op_list_arguments(const PostOpList<FromTensorT> &post_ops, std::function<ToTensorT(FromTensorT)> transform_arg)
-{
-    PostOpList<ToTensorT> transformed_post_ops;
-    for(const auto &post_op : post_ops.get_list())
-    {
-        switch(post_op->type())
-        {
-            case PostOpType::Activation:
-            {
-                const auto _post_op = utils::cast::polymorphic_downcast<const PostOpAct<FromTensorT> *>(post_op.get());
-                transformed_post_ops.template push_back_op<PostOpAct<ToTensorT>>(_post_op->_act_info);
-                break;
-            }
-            case PostOpType::Eltwise_Add:
-            {
-                const auto _post_op = utils::cast::polymorphic_downcast<const PostOpEltwiseAdd<FromTensorT> *>(post_op.get());
-                transformed_post_ops.template push_back_op<PostOpEltwiseAdd<ToTensorT>>(transform_arg(_post_op->_addend), _post_op->_prev_dst_pos, _post_op->_policy);
-                break;
-            }
-            case PostOpType::Eltwise_PRelu:
-            {
-                const auto _post_op = utils::cast::polymorphic_downcast<const PostOpEltwisePRelu<FromTensorT> *>(post_op.get());
-                transformed_post_ops.template push_back_op<PostOpEltwisePRelu<ToTensorT>>(transform_arg(_post_op->_alpha_param), _post_op->_prev_dst_pos, _post_op->_policy);
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported PostOpType");
-            }
-        }
-    }
-    return transformed_post_ops;
-}
-
-/** Get post op argument TensorType from post op argument index in a flattened, ordered post op argument list */
-inline TensorType get_post_op_arg_type(size_t index)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(static_cast<int>(index) > EXPERIMENTAL_ACL_POST_OP_ARG_LAST - EXPERIMENTAL_ACL_POST_OP_ARG_FIRST, "Post Op argument index is out of range");
-    return static_cast<TensorType>(EXPERIMENTAL_ACL_POST_OP_ARG_FIRST + static_cast<int>(index));
-}
-
-/** Get a sequence of PostOp Types from PostOpList */
-template <typename T>
-PostOpTypeSequence get_post_op_sequence(const PostOpList<T> &post_ops)
-{
-    PostOpTypeSequence post_op_sequence;
-    for(const auto &op : post_ops.get_list())
-    {
-        post_op_sequence.push_back(op->type());
-    }
-    return post_op_sequence;
-}
-
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_POSTOPUTILS
diff --git a/src/core/helpers/AutoConfiguration.h b/src/core/helpers/AutoConfiguration.h
index 8715dcd74b4d365fb62417c85752253420e0b398..9df2a769834620b1064d657abdc8e2000f8474ed 100644
--- a/src/core/helpers/AutoConfiguration.h
+++ b/src/core/helpers/AutoConfiguration.h
@@ -24,9 +24,9 @@
 #ifndef SRC_CORE_HELPERS_AUTOCONFIGURATION_H
 #define SRC_CORE_HELPERS_AUTOCONFIGURATION_H
 
-#include "arm_compute/core/utils/DataTypeUtils.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
 
 namespace arm_compute
 {
@@ -42,10 +42,11 @@ namespace arm_compute
  */
 inline bool auto_init_if_empty(ITensorInfo       &info,
                                const TensorShape &shape,
-                               int num_channels, DataType data_type,
-                               QuantizationInfo quantization_info = QuantizationInfo())
+                               int                num_channels,
+                               DataType           data_type,
+                               QuantizationInfo   quantization_info = QuantizationInfo())
 {
-    if(info.tensor_shape().total_size() == 0)
+    if (info.tensor_shape().total_size() == 0)
     {
         info.set_data_type(data_type);
         info.set_num_channels(num_channels);
@@ -70,7 +71,7 @@ inline bool auto_init_if_empty(ITensorInfo       &info,
  */
 inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source)
 {
-    if(info_sink.tensor_shape().total_size() == 0)
+    if (info_sink.tensor_shape().total_size() == 0)
     {
         info_sink.set_data_type(info_source.data_type());
         info_sink.set_num_channels(info_source.num_channels());
@@ -93,7 +94,7 @@ inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_s
  */
 inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
 {
-    if(info.tensor_shape().total_size() == 0)
+    if (info.tensor_shape().total_size() == 0)
     {
         info.set_tensor_shape(shape);
         return true;
@@ -112,7 +113,7 @@ inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
  */
 inline bool set_format_if_unknown(ITensorInfo &info, Format format)
 {
-    if(info.data_type() == DataType::UNKNOWN)
+    if (info.data_type() == DataType::UNKNOWN)
     {
         info.set_format(format);
         return true;
@@ -131,7 +132,7 @@ inline bool set_format_if_unknown(ITensorInfo &info, Format format)
  */
 inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
 {
-    if(info.data_type() == DataType::UNKNOWN)
+    if (info.data_type() == DataType::UNKNOWN)
     {
         info.set_data_type(data_type);
         return true;
@@ -150,7 +151,7 @@ inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
  */
 inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout)
 {
-    if(info.data_layout() == DataLayout::UNKNOWN)
+    if (info.data_layout() == DataLayout::UNKNOWN)
     {
         info.set_data_layout(data_layout);
         return true;
@@ -169,7 +170,7 @@ inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout
  */
 inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info)
 {
-    if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
+    if (info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
     {
         info.set_quantization_info(quantization_info);
         return true;
diff --git a/src/core/helpers/MemoryHelpers.h b/src/core/helpers/MemoryHelpers.h
index a41052687bc82e83eae9fb8c81f1c8f1d2cad4f6..dd094b414c28491f7d932a70345f875bcbbaa526 100644
--- a/src/core/helpers/MemoryHelpers.h
+++ b/src/core/helpers/MemoryHelpers.h
@@ -24,9 +24,9 @@
 #ifndef SRC_COMMON_MEMORY_HELPERS_H
 #define SRC_COMMON_MEMORY_HELPERS_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
 #include <memory>
@@ -43,18 +43,17 @@ inline int offset_int_vec(int offset)
 template <typename TensorType>
 struct WorkspaceDataElement
 {
-    int                          slot{ -1 };
-    experimental::MemoryLifetime lifetime{ experimental::MemoryLifetime::Temporary };
-    std::unique_ptr<TensorType>  tensor{ nullptr };
+    int                          slot{-1};
+    experimental::MemoryLifetime lifetime{experimental::MemoryLifetime::Temporary};
+    std::unique_ptr<TensorType>  tensor{nullptr};
 };
 
 template <typename TensorType>
 using WorkspaceData = std::vector<WorkspaceDataElement<TensorType>>;
 
 template <typename TensorType>
-WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirements &mem_reqs,
-                                           MemoryGroup                            &mgroup,
-                                           ITensorPack                            &run_pack)
+WorkspaceData<TensorType>
+manage_workspace(const experimental::MemoryRequirements &mem_reqs, MemoryGroup &mgroup, ITensorPack &run_pack)
 {
     ITensorPack dummy_pack = ITensorPack();
     return manage_workspace<TensorType>(mem_reqs, mgroup, run_pack, dummy_pack);
@@ -63,24 +62,26 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement
 template <typename TensorType>
 WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirements &mem_reqs,
                                            MemoryGroup                            &mgroup,
-                                           ITensorPack &run_pack, ITensorPack &prep_pack)
+                                           ITensorPack                            &run_pack,
+                                           ITensorPack                            &prep_pack)
 {
     WorkspaceData<TensorType> workspace_memory;
-    for(const auto &req : mem_reqs)
+    for (const auto &req : mem_reqs)
     {
-        if(req.size == 0)
+        if (req.size == 0)
         {
             continue;
         }
 
-        const auto aux_info = TensorInfo{ TensorShape(req.size), 1, DataType::U8 };
-        workspace_memory.emplace_back(WorkspaceDataElement<TensorType> { req.slot, req.lifetime, std::make_unique<TensorType>() });
+        const auto aux_info = TensorInfo{TensorShape(req.size), 1, DataType::U8};
+        workspace_memory.emplace_back(
+            WorkspaceDataElement<TensorType>{req.slot, req.lifetime, std::make_unique<TensorType>()});
 
         auto aux_tensor = workspace_memory.back().tensor.get();
         ARM_COMPUTE_ERROR_ON_NULLPTR(aux_tensor);
         aux_tensor->allocator()->init(aux_info, req.alignment);
 
-        if(req.lifetime == experimental::MemoryLifetime::Temporary)
+        if (req.lifetime == experimental::MemoryLifetime::Temporary)
         {
             mgroup.manage(aux_tensor);
         }
@@ -91,7 +92,7 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement
         run_pack.add_tensor(req.slot, aux_tensor);
     }
 
-    for(auto &mem : workspace_memory)
+    for (auto &mem : workspace_memory)
     {
         auto tensor = mem.tensor.get();
         tensor->allocator()->allocate();
@@ -103,31 +104,29 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement
 template <typename TensorType>
 void release_prepare_tensors(WorkspaceData<TensorType> &workspace, ITensorPack &prep_pack)
 {
-    workspace.erase(std::remove_if(workspace.begin(),
-                                   workspace.end(),
-                                   [&prep_pack](auto & wk)
-    {
-        const bool to_erase = wk.lifetime == experimental::MemoryLifetime::Prepare;
-        if(to_erase)
-        {
-            prep_pack.remove_tensor(wk.slot);
-        }
-        return to_erase;
-    }),
-    workspace.end());
+    workspace.erase(std::remove_if(workspace.begin(), workspace.end(),
+                                   [&prep_pack](auto &wk)
+                                   {
+                                       const bool to_erase = wk.lifetime == experimental::MemoryLifetime::Prepare;
+                                       if (to_erase)
+                                       {
+                                           prep_pack.remove_tensor(wk.slot);
+                                       }
+                                       return to_erase;
+                                   }),
+                    workspace.end());
 }
 
 /** Utility function to release tensors with lifetime marked as Prepare */
 template <typename TensorType>
-void release_temporaries(const experimental::MemoryRequirements &mem_reqs,
-                         WorkspaceData<TensorType>              &workspace)
+void release_temporaries(const experimental::MemoryRequirements &mem_reqs, WorkspaceData<TensorType> &workspace)
 {
-    for(auto &ws : workspace)
+    for (auto &ws : workspace)
     {
         const int slot = ws.slot;
-        for(auto &m : mem_reqs)
+        for (auto &m : mem_reqs)
         {
-            if(m.slot == slot && m.lifetime == experimental::MemoryLifetime::Prepare)
+            if (m.slot == slot && m.lifetime == experimental::MemoryLifetime::Prepare)
             {
                 auto tensor = ws.tensor.get();
                 tensor->allocator()->free();
diff --git a/src/core/helpers/PoolingHelpers.h b/src/core/helpers/PoolingHelpers.h
index 079629ee6a86d2287879e5ed5abfc78c22f3b9fd..9ef045f472e3a45c7d577c1c35758fdd6cf70772 100644
--- a/src/core/helpers/PoolingHelpers.h
+++ b/src/core/helpers/PoolingHelpers.h
@@ -33,8 +33,20 @@ namespace cpu
 namespace
 {
 
-inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int pool_size_z, const int upper_bound_w,
-                                 const int upper_bound_h, const int upper_bound_d, const int pad_x, const int pad_y, const int pad_z, const int stride_x, const int stride_y, const int stride_z)
+inline float calculate_avg_scale_pool3d(bool               exclude_padding,
+                                        const Coordinates &id,
+                                        const int          pool_size_x,
+                                        const int          pool_size_y,
+                                        const int          pool_size_z,
+                                        const int          upper_bound_w,
+                                        const int          upper_bound_h,
+                                        const int          upper_bound_d,
+                                        const int          pad_x,
+                                        const int          pad_y,
+                                        const int          pad_z,
+                                        const int          stride_x,
+                                        const int          stride_y,
+                                        const int          stride_z)
 {
     // Based on NDHWC
     int start_x = id[1] * stride_x - pad_x;
@@ -44,7 +56,7 @@ inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates
     const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
     const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
     const int end_z = std::min(start_z + pool_size_z, upper_bound_d);
-    if(exclude_padding)
+    if (exclude_padding)
     {
         start_x = std::max(0, start_x);
         start_y = std::max(0, start_y);
@@ -53,8 +65,17 @@ inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates
     return 1.f / ((end_y - start_y) * (end_x - start_x) * (end_z - start_z));
 }
 
-inline float calculate_avg_scale_pool2d(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
-                                 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+inline float calculate_avg_scale_pool2d(bool               exclude_padding,
+                                        DataLayout         data_layout,
+                                        const Coordinates &id,
+                                        const int          pool_size_x,
+                                        const int          pool_size_y,
+                                        const int          upper_bound_w,
+                                        const int          upper_bound_h,
+                                        const int          pad_x,
+                                        const int          pad_y,
+                                        const int          stride_x,
+                                        const int          stride_y)
 {
     const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -64,7 +85,7 @@ inline float calculate_avg_scale_pool2d(bool exclude_padding, DataLayout data_la
 
     const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
     const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
-    if(exclude_padding)
+    if (exclude_padding)
     {
         start_x = std::max(0, start_x);
         start_y = std::max(0, start_y);
@@ -117,17 +138,26 @@ inline float32x4_t vcvtq_f32_q32(int32x4_t values)
 }
 
 template <typename Tout>
-inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
+inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc,
+                                           const float          quant_rescale,
+                                           const float          scale_pooling,
+                                           const int32_t        new_offset);
 
 template <>
-inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
+inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc,
+                                                 const float          quant_rescale,
+                                                 const float          scale_pooling,
+                                                 const int32_t        new_offset)
 {
     const float new_scale = quant_rescale / scale_pooling;
     return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
 }
 
 template <>
-inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
+inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc,
+                                                const float          quant_rescale,
+                                                const float          scale_pooling,
+                                                const int32_t        new_offset)
 {
     const float new_scale = quant_rescale / scale_pooling;
     return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
@@ -139,30 +169,24 @@ inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInf
 template <>
 inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
 {
-    const float32x4x4_t acc =
-    {
-        {
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
-        }
-    };
+    const float32x4x4_t acc = {{
+        vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
+        vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
+        vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
+        vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
+    }};
     return vquantize(acc, requant_qinfo);
 }
 
 template <>
 inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
 {
-    const float32x4x4_t acc =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
-        }
-    };
+    const float32x4x4_t acc = {{
+        vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
+        vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
+        vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
+        vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
+    }};
     return vquantize_signed(acc, requant_qinfo);
 }
 
@@ -172,26 +196,20 @@ inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinf
 template <>
 inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
 {
-    const float32x4x2_t acc =
-    {
-        {
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
-        }
-    };
+    const float32x4x2_t acc = {{
+        vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
+        vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
+    }};
     return vquantize(acc, requant_qinfo);
 }
 
 template <>
 inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
 {
-    const float32x4x2_t acc =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
-        }
-    };
+    const float32x4x2_t acc = {{
+        vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
+        vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
+    }};
     return vquantize_signed(acc, requant_qinfo);
 }
 
@@ -199,4 +217,3 @@ inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo
 } // namespace cpu
 } // namespace arm_compute
 #endif /* SRC_CORE_HELPERS_POOLINGHELPERS_H */
-
diff --git a/src/core/helpers/ScaleHelpers.h b/src/core/helpers/ScaleHelpers.h
index e769bba782606582f0916161e00c6ca711572666..47605e73857369b9f19c1f5019186ebf67bf2f7b 100644
--- a/src/core/helpers/ScaleHelpers.h
+++ b/src/core/helpers/ScaleHelpers.h
@@ -50,8 +50,12 @@ namespace scale_helpers
  *
  * @return The bilinear interpolated pixel value
  */
-inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stride, float dx, float dy,
-                                           UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
+inline uint8_t delta_bilinear_c1_quantized(const uint8_t          *pixel_ptr,
+                                           size_t                  stride,
+                                           float                   dx,
+                                           float                   dy,
+                                           UniformQuantizationInfo iq_info,
+                                           UniformQuantizationInfo oq_info)
 {
     ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
 
@@ -85,8 +89,12 @@ inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stri
  *
  * @return The bilinear interpolated pixel value
  */
-inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride, float dx, float dy,
-                                          UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
+inline int8_t delta_bilinear_c1_quantized(const int8_t           *pixel_ptr,
+                                          size_t                  stride,
+                                          float                   dx,
+                                          float                   dy,
+                                          UniformQuantizationInfo iq_info,
+                                          UniformQuantizationInfo oq_info)
 {
     ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
 
@@ -122,9 +130,8 @@ inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride
  *
  * @return The pixel at (x, y) using area interpolation.
  */
-inline uint8_t
-pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr,
-                      float hr, int x, int y)
+inline uint8_t pixel_area_c1u8_clamp(
+    const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y)
 {
     ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
 
@@ -159,7 +166,7 @@ pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t widt
 
     // Sum pixels in area
     int sum = 0;
-    for(int j = yi + y_from, je = yi + y_to; j <= je; ++j)
+    for (int j = yi + y_from, je = yi + y_to; j <= je; ++j)
     {
         const uint8_t *ptr = first_pixel_ptr + j * stride + xi + x_from;
         sum                = std::accumulate(ptr, ptr + x_elements, sum);
diff --git a/src/core/helpers/SoftmaxHelpers.cpp b/src/core/helpers/SoftmaxHelpers.cpp
index 71b971af3179e6616b81570dd8a0e3fd0983816c..8184991ab5559b82c992070b0d37ec5406f63cca 100644
--- a/src/core/helpers/SoftmaxHelpers.cpp
+++ b/src/core/helpers/SoftmaxHelpers.cpp
@@ -29,7 +29,7 @@ namespace softmax_helpers
 {
 PermutationVector get_permutation_vector_from_softmax_axis(size_t axis)
 {
-    switch(axis)
+    switch (axis)
     {
         case 1:
             return PermutationVector(1U, 0U, 2U, 3U);
diff --git a/tests/validation/reference/PostOps.h b/src/core/helpers/Utils.cpp
similarity index 65%
rename from tests/validation/reference/PostOps.h
rename to src/core/helpers/Utils.cpp
index 5fe0fe71f57b64744886907754277b59ee7e57da..f8895d8a3c221a1ee24aec4499c1a2ff18cbaf07 100644
--- a/tests/validation/reference/PostOps.h
+++ b/src/core/helpers/Utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+* Copyright (c) 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,27 +21,29 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_POSTOPS_H
-#define ARM_COMPUTE_TEST_POSTOPS_H
-
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "tests/SimpleTensor.h"
-#include "tests/validation/Helpers.h"
+#include "src/core/helpers/Utils.h"
 
 namespace arm_compute
 {
-namespace test
-{
-namespace validation
+bool has_holes(const ITensorInfo &info)
 {
-namespace reference
+    return has_holes(info, info.num_dimensions() - 1);
+}
+
+bool has_holes(const ITensorInfo &info, size_t dimension)
 {
-/** (EXPERIMENTAL_POST_OPS) */
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> post_ops(const SimpleTensor<T> &a, experimental::PostOpList<SimpleTensor<T>> post_ops);
+    const auto &shape          = info.tensor_shape();
+    const auto &strides        = info.strides_in_bytes();
+    size_t      squashed_bytes = info.element_size();
 
-} // namespace reference
-} // namespace validation
-} // namespace test
+    for (size_t dim = 0; dim <= dimension; ++dim)
+    {
+        if (strides[dim] != squashed_bytes)
+        {
+            return true;
+        }
+        squashed_bytes *= shape[dim];
+    }
+    return false;
+}
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_POSTOPS_H */
diff --git a/src/core/helpers/Utils.h b/src/core/helpers/Utils.h
index 326dc962c710a128ee47613d1e25b9242f82e1b8..a17a78f7ee5b961fcddcb27ef78d4d3ea259b180 100644
--- a/src/core/helpers/Utils.h
+++ b/src/core/helpers/Utils.h
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2020-2021 Arm Limited.
+* Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_HELPERS_UTILS_H
-#define SRC_CORE_HELPERS_UTILS_H
+#ifndef ACL_SRC_CORE_HELPERS_UTILS_H
+#define ACL_SRC_CORE_HELPERS_UTILS_H
 
 #include "arm_compute/core/ITensorInfo.h"
 
@@ -38,14 +38,14 @@ namespace arm_compute
  *         calculated based on the tensor shape and the strides of lower dimensions.
  */
 template <typename T, typename... Ts>
-inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&... fixed_strides)
+inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&...fixed_strides)
 {
     const TensorShape &shape = info.tensor_shape();
 
     // Create strides object
     Strides strides(stride_x, fixed_strides...);
 
-    for(size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
+    for (size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
     {
         strides.set(i, shape[i - 1] * strides[i - 1]);
     }
@@ -92,6 +92,29 @@ inline unsigned int get_next_power_two(unsigned int x)
 
     return x;
 }
+
+/** Check if the tensor has any holes.
+ *
+ * A hole is defined as any gap in the tensor between two consecutive values. This can be a result of extending
+ * the paddings or manipulating the strides of the tensor
+ *
+ * @param[in] info Tensor info object defining the shape of the input tensor.
+ *
+ * @note This function checks for holes in all dimensions.
+ *
+ */
+bool has_holes(const ITensorInfo &info);
+
+/** Check if the tensor has any holes.
+ *
+ * @param[in] info      Tensor info object defining the shape of the input tensor.
+ * @param[in] dimension Highest dimension to check.
+ *
+ * @note This function checks for holes in all the dimensions upto and including the highest dimension.
+ *
+ */
+bool has_holes(const ITensorInfo &info, size_t dimension);
+
 } // namespace arm_compute
 
-#endif /* SRC_CORE_HELPERS_UTILS_H */
+#endif // ACL_SRC_CORE_HELPERS_UTILS_H
diff --git a/src/core/helpers/WindowHelpers.cpp b/src/core/helpers/WindowHelpers.cpp
index a4d46db352f05f21930c338692341388e301bd6e..30a55fcbc632b53345a8276120f949ac1d7ae273 100644
--- a/src/core/helpers/WindowHelpers.cpp
+++ b/src/core/helpers/WindowHelpers.cpp
@@ -25,9 +25,10 @@
 
 namespace arm_compute
 {
-Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+Window
+calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
 {
-    if(!skip_border)
+    if (!skip_border)
     {
         border_size = BorderSize(0);
     }
@@ -38,40 +39,47 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps,
     Window window;
 
     window.set(0, Window::Dimension(
-                   // Skip the border left of the image
-                   anchor[0] + border_size.left,
-                   // Skip the border right of the image
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
-                   steps[0]));
+                      // Skip the border left of the image
+                      anchor[0] + border_size.left,
+                      // Skip the border right of the image
+                      // Make sure the window width is a multiple of the step size
+                      anchor[0] + border_size.left +
+                          ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) -
+                                                           static_cast<int>(border_size.right)),
+                                           steps[0]),
+                      steps[0]));
 
     size_t n = 1;
 
-    if(anchor.num_dimensions() > 1)
+    if (anchor.num_dimensions() > 1)
     {
-        window.set(1, Window::Dimension(
+        window.set(1,
+                   Window::Dimension(
                        // Skip the border above the image
                        anchor[1] + border_size.top,
                        // Skip the border below the image
-                       anchor[1] + border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]),
+                       anchor[1] + border_size.top +
+                           ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) -
+                                                            static_cast<int>(border_size.bottom)),
+                                            steps[1]),
                        steps[1]));
 
         ++n;
     }
 
-    if(anchor.num_dimensions() > 2)
+    if (anchor.num_dimensions() > 2)
     {
         window.set(2, Window::Dimension(anchor[2], std::max<size_t>(1, shape[2]), steps[2]));
 
         ++n;
     }
 
-    for(; n < anchor.num_dimensions(); ++n)
+    for (; n < anchor.num_dimensions(); ++n)
     {
         window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
     }
 
-    for(; n < Coordinates::num_max_dimensions; ++n)
+    for (; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, 1));
     }
@@ -81,7 +89,7 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps,
 
 Window calculate_max_window(const TensorShape &shape, const Steps &steps, bool skip_border, BorderSize border_size)
 {
-    if(!skip_border)
+    if (!skip_border)
     {
         border_size = BorderSize(0);
     }
@@ -89,40 +97,46 @@ Window calculate_max_window(const TensorShape &shape, const Steps &steps, bool s
     Window window;
 
     window.set(0, Window::Dimension(
-                   // Skip the border left of the image
-                   border_size.left,
-                   // Skip the border right of the image
-                   // Make sure the window width is a multiple of the step size
-                   border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
-                   steps[0]));
+                      // Skip the border left of the image
+                      border_size.left,
+                      // Skip the border right of the image
+                      // Make sure the window width is a multiple of the step size
+                      border_size.left +
+                          ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) -
+                                                           static_cast<int>(border_size.right)),
+                                           steps[0]),
+                      steps[0]));
 
     size_t n = 1;
 
-    if(shape.num_dimensions() > 1)
+    if (shape.num_dimensions() > 1)
     {
         window.set(1, Window::Dimension(
-                       // Skip the border above the image
-                       border_size.top,
-                       // Skip the border below the image
-                       border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]),
-                       steps[1]));
+                          // Skip the border above the image
+                          border_size.top,
+                          // Skip the border below the image
+                          border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) -
+                                                                             static_cast<int>(border_size.top) -
+                                                                             static_cast<int>(border_size.bottom)),
+                                                             steps[1]),
+                          steps[1]));
 
         ++n;
     }
 
-    if(shape.num_dimensions() > 2)
+    if (shape.num_dimensions() > 2)
     {
         window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[2]), steps[2]));
 
         ++n;
     }
 
-    for(; n < shape.num_dimensions(); ++n)
+    for (; n < shape.num_dimensions(); ++n)
     {
         window.set(n, Window::Dimension(0, std::max<size_t>(1, shape[n])));
     }
 
-    for(; n < Coordinates::num_max_dimensions; ++n)
+    for (; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, 1));
     }
@@ -138,40 +152,42 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step
     Window window;
 
     window.set(0, Window::Dimension(
-                   // move the anchor to the start from the border
-                   anchor[0] - border_size.left,
-                   // move the anchor to include the right end border
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] - border_size.left + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
-                   steps[0]));
+                      // move the anchor to the start from the border
+                      anchor[0] - border_size.left,
+                      // move the anchor to include the right end border
+                      // Make sure the window width is a multiple of the step size
+                      anchor[0] - border_size.left +
+                          ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
+                      steps[0]));
 
     size_t n = 1;
 
-    if(anchor.num_dimensions() > 1)
+    if (anchor.num_dimensions() > 1)
     {
         window.set(1, Window::Dimension(
-                       // Include the border above the image
-                       anchor[1] - border_size.top,
-                       // Include the border below the image
-                       anchor[1] - border_size.top + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
-                       steps[1]));
+                          // Include the border above the image
+                          anchor[1] - border_size.top,
+                          // Include the border below the image
+                          anchor[1] - border_size.top +
+                              ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
+                          steps[1]));
 
         ++n;
     }
 
-    if(anchor.num_dimensions() > 2)
+    if (anchor.num_dimensions() > 2)
     {
         window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[n]), steps[2]));
 
         ++n;
     }
 
-    for(; n < anchor.num_dimensions(); ++n)
+    for (; n < anchor.num_dimensions(); ++n)
     {
         window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
     }
 
-    for(; n < Coordinates::num_max_dimensions; ++n)
+    for (; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, 1));
     }
@@ -179,9 +195,12 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step
     return window;
 }
 
-Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+Window calculate_max_window_horizontal(const ValidRegion &valid_region,
+                                       const Steps       &steps,
+                                       bool               skip_border,
+                                       BorderSize         border_size)
 {
-    if(skip_border)
+    if (skip_border)
     {
         border_size.top    = 0;
         border_size.bottom = 0;
@@ -198,33 +217,35 @@ Window calculate_max_window_horizontal(const ValidRegion &valid_region, const St
     Window window;
 
     window.set(0, Window::Dimension(
-                   // Skip the border left of the image
-                   anchor[0] + border_size.left,
-                   // Skip the border right of the image
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
-                   steps[0]));
+                      // Skip the border left of the image
+                      anchor[0] + border_size.left,
+                      // Skip the border right of the image
+                      // Make sure the window width is a multiple of the step size
+                      anchor[0] + border_size.left +
+                          ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) -
+                                                           static_cast<int>(border_size.right)),
+                                           steps[0]),
+                      steps[0]));
 
     size_t n = 1;
 
-    if(anchor.num_dimensions() > 1)
+    if (anchor.num_dimensions() > 1)
     {
         window.set(1, Window::Dimension(
-                       // Skip the border above the image
-                       anchor[1] - border_size.top,
-                       // Skip the border below the image
-                       anchor[1] + shape[1] + border_size.bottom,
-                       1));
+                          // Skip the border above the image
+                          anchor[1] - border_size.top,
+                          // Skip the border below the image
+                          anchor[1] + shape[1] + border_size.bottom, 1));
 
         ++n;
     }
 
-    for(; n < anchor.num_dimensions(); ++n)
+    for (; n < anchor.num_dimensions(); ++n)
     {
         window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
     }
 
-    for(; n < Coordinates::num_max_dimensions; ++n)
+    for (; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, 1));
     }
@@ -247,9 +268,9 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
     size_t squashed_bytes = src0.element_size();
 
     // Try to squash the low dimensions together.
-    for(; dim < num_dimensions; ++dim)
+    for (; dim < num_dimensions; ++dim)
     {
-        if(shape0[dim] != shape1[dim] || strides0[dim] != squashed_bytes || strides1[dim] != squashed_bytes)
+        if (shape0[dim] != shape1[dim] || strides0[dim] != squashed_bytes || strides1[dim] != squashed_bytes)
         {
             break;
         }
@@ -257,7 +278,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
         squashed_bytes *= shape0[dim];
     }
 
-    if(dim == num_dimensions)
+    if (dim == num_dimensions)
     {
         auto squashed_elements = squashed_bytes / src0.element_size();
 
@@ -266,7 +287,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
         // The input tensors can be interpreted as 1D array.
         win.set(0, Window::Dimension(0, squashed_elements, 1));
 
-        for(dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
+        for (dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
         {
             win.set(dim, Window::Dimension(0, 1, 1));
         }
@@ -274,7 +295,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
     else
     {
         // Generates the max window.
-        for(dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
+        for (dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
         {
             win.set(dim, Window::Dimension(0, std::max(shape0[dim], shape1[dim]), 1));
         }
@@ -295,21 +316,21 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
     size_t squashed_bytes  = src.element_size();
 
     // Try to squash the low dimensions together.
-    for(; dim < num_dimensions; ++dim)
+    for (; dim < num_dimensions; ++dim)
     {
-        if(strides[dim] != squashed_bytes)
+        if (strides[dim] != squashed_bytes)
         {
             break;
         }
         squashed_bytes *= shape[dim];
     }
-    if(dim == num_dimensions)
+    if (dim == num_dimensions)
     {
         const auto squashed_elements = squashed_bytes / src.element_size();
         split_dimension              = Window::DimX;
         // The input tensor can be interpreted as 1D array.
         win.set(0, Window::Dimension(0, squashed_elements, 1));
-        for(dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
+        for (dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
         {
             win.set(dim, Window::Dimension(0, 1, 1));
         }
@@ -317,7 +338,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
     else
     {
         // Generate the max window.
-        for(dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
+        for (dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
         {
             win.set(dim, Window::Dimension(0, shape[dim], 1));
         }
diff --git a/src/core/helpers/WindowHelpers.h b/src/core/helpers/WindowHelpers.h
index eccf7f2d18b81029f2c5fc942b71bd4c85cd9906..e404c18e8a8a8b613d04c8a44bbd10b900615e7f 100644
--- a/src/core/helpers/WindowHelpers.h
+++ b/src/core/helpers/WindowHelpers.h
@@ -43,21 +43,13 @@ namespace arm_compute
  *         influence the returned value.
  */
 template <typename... Ts>
-bool update_window_and_padding(Window &win, Ts &&... patterns)
+bool update_window_and_padding(Window &win, Ts &&...patterns)
 {
     bool window_changed = false;
 
-    utility::for_each([&](const IAccessWindow & w)
-    {
-        window_changed |= w.update_window_if_needed(win);
-    },
-    patterns...);
+    utility::for_each([&](const IAccessWindow &w) { window_changed |= w.update_window_if_needed(win); }, patterns...);
 
-    utility::for_each([&](IAccessWindow & w)
-    {
-        w.update_padding_if_needed(win);
-    },
-    patterns...);
+    utility::for_each([&](IAccessWindow &w) { w.update_padding_if_needed(win); }, patterns...);
 
     return window_changed;
 }
@@ -69,18 +61,18 @@ bool update_window_and_padding(Window &win, Ts &&... patterns)
  * @return Intersection of all regions.
  */
 template <typename... Ts>
-ValidRegion intersect_valid_regions(const Ts &... regions)
+ValidRegion intersect_valid_regions(const Ts &...regions)
 {
-    auto intersect = [](const ValidRegion & r1, const ValidRegion & r2) -> ValidRegion
+    auto intersect = [](const ValidRegion &r1, const ValidRegion &r2) -> ValidRegion
     {
         ValidRegion region;
 
-        for(size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
+        for (size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
         {
             region.anchor.set(d, std::max(r1.anchor[d], r2.anchor[d]));
         }
 
-        for(size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
+        for (size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
         {
             region.shape.set(d, std::min(r1.shape[d], r2.shape[d]));
         }
@@ -101,7 +93,10 @@ ValidRegion intersect_valid_regions(const Ts &... regions)
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window(const ValidRegion &valid_region,
+                            const Steps       &steps       = Steps(),
+                            bool               skip_border = false,
+                            BorderSize         border_size = BorderSize());
 
 /** Calculate the maximum window for a given tensor shape and border setting
  *
@@ -112,7 +107,10 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_window(const TensorShape &shape, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window(const TensorShape &shape,
+                            const Steps       &steps       = Steps(),
+                            bool               skip_border = false,
+                            BorderSize         border_size = BorderSize());
 
 /** Calculate the maximum window for a given tensor shape and border setting
  *
@@ -123,7 +121,10 @@ Window calculate_max_window(const TensorShape &shape, const Steps &steps = Steps
  *
  * @return The maximum window the kernel can be executed on.
  */
-inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
+inline Window calculate_max_window(const ITensorInfo &info,
+                                   const Steps       &steps       = Steps(),
+                                   bool               skip_border = false,
+                                   BorderSize         border_size = BorderSize())
 {
     return calculate_max_window(info.tensor_shape(), steps, skip_border, border_size);
 }
@@ -137,7 +138,10 @@ inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps =
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window_horizontal(const ValidRegion &valid_region,
+                                       const Steps       &steps       = Steps(),
+                                       bool               skip_border = false,
+                                       BorderSize         border_size = BorderSize());
 
 /** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
  *
@@ -148,7 +152,10 @@ Window calculate_max_window_horizontal(const ValidRegion &valid_region, const St
  *
  * @return The maximum window the kernel can be executed on.
  */
-inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
+inline Window calculate_max_window_horizontal(const ITensorInfo &info,
+                                              const Steps       &steps       = Steps(),
+                                              bool               skip_border = false,
+                                              BorderSize         border_size = BorderSize())
 {
     return calculate_max_window_horizontal(info.valid_region(), steps, skip_border, border_size);
 }
@@ -161,7 +168,9 @@ inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Ste
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps = Steps(), BorderSize border_size = BorderSize());
+Window calculate_max_enlarged_window(const ValidRegion &valid_region,
+                                     const Steps       &steps       = Steps(),
+                                     BorderSize         border_size = BorderSize());
 
 /** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
  *
@@ -171,7 +180,9 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step
  *
  * @return The maximum window the kernel can be executed on.
  */
-inline Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps = Steps(), BorderSize border_size = BorderSize())
+inline Window calculate_max_enlarged_window(const ITensorInfo &info,
+                                            const Steps       &steps       = Steps(),
+                                            BorderSize         border_size = BorderSize())
 {
     return calculate_max_enlarged_window(info.valid_region(), steps, border_size);
 }
@@ -208,7 +219,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
  * @return A pair of the shape and window
  */
 template <typename... Shapes>
-std::pair<TensorShape, Window> compute_output_shape_and_window(const Shapes &... shapes)
+std::pair<TensorShape, Window> compute_output_shape_and_window(const Shapes &...shapes)
 {
     const TensorShape out_shape = TensorShape::broadcast_shape(shapes...);
     return std::make_pair(out_shape, calculate_max_window(out_shape));
diff --git a/src/core/utils/ActivationFunctionUtils.cpp b/src/core/utils/ActivationFunctionUtils.cpp
index 4854b8eb0bead6c5857c052f969b862ba795e3bb..017170a0c5f96d9ac4e76aa7ef024c5dfe54a4c4 100644
--- a/src/core/utils/ActivationFunctionUtils.cpp
+++ b/src/core/utils/ActivationFunctionUtils.cpp
@@ -28,26 +28,24 @@
 
 namespace arm_compute
 {
-const std::string &string_from_activation_func(const ActivationFunction& act)
+const std::string &string_from_activation_func(const ActivationFunction &act)
 {
-    static std::map<ActivationFunction, const std::string> act_map =
-    {
-        { ActivationFunction::ABS, "ABS" },
-        { ActivationFunction::LINEAR, "LINEAR" },
-        { ActivationFunction::LOGISTIC, "LOGISTIC" },
-        { ActivationFunction::RELU, "RELU" },
-        { ActivationFunction::BOUNDED_RELU, "BRELU" },
-        { ActivationFunction::LU_BOUNDED_RELU, "LU_BRELU" },
-        { ActivationFunction::LEAKY_RELU, "LRELU" },
-        { ActivationFunction::SOFT_RELU, "SRELU" },
-        { ActivationFunction::ELU, "ELU" },
-        { ActivationFunction::SQRT, "SQRT" },
-        { ActivationFunction::SQUARE, "SQUARE" },
-        { ActivationFunction::TANH, "TANH" },
-        { ActivationFunction::IDENTITY, "IDENTITY" },
-        { ActivationFunction::HARD_SWISH, "HARD_SWISH" },
-        { ActivationFunction::SWISH, "SWISH" },
-        { ActivationFunction::GELU, "GELU" }
+    static std::map<ActivationFunction, const std::string> act_map = {{ActivationFunction::ABS, "ABS"},
+                                                                      {ActivationFunction::LINEAR, "LINEAR"},
+                                                                      {ActivationFunction::LOGISTIC, "LOGISTIC"},
+                                                                      {ActivationFunction::RELU, "RELU"},
+                                                                      {ActivationFunction::BOUNDED_RELU, "BRELU"},
+                                                                      {ActivationFunction::LU_BOUNDED_RELU, "LU_BRELU"},
+                                                                      {ActivationFunction::LEAKY_RELU, "LRELU"},
+                                                                      {ActivationFunction::SOFT_RELU, "SRELU"},
+                                                                      {ActivationFunction::ELU, "ELU"},
+                                                                      {ActivationFunction::SQRT, "SQRT"},
+                                                                      {ActivationFunction::SQUARE, "SQUARE"},
+                                                                      {ActivationFunction::TANH, "TANH"},
+                                                                      {ActivationFunction::IDENTITY, "IDENTITY"},
+                                                                      {ActivationFunction::HARD_SWISH, "HARD_SWISH"},
+                                                                      {ActivationFunction::SWISH, "SWISH"},
+                                                                      {ActivationFunction::GELU, "GELU"}
 
     };
 
diff --git a/src/core/utils/AssemblyUtils.cpp b/src/core/utils/AssemblyUtils.cpp
index 6d483adc7fff3fc9e0f4d9085c89a9b33e0e1aec..d97ea42091dc566633681f9a05304151367fdc5e 100644
--- a/src/core/utils/AssemblyUtils.cpp
+++ b/src/core/utils/AssemblyUtils.cpp
@@ -34,12 +34,12 @@ arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
     arm_gemm::Activation gemm_act;
 
     // Early exit in case lower bound is other than 0, as it's not yet supported
-    if(act.b() != 0.f)
+    if (act.b() != 0.f)
     {
         return gemm_act;
     }
 
-    switch(act.activation())
+    switch (act.activation())
     {
         case ActivationLayerInfo::ActivationFunction::RELU:
             gemm_act.type = arm_gemm::Activation::Type::ReLU;
@@ -63,17 +63,15 @@ arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
 
 arm_conv::PaddingValues map_to_arm_conv_padding(const PadStrideInfo &pad_stride_info)
 {
-    return arm_conv::PaddingValues{ pad_stride_info.pad_left(),
-                                    pad_stride_info.pad_top(),
-                                    pad_stride_info.pad_right(),
-                                    pad_stride_info.pad_bottom() };
+    return arm_conv::PaddingValues{pad_stride_info.pad_left(), pad_stride_info.pad_top(), pad_stride_info.pad_right(),
+                                   pad_stride_info.pad_bottom()};
 }
 
 arm_gemm::WeightFormat map_to_arm_gemm_weight_format(const arm_compute::WeightFormat &weight_format)
 {
     arm_gemm::WeightFormat gemm_weight_fromat;
 
-    switch(weight_format)
+    switch (weight_format)
     {
         case arm_compute::WeightFormat::UNSPECIFIED:
             gemm_weight_fromat = arm_gemm::WeightFormat::UNSPECIFIED;
@@ -193,7 +191,7 @@ arm_compute::WeightFormat map_to_arm_compute_weight_format(const arm_gemm::Weigh
 {
     arm_compute::WeightFormat acl_weight_fromat;
 
-    switch(weight_format)
+    switch (weight_format)
     {
         case arm_gemm::WeightFormat::UNSPECIFIED:
             acl_weight_fromat = arm_compute::WeightFormat::UNSPECIFIED;
diff --git a/src/core/utils/AssemblyUtils.h b/src/core/utils/AssemblyUtils.h
index 60bad3b61836c24e799a6204833bd9065a06c564..7d0d37c4ef55b20e2a7b6d38335369312a691b47 100644
--- a/src/core/utils/AssemblyUtils.h
+++ b/src/core/utils/AssemblyUtils.h
@@ -25,6 +25,7 @@
 #define UTILS_CORE_ASSEMBLY_UTILS_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/kernels/assembly/common.hpp"
 #include "src/cpu/kernels/assembly/arm_gemm.hpp"
 
@@ -65,6 +66,6 @@ arm_gemm::WeightFormat map_to_arm_gemm_weight_format(const arm_compute::WeightFo
  * @return Compute Library WeightFormat
  */
 arm_compute::WeightFormat map_to_arm_compute_weight_format(const arm_gemm::WeightFormat &weight_format);
-} // namespace assembly
+} // namespace assembly_utils
 } // namespace arm_compute
 #endif /* UTILS_CORE_ASSEMBLY_UTILS_H */
diff --git a/src/core/utils/DataLayoutUtils.cpp b/src/core/utils/DataLayoutUtils.cpp
index 4919b79a42320d6b8c60a9807b573d0c3487dc92..234bed71cb22c5eefcb77ad64c79eb7ff014f245 100644
--- a/src/core/utils/DataLayoutUtils.cpp
+++ b/src/core/utils/DataLayoutUtils.cpp
@@ -29,11 +29,10 @@ namespace arm_compute
 
 const std::string &string_from_data_layout(DataLayout dl)
 {
-    static std::map<DataLayout, const std::string> dl_map =
-    {
-        { DataLayout::UNKNOWN, "UNKNOWN" },
-        { DataLayout::NCHW, "NCHW" },
-        { DataLayout::NHWC, "NHWC" },
+    static std::map<DataLayout, const std::string> dl_map = {
+        {DataLayout::UNKNOWN, "UNKNOWN"},
+        {DataLayout::NCHW, "NCHW"},
+        {DataLayout::NHWC, "NHWC"},
     };
 
     return dl_map[dl];
diff --git a/src/core/utils/DataTypeUtils.cpp b/src/core/utils/DataTypeUtils.cpp
index 07999354d9716a02abfb04d03c33b7553ab3ed38..13943399875cb32cb110a6021ea2b02e1d51edc1 100644
--- a/src/core/utils/DataTypeUtils.cpp
+++ b/src/core/utils/DataTypeUtils.cpp
@@ -30,27 +30,26 @@ namespace arm_compute
 {
 const std::string &string_from_data_type(DataType dt)
 {
-    static std::map<DataType, const std::string> dt_map =
-    {
-        { DataType::UNKNOWN, "UNKNOWN" },
-        { DataType::S8, "S8" },
-        { DataType::U8, "U8" },
-        { DataType::S16, "S16" },
-        { DataType::U16, "U16" },
-        { DataType::S32, "S32" },
-        { DataType::U32, "U32" },
-        { DataType::S64, "S64" },
-        { DataType::U64, "U64" },
-        { DataType::F16, "F16" },
-        { DataType::F32, "F32" },
-        { DataType::F64, "F64" },
-        { DataType::SIZET, "SIZET" },
-        { DataType::QSYMM8, "QSYMM8" },
-        { DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL" },
-        { DataType::QASYMM8, "QASYMM8" },
-        { DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED" },
-        { DataType::QSYMM16, "QSYMM16" },
-        { DataType::QASYMM16, "QASYMM16" },
+    static std::map<DataType, const std::string> dt_map = {
+        {DataType::UNKNOWN, "UNKNOWN"},
+        {DataType::S8, "S8"},
+        {DataType::U8, "U8"},
+        {DataType::S16, "S16"},
+        {DataType::U16, "U16"},
+        {DataType::S32, "S32"},
+        {DataType::U32, "U32"},
+        {DataType::S64, "S64"},
+        {DataType::U64, "U64"},
+        {DataType::F16, "F16"},
+        {DataType::F32, "F32"},
+        {DataType::F64, "F64"},
+        {DataType::SIZET, "SIZET"},
+        {DataType::QSYMM8, "QSYMM8"},
+        {DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL"},
+        {DataType::QASYMM8, "QASYMM8"},
+        {DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED"},
+        {DataType::QSYMM16, "QSYMM16"},
+        {DataType::QASYMM16, "QASYMM16"},
     };
 
     return dt_map[dt];
@@ -58,12 +57,11 @@ const std::string &string_from_data_type(DataType dt)
 
 DataType data_type_from_name(const std::string &name)
 {
-    static const std::map<std::string, DataType> data_types =
-    {
-        { "f16", DataType::F16 },
-        { "f32", DataType::F32 },
-        { "qasymm8", DataType::QASYMM8 },
-        { "qasymm8_signed", DataType::QASYMM8_SIGNED },
+    static const std::map<std::string, DataType> data_types = {
+        {"f16", DataType::F16},
+        {"f32", DataType::F32},
+        {"qasymm8", DataType::QASYMM8},
+        {"qasymm8_signed", DataType::QASYMM8_SIGNED},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -74,7 +72,7 @@ DataType data_type_from_name(const std::string &name)
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         ARM_COMPUTE_ERROR_VAR("Invalid data type name: %s", name.c_str());
     }
diff --git a/src/core/utils/FormatUtils.cpp b/src/core/utils/FormatUtils.cpp
index 05b649ded2029c530b99a03bae96d539a211440a..46f84553156941eb167b532962e355b87cb973c2 100644
--- a/src/core/utils/FormatUtils.cpp
+++ b/src/core/utils/FormatUtils.cpp
@@ -30,26 +30,16 @@ namespace arm_compute
 {
 const std::string &string_from_format(Format format)
 {
-    static std::map<Format, const std::string> formats_map =
-    {
-        { Format::UNKNOWN, "UNKNOWN" },
-        { Format::U8, "U8" },
-        { Format::S16, "S16" },
-        { Format::U16, "U16" },
-        { Format::S32, "S32" },
-        { Format::U32, "U32" },
-        { Format::F16, "F16" },
-        { Format::F32, "F32" },
-        { Format::UV88, "UV88" },
-        { Format::RGB888, "RGB888" },
-        { Format::RGBA8888, "RGBA8888" },
-        { Format::YUV444, "YUV444" },
-        { Format::YUYV422, "YUYV422" },
-        { Format::NV12, "NV12" },
-        { Format::NV21, "NV21" },
-        { Format::IYUV, "IYUV" },
-        { Format::UYVY422, "UYVY422" }
-    };
+    static std::map<Format, const std::string> formats_map = {
+        {Format::UNKNOWN, "UNKNOWN"},   {Format::U8, "U8"},
+        {Format::S16, "S16"},           {Format::U16, "U16"},
+        {Format::S32, "S32"},           {Format::U32, "U32"},
+        {Format::F16, "F16"},           {Format::F32, "F32"},
+        {Format::UV88, "UV88"},         {Format::RGB888, "RGB888"},
+        {Format::RGBA8888, "RGBA8888"}, {Format::YUV444, "YUV444"},
+        {Format::YUYV422, "YUYV422"},   {Format::NV12, "NV12"},
+        {Format::NV21, "NV21"},         {Format::IYUV, "IYUV"},
+        {Format::UYVY422, "UYVY422"}};
 
     return formats_map[format];
 }
diff --git a/src/core/utils/InterpolationPolicyUtils.cpp b/src/core/utils/InterpolationPolicyUtils.cpp
index 2d6cabe85efe2dee73b5b342aa5d8743a943cd5f..276e7605442e1f3c3f8b03b794d629f8e3326d03 100644
--- a/src/core/utils/InterpolationPolicyUtils.cpp
+++ b/src/core/utils/InterpolationPolicyUtils.cpp
@@ -29,11 +29,10 @@ namespace arm_compute
 
 const std::string &string_from_interpolation_policy(InterpolationPolicy policy)
 {
-    static std::map<InterpolationPolicy, const std::string> interpolation_policy_map =
-    {
-        { InterpolationPolicy::AREA, "AREA" },
-        { InterpolationPolicy::BILINEAR, "BILINEAR" },
-        { InterpolationPolicy::NEAREST_NEIGHBOR, "NEAREST_NEIGHBOUR" },
+    static std::map<InterpolationPolicy, const std::string> interpolation_policy_map = {
+        {InterpolationPolicy::AREA, "AREA"},
+        {InterpolationPolicy::BILINEAR, "BILINEAR"},
+        {InterpolationPolicy::NEAREST_NEIGHBOR, "NEAREST_NEIGHBOUR"},
     };
 
     return interpolation_policy_map[policy];
diff --git a/src/core/utils/Math.cpp b/src/core/utils/Math.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..270e65c16187375f51dc28a1c89bfe4e57604768
--- /dev/null
+++ b/src/core/utils/Math.cpp
@@ -0,0 +1,546 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/utils/Math.h"
+
+namespace arm_compute
+{
+
+const std::array<ErfLutEntry<float>, 513> erf_f32_lut = {{
+    {0.0000000000f, 1.1283791671f}, // 0.0000000000
+    {0.0088152829f, 1.1283102984f}, // 0.0078125000
+    {0.0176294898f, 1.1281037175f}, // 0.0156250000
+    {0.0264415450f, 1.1277595001f}, // 0.0234375000
+    {0.0352503739f, 1.1272777722f}, // 0.0312500000
+    {0.0440549026f, 1.1266587101f}, // 0.0390625000
+    {0.0528540592f, 1.1259025402f}, // 0.0468750000
+    {0.0616467734f, 1.1250095393f}, // 0.0546875000
+    {0.0704319777f, 1.1239800336f}, // 0.0625000000
+    {0.0792086070f, 1.1228143994f}, // 0.0703125000
+    {0.0879755993f, 1.1215130622f}, // 0.0781250000
+    {0.0967318964f, 1.1200764968f}, // 0.0859375000
+    {0.1054764438f, 1.1185052270f}, // 0.0937500000
+    {0.1142081913f, 1.1167998249f}, // 0.1015625000
+    {0.1229260934f, 1.1149609111f}, // 0.1093750000
+    {0.1316291096f, 1.1129891541f}, // 0.1171875000
+    {0.1403162048f, 1.1108852696f}, // 0.1250000000
+    {0.1489863498f, 1.1086500206f}, // 0.1328125000
+    {0.1576385214f, 1.1062842165f}, // 0.1406250000
+    {0.1662717029f, 1.1037887128f}, // 0.1484375000
+    {0.1748848846f, 1.1011644107f}, // 0.1562500000
+    {0.1834770639f, 1.0984122563f}, // 0.1640625000
+    {0.1920472457f, 1.0955332401f}, // 0.1718750000
+    {0.2005944431f, 1.0925283966f}, // 0.1796875000
+    {0.2091176771f, 1.0893988035f}, // 0.1875000000
+    {0.2176159774f, 1.0861455810f}, // 0.1953125000
+    {0.2260883828f, 1.0827698913f}, // 0.2031250000
+    {0.2345339412f, 1.0792729378f}, // 0.2109375000
+    {0.2429517099f, 1.0756559646f}, // 0.2187500000
+    {0.2513407564f, 1.0719202554f}, // 0.2265625000
+    {0.2597001582f, 1.0680671328f}, // 0.2343750000
+    {0.2680290031f, 1.0640979580f}, // 0.2421875000
+    {0.2763263902f, 1.0600141294f}, // 0.2500000000
+    {0.2845914291f, 1.0558170820f}, // 0.2578125000
+    {0.2928232411f, 1.0515082867f}, // 0.2656250000
+    {0.3010209590f, 1.0470892494f}, // 0.2734375000
+    {0.3091837275f, 1.0425615098f}, // 0.2812500000
+    {0.3173107036f, 1.0379266409f}, // 0.2890625000
+    {0.3254010565f, 1.0331862480f}, // 0.2968750000
+    {0.3334539681f, 1.0283419676f}, // 0.3046875000
+    {0.3414686335f, 1.0233954666f}, // 0.3125000000
+    {0.3494442605f, 1.0183484415f}, // 0.3203125000
+    {0.3573800706f, 1.0132026170f}, // 0.3281250000
+    {0.3652752987f, 1.0079597454f}, // 0.3359375000
+    {0.3731291935f, 1.0026216055f}, // 0.3437500000
+    {0.3809410179f, 0.9971900017f}, // 0.3515625000
+    {0.3887100487f, 0.9916667625f}, // 0.3593750000
+    {0.3964355772f, 0.9860537403f}, // 0.3671875000
+    {0.4041169094f, 0.9803528095f}, // 0.3750000000
+    {0.4117533659f, 0.9745658663f}, // 0.3828125000
+    {0.4193442821f, 0.9686948267f}, // 0.3906250000
+    {0.4268890086f, 0.9627416265f}, // 0.3984375000
+    {0.4343869111f, 0.9567082195f}, // 0.4062500000
+    {0.4418373708f, 0.9505965764f}, // 0.4140625000
+    {0.4492397841f, 0.9444086845f}, // 0.4218750000
+    {0.4565935631f, 0.9381465458f}, // 0.4296875000
+    {0.4638981357f, 0.9318121761f}, // 0.4375000000
+    {0.4711529456f, 0.9254076045f}, // 0.4453125000
+    {0.4783574521f, 0.9189348715f}, // 0.4531250000
+    {0.4855111308f, 0.9123960286f}, // 0.4609375000
+    {0.4926134732f, 0.9057931368f}, // 0.4687500000
+    {0.4996639871f, 0.8991282656f}, // 0.4765625000
+    {0.5066621964f, 0.8924034924f}, // 0.4843750000
+    {0.5136076411f, 0.8856209005f}, // 0.4921875000
+    {0.5204998778f, 0.8787825789f}, // 0.5000000000
+    {0.5273384792f, 0.8718906210f}, // 0.5078125000
+    {0.5341230345f, 0.8649471234f}, // 0.5156250000
+    {0.5408531493f, 0.8579541846f}, // 0.5234375000
+    {0.5475284454f, 0.8509139049f}, // 0.5312500000
+    {0.5541485612f, 0.8438283842f}, // 0.5390625000
+    {0.5607131516f, 0.8366997220f}, // 0.5468750000
+    {0.5672218875f, 0.8295300154f}, // 0.5546875000
+    {0.5736744566f, 0.8223213592f}, // 0.5625000000
+    {0.5800705628f, 0.8150758439f}, // 0.5703125000
+    {0.5864099261f, 0.8077955554f}, // 0.5781250000
+    {0.5926922832f, 0.8004825735f}, // 0.5859375000
+    {0.5989173866f, 0.7931389715f}, // 0.5937500000
+    {0.6050850052f, 0.7857668149f}, // 0.6015625000
+    {0.6111949241f, 0.7783681603f}, // 0.6093750000
+    {0.6172469441f, 0.7709450550f}, // 0.6171875000
+    {0.6232408822f, 0.7634995358f}, // 0.6250000000
+    {0.6291765712f, 0.7560336278f}, // 0.6328125000
+    {0.6350538598f, 0.7485493443f}, // 0.6406250000
+    {0.6408726121f, 0.7410486852f}, // 0.6484375000
+    {0.6466327080f, 0.7335336365f}, // 0.6562500000
+    {0.6523340428f, 0.7260061695f}, // 0.6640625000
+    {0.6579765272f, 0.7184682397f}, // 0.6718750000
+    {0.6635600869f, 0.7109217866f}, // 0.6796875000
+    {0.6690846629f, 0.7033687322f}, // 0.6875000000
+    {0.6745502111f, 0.6958109807f}, // 0.6953125000
+    {0.6799567021f, 0.6882504177f}, // 0.7031250000
+    {0.6853041214f, 0.6806889096f}, // 0.7109375000
+    {0.6905924687f, 0.6731283025f}, // 0.7187500000
+    {0.6958217582f, 0.6655704219f}, // 0.7265625000
+    {0.7009920183f, 0.6580170718f}, // 0.7343750000
+    {0.7061032914f, 0.6504700344f}, // 0.7421875000
+    {0.7111556337f, 0.6429310692f}, // 0.7500000000
+    {0.7161491149f, 0.6354019123f}, // 0.7578125000
+    {0.7210838185f, 0.6278842762f}, // 0.7656250000
+    {0.7259598411f, 0.6203798491f}, // 0.7734375000
+    {0.7307772924f, 0.6128902940f}, // 0.7812500000
+    {0.7355362950f, 0.6054172488f}, // 0.7890625000
+    {0.7402369841f, 0.5979623254f}, // 0.7968750000
+    {0.7448795076f, 0.5905271095f}, // 0.8046875000
+    {0.7494640256f, 0.5831131598f}, // 0.8125000000
+    {0.7539907101f, 0.5757220079f}, // 0.8203125000
+    {0.7584597452f, 0.5683551577f}, // 0.8281250000
+    {0.7628713266f, 0.5610140853f}, // 0.8359375000
+    {0.7672256612f, 0.5537002383f}, // 0.8437500000
+    {0.7715229674f, 0.5464150355f}, // 0.8515625000
+    {0.7757634744f, 0.5391598669f}, // 0.8593750000
+    {0.7799474221f, 0.5319360931f}, // 0.8671875000
+    {0.7840750611f, 0.5247450453f}, // 0.8750000000
+    {0.7881466520f, 0.5175880246f}, // 0.8828125000
+    {0.7921624659f, 0.5104663022f}, // 0.8906250000
+    {0.7961227832f, 0.5033811191f}, // 0.8984375000
+    {0.8000278942f, 0.4963336858f}, // 0.9062500000
+    {0.8038780984f, 0.4893251822f}, // 0.9140625000
+    {0.8076737045f, 0.4823567575f}, // 0.9218750000
+    {0.8114150300f, 0.4754295299f}, // 0.9296875000
+    {0.8151024010f, 0.4685445869f}, // 0.9375000000
+    {0.8187361521f, 0.4617029846f}, // 0.9453125000
+    {0.8223166257f, 0.4549057483f}, // 0.9531250000
+    {0.8258441725f, 0.4481538720f}, // 0.9609375000
+    {0.8293191506f, 0.4414483184f}, // 0.9687500000
+    {0.8327419255f, 0.4347900193f}, // 0.9765625000
+    {0.8361128701f, 0.4281798750f}, // 0.9843750000
+    {0.8394323638f, 0.4216187550f}, // 0.9921875000
+    {0.8427007929f, 0.4151074974f}, // 1.0000000000
+    {0.8459185504f, 0.4086469096f}, // 1.0078125000
+    {0.8490860349f, 0.4022377678f}, // 1.0156250000
+    {0.8522036514f, 0.3958808176f}, // 1.0234375000
+    {0.8552718104f, 0.3895767737f}, // 1.0312500000
+    {0.8582909280f, 0.3833263203f}, // 1.0390625000
+    {0.8612614255f, 0.3771301114f}, // 1.0468750000
+    {0.8641837289f, 0.3709887705f}, // 1.0546875000
+    {0.8670582694f, 0.3649028912f}, // 1.0625000000
+    {0.8698854825f, 0.3588730371f}, // 1.0703125000
+    {0.8726658079f, 0.3528997425f}, // 1.0781250000
+    {0.8753996896f, 0.3469835119f}, // 1.0859375000
+    {0.8780875752f, 0.3411248209f}, // 1.0937500000
+    {0.8807299159f, 0.3353241162f}, // 1.1015625000
+    {0.8833271666f, 0.3295818158f}, // 1.1093750000
+    {0.8858797849f, 0.3238983093f}, // 1.1171875000
+    {0.8883882317f, 0.3182739585f}, // 1.1250000000
+    {0.8908529704f, 0.3127090972f}, // 1.1328125000
+    {0.8932744671f, 0.3072040319f}, // 1.1406250000
+    {0.8956531899f, 0.3017590421f}, // 1.1484375000
+    {0.8979896092f, 0.2963743805f}, // 1.1562500000
+    {0.9002841973f, 0.2910502733f}, // 1.1640625000
+    {0.9025374279f, 0.2857869208f}, // 1.1718750000
+    {0.9047497766f, 0.2805844976f}, // 1.1796875000
+    {0.9069217198f, 0.2754431531f}, // 1.1875000000
+    {0.9090537352f, 0.2703630118f}, // 1.1953125000
+    {0.9111463015f, 0.2653441734f}, // 1.2031250000
+    {0.9131998978f, 0.2603867140f}, // 1.2109375000
+    {0.9152150039f, 0.2554906858f}, // 1.2187500000
+    {0.9171920998f, 0.2506561176f}, // 1.2265625000
+    {0.9191316658f, 0.2458830155f}, // 1.2343750000
+    {0.9210341819f, 0.2411713632f}, // 1.2421875000
+    {0.9229001283f, 0.2365211224f}, // 1.2500000000
+    {0.9247299843f, 0.2319322334f}, // 1.2578125000
+    {0.9265242290f, 0.2274046151f}, // 1.2656250000
+    {0.9282833407f, 0.2229381659f}, // 1.2734375000
+    {0.9300077968f, 0.2185327643f}, // 1.2812500000
+    {0.9316980737f, 0.2141882685f}, // 1.2890625000
+    {0.9333546467f, 0.2099045180f}, // 1.2968750000
+    {0.9349779895f, 0.2056813330f}, // 1.3046875000
+    {0.9365685747f, 0.2015185157f}, // 1.3125000000
+    {0.9381268730f, 0.1974158503f}, // 1.3203125000
+    {0.9396533534f, 0.1933731034f}, // 1.3281250000
+    {0.9411484831f, 0.1893900249f}, // 1.3359375000
+    {0.9426127272f, 0.1854663482f}, // 1.3437500000
+    {0.9440465488f, 0.1816017904f}, // 1.3515625000
+    {0.9454504084f, 0.1777960534f}, // 1.3593750000
+    {0.9468247645f, 0.1740488238f}, // 1.3671875000
+    {0.9481700728f, 0.1703597737f}, // 1.3750000000
+    {0.9494867865f, 0.1667285609f}, // 1.3828125000
+    {0.9507753562f, 0.1631548298f}, // 1.3906250000
+    {0.9520362295f, 0.1596382112f}, // 1.3984375000
+    {0.9532698510f, 0.1561783236f}, // 1.4062500000
+    {0.9544766625f, 0.1527747727f}, // 1.4140625000
+    {0.9556571025f, 0.1494271527f}, // 1.4218750000
+    {0.9568116063f, 0.1461350463f}, // 1.4296875000
+    {0.9579406061f, 0.1428980254f}, // 1.4375000000
+    {0.9590445303f, 0.1397156511f}, // 1.4453125000
+    {0.9601238042f, 0.1365874749f}, // 1.4531250000
+    {0.9611788495f, 0.1335130382f}, // 1.4609375000
+    {0.9622100842f, 0.1304918737f}, // 1.4687500000
+    {0.9632179226f, 0.1275235050f}, // 1.4765625000
+    {0.9642027752f, 0.1246074475f}, // 1.4843750000
+    {0.9651650489f, 0.1217432089f}, // 1.4921875000
+    {0.9661051465f, 0.1189302892f}, // 1.5000000000
+    {0.9670234670f, 0.1161681815f}, // 1.5078125000
+    {0.9679204053f, 0.1134563721f}, // 1.5156250000
+    {0.9687963524f, 0.1107943411f}, // 1.5234375000
+    {0.9696516951f, 0.1081815630f}, // 1.5312500000
+    {0.9704868162f, 0.1056175064f}, // 1.5390625000
+    {0.9713020942f, 0.1031016352f}, // 1.5468750000
+    {0.9720979033f, 0.1006334084f}, // 1.5546875000
+    {0.9728746138f, 0.0982122808f}, // 1.5625000000
+    {0.9736325914f, 0.0958377032f}, // 1.5703125000
+    {0.9743721977f, 0.0935091227f}, // 1.5781250000
+    {0.9750937898f, 0.0912259834f}, // 1.5859375000
+    {0.9757977206f, 0.0889877264f}, // 1.5937500000
+    {0.9764843385f, 0.0867937900f}, // 1.6015625000
+    {0.9771539875f, 0.0846436106f}, // 1.6093750000
+    {0.9778070074f, 0.0825366227f}, // 1.6171875000
+    {0.9784437332f, 0.0804722590f}, // 1.6250000000
+    {0.9790644959f, 0.0784499511f}, // 1.6328125000
+    {0.9796696218f, 0.0764691297f}, // 1.6406250000
+    {0.9802594326f, 0.0745292246f}, // 1.6484375000
+    {0.9808342460f, 0.0726296655f}, // 1.6562500000
+    {0.9813943747f, 0.0707698819f}, // 1.6640625000
+    {0.9819401275f, 0.0689493034f}, // 1.6718750000
+    {0.9824718082f, 0.0671673602f}, // 1.6796875000
+    {0.9829897166f, 0.0654234833f}, // 1.6875000000
+    {0.9834941478f, 0.0637171046f}, // 1.6953125000
+    {0.9839853925f, 0.0620476570f}, // 1.7031250000
+    {0.9844637371f, 0.0604145752f}, // 1.7109375000
+    {0.9849294635f, 0.0588172956f}, // 1.7187500000
+    {0.9853828492f, 0.0572552562f}, // 1.7265625000
+    {0.9858241672f, 0.0557278976f}, // 1.7343750000
+    {0.9862536864f, 0.0542346624f}, // 1.7421875000
+    {0.9866716712f, 0.0527749959f}, // 1.7500000000
+    {0.9870783817f, 0.0513483463f}, // 1.7578125000
+    {0.9874740737f, 0.0499541645f}, // 1.7656250000
+    {0.9878589987f, 0.0485919049f}, // 1.7734375000
+    {0.9882334039f, 0.0472610247f}, // 1.7812500000
+    {0.9885975325f, 0.0459609852f}, // 1.7890625000
+    {0.9889516232f, 0.0446912508f}, // 1.7968750000
+    {0.9892959108f, 0.0434512901f}, // 1.8046875000
+    {0.9896306258f, 0.0422405756f}, // 1.8125000000
+    {0.9899559946f, 0.0410585838f}, // 1.8203125000
+    {0.9902722396f, 0.0399047954f}, // 1.8281250000
+    {0.9905795791f, 0.0387786956f}, // 1.8359375000
+    {0.9908782275f, 0.0376797741f}, // 1.8437500000
+    {0.9911683951f, 0.0366075252f}, // 1.8515625000
+    {0.9914502882f, 0.0355614479f}, // 1.8593750000
+    {0.9917241096f, 0.0345410460f}, // 1.8671875000
+    {0.9919900577f, 0.0335458284f}, // 1.8750000000
+    {0.9922483274f, 0.0325753089f}, // 1.8828125000
+    {0.9924991099f, 0.0316290065f}, // 1.8906250000
+    {0.9927425925f, 0.0307064452f}, // 1.8984375000
+    {0.9929789587f, 0.0298071547f}, // 1.9062500000
+    {0.9932083887f, 0.0289306696f}, // 1.9140625000
+    {0.9934310586f, 0.0280765301f}, // 1.9218750000
+    {0.9936471415f, 0.0272442821f}, // 1.9296875000
+    {0.9938568064f, 0.0264334768f}, // 1.9375000000
+    {0.9940602192f, 0.0256436709f}, // 1.9453125000
+    {0.9942575423f, 0.0248744271f}, // 1.9531250000
+    {0.9944489346f, 0.0241253134f}, // 1.9609375000
+    {0.9946345516f, 0.0233959038f}, // 1.9687500000
+    {0.9948145458f, 0.0226857778f}, // 1.9765625000
+    {0.9949890661f, 0.0219945210f}, // 1.9843750000
+    {0.9951582582f, 0.0213217245f}, // 1.9921875000
+    {0.9953222650f, 0.0206669854f}, // 2.0000000000
+    {0.9954812259f, 0.0200299065f}, // 2.0078125000
+    {0.9956352773f, 0.0194100966f}, // 2.0156250000
+    {0.9957845526f, 0.0188071704f}, // 2.0234375000
+    {0.9959291823f, 0.0182207482f}, // 2.0312500000
+    {0.9960692938f, 0.0176504563f}, // 2.0390625000
+    {0.9962050117f, 0.0170959271f}, // 2.0468750000
+    {0.9963364578f, 0.0165567984f}, // 2.0546875000
+    {0.9964637509f, 0.0160327141f}, // 2.0625000000
+    {0.9965870072f, 0.0155233240f}, // 2.0703125000
+    {0.9967063402f, 0.0150282836f}, // 2.0781250000
+    {0.9968218606f, 0.0145472542f}, // 2.0859375000
+    {0.9969336766f, 0.0140799029f}, // 2.0937500000
+    {0.9970418939f, 0.0136259025f}, // 2.1015625000
+    {0.9971466153f, 0.0131849315f}, // 2.1093750000
+    {0.9972479415f, 0.0127566743f}, // 2.1171875000
+    {0.9973459706f, 0.0123408206f}, // 2.1250000000
+    {0.9974407984f, 0.0119370661f}, // 2.1328125000
+    {0.9975325180f, 0.0115451118f}, // 2.1406250000
+    {0.9976212207f, 0.0111646644f}, // 2.1484375000
+    {0.9977069951f, 0.0107954360f}, // 2.1562500000
+    {0.9977899279f, 0.0104371443f}, // 2.1640625000
+    {0.9978701033f, 0.0100895123f}, // 2.1718750000
+    {0.9979476035f, 0.0097522684f}, // 2.1796875000
+    {0.9980225088f, 0.0094251464f}, // 2.1875000000
+    {0.9980948971f, 0.0091078852f}, // 2.1953125000
+    {0.9981648445f, 0.0088002291f}, // 2.2031250000
+    {0.9982324251f, 0.0085019274f}, // 2.2109375000
+    {0.9982977109f, 0.0082127346f}, // 2.2187500000
+    {0.9983607721f, 0.0079324104f}, // 2.2265625000
+    {0.9984216773f, 0.0076607192f}, // 2.2343750000
+    {0.9984804928f, 0.0073974307f}, // 2.2421875000
+    {0.9985372834f, 0.0071423190f}, // 2.2500000000
+    {0.9985921122f, 0.0068951636f}, // 2.2578125000
+    {0.9986450405f, 0.0066557482f}, // 2.2656250000
+    {0.9986961279f, 0.0064238617f}, // 2.2734375000
+    {0.9987454324f, 0.0061992973f}, // 2.2812500000
+    {0.9987930105f, 0.0059818530f}, // 2.2890625000
+    {0.9988389169f, 0.0057713311f}, // 2.2968750000
+    {0.9988832050f, 0.0055675385f}, // 2.3046875000
+    {0.9989259267f, 0.0053702865f}, // 2.3125000000
+    {0.9989671323f, 0.0051793907f}, // 2.3203125000
+    {0.9990068708f, 0.0049946708f}, // 2.3281250000
+    {0.9990451897f, 0.0048159509f}, // 2.3359375000
+    {0.9990821352f, 0.0046430592f}, // 2.3437500000
+    {0.9991177522f, 0.0044758278f}, // 2.3515625000
+    {0.9991520843f, 0.0043140931f}, // 2.3593750000
+    {0.9991851738f, 0.0041576951f}, // 2.3671875000
+    {0.9992170618f, 0.0040064779f}, // 2.3750000000
+    {0.9992477881f, 0.0038602892f}, // 2.3828125000
+    {0.9992773915f, 0.0037189807f}, // 2.3906250000
+    {0.9993059095f, 0.0035824076f}, // 2.3984375000
+    {0.9993333786f, 0.0034504286f}, // 2.4062500000
+    {0.9993598341f, 0.0033229062f}, // 2.4140625000
+    {0.9993853103f, 0.0031997062f}, // 2.4218750000
+    {0.9994098404f, 0.0030806979f}, // 2.4296875000
+    {0.9994334567f, 0.0029657539f}, // 2.4375000000
+    {0.9994561906f, 0.0028547501f}, // 2.4453125000
+    {0.9994780722f, 0.0027475655f}, // 2.4531250000
+    {0.9994991309f, 0.0026440825f}, // 2.4609375000
+    {0.9995193953f, 0.0025441865f}, // 2.4687500000
+    {0.9995388929f, 0.0024477658f}, // 2.4765625000
+    {0.9995576504f, 0.0023547119f}, // 2.4843750000
+    {0.9995756937f, 0.0022649190f}, // 2.4921875000
+    {0.9995930480f, 0.0021782842f}, // 2.5000000000
+    {0.9996097374f, 0.0020947076f}, // 2.5078125000
+    {0.9996257855f, 0.0020140918f}, // 2.5156250000
+    {0.9996412150f, 0.0019363421f}, // 2.5234375000
+    {0.9996560481f, 0.0018613666f}, // 2.5312500000
+    {0.9996703059f, 0.0017890757f}, // 2.5390625000
+    {0.9996840091f, 0.0017193826f}, // 2.5468750000
+    {0.9996971778f, 0.0016522026f}, // 2.5546875000
+    {0.9997098311f, 0.0015874537f}, // 2.5625000000
+    {0.9997219879f, 0.0015250561f}, // 2.5703125000
+    {0.9997336661f, 0.0014649323f}, // 2.5781250000
+    {0.9997448832f, 0.0014070070f}, // 2.5859375000
+    {0.9997556561f, 0.0013512073f}, // 2.5937500000
+    {0.9997660011f, 0.0012974620f}, // 2.6015625000
+    {0.9997759341f, 0.0012457025f}, // 2.6093750000
+    {0.9997854702f, 0.0011958618f}, // 2.6171875000
+    {0.9997946243f, 0.0011478751f}, // 2.6250000000
+    {0.9998034104f, 0.0011016795f}, // 2.6328125000
+    {0.9998118425f, 0.0010572140f}, // 2.6406250000
+    {0.9998199338f, 0.0010144193f}, // 2.6484375000
+    {0.9998276970f, 0.0009732381f}, // 2.6562500000
+    {0.9998351447f, 0.0009336147f}, // 2.6640625000
+    {0.9998422887f, 0.0008954951f}, // 2.6718750000
+    {0.9998491406f, 0.0008588272f}, // 2.6796875000
+    {0.9998557115f, 0.0008235601f}, // 2.6875000000
+    {0.9998620122f, 0.0007896449f}, // 2.6953125000
+    {0.9998680531f, 0.0007570339f}, // 2.7031250000
+    {0.9998738441f, 0.0007256811f}, // 2.7109375000
+    {0.9998793950f, 0.0006955419f}, // 2.7187500000
+    {0.9998847150f, 0.0006665730f}, // 2.7265625000
+    {0.9998898132f, 0.0006387328f}, // 2.7343750000
+    {0.9998946981f, 0.0006119806f}, // 2.7421875000
+    {0.9998993781f, 0.0005862772f}, // 2.7500000000
+    {0.9999038613f, 0.0005615849f}, // 2.7578125000
+    {0.9999081554f, 0.0005378669f}, // 2.7656250000
+    {0.9999122679f, 0.0005150877f}, // 2.7734375000
+    {0.9999162060f, 0.0004932131f}, // 2.7812500000
+    {0.9999199766f, 0.0004722097f}, // 2.7890625000
+    {0.9999235864f, 0.0004520456f}, // 2.7968750000
+    {0.9999270419f, 0.0004326897f}, // 2.8046875000
+    {0.9999303492f, 0.0004141120f}, // 2.8125000000
+    {0.9999335144f, 0.0003962836f}, // 2.8203125000
+    {0.9999365431f, 0.0003791765f}, // 2.8281250000
+    {0.9999394408f, 0.0003627636f}, // 2.8359375000
+    {0.9999422130f, 0.0003470187f}, // 2.8437500000
+    {0.9999448647f, 0.0003319167f}, // 2.8515625000
+    {0.9999474008f, 0.0003174332f}, // 2.8593750000
+    {0.9999498261f, 0.0003035446f}, // 2.8671875000
+    {0.9999521452f, 0.0002902283f}, // 2.8750000000
+    {0.9999543624f, 0.0002774622f}, // 2.8828125000
+    {0.9999564819f, 0.0002652254f}, // 2.8906250000
+    {0.9999585078f, 0.0002534972f}, // 2.8984375000
+    {0.9999604441f, 0.0002422581f}, // 2.9062500000
+    {0.9999622943f, 0.0002314890f}, // 2.9140625000
+    {0.9999640622f, 0.0002211717f}, // 2.9218750000
+    {0.9999657513f, 0.0002112884f}, // 2.9296875000
+    {0.9999673647f, 0.0002018221f}, // 2.9375000000
+    {0.9999689058f, 0.0001927564f}, // 2.9453125000
+    {0.9999703775f, 0.0001840754f}, // 2.9531250000
+    {0.9999717829f, 0.0001757640f}, // 2.9609375000
+    {0.9999731248f, 0.0001678073f}, // 2.9687500000
+    {0.9999744058f, 0.0001601913f}, // 2.9765625000
+    {0.9999756286f, 0.0001529022f}, // 2.9843750000
+    {0.9999767957f, 0.0001459270f}, // 2.9921875000
+    {0.9999779095f, 0.0001392531f}, // 3.0000000000
+    {0.9999789723f, 0.0001328681f}, // 3.0078125000
+    {0.9999799863f, 0.0001267604f}, // 3.0156250000
+    {0.9999809536f, 0.0001209187f}, // 3.0234375000
+    {0.9999818763f, 0.0001153322f}, // 3.0312500000
+    {0.9999827563f, 0.0001099903f}, // 3.0390625000
+    {0.9999835955f, 0.0001048830f}, // 3.0468750000
+    {0.9999843957f, 0.0001000007f}, // 3.0546875000
+    {0.9999851586f, 0.0000953340f}, // 3.0625000000
+    {0.9999858858f, 0.0000908740f}, // 3.0703125000
+    {0.9999865790f, 0.0000866121f}, // 3.0781250000
+    {0.9999872396f, 0.0000825400f}, // 3.0859375000
+    {0.9999878692f, 0.0000786497f}, // 3.0937500000
+    {0.9999884690f, 0.0000749336f}, // 3.1015625000
+    {0.9999890404f, 0.0000713844f}, // 3.1093750000
+    {0.9999895848f, 0.0000679950f}, // 3.1171875000
+    {0.9999901033f, 0.0000647587f}, // 3.1250000000
+    {0.9999905970f, 0.0000616688f}, // 3.1328125000
+    {0.9999910672f, 0.0000587192f}, // 3.1406250000
+    {0.9999915149f, 0.0000559039f}, // 3.1484375000
+    {0.9999919410f, 0.0000532170f}, // 3.1562500000
+    {0.9999923467f, 0.0000506531f}, // 3.1640625000
+    {0.9999927328f, 0.0000482069f}, // 3.1718750000
+    {0.9999931002f, 0.0000458732f}, // 3.1796875000
+    {0.9999934498f, 0.0000436471f}, // 3.1875000000
+    {0.9999937825f, 0.0000415240f}, // 3.1953125000
+    {0.9999940989f, 0.0000394993f}, // 3.2031250000
+    {0.9999943999f, 0.0000375688f}, // 3.2109375000
+    {0.9999946862f, 0.0000357282f}, // 3.2187500000
+    {0.9999949584f, 0.0000339737f}, // 3.2265625000
+    {0.9999952172f, 0.0000323014f}, // 3.2343750000
+    {0.9999954633f, 0.0000307077f}, // 3.2421875000
+    {0.9999956972f, 0.0000291890f}, // 3.2500000000
+    {0.9999959196f, 0.0000277421f}, // 3.2578125000
+    {0.9999961309f, 0.0000263636f}, // 3.2656250000
+    {0.9999963317f, 0.0000250506f}, // 3.2734375000
+    {0.9999965224f, 0.0000238001f}, // 3.2812500000
+    {0.9999967037f, 0.0000226093f}, // 3.2890625000
+    {0.9999968759f, 0.0000214754f}, // 3.2968750000
+    {0.9999970394f, 0.0000203959f}, // 3.3046875000
+    {0.9999971947f, 0.0000193683f}, // 3.3125000000
+    {0.9999973421f, 0.0000183902f}, // 3.3203125000
+    {0.9999974822f, 0.0000174594f}, // 3.3281250000
+    {0.9999976151f, 0.0000165736f}, // 3.3359375000
+    {0.9999977412f, 0.0000157309f}, // 3.3437500000
+    {0.9999978610f, 0.0000149292f}, // 3.3515625000
+    {0.9999979746f, 0.0000141667f}, // 3.3593750000
+    {0.9999980824f, 0.0000134414f}, // 3.3671875000
+    {0.9999981847f, 0.0000127517f}, // 3.3750000000
+    {0.9999982818f, 0.0000120960f}, // 3.3828125000
+    {0.9999983738f, 0.0000114725f}, // 3.3906250000
+    {0.9999984611f, 0.0000108799f}, // 3.3984375000
+    {0.9999985439f, 0.0000103166f}, // 3.4062500000
+    {0.9999986224f, 0.0000097813f}, // 3.4140625000
+    {0.9999986968f, 0.0000092726f}, // 3.4218750000
+    {0.9999987673f, 0.0000087893f}, // 3.4296875000
+    {0.9999988342f, 0.0000083302f}, // 3.4375000000
+    {0.9999988975f, 0.0000078941f}, // 3.4453125000
+    {0.9999989576f, 0.0000074799f}, // 3.4531250000
+    {0.9999990145f, 0.0000070866f}, // 3.4609375000
+    {0.9999990684f, 0.0000067131f}, // 3.4687500000
+    {0.9999991194f, 0.0000063586f}, // 3.4765625000
+    {0.9999991678f, 0.0000060220f}, // 3.4843750000
+    {0.9999992135f, 0.0000057026f}, // 3.4921875000
+    {0.9999992569f, 0.0000053994f}, // 3.5000000000
+    {0.9999992980f, 0.0000051118f}, // 3.5078125000
+    {0.9999993368f, 0.0000048388f}, // 3.5156250000
+    {0.9999993736f, 0.0000045799f}, // 3.5234375000
+    {0.9999994084f, 0.0000043343f}, // 3.5312500000
+    {0.9999994414f, 0.0000041014f}, // 3.5390625000
+    {0.9999994725f, 0.0000038805f}, // 3.5468750000
+    {0.9999995020f, 0.0000036711f}, // 3.5546875000
+    {0.9999995299f, 0.0000034725f}, // 3.5625000000
+    {0.9999995563f, 0.0000032843f}, // 3.5703125000
+    {0.9999995813f, 0.0000031059f}, // 3.5781250000
+    {0.9999996049f, 0.0000029369f}, // 3.5859375000
+    {0.9999996272f, 0.0000027767f}, // 3.5937500000
+    {0.9999996483f, 0.0000026249f}, // 3.6015625000
+    {0.9999996682f, 0.0000024811f}, // 3.6093750000
+    {0.9999996870f, 0.0000023449f}, // 3.6171875000
+    {0.9999997049f, 0.0000022159f}, // 3.6250000000
+    {0.9999997217f, 0.0000020938f}, // 3.6328125000
+    {0.9999997376f, 0.0000019781f}, // 3.6406250000
+    {0.9999997526f, 0.0000018686f}, // 3.6484375000
+    {0.9999997668f, 0.0000017650f}, // 3.6562500000
+    {0.9999997802f, 0.0000016669f}, // 3.6640625000
+    {0.9999997929f, 0.0000015740f}, // 3.6718750000
+    {0.9999998048f, 0.0000014862f}, // 3.6796875000
+    {0.9999998161f, 0.0000014030f}, // 3.6875000000
+    {0.9999998267f, 0.0000013244f}, // 3.6953125000
+    {0.9999998368f, 0.0000012500f}, // 3.7031250000
+    {0.9999998463f, 0.0000011797f}, // 3.7109375000
+    {0.9999998552f, 0.0000011131f}, // 3.7187500000
+    {0.9999998637f, 0.0000010502f}, // 3.7265625000
+    {0.9999998717f, 0.0000009908f}, // 3.7343750000
+    {0.9999998792f, 0.0000009346f}, // 3.7421875000
+    {0.9999998863f, 0.0000008814f}, // 3.7500000000
+    {0.9999998930f, 0.0000008312f}, // 3.7578125000
+    {0.9999998993f, 0.0000007838f}, // 3.7656250000
+    {0.9999999052f, 0.0000007389f}, // 3.7734375000
+    {0.9999999108f, 0.0000006966f}, // 3.7812500000
+    {0.9999999161f, 0.0000006566f}, // 3.7890625000
+    {0.9999999211f, 0.0000006188f}, // 3.7968750000
+    {0.9999999258f, 0.0000005831f}, // 3.8046875000
+    {0.9999999302f, 0.0000005494f}, // 3.8125000000
+    {0.9999999344f, 0.0000005176f}, // 3.8203125000
+    {0.9999999383f, 0.0000004876f}, // 3.8281250000
+    {0.9999999420f, 0.0000004593f}, // 3.8359375000
+    {0.9999999455f, 0.0000004325f}, // 3.8437500000
+    {0.9999999488f, 0.0000004073f}, // 3.8515625000
+    {0.9999999518f, 0.0000003835f}, // 3.8593750000
+    {0.9999999547f, 0.0000003610f}, // 3.8671875000
+    {0.9999999575f, 0.0000003398f}, // 3.8750000000
+    {0.9999999601f, 0.0000003198f}, // 3.8828125000
+    {0.9999999625f, 0.0000003010f}, // 3.8906250000
+    {0.9999999648f, 0.0000002832f}, // 3.8984375000
+    {0.9999999669f, 0.0000002665f}, // 3.9062500000
+    {0.9999999689f, 0.0000002507f}, // 3.9140625000
+    {0.9999999708f, 0.0000002358f}, // 3.9218750000
+    {0.9999999726f, 0.0000002218f}, // 3.9296875000
+    {0.9999999743f, 0.0000002085f}, // 3.9375000000
+    {0.9999999759f, 0.0000001961f}, // 3.9453125000
+    {0.9999999774f, 0.0000001844f}, // 3.9531250000
+    {0.9999999788f, 0.0000001733f}, // 3.9609375000
+    {0.9999999801f, 0.0000001629f}, // 3.9687500000
+    {0.9999999813f, 0.0000001531f}, // 3.9765625000
+    {0.9999999825f, 0.0000001439f}, // 3.9843750000
+    {0.9999999836f, 0.0000001352f}, // 3.9921875000
+    {0.9999999846f, 0.0000001270f}, // 4.0000000000
+}};
+
+} // namespace arm_compute
diff --git a/src/core/utils/Math.h b/src/core/utils/Math.h
new file mode 100644
index 0000000000000000000000000000000000000000..f006948f6921a6c91866e61d923416ddc0586818
--- /dev/null
+++ b/src/core/utils/Math.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CORE_UTILS_MATH_H
+#define ACL_SRC_CORE_UTILS_MATH_H
+
+#include <array>
+
+namespace arm_compute
+{
+
+/** Lookup table for erf(x) calculation. */
+template <typename T>
+struct ErfLutEntry
+{
+    /** erf(x) */
+    T value;
+
+    /** 2 / sqrt(pi) * e^(-x^2) */
+    T scale;
+};
+
+/** The lookup table for FP32 erf(x) calculation. */
+extern const std::array<ErfLutEntry<float>, 513> erf_f32_lut;
+
+} // namespace arm_compute
+
+#endif // ACL_SRC_CORE_UTILS_MATH_H
diff --git a/src/core/utils/ScaleUtils.cpp b/src/core/utils/ScaleUtils.cpp
index ee57a8e7a75fd32c48156b6eca10a47c740731a1..a92da39b6790ec3b7883ba0f77347318a2361a55 100644
--- a/src/core/utils/ScaleUtils.cpp
+++ b/src/core/utils/ScaleUtils.cpp
@@ -23,11 +23,12 @@
  */
 
 #include "src/core/utils/ScaleUtils.h"
-#include "src/common/cpuinfo/CpuIsaInfo.h"
 
 #include "arm_compute/core/CPP/CPPTypes.h"
 #include "arm_compute/core/TensorInfo.h"
 
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+
 float arm_compute::scale_utils::calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners)
 {
     const size_t offset = (align_corners && output_size > 1) ? 1 : 0;
@@ -40,13 +41,15 @@ float arm_compute::scale_utils::calculate_resize_ratio(size_t input_size, size_t
     return static_cast<float>(in) / static_cast<float>(out);
 }
 
-bool arm_compute::scale_utils::is_precomputation_required(DataLayout data_layout, DataType data_type,
-                                                          InterpolationPolicy policy, BorderMode border_mode)
+bool arm_compute::scale_utils::is_precomputation_required(DataLayout          data_layout,
+                                                          DataType            data_type,
+                                                          InterpolationPolicy policy,
+                                                          BorderMode          border_mode)
 {
     // Do not calculate precomputed weights and indices if kernel code doesn't use them
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
             case DataType::F16:
@@ -62,4 +65,4 @@ bool arm_compute::scale_utils::is_precomputation_required(DataLayout data_layout
     }
 
     return true;
-}
\ No newline at end of file
+}
diff --git a/src/core/utils/ScaleUtils.h b/src/core/utils/ScaleUtils.h
index 1484824a7f1229b528ff33512e5b15914777ebad..d8dddc8c70d75ba4a3f41b36baffe296e5a12c9f 100644
--- a/src/core/utils/ScaleUtils.h
+++ b/src/core/utils/ScaleUtils.h
@@ -60,8 +60,11 @@ inline bool is_align_corners_allowed_sampling_policy(SamplingPolicy sampling_pol
  *
  * @return True if precomputation is required
  */
-bool is_precomputation_required(DataLayout data_layout, DataType data_type, InterpolationPolicy policy, BorderMode border_mode);
+bool is_precomputation_required(DataLayout          data_layout,
+                                DataType            data_type,
+                                InterpolationPolicy policy,
+                                BorderMode          border_mode);
 
 } // namespace scale_utils
 } // namespace arm_compute
-#endif /* UTILS_CORE_SCALEUTILS_H */
\ No newline at end of file
+#endif /* UTILS_CORE_SCALEUTILS_H */
diff --git a/src/core/utils/StringUtils.cpp b/src/core/utils/StringUtils.cpp
index 6d05c9b64e164e323a0d447a6726099f5c835884..bcab0ce10c28efec2b20883b7ff575c8b8cbf0de 100644
--- a/src/core/utils/StringUtils.cpp
+++ b/src/core/utils/StringUtils.cpp
@@ -55,7 +55,7 @@ std::string float_to_string_with_full_precision(float val)
     ss.precision(std::numeric_limits<float>::max_digits10);
     ss << val;
 
-    if(val != static_cast<int>(val))
+    if (val != static_cast<int>(val))
     {
         ss << "f";
     }
@@ -65,17 +65,11 @@ std::string float_to_string_with_full_precision(float val)
 
 std::string join(const std::vector<std::string> strings, const std::string &sep)
 {
-    if(strings.empty())
+    if (strings.empty())
     {
         return "";
     }
-    return std::accumulate(
-               std::next(strings.begin()),
-               strings.end(),
-               strings.at(0),
-               [&sep](const std::string & a, const std::string & b)
-    {
-        return a + sep + b;
-    });
-}
+    return std::accumulate(std::next(strings.begin()), strings.end(), strings.at(0),
+                           [&sep](const std::string &a, const std::string &b) { return a + sep + b; });
 }
+} // namespace arm_compute
diff --git a/src/core/utils/helpers/fft.cpp b/src/core/utils/helpers/fft.cpp
index 64633c643d9a1d639a0108a74b18aeec86354b67..edc8d0eaccb82d678865703bb90f8c36518433ce 100644
--- a/src/core/utils/helpers/fft.cpp
+++ b/src/core/utils/helpers/fft.cpp
@@ -37,7 +37,7 @@ std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsign
     unsigned int              res = N;
 
     // Early exit if no supported factors are provided
-    if(supported_factors.empty())
+    if (supported_factors.empty())
     {
         return stages;
     }
@@ -46,10 +46,10 @@ std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsign
     auto rfactor_it = supported_factors.rbegin();
 
     // Decomposition step
-    while(res != 0)
+    while (res != 0)
     {
         const unsigned int factor = *rfactor_it;
-        if(0 == (res % factor) && res >= factor)
+        if (0 == (res % factor) && res >= factor)
         {
             stages.push_back(factor);
             res /= factor;
@@ -57,9 +57,9 @@ std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsign
         else
         {
             ++rfactor_it;
-            if(rfactor_it == supported_factors.rend())
+            if (rfactor_it == supported_factors.rend())
             {
-                if(res > 1)
+                if (res > 1)
                 {
                     // Couldn't decompose with given factors
                     stages.clear();
@@ -81,8 +81,9 @@ std::vector<unsigned int> digit_reverse_indices(unsigned int N, const std::vecto
     std::vector<unsigned int> idx_digit_reverse;
 
     // Early exit in case N and fft stages do not match
-    const float stages_prod = std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies<unsigned int>());
-    if(stages_prod != N)
+    const float stages_prod =
+        std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies<unsigned int>());
+    if (stages_prod != N)
     {
         return idx_digit_reverse;
     }
@@ -94,13 +95,13 @@ std::vector<unsigned int> digit_reverse_indices(unsigned int N, const std::vecto
     unsigned int n_stages = fft_stages.size();
 
     // Scan elements
-    for(unsigned int n = 0; n < N; ++n)
+    for (unsigned int n = 0; n < N; ++n)
     {
         unsigned int k  = n;
         unsigned int Nx = fft_stages[0];
 
         // Scan stages
-        for(unsigned int s = 1; s < n_stages; ++s)
+        for (unsigned int s = 1; s < n_stages; ++s)
         {
             // radix of stage i-th
             unsigned int Ny = fft_stages[s];
diff --git a/src/core/utils/helpers/float_ops.h b/src/core/utils/helpers/float_ops.h
index a475a23b594a70e0f7a64b811696c67b6ee26c08..487496915a6fa35df864445c16bc7b9b33f1fd72 100644
--- a/src/core/utils/helpers/float_ops.h
+++ b/src/core/utils/helpers/float_ops.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_UTILS_HELPERS_FLOAT_OPS_H
-#define ARM_COMPUTE_UTILS_HELPERS_FLOAT_OPS_H
+#ifndef ACL_SRC_CORE_UTILS_HELPERS_FLOAT_OPS_H
+#define ACL_SRC_CORE_UTILS_HELPERS_FLOAT_OPS_H
+
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
 
 namespace arm_compute
 {
@@ -36,8 +40,7 @@ union RawFloat
      *
      * @param[in] val Floating-point value
      */
-    explicit RawFloat(float val)
-        : f32(val)
+    explicit RawFloat(float val) : f32(val)
     {
     }
     /** Extract sign of floating point number
@@ -113,4 +116,4 @@ inline bool is_zero(float a, float epsilon = 0.00001f)
 } // namespace float_ops
 } // namespace helpers
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_UTILS_HELPERS_FLOAT_OPS_H */
+#endif // ACL_SRC_CORE_UTILS_HELPERS_FLOAT_OPS_H
diff --git a/src/core/utils/helpers/tensor_info.h b/src/core/utils/helpers/tensor_info.h
index 9279532e2a82c6dc9a758d00761b22877c3519b8..fd4745a453b57336ae58adde21f96ae20e8fed00 100644
--- a/src/core/utils/helpers/tensor_info.h
+++ b/src/core/utils/helpers/tensor_info.h
@@ -41,15 +41,17 @@ namespace tensor_info
  * @return True if tensors have mismatching quantization info else false.
  */
 template <typename... Ts>
-inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1,
+                                                     const ITensorInfo *tensor_info_2,
+                                                     Ts... tensor_infos)
 {
     const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info();
 
-    const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
-    return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
-    {
-        return tensor_info->quantization_info() != first_quantization_info;
-    });
+    const std::array<const ITensorInfo *, 1 + sizeof...(Ts)> tensor_infos_array{
+        {tensor_info_2, std::forward<Ts>(tensor_infos)...}};
+    return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(),
+                       [&](const ITensorInfo *tensor_info)
+                       { return tensor_info->quantization_info() != first_quantization_info; });
 }
 } // namespace tensor_info
 } // namespace helpers
diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp
index f2216995a9e8c8fc551e18542dea053de1f4901c..19d0badd74f9ed72f557741690636a1d55b3a560 100644
--- a/src/core/utils/helpers/tensor_transform.cpp
+++ b/src/core/utils/helpers/tensor_transform.cpp
@@ -36,10 +36,11 @@ int calculate_stride_on_index(int index, Coordinates strides)
     return index >= static_cast<int>(strides.num_dimensions()) ? 1 : strides[index];
 }
 
-int calculate_start_on_index(TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask)
+int calculate_start_on_index(
+    TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask)
 {
     // Early exit
-    if(index >= static_cast<int>(starts.num_dimensions()))
+    if (index >= static_cast<int>(starts.num_dimensions()))
     {
         return 0;
     }
@@ -51,14 +52,14 @@ int calculate_start_on_index(TensorShape input_shape, int index, Coordinates sta
     int start = starts[index];
 
     // Reset in case of begin mask present
-    if(arm_compute::helpers::bit_ops::is_bit_set(begin_mask, index))
+    if (arm_compute::helpers::bit_ops::is_bit_set(begin_mask, index))
     {
         start = stride > 0 ? std::numeric_limits<int>::lowest() : std::numeric_limits<int>::max();
     }
 
     // Account negative start points
     const int dim_size = input_shape[index];
-    if(start < 0)
+    if (start < 0)
     {
         start += dim_size;
     }
@@ -69,12 +70,16 @@ int calculate_start_on_index(TensorShape input_shape, int index, Coordinates sta
     return start;
 }
 
-int calculate_end_on_index(TensorShape input_shape, int index, int start_on_index,
-                           Coordinates ends, Coordinates strides,
-                           int32_t end_mask, int32_t shrink_axis_mask)
+int calculate_end_on_index(TensorShape input_shape,
+                           int         index,
+                           int         start_on_index,
+                           Coordinates ends,
+                           Coordinates strides,
+                           int32_t     end_mask,
+                           int32_t     shrink_axis_mask)
 {
     // Early exit
-    if(index >= static_cast<int>(ends.num_dimensions()))
+    if (index >= static_cast<int>(ends.num_dimensions()))
     {
         return input_shape[index];
     }
@@ -86,9 +91,9 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde
     int stop = ends[index];
 
     // Shrink dimension
-    if(shrink_axis)
+    if (shrink_axis)
     {
-        if(start_on_index == std::numeric_limits<int>::max())
+        if (start_on_index == std::numeric_limits<int>::max())
         {
             stop = start_on_index;
         }
@@ -99,14 +104,14 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde
     }
 
     // Reset in case of begin mask present
-    if(arm_compute::helpers::bit_ops::is_bit_set(end_mask, index) && !shrink_axis)
+    if (arm_compute::helpers::bit_ops::is_bit_set(end_mask, index) && !shrink_axis)
     {
         stop = (stride > 0) ? std::numeric_limits<int>::max() : std::numeric_limits<int>::lowest();
     }
 
     // Account negative end points
     const int dim_size = input_shape[index];
-    if(stop < 0)
+    if (stop < 0)
     {
         stop += dim_size;
     }
@@ -118,14 +123,18 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde
 }
 
 std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords(TensorShape input_shape,
-                                                                                 Coordinates starts, Coordinates ends, Coordinates strides,
-                                                                                 int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+                                                                                 Coordinates starts,
+                                                                                 Coordinates ends,
+                                                                                 Coordinates strides,
+                                                                                 int32_t     begin_mask,
+                                                                                 int32_t     end_mask,
+                                                                                 int32_t     shrink_axis_mask)
 {
     Coordinates starts_abs{};
     Coordinates ends_abs{};
     Coordinates final_strides{};
 
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         const int start_i = calculate_start_on_index(input_shape, i, starts, strides, begin_mask);
         starts_abs.set(i, start_i);
@@ -136,13 +145,19 @@ std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords
     return std::make_tuple(starts_abs, ends_abs, final_strides);
 }
 
-TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends, Coordinates strides,
-                                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask, bool return_unshrinked)
+TensorShape compute_strided_slice_output_shape(TensorShape input_shape,
+                                               Coordinates starts,
+                                               Coordinates ends,
+                                               Coordinates strides,
+                                               int32_t     begin_mask,
+                                               int32_t     end_mask,
+                                               int32_t     shrink_axis_mask,
+                                               bool        return_unshrinked)
 {
     unsigned int index = 0;
 
     TensorShape output_shape;
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         const int stride = calculate_stride_on_index(index, strides);
         const int start  = calculate_start_on_index(input_shape, i, starts, strides, begin_mask);
@@ -150,11 +165,11 @@ TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordina
         const int range  = end - start;
 
         const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
-        if(return_unshrinked || !is_shrink)
+        if (return_unshrinked || !is_shrink)
         {
-            if((range == 0) ||               // Zero range
-               (range < 0 && stride >= 0) || // Negative range with positive stride
-               (range > 0 && stride <= 0))   // Positive range with negative stride
+            if ((range == 0) ||               // Zero range
+                (range < 0 && stride >= 0) || // Negative range with positive stride
+                (range > 0 && stride <= 0))   // Positive range with negative stride
             {
                 output_shape.set(index, 0);
                 return output_shape;
@@ -173,9 +188,9 @@ int32_t construct_slice_end_mask(Coordinates ends)
 {
     // Create end mask
     int32_t end_mask = 0;
-    for(unsigned int i = 0; i < ends.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < ends.num_dimensions(); ++i)
     {
-        if(ends[i] < 0)
+        if (ends[i] < 0)
         {
             end_mask |= 1 << i;
         }
diff --git a/src/core/utils/io/FileHandler.cpp b/src/core/utils/io/FileHandler.cpp
index 95fc2e3fa2851fa1df8759c67d75c67aaec94648..d1064932380d664937e0237e5de9d194bc61dabd 100644
--- a/src/core/utils/io/FileHandler.cpp
+++ b/src/core/utils/io/FileHandler.cpp
@@ -21,16 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include <string>
-
 #include "arm_compute/core/utils/io/FileHandler.h"
 
 #include "arm_compute/core/Error.h"
 
+#include <string>
+
 using namespace arm_compute::io;
 
-FileHandler::FileHandler()
-    : _filestream(), _filename(" "), _mode()
+FileHandler::FileHandler() : _filestream(), _filename(" "), _mode()
 {
 }
 
diff --git a/src/core/utils/logging/FilePrinter.cpp b/src/core/utils/logging/FilePrinter.cpp
index 55e78f9630726095a0087961d7584c97196a8350..7b4eead38db584bac50f09cf73fd3c9047b9693c 100644
--- a/src/core/utils/logging/FilePrinter.cpp
+++ b/src/core/utils/logging/FilePrinter.cpp
@@ -25,8 +25,7 @@
 
 using namespace arm_compute::logging;
 
-FilePrinter::FilePrinter(const std::string &filename)
-    : _handler()
+FilePrinter::FilePrinter(const std::string &filename) : _handler()
 {
     _handler.open(filename, std::fstream::out | std::fstream::trunc);
 }
@@ -34,4 +33,4 @@ FilePrinter::FilePrinter(const std::string &filename)
 void FilePrinter::print_internal(const std::string &msg)
 {
     _handler.stream() << msg << std::endl;
-}
\ No newline at end of file
+}
diff --git a/src/core/utils/logging/Helpers.cpp b/src/core/utils/logging/Helpers.cpp
index c3df7f6207175ea1bd4936983d7a20c82315ed9c..14ad9105627c2b75d3c3fa0947829c5048237145 100644
--- a/src/core/utils/logging/Helpers.cpp
+++ b/src/core/utils/logging/Helpers.cpp
@@ -30,13 +30,12 @@ using namespace arm_compute::logging;
 
 const std::string &arm_compute::logging::string_from_log_level(LogLevel log_level)
 {
-    static std::map<LogLevel, const std::string> log_level_map =
-    {
-        { LogLevel::VERBOSE, "VERBOSE" },
-        { LogLevel::INFO, "INFO" },
-        { LogLevel::WARN, "WARN" },
-        { LogLevel::OFF, "OFF" },
+    static std::map<LogLevel, const std::string> log_level_map = {
+        {LogLevel::VERBOSE, "VERBOSE"},
+        {LogLevel::INFO, "INFO"},
+        {LogLevel::WARN, "WARN"},
+        {LogLevel::OFF, "OFF"},
     };
 
     return log_level_map[log_level];
-}
\ No newline at end of file
+}
diff --git a/src/core/utils/logging/Logger.cpp b/src/core/utils/logging/Logger.cpp
index 70b5868da85ff050988b1e6409ddcdefbe64414f..d6681f817959c6d495da4be6039bcee220f45fc1 100644
--- a/src/core/utils/logging/Logger.cpp
+++ b/src/core/utils/logging/Logger.cpp
@@ -30,10 +30,7 @@
 using namespace arm_compute::logging;
 
 Logger::Logger(std::string name, LogLevel log_level, std::shared_ptr<Printer> printer)
-    : _name(std::move(name)), _log_level(log_level), _printers(
-{
-    std::move(printer)
-}), _decorators()
+    : _name(std::move(name)), _log_level(log_level), _printers({std::move(printer)}), _decorators()
 {
     // Check printer
     ARM_COMPUTE_ERROR_ON(printer == nullptr);
@@ -46,7 +43,7 @@ Logger::Logger(std::string name, LogLevel log_level, std::vector<std::shared_ptr
     : _name(std::move(name)), _log_level(log_level), _printers(std::move(printers)), _decorators()
 {
     // Check printers
-    for(const auto &p : _printers)
+    for (const auto &p : _printers)
     {
         ARM_COMPUTE_UNUSED(p);
         ARM_COMPUTE_ERROR_ON(p == nullptr);
@@ -62,13 +59,13 @@ Logger::Logger(std::string                              name,
     : _name(std::move(name)), _log_level(log_level), _printers(std::move(printers)), _decorators(std::move(decorators))
 {
     // Check printers
-    for(const auto &p : _printers)
+    for (const auto &p : _printers)
     {
         ARM_COMPUTE_UNUSED(p);
         ARM_COMPUTE_ERROR_ON(p == nullptr);
     }
     // Check decorators
-    for(const auto &d : _decorators)
+    for (const auto &d : _decorators)
     {
         ARM_COMPUTE_UNUSED(d);
         ARM_COMPUTE_ERROR_ON(d == nullptr);
@@ -79,7 +76,7 @@ void Logger::log(LogLevel log_level, const std::string &msg)
 {
     // Return if message shouldn't be logged
     // i.e. if log level does not match the logger's
-    if(!is_loggable(log_level))
+    if (!is_loggable(log_level))
     {
         return;
     }
@@ -129,7 +126,7 @@ bool Logger::is_loggable(LogLevel log_level)
 
 void Logger::decorate_log_msg(LogMsg &msg)
 {
-    for(const auto &d : _decorators)
+    for (const auto &d : _decorators)
     {
         d->decorate(msg);
     }
@@ -148,7 +145,7 @@ std::string Logger::create_log_msg(const std::string &str, LogLevel log_level)
 
 void Logger::print_all(const std::string &msg)
 {
-    for(auto &p : _printers)
+    for (auto &p : _printers)
     {
         p->print(msg);
     }
diff --git a/src/core/utils/logging/LoggerRegistry.cpp b/src/core/utils/logging/LoggerRegistry.cpp
index c281d8863c105604c5d44512b325b556b07de7d7..17015d9ae913ee7ef406e9bdb89e124cd2198fc0 100644
--- a/src/core/utils/logging/LoggerRegistry.cpp
+++ b/src/core/utils/logging/LoggerRegistry.cpp
@@ -24,15 +24,15 @@
 #include "arm_compute/core/utils/logging/LoggerRegistry.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "support/Mutex.h"
 
 using namespace arm_compute::logging;
 
 /** Reserved logger used by the library */
-std::set<std::string> LoggerRegistry::_reserved_loggers = { "CORE", "RUNTIME", "GRAPH" };
+std::set<std::string> LoggerRegistry::_reserved_loggers = {"CORE", "RUNTIME", "GRAPH"};
 
-LoggerRegistry::LoggerRegistry()
-    : _mtx(), _loggers()
+LoggerRegistry::LoggerRegistry() : _mtx(), _loggers()
 {
 }
 
@@ -42,10 +42,12 @@ LoggerRegistry &LoggerRegistry::get()
     return _instance;
 }
 
-void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level, const std::vector<std::shared_ptr<Printer>> &printers)
+void LoggerRegistry::create_logger(const std::string                           &name,
+                                   LogLevel                                     log_level,
+                                   const std::vector<std::shared_ptr<Printer>> &printers)
 {
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-    if((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end()))
+    if ((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end()))
     {
         _loggers[name] = std::make_shared<Logger>(name, log_level, printers);
     }
@@ -54,7 +56,7 @@ void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level,
 void LoggerRegistry::remove_logger(const std::string &name)
 {
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-    if(_loggers.find(name) != _loggers.end())
+    if (_loggers.find(name) != _loggers.end())
     {
         _loggers.erase(name);
     }
@@ -69,9 +71,9 @@ std::shared_ptr<Logger> LoggerRegistry::logger(const std::string &name)
 void LoggerRegistry::create_reserved_loggers(LogLevel log_level, const std::vector<std::shared_ptr<Printer>> &printers)
 {
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-    for(const auto &r : _reserved_loggers)
+    for (const auto &r : _reserved_loggers)
     {
-        if(_loggers.find(r) == _loggers.end())
+        if (_loggers.find(r) == _loggers.end())
         {
             _loggers[r] = std::make_shared<Logger>(r, log_level, printers);
         }
diff --git a/src/core/utils/misc/MMappedFile.cpp b/src/core/utils/misc/MMappedFile.cpp
index adae8a2bf081861976d14d1e8f7a8b9f9fd4b7dc..a467cb3320c62ce079c5c25d39c8c170496ccdbc 100644
--- a/src/core/utils/misc/MMappedFile.cpp
+++ b/src/core/utils/misc/MMappedFile.cpp
@@ -27,12 +27,11 @@
 
 #include <cstdio>
 #include <cstring>
-#include <tuple>
-
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <tuple>
 #include <unistd.h>
 
 namespace arm_compute
@@ -53,7 +52,7 @@ std::pair<size_t, bool> get_file_size(const std::string &filename)
 {
     struct stat st; // NOLINT
     memset(&st, 0, sizeof(struct stat));
-    if(stat(filename.c_str(), &st) == 0)
+    if (stat(filename.c_str(), &st) == 0)
     {
         return std::make_pair(st.st_size, true);
     }
@@ -73,8 +72,7 @@ size_t get_page_size()
 }
 } // namespace
 
-MMappedFile::MMappedFile()
-    : _filename(), _file_size(0), _map_size(0), _map_offset(0), _fp(nullptr), _data(nullptr)
+MMappedFile::MMappedFile() : _filename(), _file_size(0), _map_size(0), _map_offset(0), _fp(nullptr), _data(nullptr)
 {
 }
 
@@ -92,14 +90,14 @@ MMappedFile::~MMappedFile()
 bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
 {
     // Check if file is mapped
-    if(is_mapped())
+    if (is_mapped())
     {
         return false;
     }
 
     // Open file
     _fp = fopen(filename.c_str(), "a+be");
-    if(_fp == nullptr)
+    if (_fp == nullptr)
     {
         return false;
     }
@@ -107,26 +105,26 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
     // Extract file descriptor
     int  fd     = fileno(_fp);
     bool status = fd >= 0;
-    if(status)
+    if (status)
     {
         // Get file size
         std::tie(_file_size, status) = get_file_size(_filename);
 
-        if(status)
+        if (status)
         {
             // Map all file from offset if map size is 0
             _map_size   = (size == 0) ? _file_size : size;
             _map_offset = offset;
 
             // Check offset mapping
-            if((_map_offset > _file_size) || (_map_offset % get_page_size() != 0))
+            if ((_map_offset > _file_size) || (_map_offset % get_page_size() != 0))
             {
                 status = false;
             }
             else
             {
                 // Truncate to file size
-                if(_map_offset + _map_size > _file_size)
+                if (_map_offset + _map_size > _file_size)
                 {
                     _map_size = _file_size - _map_offset;
                 }
@@ -137,7 +135,7 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
         }
     }
 
-    if(!status)
+    if (!status)
     {
         fclose(_fp);
     }
@@ -148,14 +146,14 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
 void MMappedFile::release()
 {
     // Unmap file
-    if(_data != nullptr)
+    if (_data != nullptr)
     {
         ::munmap(_data, _file_size);
         _data = nullptr;
     }
 
     // Close file
-    if(_fp != nullptr)
+    if (_fp != nullptr)
     {
         fclose(_fp);
         _fp = nullptr;
diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index 086d63b968086278b3c241269d41ca0a546f79c6..f66d3e7064ba26d1810ae0b99a1bd186902c3ef5 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/utils/quantization/AsymmHelpers.h"
 #include "support/ToolchainSupport.h"
 
@@ -40,7 +42,7 @@ constexpr float   epsilon            = 0.00001f;
 
 Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon)
 {
-    if(multiplier >= 1.f)
+    if (multiplier >= 1.f)
     {
         Status status = calculate_quantized_multiplier_greater_than_one(multiplier, quant_multiplier, shift);
         *shift *= -1;
@@ -69,13 +71,13 @@ Status calculate_quantized_multiplier_less_than_one(float    multiplier,
     *right_shift           = -1 * shift_exp;
     auto q_fixed           = static_cast<int64_t>(support::cpp11::round(q * fixed_point_one_Q0));
     ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0);
-    if(q_fixed == fixed_point_one_Q0)
+    if (q_fixed == fixed_point_one_Q0)
     {
         q_fixed /= 2;
         --*right_shift;
     }
 
-    if(ignore_epsilon && *right_shift > 31)
+    if (ignore_epsilon && *right_shift > 31)
     {
         *right_shift = 0;
         q_fixed      = 0;
@@ -88,9 +90,8 @@ Status calculate_quantized_multiplier_less_than_one(float    multiplier,
     return Status{};
 }
 
-Status calculate_quantized_multiplier_greater_than_one(float    multiplier,
-                                                       int32_t *quantized_multiplier,
-                                                       int32_t *left_shift)
+Status
+calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(quantized_multiplier == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(left_shift == nullptr);
@@ -101,7 +102,7 @@ Status calculate_quantized_multiplier_greater_than_one(float    multiplier,
     *left_shift            = shift_exp;
     auto q_fixed           = static_cast<int64_t>(support::cpp11::round(q * fixed_point_one_Q0));
     ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0);
-    if(q_fixed == fixed_point_one_Q0)
+    if (q_fixed == fixed_point_one_Q0)
     {
         q_fixed /= 2;
         ++*left_shift;
@@ -113,9 +114,9 @@ Status calculate_quantized_multiplier_greater_than_one(float    multiplier,
     return Status{};
 }
 
-arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_info,
-                                                    const QuantizationInfo &wq_info,
-                                                    const QuantizationInfo &oq_info,
+arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo  &iq_info,
+                                                    const QuantizationInfo  &wq_info,
+                                                    const QuantizationInfo  &oq_info,
                                                     GEMMLowpOutputStageInfo &stage_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(iq_info.scale().empty());
@@ -133,7 +134,7 @@ arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_i
     const float i_scale  = iq_info.scale().at(0);
     const float o_scale  = oq_info.scale().at(0);
 
-    for(unsigned int i = 0; i < size; ++i)
+    for (unsigned int i = 0; i < size; ++i)
     {
         const float multiplier       = i_scale * w_scales[i] / o_scale;
         int32_t     quant_multiplier = 0;
@@ -154,7 +155,7 @@ std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_ty
 {
     int min_quant_val = 0;
     int max_quant_val = 0;
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::QASYMM8:
             min_quant_val = std::numeric_limits<uint8_t>::min();
@@ -179,7 +180,9 @@ std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_ty
     return std::make_pair(min_quant_val, max_quant_val);
 }
 
-std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type)
+std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo    &q_info,
+                                                                     const ActivationLayerInfo &act_info,
+                                                                     DataType                   data_type)
 {
     ARM_COMPUTE_ERROR_ON(data_type != DataType::QASYMM8 && data_type != DataType::QASYMM8_SIGNED);
 
@@ -190,20 +193,23 @@ std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const Quant
 
     const UniformQuantizationInfo q_unif = q_info.uniform();
 
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
-        switch(act_info.activation())
+        switch (act_info.activation())
         {
             case ActivationLayerInfo::ActivationFunction::RELU:
                 type_min = q_unif.offset;
                 break;
             case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
                 type_min = q_unif.offset;
-                type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info) : quantize_qasymm8_signed(act_info.a(), q_info);
+                type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info)
+                                                            : quantize_qasymm8_signed(act_info.a(), q_info);
                 break;
             case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                type_min = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.b(), q_info) : quantize_qasymm8_signed(act_info.b(), q_info);
-                type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info) : quantize_qasymm8_signed(act_info.a(), q_info);
+                type_min = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.b(), q_info)
+                                                            : quantize_qasymm8_signed(act_info.b(), q_info);
+                type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info)
+                                                            : quantize_qasymm8_signed(act_info.a(), q_info);
                 break;
             default:
                 ARM_COMPUTE_ERROR("Activation function not supported.");
@@ -226,7 +232,7 @@ void compute_quantized_multipliers_and_shifts(const ITensorInfo *input,
 
     const unsigned int num_filters = wq_info.scale().size();
 
-    for(unsigned int i = 0; i < num_filters; ++i)
+    for (unsigned int i = 0; i < num_filters; ++i)
     {
         int32_t     output_multiplier = 0;
         int32_t     output_shift      = 0;
@@ -267,11 +273,11 @@ int32_t multiply_by_quantized_multiplier(int32_t input, int32_t qmul, int32_t sh
 
 int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v)
 {
-    if(exponent == 0)
+    if (exponent == 0)
     {
         return v;
     }
-    else if(exponent < 0)
+    else if (exponent < 0)
     {
         return rounding_divide_by_pow2(v, -exponent);
     }
@@ -291,11 +297,14 @@ int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v)
     }
 }
 
-void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, int32_t &output_inv_sqrt, int32_t &output_shift)
+void get_invsqrt_quantized_multiplier_exp(int32_t  input,
+                                          int32_t  reverse_shift,
+                                          int32_t &output_inv_sqrt,
+                                          int32_t &output_shift)
 {
     ARM_COMPUTE_ERROR_ON(input < 0);
 
-    if(input <= 1)
+    if (input <= 1)
     {
         // dealing the inputs (0 and 1) separately to avoid overflow
         output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
@@ -305,7 +314,7 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift,
 
     // prepare input for fixed point operation and compute shift value
     output_shift = 11;
-    while(input >= (1 << 29))
+    while (input >= (1 << 29))
     {
         input /= 4;
         ++output_shift;
@@ -334,9 +343,7 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift,
 
     // multiplication of two fixed point numbers, defined for readability
     auto fixed_point_mul = [](FixedPointRawType a, FixedPointRawType b) -> FixedPointRawType
-    {
-        return saturating_rounding_doubling_highmul(a, b);
-    };
+    { return saturating_rounding_doubling_highmul(a, b); };
 
     // rescaling of fixed point to have dst_bit integer bits, defined for readability
     auto fixed_point_rescale = [](FixedPointRawType a, uint32_t src_bit, uint32_t dst_bit) -> FixedPointRawType
@@ -347,17 +354,18 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift,
 
     // 5 iterations of Newton-Raphson method for inverse square root - 1.5 * x_n = input/2 * (x_n)^3
     constexpr int32_t num_iteration = 5;
-    for(int32_t i = 0; i < num_iteration; ++i)
+    for (int32_t i = 0; i < num_iteration; ++i)
     {
         const auto x3 = fixed_point_rescale(fixed_point_mul(fixed_point_mul(x, x), x), 9, fixedpoint_position);
-        x             = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3), 6, fixedpoint_position);
+        x = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3),
+                                6, fixedpoint_position);
     }
 
     // fixed point representation of sqrt(1/2)
     const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250;
     x                                        = fixed_point_mul(fixedpoint_half_sqrt_2, x);
     output_inv_sqrt                          = x;
-    if(output_shift < 0)
+    if (output_shift < 0)
     {
         output_inv_sqrt <<= -output_shift;
         output_shift = 0;
@@ -365,5 +373,5 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift,
     // convert right shift to left shift
     output_shift *= reverse_shift;
 }
-} // quantization
-} // arm_compute
+} // namespace quantization
+} // namespace arm_compute
diff --git a/src/core/utils/quantization/AsymmHelpers.h b/src/core/utils/quantization/AsymmHelpers.h
index f9701095cb8e6910d5ba5adc808b82941f2eb54d..5dc607ce58d6cc7620a067fb8d0229b7891411f5 100644
--- a/src/core/utils/quantization/AsymmHelpers.h
+++ b/src/core/utils/quantization/AsymmHelpers.h
@@ -29,7 +29,8 @@
 
 namespace arm_compute
 {
-namespace quantization {
+namespace quantization
+{
 
 /** Get minimum and maximum output of the activation function after quantization.
  *
@@ -41,7 +42,9 @@ namespace quantization {
  *
  * @return The minimum and maximum output of the activation function after quantization.
  */
-std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type);
+std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo    &q_info,
+                                                                     const ActivationLayerInfo &act_info,
+                                                                     DataType                   data_type);
 
 } // namespace quantization
 } // namespace arm_compute
diff --git a/src/cpu/CpuContext.cpp b/src/cpu/CpuContext.cpp
index 7c14891ef801ffc86dca8d7acb38ea8f232d7e71..b745af8229305c8adc4b3297215b5aab21cac80a 100644
--- a/src/cpu/CpuContext.cpp
+++ b/src/cpu/CpuContext.cpp
@@ -24,6 +24,7 @@
 #include "src/cpu/CpuContext.h"
 
 #include "arm_compute/core/CPP/CPPTypes.h"
+
 #include "src/cpu/CpuQueue.h"
 #include "src/cpu/CpuTensor.h"
 
@@ -32,7 +33,7 @@
 #include <malloc.h>
 
 #if defined(_WIN64)
-#define posix_memalign _aligned_realloc
+#define posix_memalign      _aligned_realloc
 #define posix_memalign_free _aligned_free
 #endif // defined(_WIN64)
 #endif // !defined(__APPLE__) && !defined(__OpenBSD__)
@@ -66,7 +67,7 @@ void *default_aligned_allocate(void *user_data, size_t size, size_t alignment)
     size_t real_size = (rem) ? (size + alignment - rem) : size;
     ptr              = memalign(alignment, real_size);
 #else  /* defined(BARE_METAL) */
-    if(posix_memalign(&ptr, alignment, size) != 0)
+    if (posix_memalign(&ptr, alignment, size) != 0)
     {
         // posix_memalign returns non-zero on failures, the return values will be
         // - EINVAL: wrong alignment
@@ -81,17 +82,13 @@ void default_aligned_free(void *user_data, void *ptr)
     ARM_COMPUTE_UNUSED(user_data);
     free(ptr);
 }
-static AclAllocator default_allocator = { &default_allocate,
-                                          &default_free,
-                                          &default_aligned_allocate,
-                                          &default_aligned_free,
-                                          nullptr
-                                        };
+static AclAllocator default_allocator = {&default_allocate, &default_free, &default_aligned_allocate,
+                                         &default_aligned_free, nullptr};
 
 AllocatorWrapper populate_allocator(AclAllocator *external_allocator)
 {
     bool is_valid = (external_allocator != nullptr);
-    if(is_valid)
+    if (is_valid)
     {
         is_valid = is_valid && (external_allocator->alloc != nullptr);
         is_valid = is_valid && (external_allocator->free != nullptr);
@@ -123,14 +120,13 @@ cpuinfo::CpuIsaInfo populate_capabilities_flags(AclTargetCapabilities external_c
     return isa_caps;
 }
 
-CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps,
-                                      int32_t               max_threads)
+CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps, int32_t max_threads)
 {
     CpuCapabilities caps;
 
     // Populate capabilities with system information
     caps.cpu_info = cpuinfo::CpuInfo::build();
-    if(external_caps != AclCpuCapabilitiesAuto)
+    if (external_caps != AclCpuCapabilitiesAuto)
     {
         cpuinfo::CpuIsaInfo isa  = populate_capabilities_flags(external_caps);
         auto                cpus = caps.cpu_info.cpus();
@@ -151,11 +147,9 @@ CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps,
 } // namespace
 
 CpuContext::CpuContext(const AclContextOptions *options)
-    : IContext(Target::Cpu),
-      _allocator(default_allocator),
-      _caps(populate_capabilities(AclCpuCapabilitiesAuto, -1))
+    : IContext(Target::Cpu), _allocator(default_allocator), _caps(populate_capabilities(AclCpuCapabilitiesAuto, -1))
 {
-    if(options != nullptr)
+    if (options != nullptr)
     {
         _allocator = populate_allocator(options->allocator);
         _caps      = populate_capabilities(options->capabilities, options->max_compute_units);
@@ -175,7 +169,7 @@ AllocatorWrapper &CpuContext::allocator()
 ITensorV2 *CpuContext::create_tensor(const AclTensorDescriptor &desc, bool allocate)
 {
     CpuTensor *tensor = new CpuTensor(this, desc);
-    if(tensor != nullptr && allocate)
+    if (tensor != nullptr && allocate)
     {
         tensor->allocate();
     }
diff --git a/src/cpu/CpuContext.h b/src/cpu/CpuContext.h
index da241ed0974d1b625903625b2005a1748e4f11ef..0c8ae49f49e07b909255b84884654d36ddf82aa7 100644
--- a/src/cpu/CpuContext.h
+++ b/src/cpu/CpuContext.h
@@ -25,8 +25,8 @@
 #define SRC_CPU_CPUCONTEXT_H
 
 #include "src/common/AllocatorWrapper.h"
-#include "src/common/IContext.h"
 #include "src/common/cpuinfo/CpuInfo.h"
+#include "src/common/IContext.h"
 
 namespace arm_compute
 {
@@ -36,7 +36,7 @@ namespace cpu
 struct CpuCapabilities
 {
     cpuinfo::CpuInfo cpu_info{};
-    int32_t          max_threads{ -1 };
+    int32_t          max_threads{-1};
 };
 
 /** CPU context implementation class */
@@ -60,9 +60,9 @@ public:
     AllocatorWrapper &allocator();
 
     // Inherrited methods overridden
-    ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
-    IQueue *create_queue(const AclQueueOptions *options) override;
-    std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src,
+    ITensorV2                          *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
+    IQueue                             *create_queue(const AclQueueOptions *options) override;
+    std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor     &src,
                                                           const AclTensorDescriptor     &dst,
                                                           const AclActivationDescriptor &act,
                                                           bool                           is_validate) override;
@@ -74,4 +74,4 @@ private:
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CPU_CPUCONTEXT_H */
\ No newline at end of file
+#endif /* SRC_CPU_CPUCONTEXT_H */
diff --git a/src/cpu/CpuQueue.cpp b/src/cpu/CpuQueue.cpp
index 0f0097b3f4e8e9e55478ed96fc5abd0003713441..be781d67945807e7ef61c4e87748ea9c8e60c599 100644
--- a/src/cpu/CpuQueue.cpp
+++ b/src/cpu/CpuQueue.cpp
@@ -29,8 +29,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-CpuQueue::CpuQueue(IContext *ctx, const AclQueueOptions *options)
-    : IQueue(ctx)
+CpuQueue::CpuQueue(IContext *ctx, const AclQueueOptions *options) : IQueue(ctx)
 {
     ARM_COMPUTE_UNUSED(options);
 }
diff --git a/src/cpu/CpuQueue.h b/src/cpu/CpuQueue.h
index 871a36c85bbe3fa3c5316d80fc06d4ea9bdcc9fd..b6a2be0e23e2c71d78173e38efde81a6208cc5aa 100644
--- a/src/cpu/CpuQueue.h
+++ b/src/cpu/CpuQueue.h
@@ -24,10 +24,10 @@
 #ifndef SRC_CPU_CPUQUEUE_H
 #define SRC_CPU_CPUQUEUE_H
 
-#include "src/common/IQueue.h"
-
 #include "arm_compute/runtime/IScheduler.h"
 
+#include "src/common/IQueue.h"
+
 namespace arm_compute
 {
 namespace cpu
diff --git a/src/cpu/CpuTensor.cpp b/src/cpu/CpuTensor.cpp
index 6dd6d9c31b4b0b0912ef3e3fd9cd28d250cd503e..59082b5350908aa48d1c6211c75540456837d59f 100644
--- a/src/cpu/CpuTensor.cpp
+++ b/src/cpu/CpuTensor.cpp
@@ -29,8 +29,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-CpuTensor::CpuTensor(IContext *ctx, const AclTensorDescriptor &desc)
-    : ITensorV2(ctx), _legacy_tensor()
+CpuTensor::CpuTensor(IContext *ctx, const AclTensorDescriptor &desc) : ITensorV2(ctx), _legacy_tensor()
 {
     ARM_COMPUTE_ASSERT((ctx != nullptr) && (ctx->type() == Target::Cpu));
     _legacy_tensor = std::make_unique<Tensor>();
@@ -41,7 +40,7 @@ void *CpuTensor::map()
 {
     ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr);
 
-    if(_legacy_tensor == nullptr)
+    if (_legacy_tensor == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[CpuTensor:map]: Backing tensor does not exist!");
         return nullptr;
diff --git a/src/cpu/CpuTensor.h b/src/cpu/CpuTensor.h
index b078774c99079de959c7cae6e6046288fffda511..89931e1f94be5ef8fb4fc19fee98504a17aa3e1f 100644
--- a/src/cpu/CpuTensor.h
+++ b/src/cpu/CpuTensor.h
@@ -24,10 +24,10 @@
 #ifndef SRC_CPU_CPUTENSOR_H
 #define SRC_CPU_CPUTENSOR_H
 
-#include "src/common/ITensorV2.h"
-
 #include "arm_compute/runtime/Tensor.h"
 
+#include "src/common/ITensorV2.h"
+
 namespace arm_compute
 {
 namespace cpu
@@ -52,7 +52,7 @@ public:
     void                 *map() override;
     StatusCode            unmap() override;
     arm_compute::ITensor *tensor() const override;
-    StatusCode import(void *handle, ImportMemoryType type) override;
+    StatusCode            import(void *handle, ImportMemoryType type) override;
 
 private:
     std::unique_ptr<Tensor> _legacy_tensor;
@@ -60,4 +60,4 @@ private:
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CPU_CPUTENSOR_H */
\ No newline at end of file
+#endif /* SRC_CPU_CPUTENSOR_H */
diff --git a/src/cpu/CpuTypes.h b/src/cpu/CpuTypes.h
index 0f7b9b655278dca9abcb7f083d32c57bea916e7d..8726bc470a2a94f917cc31d0c29f8969651bb802 100644
--- a/src/cpu/CpuTypes.h
+++ b/src/cpu/CpuTypes.h
@@ -31,6 +31,6 @@ namespace arm_compute
 typedef __fp16 float16_t;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 typedef float float32_t;
-}
+} // namespace arm_compute
 
 #endif /* ARM_COMPUTE_CPUTYPES */
diff --git a/src/cpu/ICpuKernel.h b/src/cpu/ICpuKernel.h
index 8f4106240d37bf0b551055eb9293aeb4015b1d13..bcd0cb2c70f8d7cf79c33cee6de0f7ad6f08851c 100644
--- a/src/cpu/ICpuKernel.h
+++ b/src/cpu/ICpuKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_ICPUKERNEL_H
 
 #include "arm_compute/core/CPP/ICPPKernel.h"
+
 #include "src/cpu/kernels/CpuKernelSelectionTypes.h"
 
 namespace arm_compute
@@ -34,7 +35,7 @@ namespace cpu
 enum class KernelSelectionType
 {
     Preferred, /**< Retrieve the best implementation available for the given Cpu ISA, ignoring the build flags */
-    Supported  /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */
+    Supported /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */
 };
 
 template <class Derived>
@@ -50,13 +51,15 @@ public:
      */
 
     template <typename SelectorType>
-    static const auto *get_implementation(const SelectorType &selector, KernelSelectionType selection_type = KernelSelectionType::Supported)
+    static const auto *get_implementation(const SelectorType &selector,
+                                          KernelSelectionType selection_type = KernelSelectionType::Supported)
     {
-        using kernel_type = typename std::remove_reference<decltype(Derived::get_available_kernels())>::type::value_type;
+        using kernel_type =
+            typename std::remove_reference<decltype(Derived::get_available_kernels())>::type::value_type;
 
-        for(const auto &uk : Derived::get_available_kernels())
+        for (const auto &uk : Derived::get_available_kernels())
         {
-            if(uk.is_selected(selector) && (selection_type == KernelSelectionType::Preferred || uk.ukernel != nullptr))
+            if (uk.is_selected(selector) && (selection_type == KernelSelectionType::Preferred || uk.ukernel != nullptr))
             {
                 return &uk;
             }
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index f4bd4e6cadc59aa5934792f71e31ded7532ff69c..50bf672d3c61d2f1e54d9f1743e58b9cbb5d1c88 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -26,11 +26,11 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
 #include "src/cpu/kernels/activation/list.h"
 
 #include <array>
@@ -43,126 +43,126 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels =
-{
+static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels = {
 #ifdef ARM_COMPUTE_ENABLE_SVE
-    {
-        "sve2_q8_activation_lut",
-        [](const ActivationDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.cpumodel == CPUModel::A510 && data.isa.sve2; },
-        REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)
-    },
+    {"sve2_q8_activation_lut",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) &&
+                data.cpumodel == CPUModel::A510 && data.isa.sve2;
+     },
+     REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)},
 #endif // ARM_COMPUTE_ENABLE_SVE
 #ifdef __aarch64__
-    {
-        // Neon LUT implementantion takes precedence
-        "neon_q8_activation_lut",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)
-    },
+    {// Neon LUT implementantion takes precedence
+     "neon_q8_activation_lut",
+     [](const ActivationDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)},
 #endif // __aarch64__
-    {
-        "sve2_qu8_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
-        REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)
-    },
-    {
-        "sve2_qs8_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
-        REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)
-    },
-    {
-        "sve2_qs16_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QSYMM16 && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
-        REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)
-    },
-    {
-        "sve_fp16_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
-        REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)
-    },
-    {
-        "sve_fp32_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
-        REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)
-    },
-    {
-        "neon_fp16_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)
-    },
-    {
-        "neon_fp32_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)
-    },
-    {
-        "neon_qu8_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)
-    },
-    {
-        "neon_qs8_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)
-    },
-    {
-        "neon_qs16_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QSYMM16; },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)
-    },
+    {"sve2_qu8_activation",
+     [](const ActivationDataTypeISASelectorData &data) {
+         return data.dt == DataType::QASYMM8 && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::GELU;
+     },
+     REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)},
+    {"sve2_qs8_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::GELU;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)},
+    {"sve2_qs16_activation",
+     [](const ActivationDataTypeISASelectorData &data) {
+         return data.dt == DataType::QSYMM16 && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::GELU;
+     },
+     REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)},
+    {"sve_fp16_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+                data.f != ActivationLayerInfo::ActivationFunction::GELU;
+     },
+     REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)},
+    {"sve_fp32_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
+     REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)},
+    {"neon_fp16_activation",
+     [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)},
+    {"neon_fp32_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)},
+    {"neon_qu8_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)},
+    {"neon_qs8_activation",
+     [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)},
+    {"neon_qs16_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QSYMM16; },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)},
 };
 
 /* Supported activation in the 8-bit integer domain */
-static const std::array<ActivationLayerInfo::ActivationFunction, 8> qasymm8_activations =
-{
-    ActivationLayerInfo::ActivationFunction::RELU,
-    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
-    ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-    ActivationLayerInfo::ActivationFunction::LOGISTIC,
-    ActivationLayerInfo::ActivationFunction::TANH,
-    ActivationLayerInfo::ActivationFunction::HARD_SWISH,
-    ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
-    ActivationLayerInfo::ActivationFunction::GELU,
+static const std::array<ActivationLayerInfo::ActivationFunction, 8> qasymm8_activations = {
+    ActivationLayerInfo::ActivationFunction::RELU,         ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+    ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC,
+    ActivationLayerInfo::ActivationFunction::TANH,         ActivationLayerInfo::ActivationFunction::HARD_SWISH,
+    ActivationLayerInfo::ActivationFunction::LEAKY_RELU,   ActivationLayerInfo::ActivationFunction::GELU,
 };
 /* Supported activation in the 16-bit integer domain */
-static const std::array<ActivationLayerInfo::ActivationFunction, 4> qsymm16_activations =
-{
-    ActivationLayerInfo::ActivationFunction::LOGISTIC,
-    ActivationLayerInfo::ActivationFunction::TANH,
-    ActivationLayerInfo::ActivationFunction::HARD_SWISH,
-    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-};
+static const std::array<ActivationLayerInfo::ActivationFunction, 4> qsymm16_activations = {
+    ActivationLayerInfo::ActivationFunction::LOGISTIC, ActivationLayerInfo::ActivationFunction::TANH,
+    ActivationLayerInfo::ActivationFunction::HARD_SWISH, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &activation_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
 
-    const auto *uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation() });
+    const auto *uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{
+        src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    const DataType                                data_type = src->data_type();
-    const QuantizationInfo                       &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
-    const ActivationLayerInfo::ActivationFunction f_act     = activation_info.activation();
+    const DataType          data_type = src->data_type();
+    const QuantizationInfo &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
+    const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation();
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_asymmetric(data_type) && (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) == std::end(qasymm8_activations)),
-                                    "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_data_type_quantized_asymmetric(data_type) &&
+            (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) ==
+             std::end(qasymm8_activations)),
+        "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations), f_act) == std::end(qsymm16_activations)),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) &&
+                                        (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations),
+                                                   f_act) == std::end(qsymm16_activations)),
                                     "For QSYMM16 only tanh and logistic are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::TANH)
-                                && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
-    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-                                && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 128)));
+    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, 0)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, -128)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
 
     // Checks performed when dst is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
@@ -176,7 +176,7 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src,
     // Configure kernel window
     Window win = calculate_max_window(*src, Steps());
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         // dst auto inizialitation if not yet initialized
         auto_init_if_empty(*dst, *src->clone());
@@ -185,14 +185,19 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src,
     return std::make_pair(Status{}, win);
 }
 #ifdef __aarch64__
-void init_lut(ActivationLayerInfo::ActivationFunction act_func, DataType data_type,
-              const UniformQuantizationInfo &qi_in, const UniformQuantizationInfo &qi_out,
-              ActivationLayerInfo::LookupTable256 &lut, float a, float b)
+void init_lut(ActivationLayerInfo::ActivationFunction act_func,
+              DataType                                data_type,
+              const UniformQuantizationInfo          &qi_in,
+              const UniformQuantizationInfo          &qi_out,
+              ActivationLayerInfo::LookupTable256    &lut,
+              float                                   a,
+              float                                   b)
 {
-    for(size_t i = 0; i < lut.size(); ++i)
+    for (size_t i = 0; i < lut.size(); ++i)
     {
-        float tmp_f = (data_type == DataType::QASYMM8) ? dequantize_qasymm8(i, qi_in) : dequantize_qasymm8_signed(i, qi_in);
-        switch(act_func)
+        float tmp_f =
+            (data_type == DataType::QASYMM8) ? dequantize_qasymm8(i, qi_in) : dequantize_qasymm8_signed(i, qi_in);
+        switch (act_func)
         {
             case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
                 tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
@@ -246,7 +251,8 @@ void init_lut(ActivationLayerInfo::ActivationFunction act_func, DataType data_ty
                 tmp_f = 0;
                 break;
         }
-        lut[i] = (data_type == DataType::QASYMM8) ? quantize_qasymm8(tmp_f, qi_out) : quantize_qasymm8_signed(tmp_f, qi_out);
+        lut[i] =
+            (data_type == DataType::QASYMM8) ? quantize_qasymm8(tmp_f, qi_out) : quantize_qasymm8_signed(tmp_f, qi_out);
     }
 }
 #endif // __aarch64__
@@ -258,8 +264,9 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info));
 
-    const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation() });
-    if(dst != nullptr)
+    const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{
+        src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()});
+    if (dst != nullptr)
     {
         // dst auto inizialitation if not yet initialized
         auto_init_if_empty(*dst, *src->clone());
@@ -271,11 +278,12 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
     _name       = std::string("CpuActivationKernel").append("/").append(uk->name);
 
 #ifdef __aarch64__
-    if(src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED)
+    if (src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED)
     {
         ActivationLayerInfo::LookupTable256 tmp_lut;
-        init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(), (dst) ? dst->quantization_info().uniform() : src->quantization_info().uniform(),
-                 tmp_lut, activation_info.a(), activation_info.b());
+        init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(),
+                 (dst) ? dst->quantization_info().uniform() : src->quantization_info().uniform(), tmp_lut,
+                 activation_info.a(), activation_info.b());
         activation_info.setLookupTable256(tmp_lut);
     }
 #endif // __aarch64__
@@ -288,11 +296,13 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
     ICPPKernel::configure(win);
 }
 
-Status CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status
+CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first);
 
     return Status{};
 }
@@ -302,7 +312,7 @@ size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count
     ARM_COMPUTE_UNUSED(thread_count);
     ARM_COMPUTE_UNUSED(platform);
 
-    if(_split_dimension == Window::DimX)
+    if (_split_dimension == Window::DimX)
     {
         // Don't split the work load too small if the tensor has been reinterpreted as 1D.
         // This number is loosely chosen as threading overhead in each platform varies wildly.
@@ -314,7 +324,7 @@ size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count
 void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     // Early exit on disabled activation
-    if(!_act_info.enabled())
+    if (!_act_info.enabled())
     {
         return;
     }
diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h
index 804407653f36a5de1a6744387b5e60d1666b89a5..4bad9fb3e8175a0935348559a5fdad2b50dae954 100644
--- a/src/cpu/kernels/CpuActivationKernel.h
+++ b/src/cpu/kernels/CpuActivationKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -38,7 +39,8 @@ namespace kernels
 class CpuActivationKernel : public ICpuKernel<CpuActivationKernel>
 {
 private:
-    using ActivationKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
+    using ActivationKernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
 
 public:
     CpuActivationKernel() = default;
@@ -71,7 +73,7 @@ public:
     size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     /** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
@@ -94,8 +96,8 @@ public:
 
 private:
     ActivationLayerInfo _act_info{};
-    ActivationKernelPtr _run_method{ nullptr };
-    size_t              _split_dimension{ Window::DimY };
+    ActivationKernelPtr _run_method{nullptr};
+    size_t              _split_dimension{Window::DimY};
     std::string         _name{};
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp
index 2983575cb6dd44db9ba157de98d778bf5687d5d8..a990aa4715185d807ca9c14e8ebb2097544cdc45 100644
--- a/src/cpu/kernels/CpuAddKernel.cpp
+++ b/src/cpu/kernels/CpuAddKernel.cpp
@@ -26,19 +26,21 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/add/list.h"
+
 #include <array>
 
 #if defined(ENABLE_FP32_KERNELS)
 namespace
 {
-    static constexpr size_t default_mws_N1_fp32_neon = 24536;
-    static constexpr size_t default_mws_V1_fp32_neon = 40510;
-}
+static constexpr size_t default_mws_N1_fp32_neon = 24536;
+static constexpr size_t default_mws_V1_fp32_neon = 40510;
+} // namespace
 #endif /* ENABLE_FP32_KERNELS */
 
 namespace arm_compute
@@ -49,152 +51,82 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuAddKernel::AddKernel> available_kernels =
-{
-    {
-        "neon_qu8_add_fixedpoint",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8) && data.can_use_fixedpoint;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<uint8_t>)
-    },
-    {
-        "neon_qs8_add_fixedpoint",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<int8_t>)
-    },
-    {
-        "sve2_qu8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8) && data.isa.sve2;
-        },
-        REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2)
-    },
-    {
-        "sve2_qs8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2;
-        },
-        REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2)
-    },
-    {
-        "sve2_qs16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QSYMM16) && data.isa.sve2;
-        },
-        REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2)
-    },
-    {
-        "sve_fp32_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F32) && data.isa.sve;
-        },
-        REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve)
-    },
-    {
-        "sve_fp16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16;
-        },
-        REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve)
-    },
-    {
-        "sve_u8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::U8) && data.isa.sve;
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve)
-    },
-    {
-        "sve_s16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::S16) && data.isa.sve;
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve)
-    },
-    {
-        "sve_s32_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::S32) && data.isa.sve;
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve)
-    },
-    {
-        "neon_fp32_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon)
-    },
-    {
-        "neon_fp16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F16) && data.isa.fp16;
-        },
-        REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon)
-    },
-    {
-        "neon_u8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::U8); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon)
-    },
-    {
-        "neon_s16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S16); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon)
-    },
-    {
-        "neon_s32_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S32); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon)
-    },
-    {
-        "neon_qu8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)
-    },
-    {
-        "neon_qs8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)
-    },
-    {
-        "neon_qs16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QSYMM16); },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)
-    }
-};
-
-Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
+static const std::vector<CpuAddKernel::AddKernel> available_kernels = {
+    {"neon_qu8_add_fixedpoint",
+     [](const CpuAddKernelDataTypeISASelectorData &data)
+     { return (data.dt == DataType::QASYMM8) && data.can_use_fixedpoint; },
+     REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<uint8_t>)},
+    {"neon_qs8_add_fixedpoint",
+     [](const CpuAddKernelDataTypeISASelectorData &data)
+     { return (data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint; },
+     REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<int8_t>)},
+    {"sve2_qu8_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
+     REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2)},
+    {"sve2_qs8_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data)
+     { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
+     REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2)},
+    {"sve2_qs16_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16) && data.isa.sve2; },
+     REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2)},
+    {"sve_fp32_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
+     REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve)},
+    {"sve_fp16_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data)
+     { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
+     REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve)},
+    {"sve_u8_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8) && data.isa.sve; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve)},
+    {"sve_s16_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16) && data.isa.sve; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve)},
+    {"sve_s32_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32) && data.isa.sve; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve)},
+    {"neon_fp32_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon)},
+    {"neon_fp16_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon)},
+    {"neon_u8_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon)},
+    {"neon_s16_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon)},
+    {"neon_s32_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon)},
+    {"neon_qu8_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)},
+    {"neon_qs8_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)},
+    {"neon_qs16_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16); },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)}};
+
+Status
+validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
 {
     ARM_COMPUTE_UNUSED(policy);
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                         DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
 
     const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src0.tensor_shape().x() != src1.tensor_shape().x()) && ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type())
-                                                                                             || (src1.data_type() != dst.data_type())),
-                                    "Broadcasting across width is supported on configurations where all tensors have the same data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (src0.tensor_shape().x() != src1.tensor_shape().x()) &&
+            ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type()) ||
+             (src1.data_type() != dst.data_type())),
+        "Broadcasting across width is supported on configurations where all tensors have the same data type");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
@@ -202,8 +134,8 @@ Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, cons
     }
 
     const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(&src0, &src1, &dst);
-    const auto uk = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(CpuAddKernelDataTypeISASelectorData{ src0.data_type(),
-                                                                                          CPUInfo::get().get_isa(), can_use_fixedpoint });
+    const auto uk                 = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(
+        CpuAddKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     return Status{};
@@ -215,9 +147,9 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
 
-    const auto can_use_fixedpoint     = add_q8_neon_fixedpoint_possible(src0, src1, dst);
-    const auto uk                     = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(CpuAddKernelDataTypeISASelectorData{ src0->data_type(),
-                                                                                                              CPUInfo::get().get_isa(), can_use_fixedpoint });
+    const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(src0, src1, dst);
+    const auto uk                 = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(
+        CpuAddKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
@@ -237,7 +169,8 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
     ICpuKernel::configure(win);
 }
 
-Status CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
+Status
+CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
 
@@ -277,14 +210,14 @@ size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
     ARM_COMPUTE_UNUSED(thread_count);
 
 #if defined(ENABLE_FP32_KERNELS)
-    if(this->_run_method == &add_fp32_neon)
+    if (this->_run_method == &add_fp32_neon)
     {
         size_t mws = ICPPKernel::default_mws;
-        if(platform.get_cpu_model() == CPUModel::N1)
+        if (platform.get_cpu_model() == CPUModel::N1)
         {
             mws = default_mws_N1_fp32_neon;
         }
-        else if(platform.get_cpu_model() == CPUModel::V1)
+        else if (platform.get_cpu_model() == CPUModel::V1)
         {
             mws = default_mws_V1_fp32_neon;
         }
@@ -294,7 +227,7 @@ size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
         }
 
         // tensor is 1D or was re-interpreted as 1D
-        if(this->window().shape().num_dimensions() == 1)
+        if (this->window().shape().num_dimensions() == 1)
         {
             return mws;
         }
@@ -307,7 +240,7 @@ size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
             return std::max(static_cast<size_t>(1), mws);
         }
     }
-#else /* ENABLE_FP32_KERNELS */
+#else  /* ENABLE_FP32_KERNELS */
     ARM_COMPUTE_UNUSED(platform);
 #endif /* ENABLE_FP32_KERNELS */
     return ICPPKernel::default_mws;
diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h
index 9921feabe2d34af66947b56430a9dc9544f30b8e..4adba8bb161e9999505559c0a7a2c7dcfd90f7bd 100644
--- a/src/cpu/kernels/CpuAddKernel.h
+++ b/src/cpu/kernels/CpuAddKernel.h
@@ -37,7 +37,8 @@ namespace kernels
 class CpuAddKernel : public ICpuKernel<CpuAddKernel>
 {
 private:
-    using AddKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
+    using AddKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
 
 public:
     struct AddKernel
@@ -74,10 +75,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
+    static Status
+    validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     /** Return minimum workload size of the relevant kernel
@@ -98,9 +100,9 @@ public:
 
 private:
     ConvertPolicy _policy{};
-    AddKernelPtr  _run_method{ nullptr };
+    AddKernelPtr  _run_method{nullptr};
     std::string   _name{};
-    size_t        _split_dimension{ Window::DimY };
+    size_t        _split_dimension{Window::DimY};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuAddMulAddKernel.cpp b/src/cpu/kernels/CpuAddMulAddKernel.cpp
index b84bdd54e965f173c869479bc5d88809363cceca..6a632e87022cfa2a33aca21551f667943bdc7bcb 100644
--- a/src/cpu/kernels/CpuAddMulAddKernel.cpp
+++ b/src/cpu/kernels/CpuAddMulAddKernel.cpp
@@ -27,8 +27,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 
-#include "src/core/CPP/Validate.h"
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/addmuladd/list.h"
@@ -41,36 +41,28 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuAddMulAddKernel::AddMulAddKernel> available_kernels =
-{
+static const std::vector<CpuAddMulAddKernel::AddMulAddKernel> available_kernels = {
 #ifdef __aarch64__
-    {
-        "neon_fp32_add_mul_add",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::add_mul_add_fp32_neon)
-    },
-    {
-        "neon_fp16_add_mul_add",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16); },
-        REGISTER_FP16_NEON(arm_compute::cpu::add_mul_add_fp16_neon)
-    },
-    {
-        "neon_qasymm8_add_mul_add",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::add_mul_add_u8_neon)
-    },
-    {
-        "neon_qasymm8_signed_add_mul_add",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_mul_add_s8_neon)
-    }
+    {"neon_fp32_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::add_mul_add_fp32_neon)},
+    {"neon_fp16_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16); },
+     REGISTER_FP16_NEON(arm_compute::cpu::add_mul_add_fp16_neon)},
+    {"neon_qasymm8_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::add_mul_add_u8_neon)},
+    {"neon_qasymm8_signed_add_mul_add",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_mul_add_s8_neon)}
 #endif // __aarch64__
 };
 
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
-                          const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                          const ITensorInfo *add_output, const ITensorInfo *final_output,
-                          ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status validate_arguments(const ITensorInfo         *input1,
+                          const ITensorInfo         *input2,
+                          const ITensorInfo         *bn_mul,
+                          const ITensorInfo         *bn_add,
+                          const ITensorInfo         *add_output,
+                          const ITensorInfo         *final_output,
+                          ConvertPolicy              policy,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, bn_mul, bn_add, final_output);
 
@@ -78,16 +70,16 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
 
     using ActFunction          = ActivationLayerInfo::ActivationFunction;
     const ActFunction act_func = act_info.activation();
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        (act_func != ActFunction::BOUNDED_RELU && act_func != ActFunction::RELU && act_func != ActFunction::LU_BOUNDED_RELU && act_func != ActFunction::IDENTITY),
-        "Only RELU Family activations, or no activation, is supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_func != ActFunction::BOUNDED_RELU && act_func != ActFunction::RELU &&
+                                     act_func != ActFunction::LU_BOUNDED_RELU && act_func != ActFunction::IDENTITY),
+                                    "Only RELU Family activations, or no activation, is supported");
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
                                                          DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
 
-    if(is_data_type_quantized(input1->data_type()))
+    if (is_data_type_quantized(input1->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bn_mul, 1, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bn_add, 1, DataType::F32);
@@ -101,39 +93,47 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2); // No broadcasting
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mul, bn_add);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->num_dimensions() != 1, "BatchNorm coefficients should be 1D array");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->tensor_shape()[0] != input1->tensor_shape()[0], "First dimensions of inputs and batchNorm coefs should match");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->tensor_shape()[0] != input1->tensor_shape()[0],
+                                    "First dimensions of inputs and batchNorm coefs should match");
 
     // Validate in case we have add layer's output (intermediate) initialized
-    if(add_output != nullptr && add_output->total_size() > 0)
+    if (add_output != nullptr && add_output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, add_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, add_output);
     }
 
     // Validate in case final output has been initialized
-    if(final_output->total_size() > 0)
+    if (final_output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, final_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, final_output);
     }
 
-    const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(DataTypeISASelectorData{ input1->data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(
+        DataTypeISASelectorData{input1->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     return Status{};
 }
 } // namespace
 
-void CpuAddMulAddKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2,
-                                   const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                                   ITensorInfo *add_output, ITensorInfo *final_output,
-                                   ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuAddMulAddKernel::configure(const ITensorInfo         *input1,
+                                   const ITensorInfo         *input2,
+                                   const ITensorInfo         *bn_mul,
+                                   const ITensorInfo         *bn_add,
+                                   ITensorInfo               *add_output,
+                                   ITensorInfo               *final_output,
+                                   ConvertPolicy              policy,
+                                   const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(bn_mul, bn_add, input2);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, bn_add, bn_mul, final_output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
 
-    const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(DataTypeISASelectorData{ input1->data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(
+        DataTypeISASelectorData{input1->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
 
@@ -146,7 +146,7 @@ void CpuAddMulAddKernel::configure(const ITensorInfo *input1, const ITensorInfo
     set_shape_if_empty(*final_output, input1->tensor_shape());
     set_data_type_if_unknown(*final_output, input1->data_type());
 
-    if(add_output != nullptr)
+    if (add_output != nullptr)
     {
         set_shape_if_empty(*add_output, input1->tensor_shape());
         set_data_type_if_unknown(*add_output, input1->data_type());
@@ -158,14 +158,19 @@ void CpuAddMulAddKernel::configure(const ITensorInfo *input1, const ITensorInfo
     ICpuKernel::configure(win);
 }
 
-Status CpuAddMulAddKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                                    const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                                    const ITensorInfo *add_output, const ITensorInfo *final_output,
-                                    ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuAddMulAddKernel::validate(const ITensorInfo         *input1,
+                                    const ITensorInfo         *input2,
+                                    const ITensorInfo         *bn_mul,
+                                    const ITensorInfo         *bn_add,
+                                    const ITensorInfo         *add_output,
+                                    const ITensorInfo         *final_output,
+                                    ConvertPolicy              policy,
+                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, bn_mul, bn_add, final_output);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
 
     return Status{};
 }
diff --git a/src/cpu/kernels/CpuAddMulAddKernel.h b/src/cpu/kernels/CpuAddMulAddKernel.h
index 67ce6f029afef00aac5d8fa36f299d9d8de1ab6a..c5e31ec291bb651f3c052eaddbd511efdf3c1afe 100644
--- a/src/cpu/kernels/CpuAddMulAddKernel.h
+++ b/src/cpu/kernels/CpuAddMulAddKernel.h
@@ -26,6 +26,7 @@
 #define SRC_CPU_KERNELS_CPUADDMULADDKERNEL
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -39,8 +40,15 @@ namespace kernels
 class CpuAddMulAddKernel : public ICpuKernel<CpuAddMulAddKernel>
 {
 private:
-    using AddMulAddKernelPtr =
-        std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, const ITensor *, ITensor *, ITensor *, ConvertPolicy, const ActivationLayerInfo &, const Window &)>::type;
+    using AddMulAddKernelPtr = std::add_pointer<void(const ITensor *,
+                                                     const ITensor *,
+                                                     const ITensor *,
+                                                     const ITensor *,
+                                                     ITensor *,
+                                                     ITensor *,
+                                                     ConvertPolicy,
+                                                     const ActivationLayerInfo &,
+                                                     const Window &)>::type;
 
 public:
     struct AddMulAddKernel
@@ -57,23 +65,31 @@ public:
      * Similar to @ref NEAddMulAdd::configure()
      *
      */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2,
-                   const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                   ITensorInfo *add_output, ITensorInfo *final_output,
-                   ConvertPolicy policy, const ActivationLayerInfo &act_info);
+    void configure(const ITensorInfo         *input1,
+                   const ITensorInfo         *input2,
+                   const ITensorInfo         *bn_mul,
+                   const ITensorInfo         *bn_add,
+                   ITensorInfo               *add_output,
+                   ITensorInfo               *final_output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuAddMulAddKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                           const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                           const ITensorInfo *add_output, const ITensorInfo *final_output,
-                           ConvertPolicy policy, const ActivationLayerInfo &act_info);
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *bn_mul,
+                           const ITensorInfo         *bn_add,
+                           const ITensorInfo         *add_output,
+                           const ITensorInfo         *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     static const std::vector<AddMulAddKernel> &get_available_kernels();
@@ -81,7 +97,7 @@ public:
 private:
     ConvertPolicy       _policy{};
     ActivationLayerInfo _act_info{};
-    AddMulAddKernelPtr  _run_method{ nullptr };
+    AddMulAddKernelPtr  _run_method{nullptr};
     std::string         _name{};
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuCastKernel.cpp b/src/cpu/kernels/CpuCastKernel.cpp
index 641dea40dcc82f67c52b960dc3839fbb1c83f4ac..05c7742b0310a4b2ecf1674958ff44656d0995a6 100644
--- a/src/cpu/kernels/CpuCastKernel.cpp
+++ b/src/cpu/kernels/CpuCastKernel.cpp
@@ -28,16 +28,16 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/SaturateCast.h"
-
 #include "src/cpu/kernels/cast/list.h"
+#include "support/SaturateCast.h"
 
 namespace arm_compute
 {
@@ -47,117 +47,111 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuCastKernel::CastKernel> available_kernels =
-{
-    {
-        "neon_qs8_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast)
-    },
-    {
-        "neon_qu8_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)
-    },
-    {
-        "neon_u8_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)
-    },
-    {
-        "neon_fp16_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast)
-    },
-    {
-        "neon_fp32_to_fp16_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast)
-    },
-    {
-        "neon_fp32_to_bf16_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F32 && data.dst_dt == DataType::BFLOAT16 && data.isa.bf16; },
-        REGISTER_BF16_NEON(arm_compute::cpu::neon_fp32_to_bfloat16_cast)
-    },
-    {
-        "neon_s32_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast)
-    },
-    {
-        "neon_bf16_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::BFLOAT16 && data.dst_dt == DataType::F32 && data.isa.bf16; },
-        REGISTER_BF16_NEON(arm_compute::cpu::neon_bfloat16_to_fp32_cast)
-    },
+static const std::vector<CpuCastKernel::CastKernel> available_kernels = {
+    {"neon_qs8_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast)},
+    {"neon_qu8_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)},
+    {"neon_u8_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)},
+    {"neon_fp16_cast",
+     [](const CastDataTypeISASelectorData &data) { return data.src_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast)},
+    {"neon_fp32_to_fp16_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast)},
+    {"neon_s32_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast)},
 };
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(dst);
     ARM_COMPUTE_UNUSED(policy);
     ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
 #ifdef __aarch64__
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
-                                                         DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::U8, DataType::S16, DataType::U16, DataType::F16,
                                                          DataType::F32, DataType::S32, DataType::S64, DataType::U64);
-#else // __aarch64__
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
-                                                         DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::U8, DataType::S16, DataType::U16, DataType::F16,
+                                                         DataType::U32, DataType::S32, DataType::F32, DataType::S64);
+
+#else  // __aarch64__
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::U8, DataType::S16, DataType::U16, DataType::F16,
                                                          DataType::F32, DataType::S32);
-#endif // __aarch64__
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
-                                                         DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::U8, DataType::S16, DataType::U16, DataType::F16,
                                                          DataType::U32, DataType::S32, DataType::F32);
+#endif // __aarch64__
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32
-                                                                                     && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED &&
+                                        (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32 &&
+                                         dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
                                     "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
-                                                                              && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 &&
+                                        (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 &&
+                                         dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 &&
+                                         dst->data_type() != DataType::F32),
                                     "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
-                                                                         && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 &&
+                                        (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 &&
+                                         dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 &&
+                                         dst->data_type() != DataType::F32),
                                     "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 &&
+                                        (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
                                     "Only data_types supported [in] U16 ->  [out] U8, U32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 &&
+                                        (dst->data_type() != DataType::QASYMM8_SIGNED &&
+                                         dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
                                     "Only data_types supported [in] S16 ->  [out] U8, S32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::BFLOAT16 && dst->data_type() != DataType::F32,
-                                    "Only data_types supported [in] BFLOAT16 ->  [out] F32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
-                                                                          && dst->data_type() != DataType::U8
-                                                                          && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 &&
+                                        (dst->data_type() != DataType::QASYMM8_SIGNED &&
+                                         dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::U8 &&
+                                         dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
                                     "Only data_types supported [in] F16 ->  [out] QASYMM8, F32, S32, U8");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
-                                                                          && dst->data_type() != DataType::F16 && dst->data_type() != DataType::BFLOAT16
-                                                                          && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
-                                    "Only data_types supported [in] F32 ->  [out] QASYMM8, BFLOAT16, F16, S32, U8");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
-                                                                          && dst->data_type() != DataType::F16
-                                                                          && dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8),
-                                    "Only data_types supported [in] S32 ->  [out] QASYMM8, F16, F32, U8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 &&
+                                        (dst->data_type() != DataType::QASYMM8_SIGNED &&
+                                         dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 &&
+                                         dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
+                                    "Only data_types supported [in] F32 ->  [out] QASYMM8, F16, S32, U8");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 &&
+                                        (dst->data_type() != DataType::QASYMM8_SIGNED &&
+                                         dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 &&
+                                         dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8 &&
+                                         dst->data_type() != DataType::S64),
+                                    "Only data_types supported [in] S32 ->  [out] QASYMM8, F16, F32, U8, S64");
 #ifdef __aarch64__
-     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S64 && dst->data_type() != DataType::F32,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S64 && dst->data_type() != DataType::F32,
                                     "Only data_types supported [in] S64 ->  [out] F32");
 
-     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U64 && dst->data_type() != DataType::F32,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U64 && dst->data_type() != DataType::F32,
                                     "Only data_types supported [in] U64 ->  [out] F32");
 #endif // __aarch64__
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
     }
@@ -198,36 +192,32 @@ inline void internal_neon_convert(const T1 *src_ptr, T2 *dst_ptr)
     ARM_COMPUTE_UNUSED(dst_ptr);
 }
 
+template <>
+inline void internal_neon_convert<int32_t, int64_t>(const int32_t *src_ptr, int64_t *dst_ptr)
+{
+    const int32x4x4_t texels = {
+        {vld1q_s32(src_ptr), vld1q_s32(src_ptr + 4), vld1q_s32(src_ptr + 8), vld1q_s32(src_ptr + 12)}};
+    vst1q_s64(dst_ptr, vmovl_s32(vget_low_s32(texels.val[0])));
+    vst1q_s64(dst_ptr + 2, vmovl_s32(vget_high_s32(texels.val[0])));
+    vst1q_s64(dst_ptr + 4, vmovl_s32(vget_low_s32(texels.val[1])));
+    vst1q_s64(dst_ptr + 6, vmovl_s32(vget_high_s32(texels.val[1])));
+    vst1q_s64(dst_ptr + 8, vmovl_s32(vget_low_s32(texels.val[2])));
+    vst1q_s64(dst_ptr + 10, vmovl_s32(vget_high_s32(texels.val[2])));
+    vst1q_s64(dst_ptr + 12, vmovl_s32(vget_low_s32(texels.val[3])));
+    vst1q_s64(dst_ptr + 14, vmovl_s32(vget_high_s32(texels.val[3])));
+}
+
 template <>
 inline void internal_neon_convert<int64_t, float>(const int64_t *src_ptr, float *dst_ptr)
 {
-    const float64x2x4_t texels0 =
-    {
-        {
-            vcvtq_f64_s64(vld1q_s64(src_ptr)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 2)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 4)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 6))
-        }
-    };
-    const float64x2x4_t texels1 =
-    {
-        {
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 8)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 10)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 12)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 14))
-        }
-    };
-    const float32x4x4_t texels =
-    {
-        {
-            vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
-            vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
-            vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
-            vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))
-        }
-    };
+    const float64x2x4_t texels0 = {{vcvtq_f64_s64(vld1q_s64(src_ptr)), vcvtq_f64_s64(vld1q_s64(src_ptr + 2)),
+                                    vcvtq_f64_s64(vld1q_s64(src_ptr + 4)), vcvtq_f64_s64(vld1q_s64(src_ptr + 6))}};
+    const float64x2x4_t texels1 = {{vcvtq_f64_s64(vld1q_s64(src_ptr + 8)), vcvtq_f64_s64(vld1q_s64(src_ptr + 10)),
+                                    vcvtq_f64_s64(vld1q_s64(src_ptr + 12)), vcvtq_f64_s64(vld1q_s64(src_ptr + 14))}};
+    const float32x4x4_t texels  = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
+                                    vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
+                                    vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
+                                    vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}};
     vst1q_f32(dst_ptr, texels.val[0]);
     vst1q_f32(dst_ptr + 4, texels.val[1]);
     vst1q_f32(dst_ptr + 8, texels.val[2]);
@@ -237,34 +227,15 @@ inline void internal_neon_convert<int64_t, float>(const int64_t *src_ptr, float
 template <>
 inline void internal_neon_convert<uint64_t, float>(const uint64_t *src_ptr, float *dst_ptr)
 {
-    const float64x2x4_t texels0 =
-    {
-        {
-            vcvtq_f64_u64(vld1q_u64(src_ptr)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 2)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 4)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 6))
-        }
-    };
-    const float64x2x4_t texels1 =
-    {
-        {
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 8)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 10)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 12)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 14))
-        }
-    };
+    const float64x2x4_t texels0 = {{vcvtq_f64_u64(vld1q_u64(src_ptr)), vcvtq_f64_u64(vld1q_u64(src_ptr + 2)),
+                                    vcvtq_f64_u64(vld1q_u64(src_ptr + 4)), vcvtq_f64_u64(vld1q_u64(src_ptr + 6))}};
+    const float64x2x4_t texels1 = {{vcvtq_f64_u64(vld1q_u64(src_ptr + 8)), vcvtq_f64_u64(vld1q_u64(src_ptr + 10)),
+                                    vcvtq_f64_u64(vld1q_u64(src_ptr + 12)), vcvtq_f64_u64(vld1q_u64(src_ptr + 14))}};
 
-    const float32x4x4_t texels =
-    {
-        {
-            vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
-            vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
-            vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
-            vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))
-        }
-    };
+    const float32x4x4_t texels = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
+                                   vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
+                                   vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
+                                   vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}};
 
     vst1q_f32(dst_ptr, texels.val[0]);
     vst1q_f32(dst_ptr + 4, texels.val[1]);
@@ -273,23 +244,26 @@ inline void internal_neon_convert<uint64_t, float>(const uint64_t *src_ptr, floa
 }
 
 template <typename T1, typename T2>
-inline void convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x)
+inline void
+convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x)
 {
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const T1 *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<T2 *>(dst.ptr());
-        int        x       = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            internal_neon_convert<T1, T2>(src_ptr + x, dst_ptr + x);
-        }
-        for(; x < window_end_x; ++x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            *(dst_ptr + x) = static_cast<T2>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+            const auto src_ptr = reinterpret_cast<const T1 *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<T2 *>(dst.ptr());
+            int        x       = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                internal_neon_convert<T1, T2>(src_ptr + x, dst_ptr + x);
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<T2>(*(src_ptr + x));
+            }
+        },
+        src, dst);
 }
 } // namespace
 #endif // __aarch64__
@@ -311,21 +285,22 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
 
-    /*ukernel runs only when using fp16/bfloat16, so we validate it isn't a nullptr only before using it */
-    const auto *uk = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ _src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa() });
+    /*ukernel runs only when using fp16, so we validate it isn't a nullptr only before using it */
+    const auto *uk = CpuCastKernel::get_implementation(
+        CastDataTypeISASelectorData{_src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa()});
 
-    switch(_src->info()->data_type())
+    switch (_src->info()->data_type())
     {
 #ifdef __aarch64__
         case DataType::U64:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::F32:
                 {
@@ -339,7 +314,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
         }
         case DataType::S64:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::F32:
                 {
@@ -355,111 +330,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
 
         case DataType::QASYMM8_SIGNED:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::S16:
                 {
                     /* Up-conversion QASYMM8_SIGNED -> S16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-                        int        x       = window_start_x;
-
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
+                            int        x       = window_start_x;
 
-                            const int16x8x2_t texels =
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
+                                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
 
-                            vst1q_s16(dst_ptr + x, texels.val[0]);
-                            vst1q_s16(dst_ptr + x + 8, texels.val[1]);
-                        }
+                                const int16x8x2_t texels = {
+                                    {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_s16(dst_ptr + x, texels.val[0]);
+                                vst1q_s16(dst_ptr + x + 8, texels.val[1]);
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::S32:
                 {
                     /* Up-conversion QASYMM8_SIGNED -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-                        int        x       = window_start_x;
-
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+                            int        x       = window_start_x;
 
-                            const int16x8x2_t texels =
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
+                                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
 
-                            vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
-                            vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
-                        }
+                                const int16x8x2_t texels = {
+                                    {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
+                                vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+                                vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+                                vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::F32:
                 {
                     /* Up-conversion QASYMM8_SIGNED -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
 
-                            const int16x8x2_t texels =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
-                            vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
-                            vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+
+                                const int16x8x2_t texels = {
+                                    {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
+                                vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
+                                vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
+                                vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
+                                vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::F16:
@@ -478,111 +444,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
         case DataType::QASYMM8:
         case DataType::U8:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::S16:
                 {
                     /* Up-conversion U8 -> S16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
 
-                            const int16x8x2_t texels =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
+                                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
 
-                            vst1q_s16(dst_ptr + x, texels.val[0]);
-                            vst1q_s16(dst_ptr + x + 8, texels.val[1]);
-                        }
+                                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_s16(dst_ptr + x, texels.val[0]);
+                                vst1q_s16(dst_ptr + x + 8, texels.val[1]);
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::S32:
                 {
                     /* Up-conversion U8 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
 
-                            const int16x8x2_t texels =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
+                                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
 
-                            vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
-                            vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
-                        }
+                                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
+                                vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+                                vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+                                vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::F32:
                 {
                     /* Up-conversion U8 -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
 
-                            const int16x8x2_t texels =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
-                            vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
-                            vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+
+                                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
+                                vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
+                                vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
+                                vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
+                                vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::F16:
@@ -595,35 +552,32 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
                 case DataType::U16:
                 {
                     /* Up-conversion U8 -> U16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
 
-                            const uint16x8x2_t texels =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vmovl_u8(vget_low_u8(texels_u8)),
-                                    vmovl_u8(vget_high_u8(texels_u8))
-                                }
-                            };
+                                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
 
-                            vst1q_u16(dst_ptr + x, texels.val[0]);
-                            vst1q_u16(dst_ptr + x + 8, texels.val[1]);
-                        }
+                                const uint16x8x2_t texels = {
+                                    {vmovl_u8(vget_low_u8(texels_u8)), vmovl_u8(vget_high_u8(texels_u8))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_u16(dst_ptr + x, texels.val[0]);
+                                vst1q_u16(dst_ptr + x + 8, texels.val[1]);
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 default:
@@ -633,177 +587,154 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
         }
         case DataType::S16:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::QASYMM8_SIGNED:
                 {
                     /* Down-conversion S16 -> QASYMM8_SIGNED */
-                    if(ConvertPolicy::SATURATE == _policy)
+                    if (ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int16x8x2_t texels =
+                                const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
+                                    const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
 
-                                vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
-                            }
+                                    vst1q_s8(dst_ptr + x,
+                                             vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     else
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int16x8x2_t texels =
+                                const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
+                                    const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
 
-                                vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
-                            }
+                                    vst1q_s8(dst_ptr + x,
+                                             vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     break;
                 }
                 case DataType::U8:
                 {
                     /* Down-conversion S16 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
+                    if (ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int16x8x2_t texels =
+                                const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
+                                    const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
 
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
-                            }
+                                    vst1q_u8(dst_ptr + x,
+                                             vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     else
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int16x8x2_t texels =
+                                const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
-
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
-                                                                  vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
-                            }
+                                    const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                    vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
+                                                                      vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
+                                }
+
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     break;
                 }
                 case DataType::S32:
                 {
                     /* Up-conversion S16 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vld1q_s16(src_ptr + x),
-                                    vld1q_s16(src_ptr + x + 8)
-                                }
-                            };
+                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
 
-                            const int32x4x4_t texels_s32 =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vmovl_s16(vget_low_s16(texels.val[0])),
-                                    vmovl_s16(vget_high_s16(texels.val[0])),
-                                    vmovl_s16(vget_low_s16(texels.val[1])),
-                                    vmovl_s16(vget_high_s16(texels.val[1]))
-                                }
-                            };
+                                const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
 
-                            vst1q_s32(dst_ptr + x, texels_s32.val[0]);
-                            vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
-                            vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
-                            vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
-                        }
+                                const int32x4x4_t texels_s32 = {
+                                    {vmovl_s16(vget_low_s16(texels.val[0])), vmovl_s16(vget_high_s16(texels.val[0])),
+                                     vmovl_s16(vget_low_s16(texels.val[1])), vmovl_s16(vget_high_s16(texels.val[1]))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_s32(dst_ptr + x, texels_s32.val[0]);
+                                vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
+                                vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
+                                vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 default:
@@ -814,104 +745,92 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
 
         case DataType::U16:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::U8:
                 {
                     /* Down-conversion U16 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
+                    if (ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const uint16x8x2_t texels =
+                                const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_u16(src_ptr + x),
-                                        vld1q_u16(src_ptr + x + 8)
-                                    }
-                                };
+                                    const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
 
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
-                            }
+                                    vst1q_u8(dst_ptr + x,
+                                             vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     else
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const uint16x8x2_t texels =
-                                {
-                                    {
-                                        vld1q_u16(src_ptr + x),
-                                        vld1q_u16(src_ptr + x + 8)
-                                    }
-                                };
+                                const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
 
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
-                            }
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                                {
+                                    const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
-                            }
+                                    vst1q_u8(dst_ptr + x,
+                                             vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
+                                }
 
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     break;
                 }
                 case DataType::U32:
                 {
                     /* Up-conversion U16 -> U32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const uint16x8x2_t texels =
+                            const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vld1q_u16(src_ptr + x),
-                                    vld1q_u16(src_ptr + x + 8)
-                                }
-                            };
-
-                            vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
-                            vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
-                            vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
-                            vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
-                        }
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
-                        }
+                                const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
 
-                    },
-                    src, dst);
+                                vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
+                                vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
+                                vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
+                                vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
+                            }
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 default:
@@ -919,13 +838,6 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
             }
             break;
         }
-        case DataType::BFLOAT16:
-        {
-            /* Up-conversion BFLOAT16 -> F32 */
-            ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
-            uk->ukernel(_src, _dst, info, _policy, window);
-            break;
-        }
         case DataType::F16:
         {
             /* conversion F16 -> any data type */
@@ -934,7 +846,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
             break;
         }
         case DataType::F32:
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::F16:
                 {
@@ -943,115 +855,113 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
                     uk->ukernel(_src, _dst, info, _policy, window);
                     break;
                 }
-                case DataType::BFLOAT16:
-                {
-                    /* Down-conversion F32 -> BFLOAT16 */
-                    ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
-                    uk->ukernel(_src, _dst, info, _policy, window);
-                    break;
-                }
                 case DataType::S32:
                 {
                     /* Conversion F32 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const float32x4x4_t texels =
+                            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
+                                const float32x4x4_t texels = {{
                                     vld1q_f32(src_ptr + x),
                                     vld1q_f32(src_ptr + x + 4),
                                     vld1q_f32(src_ptr + x + 8),
                                     vld1q_f32(src_ptr + x + 12),
-                                }
-                            };
+                                }};
 
-                            vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
-                            vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
-                            vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
-                            vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
-                        }
+                                vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
+                                vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
+                                vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
+                                vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
+                            }
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::QASYMM8:
                 case DataType::U8:
                 {
                     /* Down-conversion F32 -> U8 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const float32x4x4_t texels =
+                            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
+                                const float32x4x4_t texels = {{
                                     vld1q_f32(src_ptr + x),
                                     vld1q_f32(src_ptr + x + 4),
                                     vld1q_f32(src_ptr + x + 8),
                                     vld1q_f32(src_ptr + x + 12),
-                                }
-                            };
-
-                            vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
-                            vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
-                        }
+                                }};
+
+                                vst1_u8(dst_ptr + x,
+                                        vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])),
+                                                                vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
+                                vst1_u8(dst_ptr + x + 8,
+                                        vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])),
+                                                                vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
+                            }
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
                     /* Down-conversion F32 -> QASYMM8_SIGNED */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const float32x4x4_t texels =
+                            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
+                                const float32x4x4_t texels = {{
                                     vld1q_f32(src_ptr + x),
                                     vld1q_f32(src_ptr + x + 4),
                                     vld1q_f32(src_ptr + x + 8),
                                     vld1q_f32(src_ptr + x + 12),
-                                }
-                            };
-
-                            vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
-                            vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
-                        }
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                }};
+
+                                vst1_s8(dst_ptr + x,
+                                        vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])),
+                                                                vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
+                                vst1_s8(dst_ptr + x + 8,
+                                        vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])),
+                                                                vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
+                            }
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
 
@@ -1060,8 +970,15 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
             }
             break;
         case DataType::S32:
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
+#if __aarch64__
+                case DataType::S64:
+                {
+                    convert64<int32_t, int64_t>(src, dst, win, window_start_x, window_end_x, window_step_x);
+                    break;
+                }
+#endif // __aarch64__
                 case DataType::F16:
                 {
                     /* Down-conversion S32 -> F16 */
@@ -1072,104 +989,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
                 case DataType::F32:
                 {
                     /* Conversion S32 -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const int32x4x4_t texels =
+                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
+                                const int32x4x4_t texels = {{
                                     vld1q_s32(src_ptr + x),
                                     vld1q_s32(src_ptr + x + 4),
                                     vld1q_s32(src_ptr + x + 8),
                                     vld1q_s32(src_ptr + x + 12),
-                                }
-                            };
+                                }};
 
-                            vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
-                            vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
-                            vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
-                            vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
-                        }
+                                vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
+                                vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
+                                vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
+                                vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
+                            }
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
                     /* Down-conversion S32 -> QASYMM8_SIGNED */
-                    if(ConvertPolicy::SATURATE == _policy)
+                    if (ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int32x4x4_t texels =
+                                const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
+                                    const int32x4x4_t texels = {{
                                         vld1q_s32(src_ptr + x),
                                         vld1q_s32(src_ptr + x + 4),
                                         vld1q_s32(src_ptr + x + 8),
                                         vld1q_s32(src_ptr + x + 12),
-                                    }
-                                };
-                                vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
-                                vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
-                            }
+                                    }};
+                                    vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]),
+                                                                                 vqmovn_s32(texels.val[1]))));
+                                    vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]),
+                                                                                     vqmovn_s32(texels.val[3]))));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     else
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int32x4x4_t texels =
-                                {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12)
-                                    }
-                                };
+                                const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
 
-                                vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
-                                vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
-                            }
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                                {
+                                    const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
+                                                                 vld1q_s32(src_ptr + x + 8),
+                                                                 vld1q_s32(src_ptr + x + 12)}};
+
+                                    vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]),
+                                                                                vmovn_s32(texels.val[1]))));
+                                    vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]),
+                                                                                    vmovn_s32(texels.val[3]))));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     break;
                 }
@@ -1177,68 +1092,66 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
                 case DataType::U8:
                 {
                     /* Down-conversion S32 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
+                    if (ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int32x4x4_t texels =
+                                const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12)
-                                    }
-                                };
-                                vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
-                                vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
-                            }
+                                    const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
+                                                                 vld1q_s32(src_ptr + x + 8),
+                                                                 vld1q_s32(src_ptr + x + 12)}};
+                                    vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]),
+                                                                                 vqmovun_s32(texels.val[1]))));
+                                    vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]),
+                                                                                     vqmovun_s32(texels.val[3]))));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     else
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int32x4x4_t texels =
-                                {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12)
-                                    }
-                                };
+                                const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
 
-                                vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
-                                vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
-                            }
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                                {
+                                    const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
+                                                                 vld1q_s32(src_ptr + x + 8),
+                                                                 vld1q_s32(src_ptr + x + 12)}};
+
+                                    vst1_u8(dst_ptr + x,
+                                            vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])),
+                                                                   vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
+                                    vst1_u8(dst_ptr + x + 8,
+                                            vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])),
+                                                                   vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     break;
                 }
diff --git a/src/cpu/kernels/CpuCastKernel.h b/src/cpu/kernels/CpuCastKernel.h
index 76237368d87d195ffec154683aeb561b48f8a479..ddbfe1f03454df1f15e78c34ce742d4f6b2d4c3a 100644
--- a/src/cpu/kernels/CpuCastKernel.h
+++ b/src/cpu/kernels/CpuCastKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_CAST_KERNEL_H
-#define ARM_COMPUTE_CPU_CAST_KERNEL_H
+#ifndef ACL_SRC_CPU_KERNELS_CPUCASTKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUCASTKERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
@@ -40,7 +40,8 @@ namespace kernels
 class CpuCastKernel : public ICpuKernel<CpuCastKernel>
 {
 private:
-    using CastKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ThreadInfo &, ConvertPolicy, const Window &)>::type;
+    using CastKernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, const ThreadInfo &, ConvertPolicy, const Window &)>::type;
 
 public:
     CpuCastKernel() = default;
@@ -54,17 +55,17 @@ public:
      *   - U8             -> U16, S16, S32, F32, F16
      *   - U16            -> U8, U32
      *   - S16            -> QASYMM8_SIGNED, U8, S32
-     *   - BFLOAT16       -> F32
      *   - F16            -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8
      *   - S32            -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8
      *   - S64            -> F32
-     *   - F32            -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8
+     *   - F32            -> QASYMM8_SIGNED, QASYMM8, F16, S32, U8
      *
-     * @param[in]  src    The src tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/S32/S64/BFLOAT16/F16/F32.
-     * @param[out] dst    The dst tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32.
+     * @param[in]  src    The src tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/S32/S64/F16/F32.
+     * @param[out] dst    The dst tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/S64/F16/F32.
      * @param[in]  policy Conversion policy.
      *
-     * @deprecated Support for BFLOAT16 will be removed in 23.05 release
+     * @note S64 is only supported in aarch64
+     *
      */
     void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration
@@ -76,7 +77,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct CastKernel
@@ -89,9 +90,9 @@ public:
     static const std::vector<CastKernel> &get_available_kernels();
 
 private:
-    ConvertPolicy _policy{ ConvertPolicy::SATURATE };
+    ConvertPolicy _policy{ConvertPolicy::SATURATE};
 };
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CAST_KERNEL_H */
+#endif // ACL_SRC_CPU_KERNELS_CPUCASTKERNEL_H
diff --git a/src/cpu/kernels/CpuCol2ImKernel.cpp b/src/cpu/kernels/CpuCol2ImKernel.cpp
index bf5a44d78b4e7c3d28582dad11e67691c35a9d9f..a52a1f58ea7e4841ccaa2a19eca2718b8a4a9e34 100644
--- a/src/cpu/kernels/CpuCol2ImKernel.cpp
+++ b/src/cpu/kernels/CpuCol2ImKernel.cpp
@@ -29,8 +29,9 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -49,9 +50,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
     // Validate configured output
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, false));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                           compute_col2im_shape(*src, convolved_dims, false));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -106,13 +108,16 @@ void CpuCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, const T
     Iterator in(src, window);
     Iterator out(dst, window_out);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const int hidx = id.y();
-        const int idx  = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y + (hidx % _convolved_dims.width) * output_stride_x;
-        std::memcpy(out.ptr() + idx, in.ptr(), el_size);
-    },
-    in, out);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const int hidx = id.y();
+            const int idx  = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y +
+                            (hidx % _convolved_dims.width) * output_stride_x;
+            std::memcpy(out.ptr() + idx, in.ptr(), el_size);
+        },
+        in, out);
 }
 
 const char *CpuCol2ImKernel::name() const
@@ -121,4 +126,4 @@ const char *CpuCol2ImKernel::name() const
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuCol2ImKernel.h b/src/cpu/kernels/CpuCol2ImKernel.h
index deafcc14df16bda103081a161915fdde136330a7..3e394ac914768e1b59b11a2067d9b96891d5ac98 100644
--- a/src/cpu/kernels/CpuCol2ImKernel.h
+++ b/src/cpu/kernels/CpuCol2ImKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_COL2IM_KERNEL_H
 
 #include "arm_compute/core/Size2D.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -75,7 +76,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.cpp b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp
index 29d40f0e526438f645e1e6ff1c0b1a1d42fc76a9..8c290173e8f0752efd198e2e0024e32c05b2181a 100644
--- a/src/cpu/kernels/CpuConcatenateBatchKernel.cpp
+++ b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp
@@ -30,10 +30,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
@@ -50,13 +51,14 @@ void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, c
     uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
 
     // Offset dst
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + batch_offset * dst->info()->strides_in_bytes()[3];
+    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+                       batch_offset * dst->info()->strides_in_bytes()[3];
 
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
     const int  window_step_x  = 16 / dst->info()->element_size();
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     win.set(3, Window::Dimension(0, src->info()->tensor_shape()[3], 1));
 
@@ -66,66 +68,74 @@ void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, c
     const DataType                dt        = src->info()->data_type();
     const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+    if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+    else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr,
+                                    vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) =
+                        quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = *(in_ptr + x);
+                }
+            },
+            src_it, dst_it);
     }
 }
 
@@ -154,7 +164,7 @@ void CpuConcatenateBatchKernel::configure(const ITensorInfo *src, unsigned int b
     _func         = nullptr;
     _batch_offset = batch_offset;
 
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::S8:
         case DataType::U8:
@@ -196,9 +206,7 @@ void CpuConcatenateBatchKernel::run_op(ITensorPack &tensors, const Window &windo
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
-             tensors.get_tensor(TensorType::ACL_DST),
-             _batch_offset,
+    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), tensors.get_tensor(TensorType::ACL_DST), _batch_offset,
              window);
 }
 
diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.h b/src/cpu/kernels/CpuConcatenateBatchKernel.h
index 0de68a5d64201892768936f40b985a6e2a12b4b0..52ea553a7d2c3459f6b1b32d6ac47ff3afd9375d 100644
--- a/src/cpu/kernels/CpuConcatenateBatchKernel.h
+++ b/src/cpu/kernels/CpuConcatenateBatchKernel.h
@@ -57,15 +57,15 @@ public:
     static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
     using BatchConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &);
 
 private:
-    BatchConcatFunction *_func{ nullptr };
-    unsigned int         _batch_offset{ 0 };
+    BatchConcatFunction *_func{nullptr};
+    unsigned int         _batch_offset{0};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.cpp b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp
index ebc5322aeed354a6e6e4c0ba3359cc6431f40845..c75e1e447727bcb6f06258254cd0f87fe0f6d2a4 100644
--- a/src/cpu/kernels/CpuConcatenateDepthKernel.cpp
+++ b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp
@@ -30,11 +30,12 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstdint>
 
@@ -53,13 +54,14 @@ void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, c
     uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
 
     // Offset destination
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + depth_offset * dst->info()->strides_in_bytes()[2];
+    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+                       depth_offset * dst->info()->strides_in_bytes()[2];
 
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
     const int  window_step_x  = 16 / dst->info()->element_size();
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     win.set(Window::DimZ, Window::Dimension(0, src->info()->tensor_shape().z(), 1));
 
@@ -69,64 +71,73 @@ void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, c
     const DataType                dt        = src->info()->data_type();
     const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+    if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                wrapper::vstore(out_ptr + x, vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x,
+                                    vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+    else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                wrapper::vstore(out_ptr + x, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x,
+                                    vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) =
+                        quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = *(in_ptr + x);
+                }
+            },
+            src_it, dst_it);
     }
 }
 
@@ -134,7 +145,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, c
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
@@ -154,7 +166,7 @@ void CpuConcatenateDepthKernel::configure(const ITensorInfo *src, unsigned int d
     _func         = nullptr;
     _depth_offset = depth_offset;
 
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::QASYMM8:
             _func = &depth_concat<uint8_t>;
@@ -192,9 +204,7 @@ void CpuConcatenateDepthKernel::run_op(ITensorPack &tensors, const Window &windo
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
-             tensors.get_tensor(TensorType::ACL_DST),
-             _depth_offset,
+    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), tensors.get_tensor(TensorType::ACL_DST), _depth_offset,
              window);
 }
 
diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.h b/src/cpu/kernels/CpuConcatenateDepthKernel.h
index 5a0edb95bb25871eb2a3ca4cf61d165ed160cdb9..54de9aff46b86e7c5370056844be86360f115563 100644
--- a/src/cpu/kernels/CpuConcatenateDepthKernel.h
+++ b/src/cpu/kernels/CpuConcatenateDepthKernel.h
@@ -65,15 +65,15 @@ public:
     static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
     using DepthConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &);
 
 private:
-    DepthConcatFunction *_func{ nullptr };
-    unsigned int         _depth_offset{ 0 };
+    DepthConcatFunction *_func{nullptr};
+    unsigned int         _depth_offset{0};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.cpp b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp
index 47a2b44443f174bda38bd7f6ecfb7ffa1c8614c6..b6c11d948b055a6d19d92b166a3d191f7daf3024 100644
--- a/src/cpu/kernels/CpuConcatenateHeightKernel.cpp
+++ b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp
@@ -30,10 +30,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <cstdint>
 
@@ -53,7 +54,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, co
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
     }
@@ -91,13 +92,14 @@ void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &wind
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
     // Offset destination pointer to the correct position
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _height_offset * dst->info()->strides_in_bytes()[Window::DimY];
+    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+                       _height_offset * dst->info()->strides_in_bytes()[Window::DimY];
 
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
     const int  window_step_x  = 16;
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     win.set(Window::DimY, Window::Dimension(0, src->info()->tensor_shape().y(), 1));
 
@@ -108,64 +110,74 @@ void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &wind
     const DataType                 dt        = src->info()->data_type();
     const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+    if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-
-        },
-        src_it, dst_it);
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    vst1q_u8(dst_ptr + dst_it.offset() + x,
+                             vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(dst_ptr + dst_it.offset() + x) =
+                        quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+    else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
-                         vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr()) + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    vst1q_s8(
+                        reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
+                        vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr()) + x), src_qinfo),
+                                         dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(dst_ptr + dst_it.offset() + x) =
+                        quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = src_it.ptr();
-            const auto out_ptr = dst_ptr + dst_it.offset();
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = src_it.ptr();
+                const auto out_ptr = dst_ptr + dst_it.offset();
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = *(in_ptr + x);
+                }
+            },
+            src_it, dst_it);
     }
 }
 
diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.h b/src/cpu/kernels/CpuConcatenateHeightKernel.h
index 74d5d0c2c38a8b9fb4fb1a678fca01bc3fad2e4c..df880c4878e5a040547677e5c92ebf9e2c515b7a 100644
--- a/src/cpu/kernels/CpuConcatenateHeightKernel.h
+++ b/src/cpu/kernels/CpuConcatenateHeightKernel.h
@@ -58,11 +58,11 @@ public:
     static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    unsigned int _height_offset{ 0 };
+    unsigned int _height_offset{0};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.cpp b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp
index f00b37a01bfd6ce7ed72666ad08dc6213452efa2..f6100cccca9375ae6e05d883d6da004ef4a0451a 100644
--- a/src/cpu/kernels/CpuConcatenateWidthKernel.cpp
+++ b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp
@@ -24,12 +24,12 @@
 #include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Steps.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Steps.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/helpers/WindowHelpers.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
 
 namespace arm_compute
 {
@@ -47,7 +47,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, con
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
 
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
     }
@@ -86,13 +86,14 @@ void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &windo
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
     // Offset output pointer to the correct position
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _width_offset * dst->info()->strides_in_bytes()[0];
+    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+                       _width_offset * dst->info()->strides_in_bytes()[0];
 
     const auto    window_start_x = static_cast<int>(window.x().start());
     const auto    window_end_x   = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
     constexpr int window_step_x  = 16;
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     // Create iterators
@@ -101,62 +102,73 @@ void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &windo
     const DataType                 dt        = src->info()->data_type();
     const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+    if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    vst1q_u8(dst_ptr + dst_it.offset() + x,
+                             vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(dst_ptr + dst_it.offset() + x) =
+                        quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+    else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
-                         vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr() + x)), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    vst1q_s8(
+                        reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
+                        vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr() + x)), src_qinfo),
+                                         dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(dst_ptr + dst_it.offset() + x) =
+                        quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = src_it.ptr();
-            const auto out_ptr = dst_ptr + dst_it.offset();
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = src_it.ptr();
+                const auto out_ptr = dst_ptr + dst_it.offset();
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = *(in_ptr + x);
+                }
+            },
+            src_it, dst_it);
     }
 }
 
diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.h b/src/cpu/kernels/CpuConcatenateWidthKernel.h
index 418bc51b33eb03ee806787fea1900aad3f4931e6..560e44e35ac3b847ea57b1b71b247e630429fd0b 100644
--- a/src/cpu/kernels/CpuConcatenateWidthKernel.h
+++ b/src/cpu/kernels/CpuConcatenateWidthKernel.h
@@ -58,11 +58,11 @@ public:
     static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    unsigned int _width_offset{ 0 };
+    unsigned int _width_offset{0};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
index 08b39deef218f32c51e5d8f23fa9f949d1bf6bea..87703ec63176f6d747dbb60ed7227512f2a8532b 100644
--- a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
+++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -34,8 +35,10 @@ namespace cpu
 {
 namespace kernels
 {
-void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape,
-                                                      DataLayout data_layout)
+void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src,
+                                                      ITensorInfo       *dst,
+                                                      const TensorShape &original_input_shape,
+                                                      DataLayout         data_layout)
 
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -43,7 +46,8 @@ void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, IT
     // Output tensor auto initialisation if not yet initialized
     auto_init_if_empty(*dst, *src->clone());
 
-    ARM_COMPUTE_ERROR_THROW_ON(CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout));
 
     const DataLayout input_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
 
@@ -62,8 +66,10 @@ void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, IT
     ICpuKernel::configure(win);
 }
 
-Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape,
-                                                       DataLayout data_layout)
+Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src,
+                                                       const ITensorInfo *dst,
+                                                       const TensorShape &original_input_shape,
+                                                       DataLayout         data_layout)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
@@ -72,7 +78,7 @@ Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, c
     ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
 
     // Checks performed when dst is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -97,11 +103,15 @@ void CpuConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const W
     Iterator input(src, window);
     Iterator output(dst, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        memcpy(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y, input.ptr(), element_size);
-    },
-    input);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            memcpy(output.ptr() + id.x() * dst_stride_x +
+                       (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y,
+                   input.ptr(), element_size);
+        },
+        input);
 }
 
 const char *CpuConvertFullyConnectedWeightsKernel::name() const
diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
index 9a1393323b29462f15e06e6e22611a21213514e4..2253889e6935cd8125aa0f2694faaedde70be5ad 100644
--- a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
+++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
@@ -53,24 +53,32 @@ public:
      * @param[in] original_input_shape Shape of the original src tensor (the one entering fully connected layer).
      * @param[in] data_layout          The data layout the weights have been trained in.
      */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout);
+    void configure(const ITensorInfo *src,
+                   ITensorInfo       *dst,
+                   const TensorShape &original_input_shape,
+                   DataLayout         data_layout);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuConvertFullyConnectedWeightsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_input_shape,
+                           DataLayout         data_layout);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    unsigned int _factor1{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */
-    unsigned int _factor2{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */
+    unsigned int _factor1{
+        0}; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */
+    unsigned int _factor2{
+        0}; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */
 };
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */
diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
index 1005d001ab1be083906d28b506ea486811488f98..745b1566c22cf8a198bc8c96e4f7866809222334 100644
--- a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
+++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
@@ -29,9 +29,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
@@ -47,7 +48,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
 
     // Validate output if initialized
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
@@ -60,11 +61,11 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src,
 {
     // Output auto inizialitation if not yet initialized
     {
-        const bool                    is_input_signed   = src->data_type() == DataType::QASYMM8_SIGNED;
-        const DataType                dt                = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED;
-        const UniformQuantizationInfo qinfo             = src->quantization_info().uniform();
+        const bool                    is_input_signed = src->data_type() == DataType::QASYMM8_SIGNED;
+        const DataType                dt              = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED;
+        const UniformQuantizationInfo qinfo           = src->quantization_info().uniform();
         const int                     offset_correction = is_input_signed ? -128 : 128;
-        const QuantizationInfo        corrected_qinfo   = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction);
+        const QuantizationInfo        corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction);
 
         auto_init_if_empty(*dst, src->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo));
     }
@@ -110,27 +111,29 @@ void CpuConvertQuantizedSignednessKernel::run_op(ITensorPack &tensors, const Win
     const uint8_t mask  = 128;
     const auto    vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{});
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask));
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const uint8_t in  = *(reinterpret_cast<const uint8_t *>(input_ptr + x));
-            *(output_ptr + x) = in ^ mask;
-        }
-    },
-    input, output);
+            const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const uint8_t in  = *(reinterpret_cast<const uint8_t *>(input_ptr + x));
+                *(output_ptr + x) = in ^ mask;
+            }
+        },
+        input, output);
 }
 
 const char *CpuConvertQuantizedSignednessKernel::name() const
diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
index b5eaf654873d64c89b95349a3ac52d59d5cb9a32..e94d3d5ef24ccbfe565b9c32fd2499f0c426a6ed 100644
--- a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
+++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
@@ -54,7 +54,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuCopyKernel.cpp b/src/cpu/kernels/CpuCopyKernel.cpp
index 3f0f3fe422fd5b9314a3fec7b5e37dad83be186a..1b693d7a3a4e5c07f06b0d2c7f06aea1dae9e847 100644
--- a/src/cpu/kernels/CpuCopyKernel.cpp
+++ b/src/cpu/kernels/CpuCopyKernel.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -48,9 +49,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4);
 
     // Validate destination if initialized
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
 
@@ -64,7 +66,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src,
     return std::make_pair(Status{}, calculate_max_window(*dst));
 }
 
-std::pair<Status, Window> validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding)
+std::pair<Status, Window>
+validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding)
 {
     const TensorShape src_shape    = src->tensor_shape();
     const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(src_shape, padding);
@@ -84,7 +87,7 @@ void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Pa
     _padding = padding;
 
     std::pair<Status, Window> win_config;
-    if(padding.empty())
+    if (padding.empty())
     {
         win_config = validate_and_configure_window(src, dst);
     }
@@ -97,17 +100,20 @@ void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Pa
     ICpuKernel::configure(win_config.second);
 }
 
-Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, const PaddingList &padding)
+Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src,
+                               const arm_compute::ITensorInfo *dst,
+                               const PaddingList              &padding)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, padding));
 
-    if(padding.empty())
+    if (padding.empty())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first);
     }
 
     return Status{};
@@ -122,38 +128,41 @@ void CpuCopyKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_padding.empty())
+    if (_padding.empty())
     {
-        Window dst_window{ window };
-        dst_window.set(Window::DimX, Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0)));
+        Window dst_window{window};
+        dst_window.set(Window::DimX,
+                       Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0)));
         Window out_slice = dst_window.first_slice_window_1D();
         do
         {
             Iterator src_it(src, out_slice);
             Iterator dst_it(dst, out_slice);
 
-            execute_window_loop(out_slice, [&](const Coordinates &)
-            {
-                memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size());
-            },
-            src_it, dst_it);
-        }
-        while(dst_window.slide_window_slice_1D(out_slice));
+            execute_window_loop(
+                out_slice,
+                [&](const Coordinates &)
+                { memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size()); },
+                src_it, dst_it);
+        } while (dst_window.slide_window_slice_1D(out_slice));
     }
     else
     {
-        Window src_window{ window };
-        src_window.set(Window::DimX, Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0)));
+        Window src_window{window};
+        src_window.set(Window::DimX,
+                       Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0)));
 
         Iterator     src_it(src, src_window);
         Iterator     dst_it(dst, window);
         const size_t row_size_in_bytes = src->info()->dimension(0) * src->info()->element_size();
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size();
-            std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes);
-        },
-        src_it, dst_it);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &)
+            {
+                auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size();
+                std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes);
+            },
+            src_it, dst_it);
     }
 }
 
diff --git a/src/cpu/kernels/CpuCopyKernel.h b/src/cpu/kernels/CpuCopyKernel.h
index c9ef8eba7636e639fcae0746cd5cde2e3cd61b46..a05053f07eacd3a8f6269ef6edbc8a1aa648fc14 100644
--- a/src/cpu/kernels/CpuCopyKernel.h
+++ b/src/cpu/kernels/CpuCopyKernel.h
@@ -55,7 +55,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList());
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
index d6c56d2012ab5fe1cf1f95ab21c668f8908a192b..82e3a5ce008ea07d344267e3537be5cb008cf351 100644
--- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
+++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
@@ -26,11 +26,12 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/traits.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/traits.h"
 #include "src/cpu/kernels/depthwiseconv2d/list.h"
 
 namespace arm_compute
@@ -41,72 +42,53 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> available_kernels =
-{
-    {
-        "neon_qu8_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::QASYMM8);
-        },
-        REGISTER_QASYMM8_NEON(neon_qu8_deptwiseconv2dnative)
-    },
-    {
-        "neon_qs8_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::QASYMM8_SIGNED);
-        },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qs8_deptwiseconv2dnative)
-    },
-    {
-        "neon_fp16_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::F16 && data.isa.fp16);
-        },
-        REGISTER_FP16_NEON(neon_fp16_deptwiseconv2dnative)
-    },
-    {
-        "neon_fp32_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::F32);
-        },
-        REGISTER_FP32_NEON(neon_fp32_deptwiseconv2dnative)
-    },
-    {
-        "neon_qp8_qu8_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt == DataType::QASYMM8);
-        },
-        REGISTER_QASYMM8_NEON(neon_qp8_qu8_deptwiseconv2dnative)
-    },
-    {
-        "neon_qp8_qs8_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt != DataType::QASYMM8);
-        },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qp8_qs8_deptwiseconv2dnative)
-    },
+static const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> available_kernels = {
+    {"neon_qu8_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) { return (data.weights_dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(neon_qu8_deptwiseconv2dnative)},
+    {"neon_qs8_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+     { return (data.weights_dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qs8_deptwiseconv2dnative)},
+    {"neon_fp16_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+     { return (data.weights_dt == DataType::F16 && data.isa.fp16); },
+     REGISTER_FP16_NEON(neon_fp16_deptwiseconv2dnative)},
+    {"neon_fp32_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) { return (data.weights_dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_deptwiseconv2dnative)},
+    {"neon_qp8_qu8_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+     { return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(neon_qp8_qu8_deptwiseconv2dnative)},
+    {"neon_qp8_qs8_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+     { return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt != DataType::QASYMM8); },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qp8_qs8_deptwiseconv2dnative)},
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status validate_arguments(const ITensorInfo     *src,
+                          const ITensorInfo     *weights,
+                          const ITensorInfo     *biases,
+                          const ITensorInfo     *dst,
+                          const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) >
+                                src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) >
+                                src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
     ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0));
     ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
-    ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1));
+    ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) ||
+                                (info.pad_stride_info.stride().second < 1));
 
-    if(is_data_type_quantized_per_channel(weights->data_type()))
+    if (is_data_type_quantized_per_channel(weights->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
@@ -116,12 +98,12 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
 
-        if(is_data_type_quantized_asymmetric(src->data_type()))
+        if (is_data_type_quantized_asymmetric(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -131,9 +113,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         }
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        const TensorShape output_shape =
+            misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
@@ -142,7 +125,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
 }
 } // namespace
 
-void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo     *src,
+                                               const ITensorInfo     *weights,
+                                               const ITensorInfo     *biases,
+                                               ITensorInfo           *dst,
+                                               const ConvolutionInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info));
@@ -151,18 +138,26 @@ void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITe
     _conv_info  = info;
 
     const auto uk = CpuDepthwiseConv2dNativeKernel::get_implementation(
-                        DepthwiseConv2dNativeDataTypeISASelectorData{ weights->data_type(), src->data_type(), CPUInfo::get().get_isa() });
+        DepthwiseConv2dNativeDataTypeISASelectorData{weights->data_type(), src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
     _func = uk->ukernel;
 
     const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
-    auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(dst->quantization_info()));
+    auto_init_if_empty(*dst, src->clone()
+                                 ->set_is_resizable(true)
+                                 .reset_padding()
+                                 .set_tensor_shape(output_shape)
+                                 .set_quantization_info(dst->quantization_info()));
 
     Window win = calculate_max_window(*dst, Steps());
     ICpuKernel::configure(win);
 }
 
-Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo     *src,
+                                                const ITensorInfo     *weights,
+                                                const ITensorInfo     *biases,
+                                                const ITensorInfo     *dst,
+                                                const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info));
     return Status{};
@@ -187,7 +182,8 @@ const char *CpuDepthwiseConv2dNativeKernel::name() const
     return "CpuDepthwiseConv2dNativeKernel";
 }
 
-const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> &CpuDepthwiseConv2dNativeKernel::get_available_kernels()
+const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> &
+CpuDepthwiseConv2dNativeKernel::get_available_kernels()
 {
     return available_kernels;
 }
diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
index 9fabd0b01cf24a353830da9b8fcc5494df8d62b9..7e78f52e133ff9fe4666ad5a4c08a0044a2e31d7 100644
--- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
+++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/function_info/ConvolutionInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 #include "support/AclRequires.h"
@@ -44,8 +45,9 @@ namespace kernels
 class CpuDepthwiseConv2dNativeKernel : public ICpuKernel<CpuDepthwiseConv2dNativeKernel>
 {
 private:
-    using DepthwiseConv2dNativeKernelPtr =
-        std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &, bool, const ConvolutionInfo &)>::type;
+    using DepthwiseConv2dNativeKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &, bool, const ConvolutionInfo &)>::
+        type;
 
 public:
     CpuDepthwiseConv2dNativeKernel() = default;
@@ -64,17 +66,25 @@ public:
      * @param[in]  info    Depthwise convolution meta-data.
      *
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+    void configure(const ITensorInfo     *src,
+                   const ITensorInfo     *weights,
+                   const ITensorInfo     *biases,
+                   ITensorInfo           *dst,
+                   const ConvolutionInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDepthwiseConv2dNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *weights,
+                           const ITensorInfo     *biases,
+                           const ITensorInfo     *dst,
+                           const ConvolutionInfo &info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
     struct DepthwiseConv2dNativeKernel
     {
@@ -89,9 +99,9 @@ private:
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    DepthwiseConv2dNativeKernelPtr _func{ nullptr };
+    DepthwiseConv2dNativeKernelPtr _func{nullptr};
     ConvolutionInfo                _conv_info{};
-    bool                           _has_biases{ false };
+    bool                           _has_biases{false};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuDequantizeKernel.cpp b/src/cpu/kernels/CpuDequantizeKernel.cpp
index a2d24f9243aa3d2144a994248671b441c9b44932..d17128b5ac28a3242b1aaf71118b59d8bf9b0a35 100644
--- a/src/cpu/kernels/CpuDequantizeKernel.cpp
+++ b/src/cpu/kernels/CpuDequantizeKernel.cpp
@@ -28,12 +28,13 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NESymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
@@ -48,9 +49,11 @@ namespace
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8,
+                                                         DataType::QSYMM16);
 
-    if(dst->tensor_shape().total_size() > 0)
+    if (dst->tensor_shape().total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
@@ -124,28 +127,30 @@ void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Win
     Iterator in(input, win_collapsed);
     Iterator out(output, win_collapsed);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const TIn *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale, offset);
+            const auto in_ptr  = reinterpret_cast<const TIn *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
 
-            store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq);
-        }
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin  = wrapper::vloadq(in_ptr + x);
+                const auto vdeq = vdequantize(vin, scale, offset);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            auto val       = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo));
-        }
-    },
-    in, out);
+                store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                auto val       = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo));
+            }
+        },
+        in, out);
 }
 
 template <typename T>
@@ -165,28 +170,30 @@ void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *o
     Iterator in(input, win);
     Iterator out(output, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale[id.z()]);
+            const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
 
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin  = wrapper::vloadq(in_ptr + x);
+                const auto vdeq = vdequantize(vin, scale[id.z()]);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()]));
-        }
-    },
-    in, out);
+                store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int8_t val     = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()]));
+            }
+        },
+        in, out);
 }
 
 template <typename T>
@@ -206,37 +213,34 @@ void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *o
     Iterator in(input, win);
     Iterator out(output, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const float32x4x4_t vscale =
+            const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3],
-                    scale[x + 4], scale[x + 5], scale[x + 6], scale[x + 7],
-                    scale[x + 8], scale[x + 9], scale[x + 10], scale[x + 11],
-                    scale[x + 12], scale[x + 13], scale[x + 14], scale[x + 15]
-                }
-            };
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, vscale);
-
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x]));
-        }
-    },
-    in, out);
+                const float32x4x4_t vscale = {{scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3], scale[x + 4],
+                                               scale[x + 5], scale[x + 6], scale[x + 7], scale[x + 8], scale[x + 9],
+                                               scale[x + 10], scale[x + 11], scale[x + 12], scale[x + 13],
+                                               scale[x + 14], scale[x + 15]}};
+                const auto          vin    = wrapper::vloadq(in_ptr + x);
+                const auto          vdeq   = vdequantize(vin, vscale);
+
+                store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int8_t val     = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x]));
+            }
+        },
+        in, out);
 }
 
 template <typename T>
@@ -257,28 +261,30 @@ void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Wind
     Iterator in(input, win_collapsed);
     Iterator out(output, win_collapsed);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale);
+            const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
 
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin  = wrapper::vloadq(in_ptr + x);
+                const auto vdeq = vdequantize(vin, scale);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
-        }
-    },
-    in, out);
+                store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int8_t val     = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
+            }
+        },
+        in, out);
 }
 
 template <typename T>
@@ -299,34 +305,36 @@ void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Win
     Iterator in(input, win_collapsed);
     Iterator out(output, win_collapsed);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int16_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize_int16(vin, scale);
+            const auto in_ptr  = reinterpret_cast<const int16_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
 
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin  = wrapper::vloadq(in_ptr + x);
+                const auto vdeq = vdequantize_int16(vin, scale);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int16_t val    = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
-        }
-    },
-    in, out);
+                store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int16_t val    = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
+            }
+        },
+        in, out);
 }
 
 template <typename T>
 void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
 {
-    switch(input->info()->data_type())
+    switch (input->info()->data_type())
     {
         case DataType::QASYMM8:
             run_dequantization_qasymm8<T, uint8_t>(input, output, window);
@@ -335,7 +343,9 @@ void run_dequantization_core(const ITensor *input, ITensor *output, const Window
             run_dequantization_qasymm8<T, int8_t>(input, output, window);
             break;
         case DataType::QSYMM8_PER_CHANNEL:
-            input->info()->data_layout() == DataLayout::NHWC ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window);
+            input->info()->data_layout() == DataLayout::NHWC
+                ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window)
+                : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window);
             break;
         case DataType::QSYMM8:
             run_dequantization_qsymm8<T>(input, output, window);
@@ -377,7 +387,7 @@ void CpuDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, con
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    switch(dst->info()->data_type())
+    switch (dst->info()->data_type())
     {
         case DataType::F32:
             run_dequantization_core<float>(src, dst, window);
diff --git a/src/cpu/kernels/CpuDequantizeKernel.h b/src/cpu/kernels/CpuDequantizeKernel.h
index cfa991dc74257af4e448cd8101e1134d734857a9..6ed58587c945785bcc794ab37c909ec7c3290e76 100644
--- a/src/cpu/kernels/CpuDequantizeKernel.h
+++ b/src/cpu/kernels/CpuDequantizeKernel.h
@@ -54,7 +54,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/cpu/kernels/CpuDirectConv2dKernel.cpp
index a4cdddee5e59670eabb54706db6ccb1ec91e9cdb..4cb0fb1c40b4f78115f94e95bc1c6ec4c2998ac9 100644
--- a/src/cpu/kernels/CpuDirectConv2dKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.cpp
@@ -22,13 +22,14 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/CpuDirectConv2dKernel.h"
-#include "src/cpu/kernels/directconv2d/list.h"
 
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/directconv2d/list.h"
 
 using namespace arm_compute::detail;
 
@@ -38,26 +39,25 @@ namespace cpu
 {
 namespace kernels
 {
-static const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> available_kernels =
-{
-    {
-        "neon_fp32_nhwc_directconv2d",
-        [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; },
-        REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d)
-    },
-    {
-        "neon_fp32_nchw_directconv2d",
-        [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; },
-        REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d)
-    },
-    {
-        "neon_fp16_nchw_directconv2d",
-        [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d)
-    },
+static const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> available_kernels = {
+    {"neon_fp32_nhwc_directconv2d",
+     [](const DataTypeDataLayoutISASelectorData &data)
+     { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; },
+     REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d)},
+    {"neon_fp32_nchw_directconv2d",
+     [](const DataTypeDataLayoutISASelectorData &data)
+     { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; },
+     REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d)},
+    {"neon_fp16_nchw_directconv2d",
+     [](const DataTypeDataLayoutISASelectorData &data)
+     { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d)},
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+Status validate_arguments(const ITensorInfo   *src,
+                          const ITensorInfo   *weights,
+                          const ITensorInfo   *dst,
+                          const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
@@ -76,7 +76,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32);
     ARM_COMPUTE_UNUSED(width_idx);
     // Checks performed when output is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
 
@@ -100,11 +100,15 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     // Configure window without any padding
     win = calculate_max_window(*dst, Steps());
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 
-void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
+void CpuDirectConv2dKernel::configure(ITensorInfo         *src,
+                                      ITensorInfo         *weights,
+                                      ITensorInfo         *dst,
+                                      const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
@@ -129,12 +133,13 @@ void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, IT
     ICpuKernel::configure(win_config.second);
 }
 
-Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+Status CpuDirectConv2dKernel::validate(const ITensorInfo   *src,
+                                       const ITensorInfo   *weights,
+                                       const ITensorInfo   *dst,
+                                       const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
-                                                              dst->clone().get())
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
 
     return Status{};
 }
@@ -149,7 +154,8 @@ void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, c
     auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     auto dst     = tensors.get_tensor(TensorType::ACL_DST);
 
-    const auto *uk = CpuDirectConv2dKernel::get_implementation(DataTypeDataLayoutISASelectorData{ src->info()->data_type(), _data_layout, CPUInfo::get().get_isa() });
+    const auto *uk = CpuDirectConv2dKernel::get_implementation(
+        DataTypeDataLayoutISASelectorData{src->info()->data_type(), _data_layout, CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr);
 
     uk->ukernel(window, src, weights, dst, _conv_info);
diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.h b/src/cpu/kernels/CpuDirectConv2dKernel.h
index b9265dc630e0087ffa39d03cebd178a79c04116c..ad4caea19375f41826a479f6e5c030de3d1ddbc0 100644
--- a/src/cpu/kernels/CpuDirectConv2dKernel.h
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.h
@@ -37,7 +37,8 @@ namespace kernels
 class CpuDirectConv2dKernel : public ICpuKernel<CpuDirectConv2dKernel>
 {
 private:
-    using DirectConv2dKernel_Ptr = std::add_pointer<void(const Window &, const ITensor *, const ITensor *, ITensor *, const PadStrideInfo &)>::type;
+    using DirectConv2dKernel_Ptr = std::add_pointer<void(
+        const Window &, const ITensor *, const ITensor *, ITensor *, const PadStrideInfo &)>::type;
 
 public:
     CpuDirectConv2dKernel() = default;
@@ -64,10 +65,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info);
+    static Status validate(const ITensorInfo   *src,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *dst,
+                           const PadStrideInfo &conv_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct DirectConv2dKernel
@@ -81,8 +85,8 @@ public:
 
 private:
     PadStrideInfo _conv_info{};
-    unsigned int  _kernel_size{ 0 };
-    DataLayout    _data_layout{ DataLayout::UNKNOWN };
+    unsigned int  _kernel_size{0};
+    DataLayout    _data_layout{DataLayout::UNKNOWN};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
index 93ad5e5ebacd3bb64de6ad0ea31bad5d61c67f31..d4af8bedafe5c67fef4d3386298c2db0d12ef14a 100644
--- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
@@ -27,15 +27,16 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
@@ -49,7 +50,9 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+Status validate_arguments(const ITensorInfo                                 *src,
+                          const ITensorInfo                                 *bias,
+                          const ITensorInfo                                 *dst,
                           const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
@@ -57,22 +60,23 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL)));
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(
+                                                              src->data_layout(), DataLayoutDimension::CHANNEL)));
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
     }
 
-    if(src->data_type() == DataType::S32)
+    if (src->data_type() == DataType::S32)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output");
     }
 
     // Checks performed when output is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
-        if(is_data_type_float(src->data_type()))
+        if (is_data_type_float(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         }
@@ -82,10 +86,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
         }
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
     }
-    else if(src->data_type() == DataType::S32)
+    else if (src->data_type() == DataType::S32)
     {
         // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
-        ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED));
+        ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) &&
+                                    (info.output_data_type != DataType::QASYMM8_SIGNED));
     }
 
     return Status{};
@@ -93,8 +98,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
 
 template <typename T>
 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+output_stage_nchw(ITensor       *src,
+                  const ITensor *bias,
+                  const Window  &window,
+                  ITensor       *dst,
+                  int            result_fixedpoint_multiplier,
+                  int            result_shift,
+                  int            result_offset_after_shift)
 {
     const bool has_bias = bias != nullptr;
     /** SIMD vector tag type. */
@@ -113,50 +123,57 @@ output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITens
 
     Iterator in(src, win);
     Iterator out(dst, win);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
-            auto       v_in   = wrapper::vloadq(in_ptr);
-
-            // Accumulate bias
-            if(has_bias)
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
-                v_in          = wrapper::vadd(v_in, vb);
-            }
+                // Get bias and pointer to input
+                const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
+                auto       v_in   = wrapper::vloadq(in_ptr);
 
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, v_in);
-        }
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto vb = wrapper::vdup_n(
+                        *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
+                    v_in = wrapper::vadd(v_in, vb);
+                }
 
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+                const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
+                wrapper::vstore(out_ptr, v_in);
+            }
 
-            // Accumulate bias
-            if(has_bias)
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
             {
-                const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
-                s_in += b;
-            }
+                // Get bias and pointer to input
+                auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
 
-            *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
-        }
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
+                    s_in += b;
+                }
 
-    },
-    in, out);
+                *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
+            }
+        },
+        in, out);
 }
 
 template <typename T>
 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+output_stage_nhwc(ITensor       *src,
+                  const ITensor *bias,
+                  const Window  &window,
+                  ITensor       *dst,
+                  int            result_fixedpoint_multiplier,
+                  int            result_shift,
+                  int            result_offset_after_shift)
 {
     const bool has_bias = bias != nullptr;
     ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
@@ -179,50 +196,59 @@ output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITens
     Iterator bi(bias, window_bias);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
-            auto       v_in   = wrapper::vloadq(in_ptr + x);
-
-            // Accumulate bias
-            if(has_bias)
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
-                v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
-            }
+                // Get bias and pointer to input
+                const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
+                auto       v_in   = wrapper::vloadq(in_ptr + x);
 
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-            wrapper::vstore(out_ptr + x, v_in);
-        }
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+                    v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
+                }
 
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+                const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+                wrapper::vstore(out_ptr + x, v_in);
+            }
 
-            // Accumulate bias
-            if(has_bias)
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
             {
-                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
-                s_in += *bias_ptr;
-            }
+                // Get bias and pointer to input
+                auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
 
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-            *(out_ptr + x)     = s_in;
-        }
-    },
-    in, bi, out);
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+                    s_in += *bias_ptr;
+                }
+
+                const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+                *(out_ptr + x)     = s_in;
+            }
+        },
+        in, bi, out);
 }
 
 // Quantized case
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+template <
+    typename TOut,
+    typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0>
+void output_stage_nchw(ITensor       *src,
+                       const ITensor *bias,
+                       const Window  &window,
+                       ITensor       *dst,
+                       int            result_fixedpoint_multiplier,
+                       int            result_shift,
+                       int            result_offset_after_shift)
 {
     const bool has_bias = bias != nullptr;
     using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
@@ -242,67 +268,63 @@ void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window,
     Iterator in(src, win);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Get bias and pointer to input
-            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32x4x4_t v_in =
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
+                // Get bias and pointer to input
+                const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+                int32x4x4_t v_in = {{wrapper::vloadq(in_ptr), wrapper::vloadq(in_ptr + 4), wrapper::vloadq(in_ptr + 8),
+                                     wrapper::vloadq(in_ptr + 12)}};
+
+                // Accumulate bias
+                if (has_bias)
                 {
-                    wrapper::vloadq(in_ptr),
-                    wrapper::vloadq(in_ptr + 4),
-                    wrapper::vloadq(in_ptr + 8),
-                    wrapper::vloadq(in_ptr + 12)
+                    const auto vb = wrapper::vdup_n(
+                        *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
+                    v_in = {{wrapper::vadd(v_in.val[0], vb), wrapper::vadd(v_in.val[1], vb),
+                             wrapper::vadd(v_in.val[2], vb), wrapper::vadd(v_in.val[3], vb)}};
                 }
-            };
 
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
-                v_in =
-                {
-                    {
-                        wrapper::vadd(v_in.val[0], vb),
-                        wrapper::vadd(v_in.val[1], vb),
-                        wrapper::vadd(v_in.val[2], vb),
-                        wrapper::vadd(v_in.val[3], vb)
-                    }
-                };
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift,
+                                                               result_offset_after_shift_s32, min, max, false));
             }
 
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32,
-                                                           min, max, false));
-        }
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
+            {
+                // Get bias and pointer to input
+                int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
 
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
+                    s_in += b;
+                }
 
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
-                s_in += b;
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                *out_ptr =
+                    finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+                                          std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
             }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
-                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
-        }
-    },
-    in, out);
+        },
+        in, out);
 }
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+template <
+    typename TOut,
+    typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0>
+void output_stage_nhwc(ITensor       *src,
+                       const ITensor *bias,
+                       const Window  &window,
+                       ITensor       *dst,
+                       int            result_fixedpoint_multiplier,
+                       int            result_shift,
+                       int            result_offset_after_shift)
 {
     const bool has_bias = bias != nullptr;
     using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
@@ -329,62 +351,65 @@ void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window,
     Iterator bi(bias, window_bias);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            // Get bias and pointer to input
-            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32x4x4_t v_in =
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
+                // Get bias and pointer to input
+                const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+                int32x4x4_t v_in   = {{
+                      wrapper::vloadq(in_ptr),
+                      wrapper::vloadq(in_ptr + 4),
+                      wrapper::vloadq(in_ptr + 8),
+                      wrapper::vloadq(in_ptr + 12),
+                }};
+
+                // Accumulate bias
+                if (has_bias)
                 {
-                    wrapper::vloadq(in_ptr),
-                    wrapper::vloadq(in_ptr + 4),
-                    wrapper::vloadq(in_ptr + 8),
-                    wrapper::vloadq(in_ptr + 12),
-                }
-            };
+                    const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
 
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+                    wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
+                    wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
+                    wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
+                    wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
+                }
 
-                wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
-                wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
-                wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
-                wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift,
+                                                               result_offset_after_shift_s32, min, max, false));
             }
 
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
-        }
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
+            {
+                // Get bias and pointer to input
+                const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+                int32_t    s_in   = *in_ptr;
 
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32_t    s_in   = *in_ptr;
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+                    s_in += *bias_ptr;
+                }
 
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
-                s_in += *bias_ptr;
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                *out_ptr =
+                    finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+                                          std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
             }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
-                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
-        }
-    },
-    in, bi, out);
+        },
+        in, bi, out);
 }
 } // namespace
 
-void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo                                       *src,
+                                                 const ITensorInfo                                 *bias,
+                                                 ITensorInfo                                       *dst,
                                                  const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(bias);
@@ -398,7 +423,7 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor
     _result_offset_after_shift    = info.result_offset_after_shift;
 
     // Auto-initialize output output if required
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         // Work out expected output data type
         const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
@@ -410,16 +435,17 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor
 
     ICpuKernel::configure(win);
 
-    const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
+    const bool is_qasymm8_signed =
+        (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
 
     // Set appropriate function
-    if(src->data_layout() == DataLayout::NCHW)
+    if (src->data_layout() == DataLayout::NCHW)
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::S32:
             {
-                if(is_qasymm8_signed)
+                if (is_qasymm8_signed)
                 {
                     _func = &output_stage_nchw<int8_t>;
                 }
@@ -449,11 +475,11 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor
     }
     else
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::S32:
             {
-                if(is_qasymm8_signed)
+                if (is_qasymm8_signed)
                 {
                     _func = &output_stage_nhwc<int8_t>;
                 }
@@ -483,7 +509,9 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor
     }
 }
 
-Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo                                 *src,
+                                                  const ITensorInfo                                 *bias,
+                                                  const ITensorInfo                                 *dst,
                                                   const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
index d3ef17b7c9463d0bf1e5496e8097747f9782027b..ce84f49cf6cf4ad43992953b174aad3eb60704f6 100644
--- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
+++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -55,29 +56,40 @@ public:
      *                      Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
      * @param[in]      info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
      */
-    void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr,
-                   const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
+    void
+    configure(ITensorInfo                                       *src,
+              const ITensorInfo                                 *bias = nullptr,
+              ITensorInfo                                       *dst  = nullptr,
+              const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDirectConv2dOutputStageKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias = nullptr, const ITensorInfo *dst = nullptr,
-                           const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
+    static Status
+    validate(const ITensorInfo                                 *src,
+             const ITensorInfo                                 *bias = nullptr,
+             const ITensorInfo                                 *dst  = nullptr,
+             const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    using OutputStageKernel = void(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift);
+    using OutputStageKernel = void(ITensor       *src,
+                                   const ITensor *bias,
+                                   const Window  &window,
+                                   ITensor       *dst,
+                                   int            result_fixedpoint_multiplier,
+                                   int            result_shift,
+                                   int            result_offset_after_shift);
 
-    OutputStageKernel *_func{ nullptr };
-    int                _result_fixedpoint_multiplier{ 0 };
-    int                _result_shift{ 0 };
-    int                _result_offset_after_shift{ 0 };
+    OutputStageKernel *_func{nullptr};
+    int                _result_fixedpoint_multiplier{0};
+    int                _result_shift{0};
+    int                _result_offset_after_shift{0};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.cpp b/src/cpu/kernels/CpuDirectConv3dKernel.cpp
index 22c60cd994637968812382937292e8bf306a474c..b5b2aed1ba17a74a5e6ea2c715fa8b1e6114524e 100644
--- a/src/cpu/kernels/CpuDirectConv3dKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv3dKernel.cpp
@@ -29,12 +29,13 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/conv3d/neon/list.h"
 
 #include <algorithm>
@@ -49,43 +50,37 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> available_kernels =
-{
+static const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> available_kernels = {
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_fp16_directconv3d",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float16_t>)
-    },
+    {"neon_fp16_directconv3d",
+     [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float16_t>)},
 #endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "neon_fp32_directconv3d",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float>)
-    },
-    {
-        "neon_qasymm8_directconv3d",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<uint8_t>)
-    },
-    {
-        "neon_qasymm8_signed_directconv3d",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<int8_t>)
-    }
-};
-
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info)
+    {"neon_fp32_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float>)},
+    {"neon_qasymm8_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<uint8_t>)},
+    {"neon_qasymm8_signed_directconv3d",
+     [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<int8_t>)}};
+
+Status validate_arguments(const ITensorInfo *src0,
+                          const ITensorInfo *src1,
+                          const ITensorInfo *src2,
+                          const ITensorInfo *dst,
+                          const Conv3dInfo  &conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src0->data_layout() != DataLayout::NDHWC);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size3D(1U, 1U, 1U));
 
-    const auto *uk = CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()});
 
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
@@ -96,9 +91,9 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 5);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != src0->dimension(channel_idx));
 
-    if(src2 != nullptr)
+    if (src2 != nullptr)
     {
-        if(is_data_type_quantized(src0->data_type()))
+        if (is_data_type_quantized(src0->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32);
         }
@@ -106,14 +101,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
         }
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0),
+                                        "Biases size and number of dst feature maps should match");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // Checks performed when output is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        TensorShape output_shape = misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
+        TensorShape output_shape =
+            misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
 
         DataType data_type = src0->data_type();
 
@@ -125,12 +122,17 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
 }
 } // namespace
 
-void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv_info)
+void CpuDirectConv3dKernel::configure(const ITensorInfo *src0,
+                                      const ITensorInfo *src1,
+                                      const ITensorInfo *src2,
+                                      ITensorInfo       *dst,
+                                      const Conv3dInfo  &conv_info)
 {
     ARM_COMPUTE_UNUSED(src2);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
-    const auto *uk = CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()});
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
@@ -139,7 +141,8 @@ void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo
     _name       = std::string("CpuDirectConv3dKernel").append("/").append(uk->name);
 
     // Get convolved dimensions
-    TensorShape output_shape = misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
+    TensorShape output_shape =
+        misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
 
     DataType data_type = src0->data_type();
 
@@ -154,7 +157,11 @@ void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo
     ICpuKernel::configure(win);
 }
 
-Status CpuDirectConv3dKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info)
+Status CpuDirectConv3dKernel::validate(const ITensorInfo *src0,
+                                       const ITensorInfo *src1,
+                                       const ITensorInfo *src2,
+                                       const ITensorInfo *dst,
+                                       const Conv3dInfo  &conv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, conv_info));
 
@@ -188,4 +195,4 @@ const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> &CpuDirectConv3dKer
 
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.h b/src/cpu/kernels/CpuDirectConv3dKernel.h
index 688f368b9f1408bf30af7a5f7d3fce13bfb72349..8e6f5646791702f289de0e5e5c41914cb98fa34c 100644
--- a/src/cpu/kernels/CpuDirectConv3dKernel.h
+++ b/src/cpu/kernels/CpuDirectConv3dKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_DIRECT_CONV3D_KERNEL_H
 
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -39,7 +40,8 @@ class CpuDirectConv3dKernel : public ICpuKernel<CpuDirectConv3dKernel>
 {
 private:
     /* Template function for convolution 3d NDHWC */
-    using DirectConv3dKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Conv3dInfo &, const Window &)>::type;
+    using DirectConv3dKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, const ITensor *, ITensor *, const Conv3dInfo &, const Window &)>::type;
 
 public:
     CpuDirectConv3dKernel() = default;
@@ -63,17 +65,25 @@ public:
      * @param[in]      conv_info Contains padding, stride, acitvation information.
      *
      */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv_info);
+    void configure(const ITensorInfo *src0,
+                   const ITensorInfo *src1,
+                   const ITensorInfo *src2,
+                   ITensorInfo       *dst,
+                   const Conv3dInfo  &conv_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDirectConv3dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info);
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct DirectConv3dKernel
@@ -87,7 +97,7 @@ public:
 
 private:
     Conv3dInfo            _conv_info{};
-    DirectConv3dKernelPtr _run_method{ nullptr };
+    DirectConv3dKernelPtr _run_method{nullptr};
     std::string           _name{};
 };
 
diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp
index a045855b1a989ac159bea2b58ca1d4683bdd3495..57a3f398220176d9d72da17738e30e9f58999904 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseKernel.cpp
@@ -24,8 +24,9 @@
 #include "src/cpu/kernels/CpuElementwiseKernel.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/elementwise_binary/list.h"
@@ -35,11 +36,11 @@
 #if defined(ENABLE_FP32_KERNELS)
 namespace
 {
-    static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308;
-    static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772;
-    static constexpr size_t default_div_mws_N1_fp32_neon = 19043;
-    static constexpr size_t default_div_mws_V1_fp32_neon = 25511;
-}
+static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308;
+static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772;
+static constexpr size_t default_div_mws_N1_fp32_neon     = 19043;
+static constexpr size_t default_div_mws_V1_fp32_neon     = 25511;
+} // namespace
 #endif /* ENABLE_FP32_KERNELS */
 
 namespace arm_compute
@@ -50,255 +51,178 @@ namespace kernels
 {
 namespace
 {
-template <ArithmeticOperation                                                   op>
-const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels_arithmetic =
-{
-    {
-        "sve2_qu8_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SVE2(sve2_qasymm8_elementwise_binary<op>)
-    },
-    {
-        "sve2_qs8_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_elementwise_binary<op>)
-    },
-    {
-        "sve_fp32_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_FP32_SVE(sve_fp32_elementwise_binary<op>)
-    },
-    {
-        "sve_s32_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_SVE(sve_s32_elementwise_binary<op>)
-    },
-    {
-        "sve_s16_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S16 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_SVE(sve_s16_elementwise_binary<op>)
-    },
-    {
-        "sve_fp16_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_FP16_SVE(sve_fp16_elementwise_binary<op>)
-    },
-    {
-        "neon_fp32_arithmetic",
-
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_FP32_NEON(neon_fp32_elementwise_binary<op>)
-    },
-    {
-        "neon_s32_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S32 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_NEON(neon_s32_elementwise_binary<op>)
-    },
-    {
-        "neon_fp16_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_FP16_NEON(neon_fp16_elementwise_binary<op>)
-    },
-    {
-        "neon_s16_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S16 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_NEON(neon_s16_elementwise_binary<op>)
-    },
-    {
-        "neon_qu8_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_binary<op>)
-    },
-    {
-        "neon_qs8_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_binary<op>)
-    },
+template <ArithmeticOperation op>
+const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels_arithmetic = {
+    {"sve2_qu8_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_QASYMM8_SVE2(sve2_qasymm8_elementwise_binary<op>)},
+    {"sve2_qs8_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data) {
+         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_elementwise_binary<op>)},
+    {"sve_fp32_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_FP32_SVE(sve_fp32_elementwise_binary<op>)},
+    {"sve_s32_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_s32_elementwise_binary<op>)},
+    {"sve_s16_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_s16_elementwise_binary<op>)},
+    {"sve_fp16_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+                static_cast<ArithmeticOperation>(data.op) == op;
+     },
+     REGISTER_FP16_SVE(sve_fp16_elementwise_binary<op>)},
+    {"neon_fp32_arithmetic",
+
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_FP32_NEON(neon_fp32_elementwise_binary<op>)},
+    {"neon_s32_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S32 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_s32_elementwise_binary<op>)},
+    {"neon_fp16_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_FP16_NEON(neon_fp16_elementwise_binary<op>)},
+    {"neon_s16_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_s16_elementwise_binary<op>)},
+    {"neon_qu8_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_binary<op>)},
+    {"neon_qs8_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8_SIGNED && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_binary<op>)},
 };
-template <ComparisonOperation                                                   op>
-const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels_comperison =
-{
-    {
-        "sve2_qu8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SVE2(sve2_qasymm8_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve2_qs8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve_u8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::U8 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_SVE(sve_u8_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve_fp32_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_FP32_SVE(sve_fp32_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve_s16_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S16 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_SVE(sve_s16_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve_s32_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_SVE(sve_s32_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve_fp16_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_FP16_SVE(sve_fp16_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_u8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::U8 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_NEON(neon_u8_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_fp32_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_FP32_NEON(neon_fp32_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_s16_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S16 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_NEON(neon_s16_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_s32_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S32 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_NEON(neon_s32_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_qu8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_NEON(neon_qasymm8_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_qs8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_fp16_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_FP16_NEON(neon_fp16_comparison_elementwise_binary<op>)
-    },
+template <ComparisonOperation op>
+const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels_comperison = {
+    {"sve2_qu8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_QASYMM8_SVE2(sve2_qasymm8_comparison_elementwise_binary<op>)},
+    {"sve2_qs8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data) {
+         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_comparison_elementwise_binary<op>)},
+    {"sve_u8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::U8 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_u8_comparison_elementwise_binary<op>)},
+    {"sve_fp32_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_FP32_SVE(sve_fp32_comparison_elementwise_binary<op>)},
+    {"sve_s16_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_s16_comparison_elementwise_binary<op>)},
+    {"sve_s32_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_s32_comparison_elementwise_binary<op>)},
+    {"sve_fp16_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+                static_cast<ComparisonOperation>(data.op) == op;
+     },
+     REGISTER_FP16_SVE(sve_fp16_comparison_elementwise_binary<op>)},
+    {"neon_u8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::U8 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_u8_comparison_elementwise_binary<op>)},
+    {"neon_fp32_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_FP32_NEON(neon_fp32_comparison_elementwise_binary<op>)},
+    {"neon_s16_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_s16_comparison_elementwise_binary<op>)},
+    {"neon_s32_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S32 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_s32_comparison_elementwise_binary<op>)},
+    {"neon_qu8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_QASYMM8_NEON(neon_qasymm8_comparison_elementwise_binary<op>)},
+    {"neon_qs8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8_SIGNED && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_comparison_elementwise_binary<op>)},
+    {"neon_fp16_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_FP16_NEON(neon_fp16_comparison_elementwise_binary<op>)},
 };
 } // namespace
 
-const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &CpuArithmeticKernel::get_available_kernels()
+const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &
+CpuArithmeticKernel::get_available_kernels()
 {
     static std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels;
-    std::move(available_kernels_arithmetic<ArithmeticOperation::ADD>.begin(), available_kernels_arithmetic<ArithmeticOperation::ADD>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::SUB>.begin(), available_kernels_arithmetic<ArithmeticOperation::SUB>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::DIV>.begin(), available_kernels_arithmetic<ArithmeticOperation::DIV>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::MIN>.begin(), available_kernels_arithmetic<ArithmeticOperation::MIN>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::MAX>.begin(), available_kernels_arithmetic<ArithmeticOperation::MAX>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.begin(), available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::POWER>.begin(), available_kernels_arithmetic<ArithmeticOperation::POWER>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::PRELU>.begin(), available_kernels_arithmetic<ArithmeticOperation::PRELU>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::ADD>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::ADD>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::SUB>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::SUB>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::DIV>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::DIV>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::MIN>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::MIN>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::MAX>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::MAX>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.end(),
+              std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::POWER>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::POWER>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::PRELU>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::PRELU>.end(), std::back_inserter(available_kernels));
 
     return available_kernels;
 }
 
-const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &CpuComparisonKernel::get_available_kernels()
+const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &
+CpuComparisonKernel::get_available_kernels()
 {
     static std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels;
-    std::move(available_kernels_comperison<ComparisonOperation::Equal>.begin(), available_kernels_comperison<ComparisonOperation::Equal>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_comperison<ComparisonOperation::NotEqual>.begin(), available_kernels_comperison<ComparisonOperation::NotEqual>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_comperison<ComparisonOperation::Greater>.begin(), available_kernels_comperison<ComparisonOperation::Greater>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_comperison<ComparisonOperation::GreaterEqual>.begin(), available_kernels_comperison<ComparisonOperation::GreaterEqual>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_comperison<ComparisonOperation::Less>.begin(), available_kernels_comperison<ComparisonOperation::Less>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_comperison<ComparisonOperation::LessEqual>.begin(), available_kernels_comperison<ComparisonOperation::LessEqual>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::Equal>.begin(),
+              available_kernels_comperison<ComparisonOperation::Equal>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::NotEqual>.begin(),
+              available_kernels_comperison<ComparisonOperation::NotEqual>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::Greater>.begin(),
+              available_kernels_comperison<ComparisonOperation::Greater>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::GreaterEqual>.begin(),
+              available_kernels_comperison<ComparisonOperation::GreaterEqual>.end(),
+              std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::Less>.begin(),
+              available_kernels_comperison<ComparisonOperation::Less>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::LessEqual>.begin(),
+              available_kernels_comperison<ComparisonOperation::LessEqual>.end(),
+              std::back_inserter(available_kernels));
 
     return available_kernels;
 }
 
 template <class Derived>
-Status CpuElementwiseKernel<Derived>::validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
+Status CpuElementwiseKernel<Derived>::validate_arguments_common(const ITensorInfo &src0,
+                                                                const ITensorInfo &src1,
+                                                                const ITensorInfo &dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
@@ -308,7 +232,7 @@ Status CpuElementwiseKernel<Derived>::validate_arguments_common(const ITensorInf
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
                                         "Wrong shape for output");
@@ -321,7 +245,8 @@ void CpuArithmeticKernel::configure_common(const ITensorInfo *src0, const ITenso
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
-    const auto *uk = CpuArithmeticKernel::get_implementation(ElementwiseDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op) });
+    const auto *uk = CpuArithmeticKernel::get_implementation(
+        ElementwiseDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op)});
 
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
@@ -329,7 +254,7 @@ void CpuArithmeticKernel::configure_common(const ITensorInfo *src0, const ITenso
     _name       = std::string("CpuArithmeticKernel").append("/").append(uk->name);
 
     // If any of shapes is dynamic, expect a configured window and dst at run-time.
-    if(src0->is_dynamic() || src1->is_dynamic())
+    if (src0->is_dynamic() || src1->is_dynamic())
     {
         return;
     }
@@ -343,7 +268,8 @@ void CpuComparisonKernel::configure_common(const ITensorInfo *src0, const ITenso
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
-    const auto *uk = CpuComparisonKernel::get_implementation(ElementwiseDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op) });
+    const auto *uk = CpuComparisonKernel::get_implementation(
+        ElementwiseDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op)});
 
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
@@ -351,7 +277,7 @@ void CpuComparisonKernel::configure_common(const ITensorInfo *src0, const ITenso
     _name       = std::string("CpuComparisonKernel").append("/").append(uk->name);
 
     // If any of shapes is dynamic, expect a configured window and dst at run-time.
-    if(src0->is_dynamic() || src1->is_dynamic())
+    if (src0->is_dynamic() || src1->is_dynamic())
     {
         return;
     }
@@ -373,8 +299,10 @@ void CpuElementwiseKernel<Derived>::run_op(ITensorPack &tensors, const Window &w
 
     _run_method(src0, src1, dst, window);
 }
-template void CpuElementwiseKernel<CpuArithmeticKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
-template void CpuElementwiseKernel<CpuComparisonKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
+template void
+CpuElementwiseKernel<CpuArithmeticKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
+template void
+CpuElementwiseKernel<CpuComparisonKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
 
 template <class Derived>
 const char *CpuElementwiseKernel<Derived>::name() const
@@ -385,7 +313,10 @@ template const char *CpuElementwiseKernel<CpuArithmeticKernel>::name() const;
 template const char *CpuElementwiseKernel<CpuComparisonKernel>::name() const;
 
 /** Arithmetic operators (min, max, squared_diff) */
-void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+void CpuArithmeticKernel::configure(ArithmeticOperation op,
+                                    const ITensorInfo  *src0,
+                                    const ITensorInfo  *src1,
+                                    ITensorInfo        *dst)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
     _op = op;
@@ -394,16 +325,20 @@ void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *s
 
 Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S16, DataType::F16, DataType::S32, DataType::F32);
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
     }
     return validate_arguments_common(src0, src1, dst);
 }
 
-Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status CpuArithmeticKernel::validate(ArithmeticOperation op,
+                                     const ITensorInfo  *src0,
+                                     const ITensorInfo  *src1,
+                                     const ITensorInfo  *dst)
 {
     ARM_COMPUTE_UNUSED(op);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
@@ -416,15 +351,15 @@ size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count
     ARM_COMPUTE_UNUSED(thread_count);
 
 #if defined(ENABLE_FP32_KERNELS)
-    if(this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MIN>
-    || this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MAX>)
+    if (this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MIN> ||
+        this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MAX>)
     {
         size_t mws = ICPPKernel::default_mws;
-        if(platform.get_cpu_model() == CPUModel::N1)
+        if (platform.get_cpu_model() == CPUModel::N1)
         {
             mws = default_min_max_mws_N1_fp32_neon;
         }
-        else if(platform.get_cpu_model() == CPUModel::V1)
+        else if (platform.get_cpu_model() == CPUModel::V1)
         {
             mws = default_min_max_mws_V1_fp32_neon;
         }
@@ -434,7 +369,7 @@ size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count
         }
 
         // tensor is 1D or was re-interpreted as 1D
-        if(this->window().shape().num_dimensions() == 1)
+        if (this->window().shape().num_dimensions() == 1)
         {
             return mws;
         }
@@ -447,7 +382,7 @@ size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count
             return std::max(static_cast<size_t>(1), mws);
         }
     }
-#else /* ENABLE_FP32_KERNELS */
+#else  /* ENABLE_FP32_KERNELS */
     ARM_COMPUTE_UNUSED(platform);
 #endif /* ENABLE_FP32_KERNELS */
     return ICPPKernel::default_mws;
@@ -467,14 +402,14 @@ size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count)
     ARM_COMPUTE_UNUSED(thread_count);
 
 #if defined(ENABLE_FP32_KERNELS)
-    if(this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::DIV>)
+    if (this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::DIV>)
     {
         size_t mws = ICPPKernel::default_mws;
-        if(platform.get_cpu_model() == CPUModel::N1)
+        if (platform.get_cpu_model() == CPUModel::N1)
         {
             mws = default_div_mws_N1_fp32_neon;
         }
-        else if(platform.get_cpu_model() == CPUModel::V1)
+        else if (platform.get_cpu_model() == CPUModel::V1)
         {
             mws = default_div_mws_V1_fp32_neon;
         }
@@ -484,7 +419,7 @@ size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count)
         }
 
         // tensor is 1D or was re-interpreted as 1D
-        if(this->window().shape().num_dimensions() == 1)
+        if (this->window().shape().num_dimensions() == 1)
         {
             return mws;
         }
@@ -497,7 +432,7 @@ size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count)
             return std::max(static_cast<size_t>(1), mws);
         }
     }
-#else /* ENABLE_FP32_KERNELS */
+#else  /* ENABLE_FP32_KERNELS */
     ARM_COMPUTE_UNUSED(platform);
 #endif /* ENABLE_FP32_KERNELS */
     return ICPPKernel::default_mws;
@@ -538,7 +473,10 @@ Status CpuPowerKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1
 }
 
 /** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */
-void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+void CpuComparisonKernel::configure(ComparisonOperation op,
+                                    const ITensorInfo  *src0,
+                                    const ITensorInfo  *src1,
+                                    ITensorInfo        *dst)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
     _op = op;
@@ -547,16 +485,21 @@ void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *s
 
 Status CpuComparisonKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16,
+                                                         DataType::S32, DataType::F32);
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8);
     }
     return validate_arguments_common(src0, src1, dst);
 }
 
-Status CpuComparisonKernel::validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status CpuComparisonKernel::validate(ComparisonOperation op,
+                                     const ITensorInfo  *src0,
+                                     const ITensorInfo  *src1,
+                                     const ITensorInfo  *dst)
 {
     ARM_COMPUTE_UNUSED(op);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
diff --git a/src/cpu/kernels/CpuElementwiseKernel.h b/src/cpu/kernels/CpuElementwiseKernel.h
index 634e38bf9f7b72c7cc3fa632966b1d69fc959e8e..1f3e613b80747ce75ed7328e56b3852bed6fc5be 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.h
+++ b/src/cpu/kernels/CpuElementwiseKernel.h
@@ -43,7 +43,8 @@ template <class Derived>
 class CpuElementwiseKernel : public ICpuKernel<Derived>
 {
 private:
-    using ElementwiseKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
+    using ElementwiseKernelPtr =
+        std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
 
 public:
     CpuElementwiseKernel() = default;
@@ -72,7 +73,7 @@ protected:
     static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
 
 protected:
-    ElementwiseKernelPtr _run_method{ nullptr };
+    ElementwiseKernelPtr _run_method{nullptr};
     std::string          _name{};
 };
 
@@ -96,7 +97,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+    static Status
+    validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
     static const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &get_available_kernels();
 
@@ -200,7 +202,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+    static Status
+    validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
     static const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &get_available_kernels();
 
@@ -226,4 +229,4 @@ private:
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */
diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
index 04a7f15715efc356a4a7f56daf80979d919e8d74..88545ee756c41bd4c45014c7e69b0ddf9606d09e 100644
--- a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
@@ -28,8 +28,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/elementwise_unary/list.h"
@@ -59,12 +60,13 @@ std::unique_ptr<uint8_t[]> q8_prepare_lut(ElementWiseUnary op, const ITensorInfo
     const auto dst_min_fp = (((is_signed) ? -128 : 0) - dst_qi.offset) * dst_qi.scale;
     const auto dst_max_fp = (((is_signed) ? 127 : 255) - dst_qi.offset) * dst_qi.scale;
 
-    for(int i = 0; i < 256; ++i)
+    for (int i = 0; i < 256; ++i)
     {
-        const auto in     = (is_signed) ? dequantize_qasymm8_signed(static_cast<int8_t>(i), src_qi) : dequantize_qasymm8(i, src_qi);
-        float      result = 0;
+        const auto in =
+            (is_signed) ? dequantize_qasymm8_signed(static_cast<int8_t>(i), src_qi) : dequantize_qasymm8(i, src_qi);
+        float result = 0;
 
-        switch(op)
+        switch (op)
         {
             case ElementWiseUnary::RSQRT:
                 result = 1 / sqrt(in);
@@ -100,7 +102,8 @@ std::unique_ptr<uint8_t[]> q8_prepare_lut(ElementWiseUnary op, const ITensorInfo
 
         result = utility::clamp(result, dst_min_fp, dst_max_fp);
 
-        const auto out = (is_signed) ? static_cast<uint8_t>(quantize_qasymm8_signed(result, dst_qi)) : quantize_qasymm8(result, dst_qi);
+        const auto out = (is_signed) ? static_cast<uint8_t>(quantize_qasymm8_signed(result, dst_qi))
+                                     : quantize_qasymm8(result, dst_qi);
         lut[i]         = out;
     }
 
@@ -109,97 +112,68 @@ std::unique_ptr<uint8_t[]> q8_prepare_lut(ElementWiseUnary op, const ITensorInfo
 
 #endif // __aarch64__
 
-static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> available_kernels =
-{
+static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> available_kernels = {
     {
         "sve_fp32_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F32 && data.isa.sve);
-        },
+        [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32 && data.isa.sve); },
         REGISTER_FP32_SVE(sve_fp32_elementwise_unary),
         nullptr,
     },
     {
         "sve_fp16_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F16 && data.isa.sve && data.isa.fp16);
-        },
+        [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16 && data.isa.sve && data.isa.fp16); },
         REGISTER_FP16_SVE(sve_fp16_elementwise_unary),
         nullptr,
     },
     {
         "sve_s32_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::S32 && data.isa.sve);
-        },
+        [](const DataTypeISASelectorData &data) { return (data.dt == DataType::S32 && data.isa.sve); },
         REGISTER_INTEGER_SVE(sve_s32_elementwise_unary),
         nullptr,
     },
     {
         "neon_fp32_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32;
-        },
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
         REGISTER_FP32_NEON(neon_fp32_elementwise_unary),
         nullptr,
     },
     {
         "neon_fp16_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.fp16;
-        },
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
         REGISTER_FP16_NEON(neon_fp16_elementwise_unary),
         nullptr,
     },
     {
         "neon_s32_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S32;
-        },
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::S32; },
         REGISTER_INTEGER_NEON(neon_s32_elementwise_unary),
         nullptr,
     },
 #ifdef __aarch64__
     {
         "sve2_q8_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2;
-        },
+        [](const DataTypeISASelectorData &data)
+        { return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
         REGISTER_QASYMM8_SVE2(sve2_q8_elementwise_unary),
         &q8_prepare_lut,
     },
     {
         "neon_q8_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED;
-        },
+        [](const DataTypeISASelectorData &data)
+        { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; },
         REGISTER_QASYMM8_NEON(neon_q8_elementwise_unary),
         &q8_prepare_lut,
     },
 #else  // __aarch64__
     {
         "neon_qasymm8_signed_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED;
-        },
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
         REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_unary),
         nullptr,
     },
     {
         "neon_qasymm8_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8;
-        },
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
         REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_unary),
         nullptr,
     },
@@ -211,7 +185,8 @@ static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> avai
 void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(op, src, dst));
-    const auto uk = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ src.data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuElementwiseUnaryKernel::get_implementation(
+        DataTypeISASelectorData{src.data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     _op         = op;
@@ -219,12 +194,12 @@ void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo
     _name       = std::string("CpuElementwiseUnaryKernel").append("/").append(uk->name);
 
     // If input shape is dynamic, expect a configured window and dst at run-time.
-    if(src.is_dynamic())
+    if (src.is_dynamic())
     {
         return;
     }
 
-    if(uk->prepare_func != nullptr)
+    if (uk->prepare_func != nullptr)
     {
         _lut = uk->prepare_func(op, &src, &dst);
     }
@@ -238,28 +213,31 @@ Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInf
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
 
-    const auto *uk = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ src.data_type(), CPUInfo::get().get_isa() });
+    const auto *uk = CpuElementwiseUnaryKernel::get_implementation(
+        DataTypeISASelectorData{src.data_type(), CPUInfo::get().get_isa()});
 
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::EXP:
         case ElementWiseUnary::RSQRT:
         case ElementWiseUnary::LOG:
         case ElementWiseUnary::ROUND:
         case ElementWiseUnary::SIN:
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32,
+                                                                 DataType::QASYMM8, DataType::QASYMM8_SIGNED);
             break;
         case ElementWiseUnary::NEG:
         case ElementWiseUnary::ABS:
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32,
+                                                                 DataType::QASYMM8, DataType::QASYMM8_SIGNED);
             break;
         default:
             ARM_COMPUTE_ERROR("ElementWiseUnary operation not supported");
     }
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
     }
diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/cpu/kernels/CpuElementwiseUnaryKernel.h
index 00188f0d492d728153be3e030085ce38e115d0dc..249909854e6f850ed6fc650603b3216916cfea43 100644
--- a/src/cpu/kernels/CpuElementwiseUnaryKernel.h
+++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -42,8 +43,10 @@ namespace kernels
 class CpuElementwiseUnaryKernel : public ICpuKernel<CpuElementwiseUnaryKernel>
 {
 private:
-    using ElementwiseUnaryUkernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary, const uint8_t *)>::type;
-    using ElementwiseUnaryPreparePtr = std::add_pointer<std::unique_ptr<uint8_t[]>(ElementWiseUnary op, const ITensorInfo *, const ITensorInfo *)>::type;
+    using ElementwiseUnaryUkernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary, const uint8_t *)>::type;
+    using ElementwiseUnaryPreparePtr = std::add_pointer<std::unique_ptr<uint8_t[]>(
+        ElementWiseUnary op, const ITensorInfo *, const ITensorInfo *)>::type;
 
 public:
     CpuElementwiseUnaryKernel() = default;
@@ -65,7 +68,7 @@ public:
     static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct ElementwiseUnaryKernel
@@ -80,7 +83,7 @@ public:
 
 private:
     ElementWiseUnary           _op{};
-    ElementwiseUnaryUkernelPtr _run_method{ nullptr };
+    ElementwiseUnaryUkernelPtr _run_method{nullptr};
     std::string                _name{};
     std::unique_ptr<uint8_t[]> _lut{};
 };
diff --git a/src/cpu/kernels/CpuFillKernel.cpp b/src/cpu/kernels/CpuFillKernel.cpp
index f69de0082d200517d866d51c14ff81d0a933b2f3..754da97ae1c719f3b5daa2db51f5a5b88f997340 100644
--- a/src/cpu/kernels/CpuFillKernel.cpp
+++ b/src/cpu/kernels/CpuFillKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -68,17 +69,18 @@ void CpuFillKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
     collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator tensor_it(inout, collapsed);
-    execute_window_loop(collapsed, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + tensor_it.offset();
-        // Set memory
-        for(int i = 0; i < window_width; ++i)
+    execute_window_loop(
+        collapsed,
+        [&](const Coordinates &)
         {
-            std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size);
-        }
-
-    },
-    tensor_it);
+            uint8_t *base_addr = start_valid_region + tensor_it.offset();
+            // Set memory
+            for (int i = 0; i < window_width; ++i)
+            {
+                std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size);
+            }
+        },
+        tensor_it);
 }
 
 const char *CpuFillKernel::name() const
diff --git a/src/cpu/kernels/CpuFillKernel.h b/src/cpu/kernels/CpuFillKernel.h
index ce41afc462019c2f982a6065a4c6526486931b36..7c200c9b59f8dbb6b593985dfa78c498a1cbf7e0 100644
--- a/src/cpu/kernels/CpuFillKernel.h
+++ b/src/cpu/kernels/CpuFillKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_FILL_KERNEL_H
 
 #include "arm_compute/core/PixelValue.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -48,7 +49,7 @@ public:
     void configure(const ITensorInfo *tensor, const PixelValue &constant_value);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
diff --git a/src/cpu/kernels/CpuFloorKernel.cpp b/src/cpu/kernels/CpuFloorKernel.cpp
index 65e390a81a8ca7df91b231035523b4dd14a06094..df7e6aad46538aa97dca8f0738feff1e45ad1eed 100644
--- a/src/cpu/kernels/CpuFloorKernel.cpp
+++ b/src/cpu/kernels/CpuFloorKernel.cpp
@@ -27,11 +27,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
 #include "src/cpu/kernels/floor/list.h"
 
 namespace arm_compute
@@ -42,29 +42,22 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuFloorKernel::FloorKernel> available_kernels =
-{
-    {
-        "neon_fp16_floor",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)
-    },
-    {
-        "neon_fp32_floor",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)
-    }
-};
+static const std::vector<CpuFloorKernel::FloorKernel> available_kernels = {
+    {"neon_fp16_floor", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)},
+    {"neon_fp32_floor", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)}};
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
-    const auto *uk = CpuFloorKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuFloorKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     // Validate in case of configured output
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -81,7 +74,8 @@ void CpuFloorKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
 
     auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type());
 
-    const auto *uk = CpuFloorKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuFloorKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
     _run_method = uk->ukernel;
@@ -122,17 +116,14 @@ void CpuFloorKernel::run_op(ITensorPack &tensors, const Window &window, const Th
     ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
     const auto     len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src_it(src, win);
     Iterator dst_it(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        _run_method(src_it.ptr(), dst_it.ptr(), len);
-    },
-    src_it, dst_it);
+    execute_window_loop(
+        win, [&](const Coordinates &) { _run_method(src_it.ptr(), dst_it.ptr(), len); }, src_it, dst_it);
 }
 
 const char *CpuFloorKernel::name() const
diff --git a/src/cpu/kernels/CpuFloorKernel.h b/src/cpu/kernels/CpuFloorKernel.h
index 35ab534ca8f6ba8de6ddd9516d5088e5a6e297aa..57107d0532dc5a46b2aa80838c83a74f66fe9942 100644
--- a/src/cpu/kernels/CpuFloorKernel.h
+++ b/src/cpu/kernels/CpuFloorKernel.h
@@ -65,7 +65,7 @@ public:
     Window infer_window(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct FloorKernel
@@ -78,7 +78,7 @@ public:
     static const std::vector<FloorKernel> &get_available_kernels();
 
 private:
-    FloorKernelPtr _run_method{ nullptr };
+    FloorKernelPtr _run_method{nullptr};
     std::string    _name{};
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
index 9fbf2d54c6cdf0f10e0ae54cac8a110420d8406e..db433c99a855ac60b6fc81c630dd95804776a8a1 100644
--- a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
+++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
@@ -24,9 +24,10 @@
 #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
 
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -60,7 +61,7 @@ Status CpuGemmInterleave4x4Kernel::validate(const ITensorInfo *src, const ITenso
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorShape dst_shape = compute_interleaved_shape(*src);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
@@ -111,35 +112,42 @@ void CpuGemmInterleave4x4Kernel::run_op(ITensorPack &tensors, const Window &wind
     Iterator in(src, win);
     Iterator out(dst, win_out);
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        if(id.y() + 4 <= static_cast<int>(in_height))
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            for(size_t x = window_start_x; x < window_end_x; ++x)
+            if (id.y() + 4 <= static_cast<int>(in_height))
             {
-                std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size, element_size);
-                std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size, element_size);
-                std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size, element_size);
-                std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size, element_size);
-            }
-        }
-        else
-        {
-            for(size_t x = window_start_x; x < window_end_x; ++x)
-            {
-                size_t y = 0;
-                for(; y < partial_y; ++y)
+                for (size_t x = window_start_x; x < window_end_x; ++x)
                 {
-                    std::memcpy(out.ptr() + (x * 4 + y) * element_size, (in.ptr() + y * in_stride) + x * element_size, element_size);
+                    std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size,
+                                element_size);
+                    std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size,
+                                element_size);
+                    std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size,
+                                element_size);
+                    std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size,
+                                element_size);
                 }
-                for(; y < 4; ++y)
+            }
+            else
+            {
+                for (size_t x = window_start_x; x < window_end_x; ++x)
                 {
-                    std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size);
+                    size_t y = 0;
+                    for (; y < partial_y; ++y)
+                    {
+                        std::memcpy(out.ptr() + (x * 4 + y) * element_size,
+                                    (in.ptr() + y * in_stride) + x * element_size, element_size);
+                    }
+                    for (; y < 4; ++y)
+                    {
+                        std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size);
+                    }
                 }
             }
-        }
-    },
-    in, out);
+        },
+        in, out);
 }
 
 const char *CpuGemmInterleave4x4Kernel::name() const
diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
index 4fb6a52a8bd9780e878f70bd1163a6e0ae2ebee9..2ce34bc4bcf00107b861cf9bffa8d6fbde16dd1c 100644
--- a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
+++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
@@ -71,7 +71,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
index f8bef64066644cc6fd1ec8811dbd2ac26b13b259..a3ed2cd171ee6779c8e0f0c0636d29d49cd7a3b9 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -44,646 +45,494 @@ namespace kernels
 {
 namespace
 {
-void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
+void inline vector_matrix_multiply_u8(Iterator     &ina,
+                                      Iterator     &inb,
+                                      Iterator     &out,
+                                      int           width_a,
+                                      int           width_b,
+                                      int           width_out,
+                                      size_t        stride_b,
+                                      const Window &window)
 {
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        if(id.x() > width_b)
-        {
-            return;
-        }
-
-        // Note: Since the input are all positives, we can use uint32_t
-        // Accumulators for the block 0
-        uint32x4x4_t c0 =
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
+            if (id.x() > width_b)
             {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
+                return;
             }
-        };
-
-        auto vec_a          = reinterpret_cast<const uint8_t *>(ina.ptr());
-        auto matrix_b       = reinterpret_cast<const uint8_t *>(inb.ptr());
-        auto vec_a_end_addr = vec_a + width_a;
-
-        // This for loop performs 8 accumulations
-        for(; vec_a <= (vec_a_end_addr - 8);)
-        {
-            const uint8x8_t  a00_u8 = vld1_u8(vec_a);
-            const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);
-            const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);
-            const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);
-            const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);
-            const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);
-            const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);
-            const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);
-            const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);
-
-            // Convert a00_u8 to uint16_t and get the lower part
-            const uint16x4x2_t a00_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(a00_u8)),
-                    vget_high_u16(vmovl_u8(a00_u8))
-                }
-            };
-
-            const uint16x4x4_t b00_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
-                }
-            };
-
-            const uint16x4x4_t b10_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))
-                }
-            };
-
-            const uint16x4x4_t b20_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))
-                }
-            };
 
-            const uint16x4x4_t b30_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))
-                }
-            };
+            // Note: Since the input are all positives, we can use uint32_t
+            // Accumulators for the block 0
+            uint32x4x4_t c0 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            auto vec_a          = reinterpret_cast<const uint8_t *>(ina.ptr());
+            auto matrix_b       = reinterpret_cast<const uint8_t *>(inb.ptr());
+            auto vec_a_end_addr = vec_a + width_a;
+
+            // This for loop performs 8 accumulations
+            for (; vec_a <= (vec_a_end_addr - 8);)
+            {
+                const uint8x8_t  a00_u8 = vld1_u8(vec_a);
+                const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);
+                const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);
+                const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);
+                const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);
+                const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);
+                const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);
+                const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);
+                const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);
+
+                // Convert a00_u8 to uint16_t and get the lower part
+                const uint16x4x2_t a00_u16 = {{vget_low_u16(vmovl_u8(a00_u8)), vget_high_u16(vmovl_u8(a00_u8))}};
+
+                const uint16x4x4_t b00_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}};
+
+                const uint16x4x4_t b10_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))}};
+
+                const uint16x4x4_t b20_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))}};
+
+                const uint16x4x4_t b30_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))}};
+
+                const uint16x4x4_t b40_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))}};
+
+                const uint16x4x4_t b50_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))}};
+
+                const uint16x4x4_t b60_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))}};
+
+                const uint16x4x4_t b70_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))}};
+
+                // Accumulate 0:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);
+
+                // Accumulate 1:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);
+
+                // Accumulate 2:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);
+
+                // Accumulate 3:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);
+
+                // Accumulate 4:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);
+
+                // Accumulate 5:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);
+
+                // Accumulate 6:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);
+
+                // Accumulate 7:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);
+
+                vec_a += 8;
+                matrix_b += 8 * stride_b;
+            }
 
-            const uint16x4x4_t b40_u16 =
+            // This for loop performs the left-over accumulations
+            for (; vec_a < vec_a_end_addr;)
             {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))
-                }
-            };
+                const uint8x8_t  a00_u8 = vld1_dup_u8(vec_a);
+                const uint8x16_t b00_u8 = vld1q_u8(matrix_b);
 
-            const uint16x4x4_t b50_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))
-                }
-            };
+                const uint16x4x4_t b00_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}};
 
-            const uint16x4x4_t b60_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))
-                }
-            };
+                // Convert a00_u8 to uint16_t and get the lower part
+                const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
 
-            const uint16x4x4_t b70_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))
-                }
-            };
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);
-
-            // Accumulate 1:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);
-
-            // Accumulate 2:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);
-
-            // Accumulate 3:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);
-
-            // Accumulate 4:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);
-
-            // Accumulate 5:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);
-
-            // Accumulate 6:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);
-
-            // Accumulate 7:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);
-
-            vec_a += 8;
-            matrix_b += 8 * stride_b;
-        }
+                // Accumulate 0:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
 
-        // This for loop performs the left-over accumulations
-        for(; vec_a < vec_a_end_addr;)
-        {
-            const uint8x8_t  a00_u8 = vld1_dup_u8(vec_a);
-            const uint8x16_t b00_u8 = vld1q_u8(matrix_b);
+                vec_a += 1;
+                matrix_b += stride_b;
+            }
 
-            const uint16x4x4_t b00_u16 =
+            auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+            if (id.x() < (width_out - 16))
             {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
-                }
-            };
-
-            // Convert a00_u8 to uint16_t and get the lower part
-            const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
-
-            vec_a += 1;
-            matrix_b += stride_b;
-        }
-
-        auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.x() < (width_out - 16))
-        {
-            vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
-            vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
-            vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
-            vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
-        }
-        else
-        {
-            auto left_over = width_out - id.x();
-            for(auto k = 0; k < 4 && left_over; ++k)
+                vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
+                vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
+                vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
+                vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
+            }
+            else
             {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                auto left_over = width_out - id.x();
+                for (auto k = 0; k < 4 && left_over; ++k)
                 {
-                    *(vec_out + k * 4 + j) = c0.val[k][j];
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(vec_out + k * 4 + j) = c0.val[k][j];
+                    }
                 }
             }
-        }
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
 
-void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
+void inline vector_matrix_multiply_s8(Iterator     &ina,
+                                      Iterator     &inb,
+                                      Iterator     &out,
+                                      int           width_a,
+                                      int           width_b,
+                                      int           width_out,
+                                      size_t        stride_b,
+                                      const Window &window)
 {
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        if(id.x() > width_b)
-        {
-            return;
-        }
-
-        // Accumulators for the block 0
-        int32x4x4_t c0 =
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
+            if (id.x() > width_b)
             {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
+                return;
             }
-        };
-
-        auto vec_a          = reinterpret_cast<const int8_t *>(ina.ptr());
-        auto matrix_b       = reinterpret_cast<const int8_t *>(inb.ptr());
-        auto vec_a_end_addr = vec_a + width_a;
-
-        // This for loop performs 8 accumulations
-        for(; vec_a <= (vec_a_end_addr - 8);)
-        {
-            const int8x8_t  a00_s8 = vld1_s8(vec_a);
-            const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);
-            const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);
-            const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);
-            const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);
-            const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);
-            const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);
-            const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);
-            const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);
-
-            // Convert a00_s8 to int16_t and get the lower part
-            const int16x4x2_t a00_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(a00_s8)),
-                    vget_high_s16(vmovl_s8(a00_s8))
-                }
-            };
-
-            const int16x4x4_t b00_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
-                }
-            };
-
-            const int16x4x4_t b10_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))
-                }
-            };
 
-            const int16x4x4_t b20_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))
-                }
-            };
+            // Accumulators for the block 0
+            int32x4x4_t c0 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            auto vec_a          = reinterpret_cast<const int8_t *>(ina.ptr());
+            auto matrix_b       = reinterpret_cast<const int8_t *>(inb.ptr());
+            auto vec_a_end_addr = vec_a + width_a;
+
+            // This for loop performs 8 accumulations
+            for (; vec_a <= (vec_a_end_addr - 8);)
+            {
+                const int8x8_t  a00_s8 = vld1_s8(vec_a);
+                const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);
+                const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);
+                const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);
+                const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);
+                const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);
+                const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);
+                const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);
+                const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);
+
+                // Convert a00_s8 to int16_t and get the lower part
+                const int16x4x2_t a00_s16 = {{vget_low_s16(vmovl_s8(a00_s8)), vget_high_s16(vmovl_s8(a00_s8))}};
+
+                const int16x4x4_t b00_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}};
+
+                const int16x4x4_t b10_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))}};
+
+                const int16x4x4_t b20_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))}};
+
+                const int16x4x4_t b30_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))}};
+
+                const int16x4x4_t b40_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))}};
+
+                const int16x4x4_t b50_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))}};
+
+                const int16x4x4_t b60_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))}};
+
+                const int16x4x4_t b70_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))}};
+
+                // Accumulate 0:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);
+
+                // Accumulate 1:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);
+
+                // Accumulate 2:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);
+
+                // Accumulate 3:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);
+
+                // Accumulate 4:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);
+
+                // Accumulate 5:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);
+
+                // Accumulate 6:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);
+
+                // Accumulate 7:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);
+
+                vec_a += 8;
+                matrix_b += 8 * stride_b;
+            }
 
-            const int16x4x4_t b30_s16 =
+            // This for loop performs the left-over accumulations
+            for (; vec_a < vec_a_end_addr;)
             {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))
-                }
-            };
+                const int8x8_t  a00_s8 = vld1_dup_s8(vec_a);
+                const int8x16_t b00_s8 = vld1q_s8(matrix_b);
 
-            const int16x4x4_t b40_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))
-                }
-            };
+                const int16x4x4_t b00_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}};
 
-            const int16x4x4_t b50_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))
-                }
-            };
+                // Convert a00_s8 to uint16_t and get the lower part
+                const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
 
-            const int16x4x4_t b60_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))
-                }
-            };
-
-            const int16x4x4_t b70_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))
-                }
-            };
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);
-
-            // Accumulate 1:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);
-
-            // Accumulate 2:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);
-
-            // Accumulate 3:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);
-
-            // Accumulate 4:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);
-
-            // Accumulate 5:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);
-
-            // Accumulate 6:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);
-
-            // Accumulate 7:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);
-
-            vec_a += 8;
-            matrix_b += 8 * stride_b;
-        }
+                // Accumulate 0:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
 
-        // This for loop performs the left-over accumulations
-        for(; vec_a < vec_a_end_addr;)
-        {
-            const int8x8_t  a00_s8 = vld1_dup_s8(vec_a);
-            const int8x16_t b00_s8 = vld1q_s8(matrix_b);
+                vec_a += 1;
+                matrix_b += stride_b;
+            }
 
-            const int16x4x4_t b00_s16 =
+            auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+            if (id.x() < (width_out - 16))
             {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
-                }
-            };
-
-            // Convert a00_s8 to uint16_t and get the lower part
-            const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
-
-            vec_a += 1;
-            matrix_b += stride_b;
-        }
-
-        auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.x() < (width_out - 16))
-        {
-            vst1q_s32(vec_out + 0, c0.val[0]);
-            vst1q_s32(vec_out + 4, c0.val[1]);
-            vst1q_s32(vec_out + 8, c0.val[2]);
-            vst1q_s32(vec_out + 12, c0.val[3]);
-        }
-        else
-        {
-            auto left_over = width_out - id.x();
-            for(auto k = 0; k < 4 && left_over; ++k)
+                vst1q_s32(vec_out + 0, c0.val[0]);
+                vst1q_s32(vec_out + 4, c0.val[1]);
+                vst1q_s32(vec_out + 8, c0.val[2]);
+                vst1q_s32(vec_out + 12, c0.val[3]);
+            }
+            else
             {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                auto left_over = width_out - id.x();
+                for (auto k = 0; k < 4 && left_over; ++k)
                 {
-                    *(vec_out + k * 4 + j) = c0.val[k][j];
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(vec_out + k * 4 + j) = c0.val[k][j];
+                    }
                 }
             }
-        }
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
 
-void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
+void inline matrix_multiply_u8(
+    Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
 {
     const auto   width_out  = static_cast<int>(out_info.dimension(0));
     const auto   height_out = static_cast<int>(out_info.dimension(1));
     const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const uint8_t *mtx_a0 = ina.ptr();
-        const uint8_t *mtx_b0 = inb.ptr();
-
-        // Note: Since the input are all positives, we can use uint32_t
-        // Accumulators for the block 0
-        uint32x4x4_t c0 =
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
+            const uint8_t *mtx_a0 = ina.ptr();
+            const uint8_t *mtx_b0 = inb.ptr();
+
+            // Note: Since the input are all positives, we can use uint32_t
+            // Accumulators for the block 0
+            uint32x4x4_t c0 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            // Accumulators for the block 1
+            uint32x4x4_t c1 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            // Accumulators for the block 2
+            uint32x4x4_t c2 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            // Accumulators for the block 3
+            uint32x4x4_t c3 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            for (int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
+            {
+                const uint8x8_t  a00_u8 = vld1_u8(mtx_a0);
+                const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
+
+                // Convert a00_u8 to uint16_t and get the lower part
+                const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
+
+                // Convert b00_s8 to uint16_t
+                const uint16x4x4_t b00_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}};
+
+                // 4x4 block 0
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
+
+                // 4x4 block 1
+                c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1);
+                c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1);
+                c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1);
+                c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1);
+
+                // 4x4 block 2
+                c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2);
+                c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2);
+                c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2);
+                c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2);
+
+                // 4x4 block 3
+                c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3);
+                c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3);
+                c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3);
+                c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3);
             }
-        };
 
-        // Accumulators for the block 1
-        uint32x4x4_t c1 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
+            auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
 
-        // Accumulators for the block 2
-        uint32x4x4_t c2 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        // Accumulators for the block 3
-        uint32x4x4_t c3 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
-        {
-            const uint8x8_t  a00_u8 = vld1_u8(mtx_a0);
-            const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
-
-            // Convert a00_u8 to uint16_t and get the lower part
-            const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
-
-            // Convert b00_s8 to uint16_t
-            const uint16x4x4_t b00_u16 =
+            if (id.y() < height_out && id.x() < (width_out - 16))
             {
+                vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));
+                vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));
+                vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));
+                vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));
+                if (id.y() + 1 < height_out)
                 {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
-                }
-            };
-
-            // 4x4 block 0
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
-
-            // 4x4 block 1
-            c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1);
-            c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1);
-            c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1);
-            c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1);
-
-            // 4x4 block 2
-            c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2);
-            c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2);
-            c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2);
-            c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2);
-
-            // 4x4 block 3
-            c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3);
-            c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3);
-            c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3);
-            c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3);
-        }
-
-        auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
-
-        if(id.y() < height_out && id.x() < (width_out - 16))
-        {
-            vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));
-            vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));
-            vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));
-            vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));
-            if(id.y() + 1 < height_out)
-            {
-                vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));
-                vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));
-                vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));
-                vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));
-                if(id.y() + 2 < height_out)
-                {
-                    vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));
-                    vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));
-                    vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));
-                    vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));
-                    if(id.y() + 3 < height_out)
+                    vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));
+                    vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));
+                    vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));
+                    vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));
+                    if (id.y() + 2 < height_out)
                     {
-                        vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));
-                        vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));
-                        vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));
-                        vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));
+                        vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));
+                        vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));
+                        vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));
+                        vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));
+                        if (id.y() + 3 < height_out)
+                        {
+                            vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));
+                            vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));
+                            vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));
+                            vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));
+                        }
                     }
                 }
             }
-        }
-        else
-        {
-            const auto left_over_value = width_out - id.x();
-            auto       left_over       = left_over_value;
-            for(auto k = 0; k < 4 && left_over; ++k)
+            else
             {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                const auto left_over_value = width_out - id.x();
+                auto       left_over       = left_over_value;
+                for (auto k = 0; k < 4 && left_over; ++k)
                 {
-                    *(mtx_out + k * 4 + j) = c0.val[k][j];
-                }
-            }
-            if(id.y() + 1 < height_out)
-            {
-                left_over = left_over_value;
-                for(auto k = 0; k < 4 && left_over; ++k)
-                {
-                    for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
                     {
-                        *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
+                        *(mtx_out + k * 4 + j) = c0.val[k][j];
                     }
                 }
-                if(id.y() + 2 < height_out)
+                if (id.y() + 1 < height_out)
                 {
                     left_over = left_over_value;
-                    for(auto k = 0; k < 4 && left_over; ++k)
+                    for (auto k = 0; k < 4 && left_over; ++k)
                     {
-                        for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                        for (auto j = 0; j < 4 && left_over; ++j, --left_over)
                         {
-                            *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+                            *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
                         }
                     }
-                    if(id.y() + 3 < height_out)
+                    if (id.y() + 2 < height_out)
                     {
                         left_over = left_over_value;
-                        for(auto k = 0; k < 4 && left_over; ++k)
+                        for (auto k = 0; k < 4 && left_over; ++k)
+                        {
+                            for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                            {
+                                *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+                            }
+                        }
+                        if (id.y() + 3 < height_out)
                         {
-                            for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                            left_over = left_over_value;
+                            for (auto k = 0; k < 4 && left_over; ++k)
                             {
-                                *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+                                for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                                {
+                                    *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+                                }
                             }
                         }
                     }
                 }
             }
-        }
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
 
-void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
+void inline matrix_multiply_s8(
+    Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
 {
     const auto   width_out  = static_cast<int>(out_info.dimension(0));
     const auto   height_out = static_cast<int>(out_info.dimension(1));
@@ -691,182 +540,148 @@ void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int
     // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW
     // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
     // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
-        auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
-
-        // Note: Since the input are all positives, we can use uint32_t
-        // Accumulators for the block 0
-        int32x4x4_t c0 =
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
+            auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
+            auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
+
+            // Note: Since the input are all positives, we can use uint32_t
+            // Accumulators for the block 0
+            int32x4x4_t c0 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            // Accumulators for the block 1
+            int32x4x4_t c1 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            // Accumulators for the block 2
+            int32x4x4_t c2 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            // Accumulators for the block 3
+            int32x4x4_t c3 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            for (int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
+            {
+                const int8x8_t  a00_s8 = vld1_s8(mtx_a0);
+                const int8x16_t b00_s8 = vld1q_s8(mtx_b0);
+
+                // Convert a00_s8 to uint16_t and get the lower part
+                const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
+
+                // Convert b00_s8 to int16_t
+                const int16x4x4_t b00_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}};
+
+                // 4x4 block 0
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
+
+                // 4x4 block 1
+                c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);
+                c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);
+                c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);
+                c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);
+
+                // 4x4 block 2
+                c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);
+                c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);
+                c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);
+                c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);
+
+                // 4x4 block 3
+                c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);
+                c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);
+                c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);
+                c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);
             }
-        };
-
-        // Accumulators for the block 1
-        int32x4x4_t c1 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        // Accumulators for the block 2
-        int32x4x4_t c2 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        // Accumulators for the block 3
-        int32x4x4_t c3 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
-        {
-            const int8x8_t  a00_s8 = vld1_s8(mtx_a0);
-            const int8x16_t b00_s8 = vld1q_s8(mtx_b0);
-
-            // Convert a00_s8 to uint16_t and get the lower part
-            const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
-
-            // Convert b00_s8 to int16_t
-            const int16x4x4_t b00_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
-                }
-            };
-
-            // 4x4 block 0
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
-
-            // 4x4 block 1
-            c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);
-            c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);
-            c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);
-            c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);
-
-            // 4x4 block 2
-            c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);
-            c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);
-            c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);
-            c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);
-
-            // 4x4 block 3
-            c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);
-            c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);
-            c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);
-            c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);
-        }
-        auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.y() < height_out && id.x() < (width_out - 16))
-        {
-            vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);
-            vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);
-            vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);
-            vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);
-            if(id.y() + 1 < height_out)
-            {
-                vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);
-                vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);
-                vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);
-                vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);
-                if(id.y() + 2 < height_out)
-                {
-                    vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);
-                    vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);
-                    vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);
-                    vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);
-                    if(id.y() + 3 < height_out)
+            auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
+            if (id.y() < height_out && id.x() < (width_out - 16))
+            {
+                vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);
+                vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);
+                vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);
+                vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);
+                if (id.y() + 1 < height_out)
+                {
+                    vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);
+                    vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);
+                    vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);
+                    vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);
+                    if (id.y() + 2 < height_out)
                     {
-                        vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);
-                        vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);
-                        vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);
-                        vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);
+                        vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);
+                        vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);
+                        vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);
+                        vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);
+                        if (id.y() + 3 < height_out)
+                        {
+                            vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);
+                            vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);
+                            vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);
+                            vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);
+                        }
                     }
                 }
             }
-        }
-        else if(id.y() < height_out)
-        {
-            const auto left_over_value = width_out - id.x();
-            auto       left_over       = left_over_value;
-            for(auto k = 0; k < 4 && left_over; ++k)
-            {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                {
-                    *(mtx_out + k * 4 + j) = c0.val[k][j];
-                }
-            }
-            if(id.y() + 1 < height_out)
+            else if (id.y() < height_out)
             {
-                left_over = left_over_value;
-                for(auto k = 0; k < 4 && left_over; ++k)
+                const auto left_over_value = width_out - id.x();
+                auto       left_over       = left_over_value;
+                for (auto k = 0; k < 4 && left_over; ++k)
                 {
-                    for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
                     {
-                        *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
+                        *(mtx_out + k * 4 + j) = c0.val[k][j];
                     }
                 }
-                if(id.y() + 2 < height_out)
+                if (id.y() + 1 < height_out)
                 {
                     left_over = left_over_value;
-                    for(auto k = 0; k < 4 && left_over; ++k)
+                    for (auto k = 0; k < 4 && left_over; ++k)
                     {
-                        for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                        for (auto j = 0; j < 4 && left_over; ++j, --left_over)
                         {
-                            *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+                            *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
                         }
                     }
-                    if(id.y() + 3 < height_out)
+                    if (id.y() + 2 < height_out)
                     {
                         left_over = left_over_value;
-                        for(auto k = 0; k < 4 && left_over; ++k)
+                        for (auto k = 0; k < 4 && left_over; ++k)
                         {
-                            for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                            for (auto j = 0; j < 4 && left_over; ++j, --left_over)
                             {
-                                *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+                                *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+                            }
+                        }
+                        if (id.y() + 3 < height_out)
+                        {
+                            left_over = left_over_value;
+                            for (auto k = 0; k < 4 && left_over; ++k)
+                            {
+                                for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                                {
+                                    *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+                                }
                             }
                         }
                     }
                 }
             }
-        }
-
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
 
 Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S8, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+                                                         DataType::U8);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
 
     TensorShape in0_shape = src0->tensor_shape();
@@ -874,9 +689,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     TensorShape out_shape = dst->tensor_shape();
 
     // Check vector-by-matrix case
-    if(out_shape[1] == 1)
+    if (out_shape[1] == 1)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1],
+                                        "The number of input0's columns must be equal to input1's rows");
     }
     else
     {
@@ -884,8 +700,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         in1_shape.collapse(2);
         out_shape.collapse(2);
 
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2],
+                                        "Output tensor must have the same number of batches of input0 tensor");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            in1_shape[2] != 1 && in0_shape[2] != in1_shape[2],
+            "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16");
     }
 
@@ -909,20 +728,22 @@ void CpuGemmLowpMatrixMultiplyKernel::configure(const ITensorInfo *src0, const I
 
     Window win;
     // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
-    if((dst->dimension(1) == 1))
+    if ((dst->dimension(1) == 1))
     {
         // Configure kernel window
         win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x));
     }
     else
     {
-        win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        win =
+            calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
     }
 
     ICpuKernel::configure(win);
 }
 
-Status CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status
+CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst));
     return Status{};
@@ -939,12 +760,13 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window
     auto dst  = tensors.get_tensor(TensorType::ACL_DST);
 
     // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path
-    if((dst->info()->dimension(1) == 1))
+    if ((dst->info()->dimension(1) == 1))
     {
         const auto width_matrix_a = static_cast<int>(src0->info()->dimension(0));
         const auto width_matrix_b = static_cast<int>(src1->info()->dimension(0));
         const auto width_out      = static_cast<int>(dst->info()->dimension(0));
-        const auto in_b_stride    = static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type()));
+        const auto in_b_stride =
+            static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type()));
 
         // The implementation computes 16 elements per iteration
         const int window_start_x = 16 * info.thread_id;
@@ -963,7 +785,7 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window
         Window win_b;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        if(src1->info()->num_dimensions() >= 3)
+        if (src1->info()->num_dimensions() >= 3)
         {
             win_b = window;
         }
@@ -974,18 +796,20 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window
         Iterator inb(src1, win_b);
         Iterator out(dst, win_out);
 
-        switch(src0->info()->data_type())
+        switch (src0->info()->data_type())
         {
             case DataType::S8:
             case DataType::QASYMM8_SIGNED:
             {
-                vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);
+                vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride,
+                                          window);
                 break;
             }
             case DataType::U8:
             case DataType::QASYMM8:
             {
-                vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);
+                vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride,
+                                          window);
                 break;
             }
             default:
@@ -1009,7 +833,7 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window
         Window win_b;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        if(_slide_matrix_b)
+        if (_slide_matrix_b)
         {
             win_b = window;
         }
@@ -1021,7 +845,7 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window
         Iterator inb(src1, win_b);
         Iterator out(dst, window);
 
-        switch(src0->info()->data_type())
+        switch (src0->info()->data_type())
         {
             case DataType::S8:
             case DataType::QASYMM8_SIGNED:
@@ -1050,4 +874,4 @@ const char *CpuGemmLowpMatrixMultiplyKernel::name() const
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
index 2cc789d6d998b63036474d524460753f7486ba2e..439ada1b478641c3c77e5879bf33720833910fda 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
@@ -68,11 +68,11 @@ public:
     static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    bool _slide_matrix_b{ true };
+    bool _slide_matrix_b{true};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
index 534076b97c4f059e4aaf999414a5f2cb925fdea6..9bd1eae663a94214ff23cf8d5b77989674604526 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
@@ -26,9 +26,10 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
@@ -38,37 +39,49 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+Status validate_arguments_matrix_a_reduction(const ITensorInfo                 *src,
+                                             const ITensorInfo                 *dst,
+                                             const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(1),
+            "Output vector must have length equal to the number of rows of the input matrix");
     }
     return Status{};
 }
-Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+Status validate_arguments_matrix_b_reduction(const ITensorInfo                 *src,
+                                             const ITensorInfo                 *dst,
+                                             const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(0),
+            "Output vector must have length equal to the number of columns of the input matrix");
     }
     return Status{};
 }
 } // namespace
 
-void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo                 *src,
+                                                  ITensorInfo                       *dst,
+                                                  const GEMMLowpReductionKernelInfo &info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -77,7 +90,7 @@ void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITenso
     _scalar        = info.scalar;
     _mul_by_scalar = info.mul_by_scalar;
 
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::QASYMM8:
             _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<uint8_t>;
@@ -98,14 +111,18 @@ void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITenso
     ICpuKernel::configure(win);
 }
 
-Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo                 *src,
+                                                   const ITensorInfo                 *dst,
+                                                   const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(src, dst, info));
     return Status{};
 }
 
 template <typename T>
-void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor *dst, const arm_compute::Window &window)
+void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor             *src,
+                                                     ITensor                   *dst,
+                                                     const arm_compute::Window &window)
 {
     // Intermediate and final accumulator types
     using TIAcc = wrapper::traits::promote_t<T>;
@@ -121,55 +138,58 @@ void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor
     Iterator in(src, win_input);
     Iterator out(dst, collapsed_window);
 
-    execute_window_loop(collapsed_window, [&](const Coordinates & id)
-    {
-        auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
-        TAcc sum_row  = 0;
+    execute_window_loop(
+        collapsed_window,
+        [&](const Coordinates &id)
+        {
+            auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
+            TAcc sum_row  = 0;
 
-        const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2]));
+            const T *matrix_a = reinterpret_cast<const T *>(
+                (in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2]));
 
 #if __arm__
-        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
 #endif /* __arm__ */
 
-        int i = 0;
-        // This for loop performs 16 accumulations
-        for(; i <= (_k - 16); i += 16)
-        {
-            const auto a0_d8 = wrapper::vloadq(matrix_a + i);
+            int i = 0;
+            // This for loop performs 16 accumulations
+            for (; i <= (_k - 16); i += 16)
+            {
+                const auto a0_d8 = wrapper::vloadq(matrix_a + i);
 
-            // Partial accumulations in U16
-            const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
+                // Partial accumulations in U16
+                const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
 
-            // Accumulate to U32
-            vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
-        }
+                // Accumulate to U32
+                vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
+            }
 
-        // This for loop performs the leftover accumulations
-        for(; i < _k; ++i)
-        {
-            sum_row += static_cast<TAcc>(matrix_a[i]);
-        }
+            // This for loop performs the leftover accumulations
+            for (; i < _k; ++i)
+            {
+                sum_row += static_cast<TAcc>(matrix_a[i]);
+            }
 
 #if defined(__aarch64__)
-        // Reduction operation available on 64 bit architectures only
-        sum_row += wrapper::vaddv(vsum_row);
+            // Reduction operation available on 64 bit architectures only
+            sum_row += wrapper::vaddv(vsum_row);
 #else  // __aarch64__
-        auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
-        tmp      = wrapper::vpadd(tmp, tmp);
+            auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
+            tmp      = wrapper::vpadd(tmp, tmp);
 
-        sum_row += wrapper::vgetlane(tmp, 0);
+            sum_row += wrapper::vgetlane(tmp, 0);
 #endif // __aarch64__
 
-        // Multiply by scalar if necessary
-        if(_mul_by_scalar)
-        {
-            sum_row *= _scalar;
-        }
+            // Multiply by scalar if necessary
+            if (_mul_by_scalar)
+            {
+                sum_row *= _scalar;
+            }
 
-        *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
-    },
-    in, out);
+            *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
+        },
+        in, out);
 }
 
 void CpuGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
@@ -189,7 +209,9 @@ const char *CpuGemmLowpMatrixAReductionKernel::name() const
     return "CpuGemmLowpMatrixAReductionKernel";
 }
 
-void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo                 *src,
+                                                  ITensorInfo                       *dst,
+                                                  const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(src, dst, info));
@@ -201,7 +223,7 @@ void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITenso
     // Configure kernel window
     constexpr unsigned int num_elems_processed_per_iteration = 16;
 
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::QASYMM8:
             _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<uint8_t>;
@@ -223,14 +245,19 @@ void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITenso
     ICpuKernel::configure(win);
 }
 
-Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo                 *src,
+                                                   const ITensorInfo                 *dst,
+                                                   const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(src, dst, info));
     return Status{};
 }
 
 template <typename T>
-void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor    *src,
+                                                     ITensor          *dst,
+                                                     const Window     &window,
+                                                     const ThreadInfo &info)
 {
     // Intermediate and final accumulator types
     using TIAcc = wrapper::traits::promote_t<T>;
@@ -258,121 +285,116 @@ void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor
     Iterator inb(src, win_in);
     Iterator out(dst, win_out);
 
-    execute_window_loop(win_out, [&](const Coordinates & id)
-    {
-        if(id.x() > width_matrix_b)
+    execute_window_loop(
+        win_out,
+        [&](const Coordinates &id)
         {
-            return;
-        }
+            if (id.x() > width_matrix_b)
+            {
+                return;
+            }
 
-        // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-        typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
-        {
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
-        };
+            // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
+            typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] = {
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})};
 
-        const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]);
+            const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]);
 
 #if __arm__
-        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
-        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
 #endif /* __arm__ */
 
-        int i = 0;
-        // This for loop performs 4 accumulations
-        for(; i <= (_k - 4); i += 4)
-        {
-            const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
-            const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
-            const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
-            const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
+            int i = 0;
+            // This for loop performs 4 accumulations
+            for (; i <= (_k - 4); i += 4)
+            {
+                const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
+                const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
+                const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
+                const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
 
 #if __arm__
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
+                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
+                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
+                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
+                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
 #endif /* __arm__ */
 
-            // Partial accumulation in 16bit
-            typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
-            {
-                wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
-                wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
-            };
-
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
-
-            // Accumulate to 32bit
-            sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
-            sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
-            sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
-            sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
-
-            matrix_b += 4 * in_b_stride;
-        }
-
-        // This for loop perfoms the leftover accumulations
-        for(; i < _k; ++i)
-        {
-            const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
+                // Partial accumulation in 16bit
+                typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] = {
+                    wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
+                    wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})};
+
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
+
+                // Accumulate to 32bit
+                sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
+                sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
+                sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
+                sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
+
+                matrix_b += 4 * in_b_stride;
+            }
 
-            // Convert S8 to S16
-            const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
+            // This for loop perfoms the leftover accumulations
+            for (; i < _k; ++i)
             {
-                wrapper::vmovl(wrapper::vgetlow(b0_b8)),
-                wrapper::vmovl(wrapper::vgethigh(b0_b8))
-            };
+                const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
 
-            // Accumulate to 32bit
-            sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
-            sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
-            sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
-            sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
+                // Convert S8 to S16
+                const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]{
+                    wrapper::vmovl(wrapper::vgetlow(b0_b8)), wrapper::vmovl(wrapper::vgethigh(b0_b8))};
 
-            matrix_b += in_b_stride;
-        }
+                // Accumulate to 32bit
+                sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
+                sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
+                sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
+                sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
 
-        // Multiply by scalar if necessary
-        if(_mul_by_scalar)
-        {
-            sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
-            sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
-            sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
-            sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
-        }
-
-        auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.x() + 16 < width_matrix_b)
-        {
-            wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
-            wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
-            wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
-            wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
-        }
-        else
-        {
-            auto left_over = width_matrix_b - id.x();
-            for(auto k = 0; k < 4 && left_over; ++k)
+                matrix_b += in_b_stride;
+            }
+
+            // Multiply by scalar if necessary
+            if (_mul_by_scalar)
+            {
+                sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
+                sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
+                sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
+                sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
+            }
+
+            auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
+            if (id.x() + 16 < width_matrix_b)
+            {
+                wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
+                wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
+                wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
+                wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
+            }
+            else
             {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                auto left_over = width_matrix_b - id.x();
+                for (auto k = 0; k < 4 && left_over; ++k)
                 {
-                    *(vector_sum_col + k * 4 + j) = sum_col[k][j];
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(vector_sum_col + k * 4 + j) = sum_col[k][j];
+                    }
                 }
             }
-        }
-    },
-    inb, out);
+        },
+        inb, out);
 }
 
 void CpuGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
@@ -393,4 +415,4 @@ const char *CpuGemmLowpMatrixBReductionKernel::name() const
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
index e469629cdb3124e659d25d22e87e849b351dc611..20ef17e96d4560275677abbe6bda100ee7f1b626 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
@@ -66,7 +66,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -85,12 +85,14 @@ private:
      * @param[out] dst    Output tensor
      * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      */
-    using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
+    using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src,
+                                                                                             ITensor       *dst,
+                                                                                             const Window  &window);
 
-    CpuGemmLowpMatrixAReductionKernelPtr _func{ nullptr };
-    int32_t                              _k{ 0 };
-    int32_t                              _scalar{ 0 };
-    bool                                 _mul_by_scalar{ false };
+    CpuGemmLowpMatrixAReductionKernelPtr _func{nullptr};
+    int32_t                              _k{0};
+    int32_t                              _scalar{0};
+    bool                                 _mul_by_scalar{false};
 };
 
 /** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
@@ -124,7 +126,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -144,12 +146,15 @@ private:
      * @param[out] dst    Output tensor
      * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      */
-    using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info);
+    using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor    *src,
+                                                                                             ITensor          *dst,
+                                                                                             const Window     &window,
+                                                                                             const ThreadInfo &info);
 
-    CpuGemmLowpMatrixBReductionKernelPtr _func{ nullptr };
-    int32_t                              _k{ 0 };
-    int32_t                              _scalar{ 0 };
-    bool                                 _mul_by_scalar{ false };
+    CpuGemmLowpMatrixBReductionKernelPtr _func{nullptr};
+    int32_t                              _k{0};
+    int32_t                              _scalar{0};
+    bool                                 _mul_by_scalar{false};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
index a65f1a33de740a40a63b00916004b079ca57e99f..e290783021274f927da432aa33db02df543f2c6b 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -44,32 +45,37 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
-                          int32_t a_offset, int32_t b_offset)
+Status validate_arguments(const ITensorInfo *mm_result,
+                          const ITensorInfo *vector_sum_col,
+                          const ITensorInfo *vector_sum_row,
+                          int32_t            a_offset,
+                          int32_t            b_offset)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
     }
 
     // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
         // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
         // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
         ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
+        if (output_shape.num_dimensions() > 1)
         {
             const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -80,13 +86,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
                                             "mm_result tensor must have the same number of batches of output tensor");
 
-            if(a_offset != 0)
+            if (a_offset != 0)
             {
                 TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                 vector_sum_col_shape.collapse_from(1);
 
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
             }
         }
     }
@@ -94,9 +102,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
     return Status{};
 }
 
-void run_offset_contribution(const Window &window,
-                             ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row,
-                             int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, bool is_gemm3d)
+void run_offset_contribution(const Window  &window,
+                             ITensor       *mm_result,
+                             const ITensor *vector_sum_col,
+                             const ITensor *vector_sum_row,
+                             int32_t        a_offset,
+                             int32_t        b_offset,
+                             int32_t        k_offset,
+                             bool           slide_vector_sum_col,
+                             bool           is_gemm3d)
 {
     Window collapsed_window = window.collapse_if_possible(window, Window::DimZ);
     collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -112,7 +126,7 @@ void run_offset_contribution(const Window &window,
     const size_t sum_col_stride_y = (vector_sum_col != nullptr) ? (vector_sum_col->info()->strides_in_bytes().y()) : 0;
     Iterator     mm_result_it(mm_result, collapsed_window);
 
-    if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true
+    if ((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true
     {
         // Set window for vector_sum_col
         Window win_vector_sum_col(collapsed_window);
@@ -131,95 +145,85 @@ void run_offset_contribution(const Window &window,
         const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
 
         // Offset in case vector_sum_col is batched
-        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
-
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            const int    batch_id           = id.z() / depth_input;
-            const size_t batch_offset_col   = batch_id * (sum_col_stride_y );
-            auto         vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + batch_id * vector_sum_col_batch_offset);
-            auto         mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
-
-            // Compute the leftover term due to b_offset.
-            int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input);
-            b_offset_term_s32 *= b_offset;
+        const int vector_sum_col_batch_offset =
+            slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
 
-            const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            collapsed_window,
+            [&](const Coordinates &id)
             {
-                // Compute the leftover term due to a_offset.
-                int32x4x4_t a_offset_term_s32 =
-                {
-                    {
-                        vld1q_s32(vector_sum_col_ptr + x + 0),
-                        vld1q_s32(vector_sum_col_ptr + x + 4),
-                        vld1q_s32(vector_sum_col_ptr + x + 8),
-                        vld1q_s32(vector_sum_col_ptr + x + 12)
-                    }
-                };
-
-                a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
-                a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
-                a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
-                a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
-
-                // Add a_offset_term_s32 and b_offset_term_s32
-                int32x4x4_t offset_term_s32 =
+                const int    batch_id         = id.z() / depth_input;
+                const size_t batch_offset_col = batch_id * (sum_col_stride_y);
+                auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col +
+                                                                            batch_id * vector_sum_col_batch_offset);
+                auto mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+
+                // Compute the leftover term due to b_offset.
+                int32_t b_offset_term_s32 =
+                    *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                      id.y() + (id.z() % depth_input) * height_input);
+                b_offset_term_s32 *= b_offset;
+
+                const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vdupq_n_s32(k_offset),
-                        vdupq_n_s32(k_offset),
-                        vdupq_n_s32(k_offset),
-                        vdupq_n_s32(k_offset)
-                    }
-                };
-
-                offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec));
-                offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec));
-                offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec));
-                offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec));
-
-                int32x4x4_t in_s32 =
+                    // Compute the leftover term due to a_offset.
+                    int32x4x4_t a_offset_term_s32 = {
+                        {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4),
+                         vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}};
+
+                    a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+                    a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+                    a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+                    a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+                    // Add a_offset_term_s32 and b_offset_term_s32
+                    int32x4x4_t offset_term_s32 = {
+                        {vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}};
+
+                    offset_term_s32.val[0] =
+                        vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec));
+                    offset_term_s32.val[1] =
+                        vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec));
+                    offset_term_s32.val[2] =
+                        vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec));
+                    offset_term_s32.val[3] =
+                        vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec));
+
+                    int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4),
+                                           vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}};
+
+                    // Add the offset terms to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
+
+                    // Store the result with the offset contribution
+                    vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+                    vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+                    vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+                    vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+                }
+
+                // Left-overs loop
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vld1q_s32(mm_result_ptr + x + 0),
-                        vld1q_s32(mm_result_ptr + x + 4),
-                        vld1q_s32(mm_result_ptr + x + 8),
-                        vld1q_s32(mm_result_ptr + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
-
-                // Store the result with the offset contribution
-                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
-                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
-                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
-                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
-            }
+                    // Compute the leftover term due to a_offset.
+                    int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
 
-            // Left-overs loop
-            for(; x < window_end_x; ++x)
-            {
-                // Compute the leftover term due to a_offset.
-                int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
-
-                a_offset_term_s32 *= a_offset;
+                    a_offset_term_s32 *= a_offset;
 
-                // Add the offset terms to GEMM's result
-                // Store the result with the offset contribution
-                mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32;
-            }
-        },
-        vector_sum_col_it, vector_sum_row_it, mm_result_it);
+                    // Add the offset terms to GEMM's result
+                    // Store the result with the offset contribution
+                    mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32;
+                }
+            },
+            vector_sum_col_it, vector_sum_row_it, mm_result_it);
     }
-    else if((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true
+    else if ((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
 
@@ -233,54 +237,51 @@ void run_offset_contribution(const Window &window,
 
         const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
 
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            const int batch_id      = id.z() / depth_input;
-            auto      mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+        execute_window_loop(
+            collapsed_window,
+            [&](const Coordinates &id)
+            {
+                const int batch_id      = id.z() / depth_input;
+                auto      mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
 
-            // Compute the leftover term due to b_offset.
-            int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input);
-            b_offset_term_s32 *= b_offset;
+                // Compute the leftover term due to b_offset.
+                int32_t b_offset_term_s32 =
+                    *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                      id.y() + (id.z() % depth_input) * height_input);
+                b_offset_term_s32 *= b_offset;
 
-            const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
+                const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
 
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x4_t in_s32 =
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(mm_result_ptr + x + 0),
-                        vld1q_s32(mm_result_ptr + x + 4),
-                        vld1q_s32(mm_result_ptr + x + 8),
-                        vld1q_s32(mm_result_ptr + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec);
-
-                // Store the result with the offset contribution
-                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
-                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
-                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
-                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
-            }
-
-            // Left-overs loop
-            for(; x < window_end_x; ++x)
-            {
-                // Add the offset terms to GEMM's result
-                // Store the result with the offset contribution
-                mm_result_ptr[x] += b_offset_term_s32;
-            }
-        },
-        vector_sum_row_it, mm_result_it);
+                    int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4),
+                                           vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}};
+
+                    // Add the offset terms to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec);
+
+                    // Store the result with the offset contribution
+                    vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+                    vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+                    vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+                    vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+                }
+
+                // Left-overs loop
+                for (; x < window_end_x; ++x)
+                {
+                    // Add the offset terms to GEMM's result
+                    // Store the result with the offset contribution
+                    mm_result_ptr[x] += b_offset_term_s32;
+                }
+            },
+            vector_sum_row_it, mm_result_it);
     }
-    else if((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false
+    else if ((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false
     {
         // Set window for vector_sum_col
         Window win_vector_sum_col(collapsed_window);
@@ -290,69 +291,62 @@ void run_offset_contribution(const Window &window,
         Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
 
         // Offset in case vector_sum_col is batched
-        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
-
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            const int    batch_id           = id.z() / depth_input;
-            const size_t batch_offset_col   = batch_id * (sum_col_stride_y ); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor
-            auto         vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + batch_id * vector_sum_col_batch_offset);
-            auto         mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+        const int vector_sum_col_batch_offset =
+            slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
 
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            collapsed_window,
+            [&](const Coordinates &id)
             {
-                // Compute the leftover term due to a_offset.
-                int32x4x4_t a_offset_term_s32 =
+                const int    batch_id = id.z() / depth_input;
+                const size_t batch_offset_col =
+                    batch_id *
+                    (sum_col_stride_y); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor
+                auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col +
+                                                                            batch_id * vector_sum_col_batch_offset);
+                auto mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(vector_sum_col_ptr + x + 0),
-                        vld1q_s32(vector_sum_col_ptr + x + 4),
-                        vld1q_s32(vector_sum_col_ptr + x + 8),
-                        vld1q_s32(vector_sum_col_ptr + x + 12)
-                    }
-                };
-
-                a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
-                a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
-                a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
-                a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
-
-                int32x4x4_t in_s32 =
+                    // Compute the leftover term due to a_offset.
+                    int32x4x4_t a_offset_term_s32 = {
+                        {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4),
+                         vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}};
+
+                    a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+                    a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+                    a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+                    a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+                    int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4),
+                                           vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}};
+
+                    // Add the offset terms to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
+
+                    // Store the result with the offset contribution
+                    vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+                    vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+                    vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+                    vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+                }
+
+                // Left-overs loop
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vld1q_s32(mm_result_ptr + x + 0),
-                        vld1q_s32(mm_result_ptr + x + 4),
-                        vld1q_s32(mm_result_ptr + x + 8),
-                        vld1q_s32(mm_result_ptr + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
-
-                // Store the result with the offset contribution
-                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
-                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
-                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
-                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
-            }
-
-            // Left-overs loop
-            for(; x < window_end_x; ++x)
-            {
-                // Compute the leftover term due to a_offset.
-                const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
-
-                // Add the offset terms to GEMM's result
-                // Store the result with the offset contribution
-                mm_result_ptr[x] += a_offset_term_s32 * a_offset;
-            }
-        },
-        vector_sum_col_it, mm_result_it);
+                    // Compute the leftover term due to a_offset.
+                    const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
+
+                    // Add the offset terms to GEMM's result
+                    // Store the result with the offset contribution
+                    mm_result_ptr[x] += a_offset_term_s32 * a_offset;
+                }
+            },
+            vector_sum_col_it, mm_result_it);
     }
     else // false, false
     {
@@ -362,7 +356,12 @@ void run_offset_contribution(const Window &window,
 }
 } // namespace
 
-void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result,
+                                                    ITensorInfo *vector_sum_col,
+                                                    ITensorInfo *vector_sum_row,
+                                                    int32_t      k,
+                                                    int32_t      a_offset,
+                                                    int32_t      b_offset)
 {
     // Perform validate step
     ARM_COMPUTE_UNUSED(vector_sum_row);
@@ -374,7 +373,7 @@ void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITen
     _k_offset = a_offset * b_offset * k;
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         // Check if vector_sum_col_shape should be slidden or not
         // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
@@ -387,8 +386,11 @@ void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITen
     ICpuKernel::configure(win);
 }
 
-Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
-                                                     int32_t a_offset, int32_t b_offset)
+Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result,
+                                                     const ITensorInfo *vector_sum_col,
+                                                     const ITensorInfo *vector_sum_row,
+                                                     int32_t            a_offset,
+                                                     int32_t            b_offset)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
     return Status{};
@@ -405,11 +407,11 @@ void CpuGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Win
     auto mm_result      = tensors.get_tensor(TensorType::ACL_DST);
 
     // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->info()->num_dimensions() > 1
-                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->info()->num_dimensions() > 1 &&
+                                   mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
 
-    run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d);
+    run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset,
+                            _slide_vector_sum_col, reinterpret_as_3d);
 }
 
 const char *CpuGemmLowpOffsetContributionKernel::name() const
@@ -418,4 +420,4 @@ const char *CpuGemmLowpOffsetContributionKernel::name() const
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
index 3514ca811d486e9d873edad4016389ffa7613b70..08b2d47529f5c50bde678da91201a486094a02cb 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
@@ -63,24 +63,33 @@ public:
      * @param[in]      a_offset       Offset to be added to each element of the matrix A.
      * @param[in]      b_offset       Offset to be added to each element of the matrix B.
      */
-    void configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
+    void configure(ITensorInfo *mm_result,
+                   ITensorInfo *vector_sum_col,
+                   ITensorInfo *vector_sum_row,
+                   int32_t      k,
+                   int32_t      a_offset,
+                   int32_t      b_offset);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmLowpOffsetContributionKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset);
+    static Status validate(const ITensorInfo *mm_result,
+                           const ITensorInfo *vector_sum_col,
+                           const ITensorInfo *vector_sum_row,
+                           int32_t            a_offset,
+                           int32_t            b_offset);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    int32_t _a_offset{ 0 };
-    int32_t _b_offset{ 0 };
-    int32_t _k_offset{ 0 };
-    bool    _slide_vector_sum_col{ true };
+    int32_t _a_offset{0};
+    int32_t _b_offset{0};
+    int32_t _k_offset{0};
+    bool    _slide_vector_sum_col{true};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
index 190487eced81e036206c89ce4e9481aba9e1d9d0..d0088423987923d8fcd8c236dab245992be10a02 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
@@ -31,10 +31,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 
@@ -48,80 +49,38 @@ namespace
 {
 inline int32x4x4_t load_results_input(const Iterator &mm_result_it, int32_t x)
 {
-    return
-    {
-        {
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0),
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4),
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8),
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12)
-        }
-    };
+    return {{vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0),
+             vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4),
+             vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8),
+             vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12)}};
 }
 
 inline int32x4x4_t load(const int32_t *ptr, int32_t x)
 {
-    return
-    {
-        {
-            vld1q_s32(ptr + x + 0),
-            vld1q_s32(ptr + x + 4),
-            vld1q_s32(ptr + x + 8),
-            vld1q_s32(ptr + x + 12)
-        }
-    };
+    return {{vld1q_s32(ptr + x + 0), vld1q_s32(ptr + x + 4), vld1q_s32(ptr + x + 8), vld1q_s32(ptr + x + 12)}};
 }
 
 inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
 {
-    return
-    {
-        {
-            vaddq_s32(a.val[0], b),
-            vaddq_s32(a.val[1], b),
-            vaddq_s32(a.val[2], b),
-            vaddq_s32(a.val[3], b)
-        }
-    };
+    return {{vaddq_s32(a.val[0], b), vaddq_s32(a.val[1], b), vaddq_s32(a.val[2], b), vaddq_s32(a.val[3], b)}};
 }
 
 inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
 {
-    return
-    {
-        {
-            vaddq_s32(a.val[0], b.val[0]),
-            vaddq_s32(a.val[1], b.val[1]),
-            vaddq_s32(a.val[2], b.val[2]),
-            vaddq_s32(a.val[3], b.val[3])
-        }
-    };
+    return {{vaddq_s32(a.val[0], b.val[0]), vaddq_s32(a.val[1], b.val[1]), vaddq_s32(a.val[2], b.val[2]),
+             vaddq_s32(a.val[3], b.val[3])}};
 }
 
 inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
 {
-    return
-    {
-        {
-            vmulq_n_s32(a.val[0], mul_scalar),
-            vmulq_n_s32(a.val[1], mul_scalar),
-            vmulq_n_s32(a.val[2], mul_scalar),
-            vmulq_n_s32(a.val[3], mul_scalar)
-        }
-    };
+    return {{vmulq_n_s32(a.val[0], mul_scalar), vmulq_n_s32(a.val[1], mul_scalar), vmulq_n_s32(a.val[2], mul_scalar),
+             vmulq_n_s32(a.val[3], mul_scalar)}};
 }
 
 inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier)
 {
-    return
-    {
-        {
-            vmulq_s32(a.val[0], vld1q_s32(multilpier)),
-            vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)),
-            vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)),
-            vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))
-        }
-    };
+    return {{vmulq_s32(a.val[0], vld1q_s32(multilpier)), vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)),
+             vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)), vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))}};
 }
 
 inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x)
@@ -144,18 +103,11 @@ inline int32x4_t get_b_offset(const int32_t *vector_sum_row_ptr, int32_t b_offse
 
 inline int32x4x4_t get_k_offset(int32_t k_offset)
 {
-    return
-    {
-        {
-            vdupq_n_s32(k_offset),
-            vdupq_n_s32(k_offset),
-            vdupq_n_s32(k_offset),
-            vdupq_n_s32(k_offset)
-        }
-    };
+    return {{vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}};
 }
 
-inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
+inline uint8x16_t finalize_quantization_floating_point(
+    int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -172,18 +124,13 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3
     in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to U8
     uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_u8 = vmaxq_u8(out_u8, min_u8);
         out_u8 = vminq_u8(out_u8, max_u8);
@@ -192,7 +139,8 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3
     return out_u8;
 }
 
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
+inline int8x16_t finalize_quantization_floating_point(
+    int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -209,18 +157,13 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32
     in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8
     int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = vmaxq_s8(out_s8, min_s8);
         out_s8 = vminq_s8(out_s8, max_s8);
@@ -229,7 +172,8 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32
     return out_s8;
 }
 
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
+inline int8x16_t finalize_quantization_floating_point(
+    int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -246,18 +190,13 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32
     in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8
     int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = vmaxq_s8(out_s8, min_s8);
         out_s8 = vminq_s8(out_s8, max_s8);
@@ -305,81 +244,103 @@ inline Iterator get_bias_it(const Window &window, const ITensor *bias)
 }
 
 template <typename VT>
-inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
-                                                        const int32x4_t result_offset_s32, const int32x4_t result_shift_s32,
-                                                        typename VT::vtype min_vec, typename VT::vtype max_vec,
-                                                        int32_t a_offset, int32_t b_offset, int32_t k_offset,
-                                                        int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound,
-                                                        int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
+inline void run_offset_contribution_output_stage_window(const int32_t     *vector_sum_col_ptr,
+                                                        const int32_t     *vector_sum_row_ptr,
+                                                        const int32_t     *bias_ptr,
+                                                        Iterator           mm_result_it,
+                                                        Iterator           out_it,
+                                                        const int32x4_t    result_offset_s32,
+                                                        const int32x4_t    result_shift_s32,
+                                                        typename VT::vtype min_vec,
+                                                        typename VT::vtype max_vec,
+                                                        int32_t            a_offset,
+                                                        int32_t            b_offset,
+                                                        int32_t            k_offset,
+                                                        int32_t            multiplier,
+                                                        int32_t            shift,
+                                                        int32_t            offset,
+                                                        int32_t            min_bound,
+                                                        int32_t            max_bound,
+                                                        int                window_step_x,
+                                                        int                window_start_x,
+                                                        int                window_end_x,
+                                                        bool               has_a_offset,
+                                                        bool               has_b_offset,
+                                                        bool               has_bias,
+                                                        bool               is_bounded_relu,
+                                                        bool               is_fixed_point)
 {
-    int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
-    if(!is_fixed_point)
+    int32x4x4_t offset_term_s32 = {0, 0, 0, 0};
+    if (!is_fixed_point)
     {
         // Combine quantization offset with other offsets.
         offset_term_s32 = add_s32(offset_term_s32, result_offset_s32);
     }
-    if(has_a_offset && has_b_offset)
+    if (has_a_offset && has_b_offset)
     {
         offset_term_s32 = add_s32(offset_term_s32, get_k_offset(k_offset));
     }
-    if(has_b_offset)
+    if (has_b_offset)
     {
         offset_term_s32 = add_s32(offset_term_s32, get_b_offset(vector_sum_row_ptr, b_offset));
     }
 
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
 
-        if(has_a_offset)
+        if (has_a_offset)
         {
             in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
         }
-        if(has_bias)
+        if (has_bias)
         {
             in_s32 = add_s32(in_s32, load(bias_ptr, x));
         }
-        if(!is_fixed_point || has_b_offset)
+        if (!is_fixed_point || has_b_offset)
         {
             in_s32 = add_s32(in_s32, offset_term_s32);
         }
-        if(!is_fixed_point)
+        if (!is_fixed_point)
         {
             in_s32 = mul_s32(in_s32, multiplier);
         }
 
-        if(is_fixed_point)
+        if (is_fixed_point)
         {
-            wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
-                            finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu));
+            wrapper::vstore(
+                reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
+                finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu));
         }
         else
         {
-            wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
-                            finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu));
+            wrapper::vstore(
+                reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
+                finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu));
         }
     }
     // Compute left-over elements
-    for(; x < window_end_x; ++x)
+    for (; x < window_end_x; ++x)
     {
-        int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
+        int32_t in_value =
+            *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
 
-        if(has_a_offset)
+        if (has_a_offset)
         {
             in_value += (*(vector_sum_col_ptr + x) * a_offset);
         }
-        if(has_bias)
+        if (has_bias)
         {
             in_value += *(bias_ptr + x);
         }
 
-        if(is_fixed_point)
+        if (is_fixed_point)
         {
             // Finalize and store the result
-            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset,
-                                                                                              static_cast<typename VT::stype>(min_bound),
-                                                                                              static_cast<typename VT::stype>(max_bound), is_bounded_relu);
+            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) =
+                finalize_quantization(in_value, multiplier, shift, offset, static_cast<typename VT::stype>(min_bound),
+                                      static_cast<typename VT::stype>(max_bound), is_bounded_relu);
         }
         else
         {
@@ -387,75 +348,100 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
             in_value = (in_value * multiplier) >> shift;
 
             // Bound and store the result
-            if(is_bounded_relu)
+            if (is_bounded_relu)
             {
-                in_value = static_cast<typename VT::stype>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
+                in_value = static_cast<typename VT::stype>(
+                    std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
             }
-            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = static_cast<typename VT::stype>(std::max<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()),
-                                                                                                                          std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value)));
+            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) =
+                static_cast<typename VT::stype>(std::max<int32_t>(
+                    static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()),
+                    std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value)));
         }
     }
 }
 
-inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
-                                                             const int32_t *result_multipliers, const int32_t *result_shifts,
-                                                             const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8,
-                                                             int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound,
-                                                             int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
+inline void run_offset_contribution_output_stage_window_symm(const int32_t  *vector_sum_col_ptr,
+                                                             const int32_t  *bias_ptr,
+                                                             Iterator        mm_result_it,
+                                                             Iterator        out_it,
+                                                             const int32_t  *result_multipliers,
+                                                             const int32_t  *result_shifts,
+                                                             const int32x4_t result_offset,
+                                                             int8x16_t       min_s8,
+                                                             int8x16_t       max_s8,
+                                                             int32_t         a_offset,
+                                                             int32_t         offset,
+                                                             int32_t         min_bound,
+                                                             int32_t         max_bound,
+                                                             int             window_step_x,
+                                                             int             window_start_x,
+                                                             int             window_end_x,
+                                                             bool            has_a_offset,
+                                                             bool            has_bias,
+                                                             bool            is_bounded_relu,
+                                                             bool            is_fixed_point)
 {
-    int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
-    if(!is_fixed_point)
+    int32x4x4_t offset_term_s32 = {0, 0, 0, 0};
+    if (!is_fixed_point)
     {
         // Combine quantization offset with other offsets.
         offset_term_s32 = add_s32(offset_term_s32, result_offset);
     }
 
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
 
-        if(has_a_offset)
+        if (has_a_offset)
         {
             in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
         }
-        if(has_bias)
+        if (has_bias)
         {
             in_s32 = add_s32(in_s32, load(bias_ptr, x));
         }
-        if(!is_fixed_point)
+        if (!is_fixed_point)
         {
             in_s32 = add_s32(in_s32, offset_term_s32);
             in_s32 = mul_s32(in_s32, result_multipliers + x);
         }
 
-        if(is_fixed_point)
+        if (is_fixed_point)
         {
-            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8, is_bounded_relu));
+            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x),
+                     finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x),
+                                                result_offset, min_s8, max_s8, is_bounded_relu));
         }
         else
         {
-            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu));
+            vst1q_s8(
+                reinterpret_cast<int8_t *>(out_it.ptr() + x),
+                finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu));
         }
     }
     // Compute left-over elements
-    for(; x < window_end_x; ++x)
+    for (; x < window_end_x; ++x)
     {
-        int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
+        int32_t in_value =
+            *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
 
-        if(has_a_offset)
+        if (has_a_offset)
         {
             in_value += (*(vector_sum_col_ptr + x) * a_offset);
         }
-        if(has_bias)
+        if (has_bias)
         {
             in_value += *(bias_ptr + x);
         }
 
-        if(is_fixed_point)
+        if (is_fixed_point)
         {
             // Finalize and store the result
-            *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu);
+            *(out_it.ptr() + x) =
+                finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset,
+                                      static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu);
         }
         else
         {
@@ -463,7 +449,7 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect
             in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]);
 
             // Bound and store the result
-            if(is_bounded_relu)
+            if (is_bounded_relu)
             {
                 in_value = static_cast<int8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
             }
@@ -473,10 +459,20 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect
 }
 
 template <typename T>
-void run_offset_contribution_output_stage(const Window &window,
-                                          const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
-                                          int32_t a_offset, int32_t b_offset, int32_t k_offset, bool is_vector_sum_col_batched,
-                                          GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
+void run_offset_contribution_output_stage(const Window           &window,
+                                          const ITensor          *mm_result,
+                                          const ITensor          *vector_sum_col,
+                                          const ITensor          *vector_sum_row,
+                                          const ITensor          *bias,
+                                          ITensor                *output,
+                                          int32_t                 a_offset,
+                                          int32_t                 b_offset,
+                                          int32_t                 k_offset,
+                                          bool                    is_vector_sum_col_batched,
+                                          GEMMLowpOutputStageInfo output_stage,
+                                          bool                    is_gemm3d,
+                                          bool                    is_bounded_relu,
+                                          bool                    is_fixed_point)
 {
     //  Semantics of XYZW Explained for each tensor
     //
@@ -516,7 +512,7 @@ void run_offset_contribution_output_stage(const Window &window,
     Iterator mm_result_it(mm_result, win);
     Iterator out_it(output, win);
 
-    if((a_offset != 0) && (b_offset != 0))
+    if ((a_offset != 0) && (b_offset != 0))
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
@@ -527,45 +523,52 @@ void run_offset_contribution_output_stage(const Window &window,
         const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
 
         // Offset in case vector_sum_col is batched in y dimension
-        const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
+        const int vector_sum_col_stride_batch =
+            is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
-                                                                   mm_result_it,
-                                                                   out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, true, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    const auto vector_sum_row_ptr =
+                        reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                        id.y() + (id.z() % depth_input) * height_input;
+                    run_offset_contribution_output_stage_window<Typer>(
+                        vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
+                        mm_result_it, out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset,
+                        k_offset, multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x,
+                        window_end_x, true, true, true, is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, true, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    const auto vector_sum_row_ptr =
+                        reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                        id.y() + (id.z() % depth_input) * height_input;
+                    run_offset_contribution_output_stage_window<Typer>(
+                        vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32,
+                        result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset,
+                        min_bound, max_bound, window_step_x, window_start_x, window_end_x, true, true, false,
+                        is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
         }
     }
-    else if((a_offset == 0) && (b_offset != 0))
+    else if ((a_offset == 0) && (b_offset != 0))
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
 
@@ -573,114 +576,139 @@ void run_offset_contribution_output_stage(const Window &window,
 
         const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
-                                                                   out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, true, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_row_it, bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id = id.z() / depth_input;
+                    const auto vector_sum_row_ptr =
+                        reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                        id.y() + (id.z() % depth_input) * height_input;
+                    run_offset_contribution_output_stage_window<Typer>(
+                        nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+                        out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset,
+                        multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x,
+                        false, true, true, is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_row_it, bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_row_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id = id.z() / depth_input;
+                    const auto vector_sum_row_ptr =
+                        reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                        id.y() + (id.z() % depth_input) * height_input;
+                    run_offset_contribution_output_stage_window<Typer>(
+                        nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32,
+                        min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound,
+                        window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu,
+                        is_fixed_point);
+                },
+                vector_sum_row_it, mm_result_it, out_it);
         }
     }
-    else if((a_offset != 0) && (b_offset == 0))
+    else if ((a_offset != 0) && (b_offset == 0))
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
 
         Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
 
         // Offset in case vector_sum_col is batched in y dimension
-        const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
+        const int vector_sum_col_stride_batch =
+            is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
-                                                                   out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, false, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    run_offset_contribution_output_stage_window<Typer>(
+                        vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+                        out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset,
+                        multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x,
+                        true, false, true, is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_col_it, bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    run_offset_contribution_output_stage_window<Typer>(
+                        vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32,
+                        min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound,
+                        window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu,
+                        is_fixed_point);
+                },
+                vector_sum_col_it, mm_result_it, out_it);
         }
     }
     else
     {
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, false, true, is_bounded_relu, is_fixed_point);
-            },
-            bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &)
+                {
+                    run_offset_contribution_output_stage_window<Typer>(
+                        nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                        result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, multiplier,
+                        shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x, false, false,
+                        true, is_bounded_relu, is_fixed_point);
+                },
+                bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, is_fixed_point);
-            },
-            mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &)
+                {
+                    run_offset_contribution_output_stage_window<Typer>(
+                        nullptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, min_vec,
+                        max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound,
+                        window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu,
+                        is_fixed_point);
+                },
+                mm_result_it, out_it);
         }
         return;
     }
 }
 
-void run_offset_contribution_output_stage_symm(const Window &window,
-                                               const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
-                                               int32_t a_offset, int32_t b_offset, int32_t k_offset, bool is_vector_sum_col_batched,
-                                               GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
+void run_offset_contribution_output_stage_symm(const Window           &window,
+                                               const ITensor          *mm_result,
+                                               const ITensor          *vector_sum_col,
+                                               const ITensor          *vector_sum_row,
+                                               const ITensor          *bias,
+                                               ITensor                *output,
+                                               int32_t                 a_offset,
+                                               int32_t                 b_offset,
+                                               int32_t                 k_offset,
+                                               bool                    is_vector_sum_col_batched,
+                                               GEMMLowpOutputStageInfo output_stage,
+                                               bool                    is_gemm3d,
+                                               bool                    is_bounded_relu,
+                                               bool                    is_fixed_point)
 {
     ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset);
 
@@ -690,8 +718,8 @@ void run_offset_contribution_output_stage_symm(const Window &window,
     const int32_t min_bound = output_stage.gemmlowp_min_bound;
     const int32_t max_bound = output_stage.gemmlowp_max_bound;
 
-    const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data();
-    const int32_t *result_shifts      = output_stage.gemmlowp_shifts.data();
+    const int32_t  *result_multipliers = output_stage.gemmlowp_multipliers.data();
+    const int32_t  *result_shifts      = output_stage.gemmlowp_shifts.data();
     const int32x4_t result_offset_s32  = vdupq_n_s32(offset);
     const int8x16_t min_s8             = vdupq_n_s8(static_cast<int8_t>(min_bound));
     const int8x16_t max_s8             = vdupq_n_s8(static_cast<int8_t>(max_bound));
@@ -708,88 +736,105 @@ void run_offset_contribution_output_stage_symm(const Window &window,
     Iterator mm_result_it(mm_result, win);
     Iterator out_it(output, win);
 
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
 
         Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
 
         // Offset in case vector_sum_col is batched in y dimension
-        const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
+        const int vector_sum_col_stride_batch =
+            is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    run_offset_contribution_output_stage_window_symm(
+                        vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                        result_multipliers, result_shifts, result_offset_s32, min_s8, max_s8, a_offset, offset,
+                        min_bound, max_bound, window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu,
+                        is_fixed_point);
+                },
+                vector_sum_col_it, bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    run_offset_contribution_output_stage_window_symm(
+                        vector_sum_col_ptr, nullptr, mm_result_it, out_it, result_multipliers, result_shifts,
+                        result_offset_s32, min_s8, max_s8, a_offset, offset, min_bound, max_bound, window_step_x,
+                        window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_col_it, mm_result_it, out_it);
         }
     }
     else
     {
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, is_fixed_point);
-            },
-            bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &)
+                {
+                    run_offset_contribution_output_stage_window_symm(
+                        nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                        result_multipliers, result_shifts, result_offset_s32, min_s8, max_s8, a_offset, offset,
+                        min_bound, max_bound, window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu,
+                        is_fixed_point);
+                },
+                bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, false, false, is_bounded_relu, is_fixed_point);
-            },
-            mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &)
+                {
+                    run_offset_contribution_output_stage_window_symm(
+                        nullptr, nullptr, mm_result_it, out_it, result_multipliers, result_shifts, result_offset_s32,
+                        min_s8, max_s8, a_offset, offset, min_bound, max_bound, window_step_x, window_start_x,
+                        window_end_x, false, false, is_bounded_relu, is_fixed_point);
+                },
+                mm_result_it, out_it);
         }
         return;
     }
 }
 
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
-                          int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+Status validate_arguments(const ITensorInfo      *mm_result,
+                          const ITensorInfo      *vector_sum_col,
+                          const ITensorInfo      *vector_sum_row,
+                          const ITensorInfo      *bias,
+                          const ITensorInfo      *output,
+                          int32_t                 a_offset,
+                          int32_t                 b_offset,
+                          GEMMLowpOutputStageInfo output_stage)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-    if(output->data_type() != DataType::QASYMM8)
+    if (output->data_type() != DataType::QASYMM8)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && b_offset != 0);
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 &&
+                                    b_offset != 0);
     }
     ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN &&
+                                output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -797,7 +842,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
     }
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
@@ -805,19 +850,21 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
     }
 
     // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
         // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
         // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
         ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = output->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
+        if (output_shape.num_dimensions() > 1)
         {
             const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -828,13 +875,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
                                             "mm_result tensor must have the same number of batches of output tensor");
 
-            if(a_offset != 0)
+            if (a_offset != 0)
             {
                 TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                 vector_sum_col_shape.collapse_from(1);
 
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
             }
         }
 
@@ -842,7 +891,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->num_dimensions() > 3);
     }
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
@@ -852,15 +901,21 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
 }
 } // namespace
 
-void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
-                                                               const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
-                                                               int32_t k, int32_t a_offset, int32_t b_offset,
+void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo      *mm_result,
+                                                               const ITensorInfo      *vector_sum_col,
+                                                               const ITensorInfo      *vector_sum_row,
+                                                               const ITensorInfo      *bias,
+                                                               ITensorInfo            *dst,
+                                                               int32_t                 k,
+                                                               int32_t                 a_offset,
+                                                               int32_t                 b_offset,
                                                                GEMMLowpOutputStageInfo output_stage)
 {
     ARM_COMPUTE_UNUSED(vector_sum_row, bias);
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage));
 
     _a_offset     = a_offset;
     _b_offset     = b_offset;
@@ -868,7 +923,7 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo
     _output_stage = output_stage;
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         // Check if vector_sum_col_shape should be slidden or not
         // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
@@ -888,16 +943,24 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo
     ICpuKernel::configure(win);
 }
 
-Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
-                                                                const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo      *mm_result,
+                                                                const ITensorInfo      *vector_sum_col,
+                                                                const ITensorInfo      *vector_sum_row,
+                                                                const ITensorInfo      *bias,
+                                                                const ITensorInfo      *output,
+                                                                int32_t                 a_offset,
+                                                                int32_t                 b_offset,
+                                                                GEMMLowpOutputStageInfo output_stage)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
     return Status{};
 }
 
-void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack      &tensors,
+                                                            const Window     &window,
+                                                            const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -912,14 +975,14 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors
     PixelValue type_min{};
     PixelValue type_max{};
     std::tie(type_min, type_max) = get_min_max(dst->info()->data_type());
-    int32_t type_min_int = type_min.get<int32_t>();
-    int32_t type_max_int = type_max.get<int32_t>();
+    int32_t type_min_int         = type_min.get<int32_t>();
+    int32_t type_max_int         = type_max.get<int32_t>();
 
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->info()->num_dimensions() > 1
-                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->info()->num_dimensions() > 1 &&
+                                   mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
 
-    const bool is_bounded_relu = !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int);
+    const bool is_bounded_relu =
+        !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int);
 
     // Check if we need to perform fixed point requantization
     const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
@@ -930,22 +993,25 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors
     // Check if symmetric per-channel execution
     const bool is_symm = _output_stage.is_quantized_per_channel;
 
-    if(is_symm)
+    if (is_symm)
     {
-        run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage,
-                                                  reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+        run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst,
+                                                  _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched,
+                                                  _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point);
     }
     else
     {
-        if(is_signed)
+        if (is_signed)
         {
-            run_offset_contribution_output_stage<int8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage,
-                                                         reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+            run_offset_contribution_output_stage<int8_t>(
+                window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset,
+                _is_vector_sum_col_batched, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point);
         }
         else
         {
-            run_offset_contribution_output_stage<uint8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage,
-                                                          reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+            run_offset_contribution_output_stage<uint8_t>(
+                window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset,
+                _is_vector_sum_col_batched, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point);
         }
     }
 }
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
index 3cb99faee8f0164281389cdf9110f5205dacccf4..af477d4756c73d55004bd8516ae9e66acf15d06d 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -85,7 +86,13 @@ public:
      * @param[in]  b_offset       Offset to be added to each element of the matrix B.
      * @param[in]  output_stage   GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
      */
-    void configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, int32_t k, int32_t a_offset,
+    void configure(const ITensorInfo      *mm_result,
+                   const ITensorInfo      *vector_sum_col,
+                   const ITensorInfo      *vector_sum_row,
+                   const ITensorInfo      *bias,
+                   ITensorInfo            *dst,
+                   int32_t                 k,
+                   int32_t                 a_offset,
                    int32_t                 b_offset,
                    GEMMLowpOutputStageInfo output_stage);
     /** Static function to check if given info will lead to a valid configuration
@@ -94,21 +101,26 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset,
+    static Status validate(const ITensorInfo      *mm_result,
+                           const ITensorInfo      *vector_sum_col,
+                           const ITensorInfo      *vector_sum_row,
+                           const ITensorInfo      *bias,
+                           const ITensorInfo      *dst,
+                           int32_t                 a_offset,
                            int32_t                 b_offset,
                            GEMMLowpOutputStageInfo output_stage);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
     /** Function to use for the particular tensors passed to configure() */
-    int32_t                 _a_offset{ 0 };
-    int32_t                 _b_offset{ 0 };
-    int32_t                 _k_offset{ 0 };
-    bool                    _is_vector_sum_col_batched{ true };
-    GEMMLowpOutputStageInfo _output_stage{ GEMMLowpOutputStageInfo() };
+    int32_t                 _a_offset{0};
+    int32_t                 _b_offset{0};
+    int32_t                 _k_offset{0};
+    bool                    _is_vector_sum_col_batched{true};
+    GEMMLowpOutputStageInfo _output_stage{GEMMLowpOutputStageInfo()};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp
index 3023d93113bef35b279987d15567f116c901a9c3..eefc29470059c8f411103de90d05b5500c7d17d2 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp
@@ -28,13 +28,14 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
 #include "src/core/AccessWindowStatic.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 
@@ -46,26 +47,35 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                                || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_max_bound >
+        std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_min_bound <
+            std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) ||
+        output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        if(dst->data_type() != output_stage->output_data_type && (output_stage->output_data_type == DataType::QASYMM8 || output_stage->output_data_type == DataType::QASYMM8_SIGNED))
+        if (dst->data_type() != output_stage->output_data_type &&
+            (output_stage->output_data_type == DataType::QASYMM8 ||
+             output_stage->output_data_type == DataType::QASYMM8_SIGNED))
         {
             ARM_COMPUTE_RETURN_ERROR_MSG("Mismatching data types");
         }
@@ -92,24 +102,26 @@ inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value,
-       typename wrapper::traits::neon_vector<T, 16>::type>::type
-       convert_to_8bit(const int16x8x2_t in_s16)
+inline
+    typename std::enable_if<std::is_same<T, uint8_t>::value, typename wrapper::traits::neon_vector<T, 16>::type>::type
+    convert_to_8bit(const int16x8x2_t in_s16)
 {
     return wrapper::vcombine(wrapper::vqmovun(in_s16.val[0]), wrapper::vqmovun(in_s16.val[1]));
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value,
-       typename wrapper::traits::neon_vector<T, 16>::type>::type
-       convert_to_8bit(const int16x8x2_t in_s16)
+inline typename std::enable_if<std::is_same<T, int8_t>::value, typename wrapper::traits::neon_vector<T, 16>::type>::type
+convert_to_8bit(const int16x8x2_t in_s16)
 {
     return wrapper::vcombine(wrapper::vqmovn(in_s16.val[0]), wrapper::vqmovn(in_s16.val[1]));
 }
 
 template <typename T>
-inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, typename wrapper::traits::neon_vector<T, 16>::type min,
-                                                                                typename wrapper::traits::neon_vector<T, 16>::type max)
+inline typename wrapper::traits::neon_vector<T, 16>::type
+finalize_quantization(int32x4x4_t                                       &in_s32,
+                      int32x4_t                                          result_shift_s32,
+                      typename wrapper::traits::neon_vector<T, 16>::type min,
+                      typename wrapper::traits::neon_vector<T, 16>::type max)
 {
     // Shift final result (negative value shift right)
     in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
@@ -118,13 +130,8 @@ inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization(
     in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8 or U8
     typename wrapper::traits::neon_vector<T, 16>::type out = convert_to_8bit<T>(in_s16);
@@ -137,7 +144,10 @@ inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization(
 } // namespace
 
 template <typename T>
-void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
+void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src,
+                                                           const ITensor *bias,
+                                                           ITensor       *dst,
+                                                           const Window  &window)
 {
     using VectorType = typename wrapper::traits::neon_vector<T, 16>::type;
 
@@ -159,107 +169,105 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, c
     Iterator in(src, win);
     Iterator out(dst, win);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window win_biases;
         win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
         win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
 
         Iterator bias_i(bias, win_biases);
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                const int32x4x4_t bias_s32 =
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    const int32x4x4_t bias_s32 = {
+                        {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}};
+
+                    // Add the bias to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+                    // Add the offset terms to GEMM's result and multiply by result_mult_int
+                    scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
+
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x),
+                                    finalize_quantization<T>(in_s32, result_shift_s32, min, max));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)
-                    }
-                };
-
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
-                // Add the offset terms to GEMM's result and multiply by result_mult_int
-                scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
-
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int bias_value = *(reinterpret_cast<const int *>(bias_i.ptr()) + x);
-                int       in_value   = *(reinterpret_cast<const int *>(in.ptr()) + x);
-
-                // Quantize
-                in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift;
-
-                // Store the result
-                *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
-            }
-        },
-        in, bias_i, out);
+                    const int bias_value = *(reinterpret_cast<const int *>(bias_i.ptr()) + x);
+                    int       in_value   = *(reinterpret_cast<const int *>(in.ptr()) + x);
+
+                    // Quantize
+                    in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) *
+                                _output_stage->gemmlowp_multiplier) >>
+                               _output_stage->gemmlowp_shift;
+
+                    // Store the result
+                    *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
+                }
+            },
+            in, bias_i, out);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result and multiply by result_mult_int
-                scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
-
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x);
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    // Add the offset terms to GEMM's result and multiply by result_mult_int
+                    scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
+
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x),
+                                    finalize_quantization<T>(in_s32, result_shift_s32, min, max));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x);
 
-                // Quantize
-                in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift;
+                    // Quantize
+                    in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >>
+                               _output_stage->gemmlowp_shift;
 
-                // Store the result
-                *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
-            }
-        },
-        in, out);
+                    // Store the result
+                    *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
+                }
+            },
+            in, out);
     }
 }
 
-void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo                   *src,
+                                                        ITensorInfo                   *bias,
+                                                        ITensorInfo                   *dst,
+                                                        const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_UNUSED(bias);
     // Perform validate step
@@ -268,10 +276,7 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITenso
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type));
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
-                                                  bias,
-                                                  dst,
-                                                  output_stage));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage));
 
     _output_stage = output_stage;
 
@@ -281,14 +286,17 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITenso
     ICpuKernel::configure(win);
 
     // Check if we need to clamp the result using min and max
-    _is_bounded_relu = ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound)
-                        && !(_output_stage->gemmlowp_min_bound == std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                             && _output_stage->gemmlowp_max_bound == std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))));
-    if(_output_stage->output_data_type == DataType::QASYMM8)
+    _is_bounded_relu =
+        ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound) &&
+         !(_output_stage->gemmlowp_min_bound ==
+               std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) &&
+           _output_stage->gemmlowp_max_bound ==
+               std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))));
+    if (_output_stage->output_data_type == DataType::QASYMM8)
     {
         _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<uint8_t>;
     }
-    else if(_output_stage->output_data_type == DataType::QASYMM8_SIGNED)
+    else if (_output_stage->output_data_type == DataType::QASYMM8_SIGNED)
     {
         _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<int8_t>;
     }
@@ -298,7 +306,10 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITenso
     }
 }
 
-Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo             *src,
+                                                         const ITensorInfo             *bias,
+                                                         const ITensorInfo             *dst,
+                                                         const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage));
     return Status{};
@@ -323,4 +334,4 @@ const char *CpuGemmLowpQuantizeDownInt32ScaleKernel::name() const
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
index c7813edcd7869b8af71accc7ea79ed6aaec92139..33e296b251773a95f4863693d82e6bcf5b7e3de5 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -71,10 +72,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *output_stage);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -95,11 +99,14 @@ private:
      * @param[out] dst    Output tensor info
      * @param[in]  window Region on which to execute the kernel.
      */
-    using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+    using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src,
+                                                                                      const ITensor *bias,
+                                                                                      ITensor       *dst,
+                                                                                      const Window  &window);
 
-    QuantizeDownFunctionPtr        _func{ nullptr };
-    const GEMMLowpOutputStageInfo *_output_stage{ nullptr };
-    bool                           _is_bounded_relu{ false };
+    QuantizeDownFunctionPtr        _func{nullptr};
+    const GEMMLowpOutputStageInfo *_output_stage{nullptr};
+    bool                           _is_bounded_relu{false};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
index 53ca991889393397f4e24f2f6e15e7548a1f6b63..a5c09c9977478674ca3f1143ee0084d987dec72e 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
@@ -29,12 +29,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NESymm.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NESymm.h"
 
 #include <arm_neon.h>
 
@@ -53,14 +54,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
     ARM_COMPUTE_RETURN_ERROR_ON(min > max);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM16);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
@@ -71,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
 } // namespace
 
 template <bool is_bounded_relu>
-void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
+void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src,
+                                                                              const ITensor *bias,
+                                                                              ITensor       *dst,
+                                                                              const Window  &window)
 {
     const int16x8_t min_s16 = vdupq_n_s16(static_cast<int16_t>(_min));
     const int16x8_t max_s16 = vdupq_n_s16(static_cast<int16_t>(_max));
@@ -88,92 +92,92 @@ void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(co
 
     Iterator in(src, win_collapsed);
     Iterator out(dst, win_collapsed);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window win_biases;
         win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
         win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
 
         Iterator bias_i(bias, win_biases);
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x2_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)
-                    }
-                };
+                    int32x4x2_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)}};
 
-                const int32x4x2_t bias_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4)
-                    }
-                };
+                    const int32x4x2_t bias_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+                                                   vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4)}};
 
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                    // Add the bias to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
 
-                vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16));
-            }
+                    vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x,
+                              finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier,
+                                                                           _result_shift, min_s16, max_s16));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
-                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Add bias
-                in_value += bias_value;
-                // Finalize and store the result
-                *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
-                                                                                                             static_cast<int16_t>(_max));
-            }
-        },
-        in, out, bias_i);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
+                    int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                    // Add bias
+                    in_value += bias_value;
+                    // Finalize and store the result
+                    *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(
+                        in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
+                        static_cast<int16_t>(_max));
+                }
+            },
+            in, out, bias_i);
     }
     else
     {
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x2_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)
-                    }
-                };
+                    int32x4x2_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)}};
 
-                vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16));
-            }
+                    vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x,
+                              finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier,
+                                                                           _result_shift, min_s16, max_s16));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-                ARM_COMPUTE_UNUSED(in_value);
-                // Finalize and store the result
-                *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
-                                                                                                             static_cast<int16_t>(_max));
-            }
-        },
-        in, out);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+                    ARM_COMPUTE_UNUSED(in_value);
+                    // Finalize and store the result
+                    *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(
+                        in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
+                        static_cast<int16_t>(_max));
+                }
+            },
+            in, out);
     }
 }
 
-void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift,
-                                                                           int min, int max)
+void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src,
+                                                                           ITensorInfo *bias,
+                                                                           ITensorInfo *dst,
+                                                                           int          result_fixedpoint_multiplier,
+                                                                           int          result_shift,
+                                                                           int          min,
+                                                                           int          max)
 {
     // Perform validate step
     ARM_COMPUTE_UNUSED(bias, dst);
@@ -193,18 +197,21 @@ void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITens
 
     // Check if we need to clamp the result using min and max
     const bool is_bounded_relu = !(min <= -32768 && max >= 32767);
-    _func                      = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<true> :
-                                 &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<false>;
+    _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<true>
+                            : &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<false>;
 }
 
-Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(
+    const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
     return Status{};
 }
 
-void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                        const Window     &window,
+                                                                        const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
index 681d0996958798f4613d91bcf704e40dd7844963..925788b68069cfbb088cada377285b0e522c967b 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -48,7 +49,8 @@ namespace kernels
  *  -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>
+class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+    : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default;
@@ -65,17 +67,24 @@ public:
      * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
      *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
      */
-    void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0);
+    void configure(ITensorInfo *src,
+                   ITensorInfo *bias,
+                   ITensorInfo *dst,
+                   int          result_fixedpoint_multiplier,
+                   int          result_shift,
+                   int          min = 0,
+                   int          max = 0);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -97,13 +106,13 @@ private:
      * @param[in]  window Region on which to execute the kernel.
      */
     using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::*)(
-                                        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
 
-    QuantizeDownFunctionPtr _func{ nullptr };
-    int                     _result_fixedpoint_multiplier{ 0 };
-    int                     _result_shift{ 0 };
-    int                     _min{ 0 };
-    int                     _max{ 0 };
+    QuantizeDownFunctionPtr _func{nullptr};
+    int                     _result_fixedpoint_multiplier{0};
+    int                     _result_shift{0};
+    int                     _min{0};
+    int                     _max{0};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
index 27214dcb5a688f8c64f213cea0773725de905196..0e5809707328cef0d5ae7e18291478b6f6119e1c 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
@@ -29,12 +29,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NEAsymm.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
 
 #include <arm_neon.h>
 
@@ -53,14 +54,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
     ARM_COMPUTE_RETURN_ERROR_ON(min > max);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
@@ -71,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
 } // namespace
 
 template <bool is_bounded_relu>
-void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
+void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src,
+                                                                             const ITensor *bias,
+                                                                             ITensor       *dst,
+                                                                             const Window  &window)
 {
     const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
     const int8x16_t min_s8                        = vdupq_n_s8(static_cast<int8_t>(_min));
@@ -88,102 +92,102 @@ void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(con
 
     Iterator in(src, win_collapsed);
     Iterator out(dst, win_collapsed);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window win_biases;
         win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
         win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
 
         Iterator bias_i(bias, win_biases);
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                const int32x4x4_t bias_s32 =
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    const int32x4x4_t bias_s32 = {
+                        {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}};
+
+                    // Add the bias to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+                    vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
+                             finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+                                                   result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)
-                    }
-                };
-
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
-                vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
-                         finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
-                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Add bias
-                in_value += bias_value;
-                // Finalize and store the result
-                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
-                                                                                   static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out, bias_i);
+                    const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
+                    int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                    // Add bias
+                    in_value += bias_value;
+                    // Finalize and store the result
+                    *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(
+                        in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+                        static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
+                }
+            },
+            in, out, bias_i);
     }
     else
     {
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
-                         finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Finalize and store the result
-                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
-                                                                                   static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out);
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
+                             finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+                                                   result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                    // Finalize and store the result
+                    *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(
+                        in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+                        static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
+                }
+            },
+            in, out);
     }
 }
 
-void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift,
-                                                                          int result_offset_after_shift, int min, int max)
+void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src,
+                                                                          ITensorInfo *bias,
+                                                                          ITensorInfo *dst,
+                                                                          int          result_fixedpoint_multiplier,
+                                                                          int          result_shift,
+                                                                          int          result_offset_after_shift,
+                                                                          int          min,
+                                                                          int          max)
 {
     ARM_COMPUTE_UNUSED(bias);
     // Perform validate step
@@ -205,18 +209,21 @@ void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITenso
 
     // Check if we need to clamp the result using min and max
     const bool is_bounded_relu = !(min <= -128 && max >= 127);
-    _func                      = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<true> :
-                                 &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<false>;
+    _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<true>
+                            : &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<false>;
 }
 
-Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
+Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(
+    const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max));
     return Status{};
 }
 
-void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                       const Window     &window,
+                                                                       const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
index 3e615b935e2d42240e7c70b73c6b05d317668c23..6a67ba4f1998a8fb31e51792a4d732b7c32a2bd6 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -49,7 +50,8 @@ namespace kernels
  *  -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>
+class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
+    : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default;
@@ -67,17 +69,25 @@ public:
      * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
      *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
      */
-    void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+    void configure(ITensorInfo *src,
+                   ITensorInfo *bias,
+                   ITensorInfo *dst,
+                   int          result_fixedpoint_multiplier,
+                   int          result_shift,
+                   int          result_offset_after_shift,
+                   int          min = 0,
+                   int          max = 0);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -99,14 +109,14 @@ private:
      * @param[in]  window Region on which to execute the kernel.
      */
     using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::*)(
-                                        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
 
-    QuantizeDownFunctionPtr _func{ nullptr };
-    int                     _result_fixedpoint_multiplier{ 0 };
-    int                     _result_shift{ 0 };
-    int                     _result_offset_after_shift{ 0 };
-    int                     _min{ 0 };
-    int                     _max{ 0 };
+    QuantizeDownFunctionPtr _func{nullptr};
+    int                     _result_fixedpoint_multiplier{0};
+    int                     _result_shift{0};
+    int                     _result_offset_after_shift{0};
+    int                     _min{0};
+    int                     _max{0};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index e49fd2911570472ffc21113a30feab15e3134ec8..e3dd2240ca69fe74047c63cdb9def7a493103b99 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -29,12 +29,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NEAsymm.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
 
 #include <arm_neon.h>
 
@@ -53,14 +54,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
     ARM_COMPUTE_RETURN_ERROR_ON(min > max);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
@@ -71,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
 } // namespace
 
 template <bool is_bounded_relu>
-void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
+void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src,
+                                                                              const ITensor *bias,
+                                                                              ITensor       *dst,
+                                                                              const Window  &window)
 {
     const int32x4_t  result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
     const uint8x16_t min_u8                        = vdupq_n_u8(static_cast<uint8_t>(_min));
@@ -89,98 +93,102 @@ void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(co
 
     Iterator in(src, win_collapsed);
     Iterator out(dst, win_collapsed);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window win_biases;
         win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
         win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
 
         Iterator bias_i(bias, win_biases);
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                const int32x4x4_t bias_s32 =
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    const int32x4x4_t bias_s32 = {
+                        {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}};
+
+                    // Add the bias to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+                    vst1q_u8(out.ptr() + x,
+                             finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+                                                   result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)
-                    }
-                };
-
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
-                vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
-                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Add bias
-                in_value += bias_value;
-                // Finalize and store the result
-                *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out, bias_i);
+                    const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
+                    int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                    // Add bias
+                    in_value += bias_value;
+                    // Finalize and store the result
+                    *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift,
+                                                             _result_offset_after_shift, static_cast<uint8_t>(_min),
+                                                             static_cast<uint8_t>(_max), is_bounded_relu);
+                }
+            },
+            in, out, bias_i);
     }
     else
     {
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Finalize and store the result
-                *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out);
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    vst1q_u8(out.ptr() + x,
+                             finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+                                                   result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                    // Finalize and store the result
+                    *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift,
+                                                             _result_offset_after_shift, static_cast<uint8_t>(_min),
+                                                             static_cast<uint8_t>(_max), is_bounded_relu);
+                }
+            },
+            in, out);
     }
 }
 
-void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift,
-                                                                           int result_offset_after_shift, int min, int max)
+void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src,
+                                                                           ITensorInfo *bias,
+                                                                           ITensorInfo *dst,
+                                                                           int          result_fixedpoint_multiplier,
+                                                                           int          result_shift,
+                                                                           int          result_offset_after_shift,
+                                                                           int          min,
+                                                                           int          max)
 {
     ARM_COMPUTE_UNUSED(bias);
     // Perform validate step
@@ -202,18 +210,21 @@ void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITens
 
     // Check if we need to clamp the result using min and max
     const bool is_bounded_relu = !(min <= 0 && max >= 255);
-    _func                      = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<true> :
-                                 &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<false>;
+    _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<true>
+                            : &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<false>;
 }
 
-Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
+Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(
+    const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max));
     return Status{};
 }
 
-void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                        const Window     &window,
+                                                                        const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -233,4 +244,4 @@ const char *CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::name() c
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
index b773fdfdcfe17d61c2aedc8d97376c90dd15b11a..45bd742a70f319eac05ec0a868c716d05e6c9b70 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -49,7 +50,8 @@ namespace kernels
  *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>
+class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+    : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default;
@@ -67,17 +69,25 @@ public:
      * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
      *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
      */
-    void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+    void configure(ITensorInfo *src,
+                   ITensorInfo *bias,
+                   ITensorInfo *dst,
+                   int          result_fixedpoint_multiplier,
+                   int          result_shift,
+                   int          result_offset_after_shift,
+                   int          min = 0,
+                   int          max = 0);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -93,14 +103,14 @@ private:
      * @param[in] window Region on which to execute the kernel.
      */
     using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)(
-                                        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
 
-    QuantizeDownFunctionPtr _func{ nullptr };
-    int                     _result_fixedpoint_multiplier{ 0 };
-    int                     _result_shift{ 0 };
-    int                     _result_offset_after_shift{ 0 };
-    int                     _min{ 0 };
-    int                     _max{ 0 };
+    QuantizeDownFunctionPtr _func{nullptr};
+    int                     _result_fixedpoint_multiplier{0};
+    int                     _result_shift{0};
+    int                     _result_offset_after_shift{0};
+    int                     _min{0};
+    int                     _max{0};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp
index 6399ebbef4400d51cdb8861f545e7f777174b763..fb1b70b91fd725315fbbb321828b4246610fc768 100644
--- a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp
+++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp
@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEFixedPoint.h"
 #include "src/cpu/kernels/gemm_matrix_add/list.h"
 namespace arm_compute
 {
@@ -40,24 +41,12 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> available_kernels =
-{
-    {
-        "neon_fp32_gemm_matrix_add",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F32);
-        },
-        REGISTER_FP32_NEON(neon_fp32_gemm_matrix_add)
-    },
-    {
-        "neon_fp16_gemm_matrix_add",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F16) && data.isa.fp16;
-        },
-        REGISTER_FP16_NEON(neon_fp16_gemm_matrix_add)
-    },
+static const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> available_kernels = {
+    {"neon_fp32_gemm_matrix_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_gemm_matrix_add)},
+    {"neon_fp16_gemm_matrix_add",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_gemm_matrix_add)},
 
 };
 } // namespace
@@ -71,7 +60,8 @@ void CpuGemmMatrixAdditionKernel::configure(const ITensorInfo *src, ITensorInfo
     ARM_COMPUTE_ERROR_THROW_ON(CpuGemmMatrixAdditionKernel::validate(src, dst, beta));
 
     _beta         = beta;
-    const auto uk = CpuGemmMatrixAdditionKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuGemmMatrixAdditionKernel::get_implementation(
+        DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
     _func = uk->ukernel;
     // Configure kernel window
@@ -87,7 +77,7 @@ Status CpuGemmMatrixAdditionKernel::validate(const ITensorInfo *src, const ITens
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -105,7 +95,7 @@ void CpuGemmMatrixAdditionKernel::run_op(ITensorPack &tensors, const Window &win
     const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
     ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_beta != 0.0f)
+    if (_beta != 0.0f)
     {
         (*_func)(src, dst, window, _beta);
     }
@@ -116,7 +106,8 @@ const char *CpuGemmMatrixAdditionKernel::name() const
     return "CpuGemmMatrixAdditionKernel";
 }
 
-const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> &CpuGemmMatrixAdditionKernel::get_available_kernels()
+const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> &
+CpuGemmMatrixAdditionKernel::get_available_kernels()
 {
     return available_kernels;
 }
diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
index cbc5b530871bb00ab3d8caa778be1a6a9ee34043..5e12f1dcbd20b07ca6e12d1c6872b57dbef3f042 100644
--- a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
+++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
@@ -75,7 +75,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     static const std::vector<GemmMatrixAddKernel> &get_available_kernels();
@@ -89,8 +89,8 @@ private:
      * @param[in]  beta   Weight of matrix C
      */
     /** Matrix addition function to use for the particular tensor types passed to configure() */
-    GemmMatrixAddKernelPtr _func{ nullptr };
-    float                  _beta{ 0.f };
+    GemmMatrixAddKernelPtr _func{nullptr};
+    float                  _beta{0.f};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp
index 03b372efd45075c1bc4a8a641198cf0fa08b17b8..beccd94844f6669e6033b54f829dcbe2f8c91898 100644
--- a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp
+++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp
@@ -26,10 +26,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/gemm_matrix_mul/list.h"
@@ -42,27 +43,20 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> available_kernels =
-{
-    {
-        "neon_fp32_gemm_matrix_mul",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F32);
-        },
-        REGISTER_FP32_NEON(neon_fp32_gemm_matrix_mul)
-    },
-    {
-        "neon_fp16_gemm_matrix_mul",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F16) && data.isa.fp16;
-        },
-        REGISTER_FP16_NEON(neon_fp16_gemm_matrix_mul)
-    },
+static const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> available_kernels = {
+    {"neon_fp32_gemm_matrix_mul", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_gemm_matrix_mul)},
+    {"neon_fp16_gemm_matrix_mul",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_gemm_matrix_mul)},
 };
 
-inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
+inline Status validate_arguments(const ITensorInfo     *lhs,
+                                 const ITensorInfo     *rhs,
+                                 const ITensorInfo     *dst,
+                                 float                  alpha,
+                                 bool                   is_interleaved,
+                                 const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
 
@@ -70,11 +64,11 @@ inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs,
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
 
-    if(!is_interleaved)
+    if (!is_interleaved)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(0) != rhs->dimension(1));
 
-        if(dst->total_size() != 0)
+        if (dst->total_size() != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(rhs->dimension(0) != dst->dimension(0));
             ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(1) != dst->dimension(1));
@@ -90,28 +84,31 @@ inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs,
         const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
 
         /* Interleave */
-        TensorShape tensor_shape0{ lhs->tensor_shape() };
+        TensorShape tensor_shape0{lhs->tensor_shape()};
         tensor_shape0.set(0, k);
         tensor_shape0.set(1, m);
 
         const TensorInfo tensor_info0          = lhs->clone()->set_tensor_shape(tensor_shape0);
-        const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
+        const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape(
+            misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lhs, &tensor_info_reshaped0);
 
-        if(n != 0) /* Transpose */
+        if (n != 0) /* Transpose */
         {
-            TensorShape tensor_shape1{ rhs->tensor_shape() };
+            TensorShape tensor_shape1{rhs->tensor_shape()};
             tensor_shape1.set(0, n);
             tensor_shape1.set(1, k);
 
-            const TensorInfo tensor_info1          = rhs->clone()->set_tensor_shape(tensor_shape1);
-            const TensorInfo tensor_info_reshaped1 = rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
+            const TensorInfo tensor_info1 = rhs->clone()->set_tensor_shape(tensor_shape1);
+            const TensorInfo tensor_info_reshaped1 =
+                rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(
+                    tensor_info1, mult_transpose1xW_width));
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(rhs, &tensor_info_reshaped1);
         }
 
-        if(dst->total_size() != 0)
+        if (dst->total_size() != 0)
         {
-            if(n != 0)
+            if (n != 0)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(0) != static_cast<size_t>(n));
             }
@@ -125,12 +122,17 @@ inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs,
 
 } // namespace
 
-void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
+void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo     *lhs,
+                                            const ITensorInfo     *rhs,
+                                            ITensorInfo           *dst,
+                                            float                  alpha,
+                                            bool                   is_interleaved,
+                                            const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
 
     // dst tensor auto inizialitation if not yet initialized
-    TensorShape tensor_shape{ lhs->tensor_shape() };
+    TensorShape tensor_shape{lhs->tensor_shape()};
     tensor_shape.set(0, is_interleaved ? reshape_info.n() : rhs->dimension(0));
     tensor_shape.set(1, is_interleaved ? reshape_info.m() : lhs->dimension(1));
 
@@ -146,7 +148,7 @@ void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITenso
 
     // Check if the dst tensor is a vector. If so,the kernel runs the vector-matrix multiplication
     const bool is_dst_vector = (dst->dimension(1) == 1);
-    if(is_dst_vector)
+    if (is_dst_vector)
     {
         const unsigned int num_elems_processed_per_iteration_x = (lhs->data_type() == DataType::F32) ? 16 : 32;
 
@@ -157,17 +159,23 @@ void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITenso
         constexpr unsigned int num_elems_processed_per_iteration_x = 8;
         constexpr unsigned int num_elems_processed_per_iteration_y = 4;
 
-        win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        win =
+            calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
     }
 
-    const auto uk = CpuGemmMatrixMultiplyKernel::get_implementation(DataTypeISASelectorData{ lhs->data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuGemmMatrixMultiplyKernel::get_implementation(
+        DataTypeISASelectorData{lhs->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
     _func = uk->ukernel;
 
     ICPPKernel::configure(win);
 }
 
-Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved,
+Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo     *lhs,
+                                             const ITensorInfo     *rhs,
+                                             const ITensorInfo     *dst,
+                                             float                  alpha,
+                                             bool                   is_interleaved,
                                              const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info));
@@ -195,7 +203,8 @@ const char *CpuGemmMatrixMultiplyKernel::name() const
     return "CpuGemmMatrixMultiplyKernel";
 }
 
-const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> &CpuGemmMatrixMultiplyKernel::get_available_kernels()
+const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> &
+CpuGemmMatrixMultiplyKernel::get_available_kernels()
 {
     return available_kernels;
 }
diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
index a7dfec87bd534b7faf2951e32fe110328b76d3f4..765fcb8275877ec62c62c498eec2b7692fb22be7 100644
--- a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
+++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
@@ -42,7 +42,8 @@ namespace kernels
 class CpuGemmMatrixMultiplyKernel : public ICpuKernel<CpuGemmMatrixMultiplyKernel>
 {
 private:
-    using GemmMatrixMulKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &, const ThreadInfo &, float, const bool)>::type;
+    using GemmMatrixMulKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, ITensor *, const Window &, const ThreadInfo &, float, const bool)>::type;
 
 public:
     struct GemmMatrixMulKernel
@@ -67,17 +68,27 @@ public:
      * @param[in]  is_interleaved (Optional) True if lhs and rhs have been reshaped respectively using @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel
      * @param[in]  reshape_info   (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how @p lhs and @p rhs have been reshaped
      */
-    void configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
+    void configure(const ITensorInfo     *lhs,
+                   const ITensorInfo     *rhs,
+                   ITensorInfo           *dst,
+                   float                  alpha,
+                   bool                   is_interleaved,
+                   const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmMatrixMultiplyKernel
      *
      * Similar to @ref CpuGemmMatrixMultiplyKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info);
+    static Status validate(const ITensorInfo     *lhs,
+                           const ITensorInfo     *rhs,
+                           const ITensorInfo     *dst,
+                           float                  alpha,
+                           bool                   is_interleaved,
+                           const GEMMReshapeInfo &reshape_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     static const std::vector<GemmMatrixMulKernel> &get_available_kernels();
@@ -94,8 +105,8 @@ private:
      */
 
     /** Matrix multiply function to use for the particular tensor types passed to configure() */
-    GemmMatrixMulKernelPtr _func{ nullptr };
-    float                  _alpha{ 1.f };
+    GemmMatrixMulKernelPtr _func{nullptr};
+    float                  _alpha{1.f};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp
index 62d5d5f5e9daceb2431969c17621d0a9c8ea0350..c47746bc4b00891180c886dc7c0519dc2fed0465 100644
--- a/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp
+++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp
@@ -24,9 +24,10 @@
 #include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
 
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -63,9 +64,10 @@ Status CpuGemmTranspose1xWKernel::validate(const ITensorInfo *src, const ITensor
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_transpose1xW_with_element_size_shape(*src));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                           compute_transpose1xW_with_element_size_shape(*src));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -107,25 +109,28 @@ void CpuGemmTranspose1xWKernel::run_op(ITensorPack &tensors, const Window &windo
     const size_t out_stride   = dst->info()->strides_in_bytes()[1];
     const size_t vector_size  = 16 / element_size;
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const uint8_t *in_ptr  = in.ptr();
-        uint8_t *const out_ptr = out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride;
-
-        for(size_t k = 0; k < vector_size; ++k)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            // If the src width is not multiple of W, we fill the reference with 0s
-            if((id.x() + k) >= in_width)
-            {
-                std::memset(out_ptr + k * element_size, 0, element_size);
-            }
-            else
+            const uint8_t *in_ptr = in.ptr();
+            uint8_t *const out_ptr =
+                out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride;
+
+            for (size_t k = 0; k < vector_size; ++k)
             {
-                std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size);
+                // If the src width is not multiple of W, we fill the reference with 0s
+                if ((id.x() + k) >= in_width)
+                {
+                    std::memset(out_ptr + k * element_size, 0, element_size);
+                }
+                else
+                {
+                    std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size);
+                }
             }
-        }
-    },
-    in, out);
+        },
+        in, out);
 }
 
 const char *CpuGemmTranspose1xWKernel::name() const
diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
index 0ca92641b7b78dba775f946b4b87bbc2b33bf682..4b834b2cc6e1e5a55b5c01359d4031ffb6aede0d 100644
--- a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
+++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
@@ -88,7 +88,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuIm2ColKernel.cpp b/src/cpu/kernels/CpuIm2ColKernel.cpp
index 9ac291549bd1f3a65a0dc4f170eaebac84cefcd7..39ba764c78d44f148176c43ae0de34c7c64a7864 100644
--- a/src/cpu/kernels/CpuIm2ColKernel.cpp
+++ b/src/cpu/kernels/CpuIm2ColKernel.cpp
@@ -29,12 +29,14 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/cpu/kernels/directconv2d/impl.h"
+#include "src/cpu/kernels/directconv2d/list.h"
 
 #include <arm_neon.h>
 #include <cstddef>
@@ -49,396 +51,249 @@ namespace cpu
 {
 namespace kernels
 {
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                          bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right)
+void run_im2col_fp32_pad(const ITensor                        *src,
+                         ITensor                              *dst,
+                         const Window                         &window,
+                         DataLayout                            data_layout,
+                         const PadStrideInfo                  &conv_info,
+                         std::pair<unsigned int, unsigned int> convolved_dims,
+                         const Size2D                         &kernel_dims,
+                         const Size2D                         &dilation,
+                         uint32_t                              input_pad_right,
+                         bool                                  has_bias)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias);
-    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on Neon");
+    arm_compute::cpu::kernels::run_im2col<float, true, false>(src, dst, window, data_layout, conv_info, convolved_dims,
+                                                              kernel_dims, dilation, input_pad_right, has_bias);
+}
 
-    // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
-    const unsigned int width_idx    = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const unsigned     total_width  = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
-    const unsigned     total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
-    ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
+void run_im2col_fp32_nopad(const ITensor                        *src,
+                           ITensor                              *dst,
+                           const Window                         &window,
+                           DataLayout                            data_layout,
+                           const PadStrideInfo                  &conv_info,
+                           std::pair<unsigned int, unsigned int> convolved_dims,
+                           const Size2D                         &kernel_dims,
+                           const Size2D                         &dilation,
+                           uint32_t                              input_pad_right,
+                           bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<float, false, false>(src, dst, window, data_layout, conv_info, convolved_dims,
+                                                               kernel_dims, dilation, input_pad_right, has_bias);
+}
 
-    if(output->total_size() > 0)
-    {
-        TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+void run_im2col_bf16_pad(const ITensor                        *src,
+                         ITensor                              *dst,
+                         const Window                         &window,
+                         DataLayout                            data_layout,
+                         const PadStrideInfo                  &conv_info,
+                         std::pair<unsigned int, unsigned int> convolved_dims,
+                         const Size2D                         &kernel_dims,
+                         const Size2D                         &dilation,
+                         uint32_t                              input_pad_right,
+                         bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<bfloat16, true, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
 
-    return Status{};
+void run_im2col_bf16_nopad(const ITensor                        *src,
+                           ITensor                              *dst,
+                           const Window                         &window,
+                           DataLayout                            data_layout,
+                           const PadStrideInfo                  &conv_info,
+                           std::pair<unsigned int, unsigned int> convolved_dims,
+                           const Size2D                         &kernel_dims,
+                           const Size2D                         &dilation,
+                           uint32_t                              input_pad_right,
+                           bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<bfloat16, false, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
 }
+#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 
-template <typename T, bool has_pads>
-inline void linearize_volume_nchw(const uint8_t *const in_ptr,
-                                  T                   *out_ptr,
-                                  bool                 has_bias,
-                                  int                  top_left_x,
-                                  int                  top_left_y,
-                                  int                  kernel_width,
-                                  int                  kernel_height,
-                                  int                  kernel_depth,
-                                  int                  input_w,
-                                  int                  input_h,
-                                  int                  input_stride_x,
-                                  int                  input_stride_y,
-                                  int                  input_stride_z,
-                                  int                  pad_value,
-                                  int                  dilation_x,
-                                  int                  dilation_y)
+void run_im2col_int8_nopad_nhwc(const ITensor                        *src,
+                                ITensor                              *dst,
+                                const Window                         &window,
+                                DataLayout                            data_layout,
+                                const PadStrideInfo                  &conv_info,
+                                std::pair<unsigned int, unsigned int> convolved_dims,
+                                const Size2D                         &kernel_dims,
+                                const Size2D                         &dilation,
+                                uint32_t                              input_pad_right,
+                                bool                                  has_bias)
 {
-    const int kernel_size2 = kernel_width * kernel_height;
-    const int x_e          = top_left_x + kernel_width * dilation_x;
-    const int y_e          = top_left_y + kernel_height * dilation_y;
-
-    // Linearize volume
-    int d = 0;
-    // This for loop linearize a volume with 3 slices. This allows:
-    // 1) to reduce the iterations of the outer for loop "d"
-    // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
-    for(; d <= (kernel_depth - 3); d += 3)
-    {
-        for(int y = top_left_y; y < y_e; y += dilation_y)
-        {
-            if((y < 0 || y >= input_h) && has_pads)
-            {
-                // All the values will be the offset (will be zeros when not quantized)
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
-                {
-                    *(out_ptr + 0 * kernel_size2) = pad_value;
-                    *(out_ptr + 1 * kernel_size2) = pad_value;
-                    *(out_ptr + 2 * kernel_size2) = pad_value;
-                }
-            }
-            else
-            {
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
-                {
-                    if((x < 0 || x >= input_w) && has_pads)
-                    {
-                        *(out_ptr + 0 * kernel_size2) = pad_value;
-                        *(out_ptr + 1 * kernel_size2) = pad_value;
-                        *(out_ptr + 2 * kernel_size2) = pad_value;
-                    }
-                    else
-                    {
-                        *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                        *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                        *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                    }
-                }
-            }
-        }
-        out_ptr += 2 * kernel_size2;
-    }
+    arm_compute::cpu::kernels::run_im2col<int8_t, false, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
 
-    // Left over
-    for(; d < kernel_depth; d++)
-    {
-        for(int y = top_left_y; y < y_e; y += dilation_y)
-        {
-            if((y < 0 || y >= input_h) && has_pads)
-            {
-                // All the values will be the offset (will be zeros when not quantized)
-                memset(static_cast<void *>(out_ptr), pad_value, kernel_width * sizeof(T));
-                out_ptr += kernel_width;
-            }
-            else
-            {
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
-                {
-                    if((x < 0 || x >= input_w) && has_pads)
-                    {
-                        *out_ptr = pad_value;
-                    }
-                    else
-                    {
-                        *out_ptr = *(reinterpret_cast<const T *>(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                    }
-                }
-            }
-        }
-    }
+void run_im2col_uint8_nopad_nhwc(const ITensor                        *src,
+                                 ITensor                              *dst,
+                                 const Window                         &window,
+                                 DataLayout                            data_layout,
+                                 const PadStrideInfo                  &conv_info,
+                                 std::pair<unsigned int, unsigned int> convolved_dims,
+                                 const Size2D                         &kernel_dims,
+                                 const Size2D                         &dilation,
+                                 uint32_t                              input_pad_right,
+                                 bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<uint8_t, false, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
 
-    // Append 1 if the convolution layer has biases
-    if(has_bias)
-    {
-        *out_ptr = static_cast<T>(1);
-    }
+void run_im2col_qasymm8_pad_nhwc(const ITensor                        *src,
+                                 ITensor                              *dst,
+                                 const Window                         &window,
+                                 DataLayout                            data_layout,
+                                 const PadStrideInfo                  &conv_info,
+                                 std::pair<unsigned int, unsigned int> convolved_dims,
+                                 const Size2D                         &kernel_dims,
+                                 const Size2D                         &dilation,
+                                 uint32_t                              input_pad_right,
+                                 bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<qasymm8_t, true, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
 }
 
-template <typename T, bool has_pads>
-inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
-                                  T                   *out_ptr,
-                                  bool                 has_bias,
-                                  int                  start_x,
-                                  int                  start_y,
-                                  int                  kernel_width,
-                                  int                  kernel_height,
-                                  int                  input_w,
-                                  int                  input_h,
-                                  int                  input_c,
-                                  int                  input_stride_y,
-                                  int                  input_stride_z,
-                                  int                  pad_value,
-                                  int                  dilation_x,
-                                  int                  dilation_y)
+void internal_run_im2col_fp16_pad(const ITensor                        *src,
+                                  ITensor                              *dst,
+                                  const Window                         &window,
+                                  DataLayout                            data_layout,
+                                  const PadStrideInfo                  &conv_info,
+                                  std::pair<unsigned int, unsigned int> convolved_dims,
+                                  const Size2D                         &kernel_dims,
+                                  const Size2D                         &dilation,
+                                  uint32_t                              input_pad_right,
+                                  bool                                  has_bias)
 {
-    const int end_x        = start_x + kernel_width * dilation_x;
-    const int end_y        = start_y + kernel_height * dilation_y;
-    const int pad_quant    = kernel_width * input_c;
-    const int element_size = static_cast<int>(sizeof(T));
-    if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == input_c * element_size))
-    {
-        for(int y = start_y; y < end_y; y += dilation_y)
-        {
-            //optimized for no dilation and no boundary pixels
-            memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
-            out_ptr += input_c * kernel_width;
-        }
-    }
-    else
-    {
-        for(int y = start_y; y < end_y; y += dilation_y)
-        {
-            if(y < 0 || y >= input_h)
-            {
-                memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
-                out_ptr += pad_quant;
-            }
-            else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size)
-            {
-                for(int x = start_x; x < end_x; x += dilation_x)
-                {
-                    if(x < 0 || x >= input_w)
-                    {
-                        memset(static_cast<void *>(out_ptr), pad_value, input_c * element_size);
-                        out_ptr += input_c;
-                    }
-                    else
-                    {
-                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * element_size);
-                        out_ptr += input_c;
-                    }
-                }
-            }
-            else
-            {
-                //optimized for no dilation and no boundary pixels
-                memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
-                out_ptr += input_c * kernel_width;
-            }
-        }
-    }
-    // Append 1 if the convolution layer has biases
-    if(has_bias)
-    {
-        *out_ptr = static_cast<T>(1);
-    }
+/*
+   Note that when building with the option data_type_support=fp32 the fp16.cpp files won't be compiled and the linker
+   would fail with the error undefined arm_compute::cpu::kernels::run_im2col_fp16_pad.
+   To avoid this problem we only call to the actual fp16 kernel if ENABLE_FP16_KERNELS is defined.
+*/
+#if defined(ENABLE_FP16_KERNELS)
+    arm_compute::cpu::kernels::run_im2col_fp16_pad(src, dst, window, data_layout, conv_info, convolved_dims,
+                                                   kernel_dims, dilation, input_pad_right, has_bias);
+#else  // defined(ENABLE_FP16_KERNELS)
+    ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right,
+                       has_bias);
+#endif // defined(ENABLE_FP16_KERNELS)
 }
 
-template <typename T, bool has_pads>
-inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
-                                  T                   *out_ptr,
-                                  bool                 has_bias,
-                                  int                  start_x,
-                                  int                  start_y,
-                                  int                  kernel_width,
-                                  int                  kernel_height,
-                                  int                  input_w,
-                                  int                  input_h,
-                                  int                  input_c,
-                                  int                  input_stride_y,
-                                  int                  input_stride_z,
-                                  int                  pad_value,
-                                  int                  dilation_x,
-                                  int                  dilation_y,
-                                  int                  pad_right)
+void internal_run_im2col_fp16_nopad(const ITensor                        *src,
+                                    ITensor                              *dst,
+                                    const Window                         &window,
+                                    DataLayout                            data_layout,
+                                    const PadStrideInfo                  &conv_info,
+                                    std::pair<unsigned int, unsigned int> convolved_dims,
+                                    const Size2D                         &kernel_dims,
+                                    const Size2D                         &dilation,
+                                    uint32_t                              input_pad_right,
+                                    bool                                  has_bias)
 {
-    const int end_x              = start_x + kernel_width * dilation_x;
-    const int end_y              = start_y + kernel_height * dilation_y;
-    const int pad_quant          = kernel_width * (input_c + pad_right);
-    const int element_size       = static_cast<int>(sizeof(T));
-    const int channel_chunk_size = input_c * element_size;
+#if defined(ENABLE_FP16_KERNELS)
+    arm_compute::cpu::kernels::run_im2col_fp16_nopad(src, dst, window, data_layout, conv_info, convolved_dims,
+                                                     kernel_dims, dilation, input_pad_right, has_bias);
+#else  // defined(ENABLE_FP16_KERNELS)
+    ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right,
+                       has_bias);
+#endif // defined(ENABLE_FP16_KERNELS)
+}
 
-    if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == channel_chunk_size))
-    {
-        for(int y = start_y; y < end_y; y += dilation_y)
-        {
-            const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y);
-            for(int e = 0; e < kernel_width; e++)
-            {
-                memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size);
-                out_ptr += input_c + pad_right;
-            }
-        }
-    }
-    else
-    {
-        for(int y = start_y; y < end_y; y += dilation_y)
-        {
-            if(y < 0 || y >= input_h)
-            {
-                memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
-                out_ptr += pad_quant;
-            }
-            else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size)
-            {
-                for(int x = start_x; x < end_x; x += dilation_x)
-                {
-                    if(x < 0 || x >= input_w)
-                    {
-                        memset(static_cast<void *>(out_ptr), pad_value, (input_c + pad_right) * element_size);
-                        out_ptr += input_c + pad_right;
-                    }
-                    else
-                    {
-                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), channel_chunk_size);
-                        out_ptr += input_c + pad_right;
-                    }
-                }
-            }
-            else
-            {
-                const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y);
-                for(int e = 0; e < kernel_width; e++)
-                {
-                    memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size);
-                    out_ptr += input_c + pad_right;
-                }
-            }
-        }
-    }
-    // Append 1 if the convolution layer has biases
-    if(has_bias)
-    {
-        *out_ptr = static_cast<T>(1);
-    }
+void internal_run_im2col_fp16_nchw_pad(const ITensor                        *src,
+                                       ITensor                              *dst,
+                                       const Window                         &window,
+                                       DataLayout                            data_layout,
+                                       const PadStrideInfo                  &conv_info,
+                                       std::pair<unsigned int, unsigned int> convolved_dims,
+                                       const Size2D                         &kernel_dims,
+                                       const Size2D                         &dilation,
+                                       uint32_t                              input_pad_right,
+                                       bool                                  has_bias)
+{
+#if defined(ENABLE_FP16_KERNELS)
+    arm_compute::cpu::kernels::run_im2col_fp16_nchw_pad(src, dst, window, data_layout, conv_info, convolved_dims,
+                                                        kernel_dims, dilation, input_pad_right, has_bias);
+#else  // defined(ENABLE_FP16_KERNELS)
+    ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right,
+                       has_bias);
+#endif // defined(ENABLE_FP16_KERNELS)
 }
 
-} // namespace
+void internal_run_im2col_fp16_nchw_nopad(const ITensor                        *src,
+                                         ITensor                              *dst,
+                                         const Window                         &window,
+                                         DataLayout                            data_layout,
+                                         const PadStrideInfo                  &conv_info,
+                                         std::pair<unsigned int, unsigned int> convolved_dims,
+                                         const Size2D                         &kernel_dims,
+                                         const Size2D                         &dilation,
+                                         uint32_t                              input_pad_right,
+                                         bool                                  has_bias)
+{
+#if defined(ENABLE_FP16_KERNELS)
+    arm_compute::cpu::kernels::run_im2col_fp16_nchw_nopad(src, dst, window, data_layout, conv_info, convolved_dims,
+                                                          kernel_dims, dilation, input_pad_right, has_bias);
+#else  //  defined(ENABLE_FP16_KERNELS)
+    ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right,
+                       has_bias);
+#endif // defined(ENABLE_FP16_KERNELS)
+}
 
-template <typename T, bool has_pads, bool is_nchw>
-void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window &window)
+namespace
 {
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+Status validate_arguments(const ITensorInfo   *input,
+                          const ITensorInfo   *output,
+                          const Size2D        &kernel_dims,
+                          const PadStrideInfo &conv_info,
+                          bool                 has_bias,
+                          const Size2D        &dilation,
+                          unsigned int         num_groups,
+                          unsigned int         input_pad_right)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on Neon");
 
-    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
+    const unsigned int width_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    const unsigned     total_width = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
+    const unsigned     total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
+    ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
 
-    const int input_w        = src->info()->dimension(width_idx);
-    const int input_h        = src->info()->dimension(height_idx);
-    const int input_c        = src->info()->dimension(channel_idx);
-    const int input_stride_x = src->info()->strides_in_bytes().x();
-    const int input_stride_y = src->info()->strides_in_bytes().y();
-    const int input_stride_z = src->info()->strides_in_bytes().z();
-    const int pad_left       = _conv_info.pad_left();
-    const int pad_top        = _conv_info.pad_top();
-    const int stride_x       = _conv_info.stride().first;
-    const int stride_y       = _conv_info.stride().second;
-    const int pad_value      = is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0;
-
-    Window window_in_out(window);
-    // The first three dimensions of the input and output are increased by the inner loops
-    window_in_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Create iterators
-    Iterator in(src, window_in_out);
-    Iterator out(dst, window_in_out);
-
-    execute_window_loop(
-        window, [&](const Coordinates & id)
+    if (output->total_size() > 0)
     {
-        const int start_w = id[width_idx] * stride_x - pad_left;
-        const int start_h = id[height_idx] * stride_y - pad_top;
-
-        // Get pointers
-        const uint8_t *const input_ptr  = in.ptr();
-        auto                 output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * dst->info()->strides_in_bytes().y());
+        TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(
+            input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    }
 
-        // Linearize volume
-        if(is_nchw)
-        {
-            linearize_volume_nchw<T, has_pads>(input_ptr,
-                                               output_ptr,
-                                               _has_bias,
-                                               start_w,
-                                               start_h,
-                                               _kernel_width,
-                                               _kernel_height,
-                                               input_c,
-                                               input_w,
-                                               input_h,
-                                               input_stride_x,
-                                               input_stride_y,
-                                               input_stride_z,
-                                               pad_value,
-                                               _dilation.x(),
-                                               _dilation.y());
-        }
-        else
-        {
-            if(_input_pad_right > 0)
-            {
-                linearize_volume_nhwc<T, has_pads>(input_ptr,
-                                                   output_ptr,
-                                                   _has_bias,
-                                                   start_w,
-                                                   start_h,
-                                                   _kernel_width,
-                                                   _kernel_height,
-                                                   input_w,
-                                                   input_h,
-                                                   input_c,
-                                                   input_stride_y,
-                                                   input_stride_z,
-                                                   pad_value,
-                                                   _dilation.x(),
-                                                   _dilation.y(),
-                                                   _input_pad_right);
-            }
-            else
-            {
-                linearize_volume_nhwc<T, has_pads>(input_ptr,
-                                                   output_ptr,
-                                                   _has_bias,
-                                                   start_w,
-                                                   start_h,
-                                                   _kernel_width,
-                                                   _kernel_height,
-                                                   input_w,
-                                                   input_h,
-                                                   input_c,
-                                                   input_stride_y,
-                                                   input_stride_z,
-                                                   pad_value,
-                                                   _dilation.x(),
-                                                   _dilation.y());
-            }
-        }
-    },
-    in, out);
+    return Status{};
 }
+} // namespace
 
-void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right)
+void CpuIm2ColKernel::configure(const ITensorInfo   *src,
+                                ITensorInfo         *dst,
+                                const Size2D        &kernel_dims,
+                                const PadStrideInfo &conv_info,
+                                bool                 has_bias,
+                                const Size2D        &dilation,
+                                unsigned int         num_groups,
+                                unsigned int         input_pad_right)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
     ARM_COMPUTE_UNUSED(num_groups);
 
     _data_layout                   = src->data_layout();
@@ -451,31 +306,29 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
     _kernel_height   = kernel_dims.height;
     _input_pad_right = input_pad_right;
     _dilation        = dilation;
-    _convolved_dims  = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx),
-                                         _kernel_width, _kernel_height,
-                                         _conv_info, _dilation);
+    _convolved_dims  = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), _kernel_width,
+                                         _kernel_height, _conv_info, _dilation);
     _has_bias        = has_bias;
 
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::F32:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, true> : &CpuIm2ColKernel::run_im2col<float, true, true>;
+                _func = (!conv_info.has_padding()) ? &run_im2col_fp32_nchw_nopad : &run_im2col_fp32_nchw_pad;
+                break;
+            case DataType::F16:
+                _func = (!conv_info.has_padding()) ? &internal_run_im2col_fp16_nchw_nopad
+                                                   : &internal_run_im2col_fp16_nchw_pad;
                 break;
 #if defined(ARM_COMPUTE_ENABLE_BF16)
             case DataType::BFLOAT16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, true> : &CpuIm2ColKernel::run_im2col<bfloat16, true, true>;
+                _func = (!conv_info.has_padding()) ? &run_im2col_bf16_nchw_nopad : &run_im2col_bf16_nchw_pad;
                 break;
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, true> : &CpuIm2ColKernel::run_im2col<float16_t, true, true>;
-                break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::QASYMM8_SIGNED:
             case DataType::QASYMM8:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<qasymm8_t, false, true> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, true>;
+                _func = (!conv_info.has_padding()) ? &run_im2col_qasymm8_nchw_nopad : &run_im2col_qasymm8_nchw_pad;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -484,26 +337,24 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
     }
     else
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::F32:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, false> : &CpuIm2ColKernel::run_im2col<float, true, false>;
+                _func = (!conv_info.has_padding()) ? &run_im2col_fp32_nopad : &run_im2col_fp32_pad;
+                break;
+            case DataType::F16:
+                _func = (!conv_info.has_padding()) ? &internal_run_im2col_fp16_nopad : &internal_run_im2col_fp16_pad;
                 break;
 #if defined(ARM_COMPUTE_ENABLE_BF16)
             case DataType::BFLOAT16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, false> : &CpuIm2ColKernel::run_im2col<bfloat16, true, false>;
+                _func = (!conv_info.has_padding()) ? &run_im2col_bf16_nopad : &run_im2col_bf16_pad;
                 break;
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, false> : &CpuIm2ColKernel::run_im2col<float16_t, true, false>;
-                break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::QASYMM8:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<uint8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
+                _func = (!conv_info.has_padding()) ? &run_im2col_uint8_nopad_nhwc : &run_im2col_qasymm8_pad_nhwc;
                 break;
             case DataType::QASYMM8_SIGNED:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<int8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
+                _func = (!conv_info.has_padding()) ? &run_im2col_int8_nopad_nhwc : &run_im2col_qasymm8_pad_nhwc;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -512,11 +363,13 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
     }
 
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, false, num_groups, _input_pad_right)));
+    auto_init_if_empty(
+        *dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation,
+                                                                       false, num_groups, _input_pad_right)));
 
-    std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx),
-                                                                             kernel_dims.width, kernel_dims.height,
-                                                                             conv_info, dilation);
+    std::pair<unsigned int, unsigned int> convolved_dims =
+        scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx), kernel_dims.width, kernel_dims.height,
+                          conv_info, dilation);
 
     Window win = calculate_max_window(*src, Steps());
     win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1));
@@ -526,10 +379,17 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
     ICpuKernel::configure(win);
 }
 
-Status CpuIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                 bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right)
+Status CpuIm2ColKernel::validate(const ITensorInfo   *src,
+                                 const ITensorInfo   *dst,
+                                 const Size2D        &kernel_dims,
+                                 const PadStrideInfo &conv_info,
+                                 bool                 has_bias,
+                                 const Size2D        &dilation,
+                                 unsigned int         num_groups,
+                                 unsigned int         input_pad_right)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
     return Status{};
 }
 
@@ -542,7 +402,8 @@ void CpuIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, const T
     auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    (this->*_func)(src, dst, window);
+    _func(src, dst, window, _data_layout, _conv_info, _convolved_dims, Size2D(_kernel_width, _kernel_height), _dilation,
+          _input_pad_right, _has_bias);
 }
 
 const char *CpuIm2ColKernel::name() const
diff --git a/src/cpu/kernels/CpuIm2ColKernel.h b/src/cpu/kernels/CpuIm2ColKernel.h
index d133f8dc2d000707c09728c5398321e22d78dadd..ae7162cccf927c2b1ea155506735a2a18aaa0a14 100644
--- a/src/cpu/kernels/CpuIm2ColKernel.h
+++ b/src/cpu/kernels/CpuIm2ColKernel.h
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_IM2COL_KERNEL_H
-#define ARM_COMPUTE_CPU_IM2COL_KERNEL_H
+#ifndef ACL_SRC_CPU_KERNELS_CPUIM2COLKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUIM2COLKERNEL_H
 
 #include "arm_compute/core/Size2D.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -78,16 +79,28 @@ public:
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      * @param[in]  input_pad_right (Optional) When fast-math is selected, per element padding for the im2col matrix may be necessary
      */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                   bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, unsigned int input_pad_right = 0);
+    void configure(const ITensorInfo   *src,
+                   ITensorInfo         *dst,
+                   const Size2D        &kernel_dims,
+                   const PadStrideInfo &conv_info,
+                   bool                 has_bias,
+                   const Size2D        &dilation        = Size2D(1U, 1U),
+                   unsigned int         num_groups      = 1,
+                   unsigned int         input_pad_right = 0);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuIm2ColKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                           bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, unsigned int input_pad_right = 0);
+    static Status validate(const ITensorInfo   *src,
+                           const ITensorInfo   *dst,
+                           const Size2D        &kernel_dims,
+                           const PadStrideInfo &conv_info,
+                           bool                 has_bias,
+                           const Size2D        &dilation        = Size2D(1U, 1U),
+                           unsigned int         num_groups      = 1,
+                           unsigned int         input_pad_right = 0);
 
     // Inherited methods overridden:
     void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -102,32 +115,32 @@ public:
     size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
 
 private:
-    /** Template function to run im2col
-     *
-     * @param[in]  src    The input tensor info
-     * @param[out] dst    The output tensor info
-     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T, bool has_pads, bool is_nchw>
-    void run_im2col(const ITensor *src, ITensor *dst, const Window &window);
-
     /** Common signature for all the specialised im2col functions
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    using Im2ColFunctionPtr = void (CpuIm2ColKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
+    using Im2ColFunctionPtr = void (*)(const ITensor                        *src,
+                                       ITensor                              *dst,
+                                       const Window                         &window,
+                                       DataLayout                            data_layout,
+                                       const PadStrideInfo                  &conv_info,
+                                       std::pair<unsigned int, unsigned int> convolved_dims,
+                                       const Size2D                         &kernel_dims,
+                                       const Size2D                         &dilation,
+                                       uint32_t                              input_pad_right,
+                                       bool                                  has_bias);
 
-    Im2ColFunctionPtr                     _func{ nullptr };
+    Im2ColFunctionPtr                     _func{nullptr};
     std::pair<unsigned int, unsigned int> _convolved_dims{};
     PadStrideInfo                         _conv_info{};
-    unsigned int                          _kernel_width{ 0 };
-    unsigned int                          _kernel_height{ 0 };
-    unsigned int                          _input_pad_right{ 0 };
-    bool                                  _has_bias{ false };
-    Size2D                                _dilation{ 1U, 1U };
-    DataLayout                            _data_layout{ DataLayout::UNKNOWN };
+    unsigned int                          _kernel_width{0};
+    unsigned int                          _kernel_height{0};
+    unsigned int                          _input_pad_right{0};
+    bool                                  _has_bias{false};
+    Size2D                                _dilation{1U, 1U};
+    DataLayout                            _data_layout{DataLayout::UNKNOWN};
 };
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_IM2COL_KERNEL_H */
+#endif // ACL_SRC_CPU_KERNELS_CPUIM2COLKERNEL_H
diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h
index 39adc9af7cb01907d0cfdd8cb3091ed0dc4b9ceb..b7daa4d583ac8125761f3bfca186469a7fce6acb 100644
--- a/src/cpu/kernels/CpuKernelSelectionTypes.h
+++ b/src/cpu/kernels/CpuKernelSelectionTypes.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/common/cpuinfo/CpuIsaInfo.h"
 
 namespace arm_compute
@@ -78,10 +79,10 @@ struct DepthwiseConv2dNativeDataTypeISASelectorData
 
 struct ActivationDataTypeISASelectorData
 {
-    DataType                                dt;
-    const CPUModel                         &cpumodel;
-    const cpuinfo::CpuIsaInfo              &isa;
-    const ActivationFunction f;
+    DataType                   dt;
+    const CPUModel            &cpumodel;
+    const cpuinfo::CpuIsaInfo &isa;
+    const ActivationFunction   f;
 };
 
 struct CpuAddKernelDataTypeISASelectorData
@@ -99,15 +100,19 @@ struct ScaleKernelDataTypeISASelectorData
 };
 
 // Selector pointer types
-using DataTypeISASelectorPtr                      = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type;
-using DataTypeDataLayoutSelectorPtr               = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type;
-using PoolDataTypeISASelectorPtr                  = std::add_pointer<bool(const PoolDataTypeISASelectorData &data)>::type;
-using ElementwiseDataTypeISASelectorPtr           = std::add_pointer<bool(const ElementwiseDataTypeISASelectorData &data)>::type;
-using DepthwiseConv2dNativeDataTypeISASelectorPtr = std::add_pointer<bool(const DepthwiseConv2dNativeDataTypeISASelectorData &data)>::type;
-using CastDataTypeISASelectorDataPtr              = std::add_pointer<bool(const CastDataTypeISASelectorData &data)>::type;
-using ActivationDataTypeISASelectorDataPtr        = std::add_pointer<bool(const ActivationDataTypeISASelectorData &data)>::type;
-using CpuAddKernelDataTypeISASelectorDataPtr      = std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type;
-using ScaleKernelDataTypeISASelectorDataPtr       = std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type;
+using DataTypeISASelectorPtr            = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type;
+using DataTypeDataLayoutSelectorPtr     = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type;
+using PoolDataTypeISASelectorPtr        = std::add_pointer<bool(const PoolDataTypeISASelectorData &data)>::type;
+using ElementwiseDataTypeISASelectorPtr = std::add_pointer<bool(const ElementwiseDataTypeISASelectorData &data)>::type;
+using DepthwiseConv2dNativeDataTypeISASelectorPtr =
+    std::add_pointer<bool(const DepthwiseConv2dNativeDataTypeISASelectorData &data)>::type;
+using CastDataTypeISASelectorDataPtr = std::add_pointer<bool(const CastDataTypeISASelectorData &data)>::type;
+using ActivationDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const ActivationDataTypeISASelectorData &data)>::type;
+using CpuAddKernelDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type;
+using ScaleKernelDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type;
 
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp
index 7d077c75bf857179a37b9bca7800a001a43836ef..bcaa76b99b210f282ae95466e0eaa424e5982e7f 100644
--- a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp
+++ b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp
@@ -24,11 +24,12 @@
 #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
 
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/maxunpool/list.h"
@@ -43,50 +44,43 @@ using namespace misc::shape_calculator;
 
 namespace
 {
-static const std::vector<CpuMaxUnpoolingLayerKernel::MaxUnpoolingKernel> available_kernels =
-{
-    {
-        "neon_fp32_maxunpooling",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(neon_fp32_maxunpooling)
-    },
-    {
-        "neon_fp16_maxunpooling",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(neon_fp16_maxunpooling)
-    },
-    {
-        "neon_qu8_maxunpooling",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(neon_qs8_maxunpooling)
-    },
-    {
-        "neon_qs8_maxunpooling",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qu8_maxunpooling)
-    },
+static const std::vector<CpuMaxUnpoolingLayerKernel::MaxUnpoolingKernel> available_kernels = {
+    {"neon_fp32_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(neon_fp32_maxunpooling)},
+    {"neon_fp16_maxunpooling",
+     [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_maxunpooling)},
+    {"neon_qu8_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(neon_qs8_maxunpooling)},
+    {"neon_qs8_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qu8_maxunpooling)},
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo      *src,
+                          const ITensorInfo      *indices,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, indices, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, indices);
 
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    PoolingType         pool_type       = pool_info.pool_type;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
+    PoolingType         pool_type          = pool_info.pool_type;
+    const PadStrideInfo pad_stride_info    = pool_info.pad_stride_info;
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int    pool_size_x = pool_info.pool_size.width;
-    const int    pool_size_y = pool_info.pool_size.height;
+    const int    pool_size_x               = pool_info.pool_size.width;
+    const int    pool_size_y               = pool_info.pool_size.height;
     const Size2D pool_size(pool_size_x, pool_size_y);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+                                    "Pooling indices only supported for MAX pooling method");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
@@ -96,13 +90,17 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *indices, co
 }
 } // namespace
 
-void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo      *src,
+                                           const ITensorInfo      *indices,
+                                           ITensorInfo            *dst,
+                                           const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, indices);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, indices, dst, pool_info));
     ARM_COMPUTE_UNUSED(indices);
 
-    const auto uk = CpuMaxUnpoolingLayerKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuMaxUnpoolingLayerKernel::get_implementation(
+        DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
     _run_method = uk->ukernel;
 
@@ -113,7 +111,10 @@ void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo *src, const ITensor
     ICpuKernel::configure(window);
 }
 
-Status CpuMaxUnpoolingLayerKernel::validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+Status CpuMaxUnpoolingLayerKernel::validate(const ITensorInfo      *src,
+                                            const ITensorInfo      *indices,
+                                            const ITensorInfo      *dst,
+                                            const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, indices, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, indices, dst, pool_info));
diff --git a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h
index d0c13471c8e8ad2a63dc5ba45bb7bd2c1664ff06..5a641a2bea541b7bb6d9069e2f77d59f46525ae8 100644
--- a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h
+++ b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h
@@ -37,7 +37,8 @@ namespace kernels
 class CpuMaxUnpoolingLayerKernel : public ICpuKernel<CpuMaxUnpoolingLayerKernel>
 {
 private:
-    using MaxUnpoolingUKernelPtr = std::add_pointer<void(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)>::type;
+    using MaxUnpoolingUKernelPtr = std::add_pointer<void(
+        const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)>::type;
 
 public:
     /** Default constructor */
@@ -56,7 +57,8 @@ public:
      * @param[out] dst       Destination tensor. Data types supported: Same as @p src
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+    void
+    configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CpuMaxUnpoolingLayerKernel
      *
      * @param[in]  src       Source tensor to permute. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -66,7 +68,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -83,7 +88,7 @@ public:
     const char *name() const override;
 
 private:
-    MaxUnpoolingUKernelPtr _run_method{ nullptr };
+    MaxUnpoolingUKernelPtr _run_method{nullptr};
 };
 
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp
index b73d2bdf73e0ac9c151d1528c076766bbd82603a..ba086e3ac641051411a3051b5c47c4d1f40824fb 100644
--- a/src/cpu/kernels/CpuMulKernel.cpp
+++ b/src/cpu/kernels/CpuMulKernel.cpp
@@ -25,23 +25,24 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
+
 #include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NESymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
 namespace
 {
 #if defined(ENABLE_FP32_KERNELS)
-    static constexpr size_t default_mws_N1_fp32_neon = 22447;
-    static constexpr size_t default_mws_V1_fp32_neon = 38982;
+static constexpr size_t default_mws_N1_fp32_neon = 22447;
+static constexpr size_t default_mws_V1_fp32_neon = 38982;
 #endif /* ENABLE_FP32_KERNELS */
-    static constexpr size_t default_mws_other_platforms_1d_tensor = 10240;
-}
+static constexpr size_t default_mws_other_platforms_1d_tensor = 10240;
+} // namespace
 namespace arm_compute
 {
 namespace cpu
@@ -54,29 +55,38 @@ const float       scale255_constant      = 1.f / 255.f;
 const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
 const float32x4_t positive_round_f32q    = vdupq_n_f32(0.5f);
 
-inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+inline Status validate_arguments(const ITensorInfo *src1,
+                                 const ITensorInfo *src2,
+                                 const ITensorInfo *dst,
+                                 float              scale,
+                                 ConvertPolicy      overflow_policy,
+                                 RoundingPolicy     rounding_policy)
 {
     ARM_COMPUTE_UNUSED(overflow_policy);
     ARM_COMPUTE_UNUSED(rounding_policy);
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
                                                          DataType::S32, DataType::F16, DataType::F32);
-    if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
+    if (is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP,
+                                        "ConvertPolicy cannot be WRAP if datatype is quantized");
     }
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                        "Wrong shape for dst");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
         // clang-format off
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(
@@ -88,13 +98,17 @@ inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src
             !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)
             , "Invalid data type combination");
         // clang-format on
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 &&
+                                            scale != 1.f,
+                                        "Unsupported scale for QSYMM16 inputs and S32 dst");
     }
 
-    if(std::abs(scale - scale255_constant) < 0.00001f)
+    if (std::abs(scale - scale255_constant) < 0.00001f)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32,
+        ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP &&
+                                    rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 &&
+                                            dst->data_type() == DataType::S32,
                                         "Scale == 1/255 is not supported if input and dst are of data type S32");
     }
     else
@@ -107,7 +121,8 @@ inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src
         // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
         // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
         // Moreover, it will be negative as we deal with 1/2^n
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)),
+                                        "Scale value not supported (Should be 1/(2^n) or 1/255");
     }
 
     return Status{};
@@ -168,9 +183,9 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor
     const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
 
     const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
-    const UniformQuantizationInfo tmp_qua_info    = { output_qua_info.scale / scale, output_qua_info.offset };
+    const UniformQuantizationInfo tmp_qua_info    = {output_qua_info.scale / scale, output_qua_info.offset};
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
         Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -178,7 +193,7 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor
         const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
         const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
         const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
 
         // Clear X Dimension on execution window as we handle manually
         non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -190,52 +205,52 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor
         using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<T *>(dst.ptr());
-
-            const auto broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
-            const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<T *>(dst.ptr());
 
-                // Dequantize inputs
-                const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
-                const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
+                const auto broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
+                const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
 
-                const float32x4x4_t out_f32x4x4 =
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-                    vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-                    vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-                    vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-                };
-
-                // Quantize dst
-                const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
-                wrapper::vstore(output_ptr + x, result);
-            }
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+
+                    // Dequantize inputs
+                    const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
+                    const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
+
+                    const float32x4x4_t out_f32x4x4 = {
+                        vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+                        vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+                        vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+                        vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+                    };
+
+                    // Quantize dst
+                    const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
+                    wrapper::vstore(output_ptr + x, result);
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                // Dequantize inputs
-                const T     src1    = *(non_broadcast_input_ptr + x);
-                const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
-                const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
-                const float tmp_f   = tmp_in1 * tmp_in2;
-
-                // Quantize dst
-                const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
-                *(output_ptr + x)  = tmp_qua;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    // Dequantize inputs
+                    const T     src1    = *(non_broadcast_input_ptr + x);
+                    const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
+                    const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
+                    const float tmp_f   = tmp_in1 * tmp_in2;
+
+                    // Quantize dst
+                    const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
+                    *(output_ptr + x)  = tmp_qua;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
     }
     else
     {
@@ -251,56 +266,59 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto input1_q = wrapper::vloadq(input1_ptr + x);
-                const auto input2_q = wrapper::vloadq(input2_ptr + x);
-
-                // Dequantize inputs
-                const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
-                const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+                const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
 
-                const float32x4x4_t out_f32x4x4 =
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-                    vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-                    vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-                    vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-                };
-
-                // Quantize dst
-                const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
-                wrapper::vstore(output_ptr + x, result);
-            }
+                    const auto input1_q = wrapper::vloadq(input1_ptr + x);
+                    const auto input2_q = wrapper::vloadq(input2_ptr + x);
+
+                    // Dequantize inputs
+                    const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
+                    const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+
+                    const float32x4x4_t out_f32x4x4 = {
+                        vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+                        vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+                        vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+                        vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+                    };
+
+                    // Quantize dst
+                    const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
+                    wrapper::vstore(output_ptr + x, result);
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                // Dequantize inputs
-                const T     src1    = *(input1_ptr + x);
-                const T     src2    = *(input2_ptr + x);
-                const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
-                const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
-                const float tmp_f   = tmp_in1 * tmp_in2;
-
-                // Quantize dst
-                const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
-                *(output_ptr + x)  = tmp_qua;
-            }
-        },
-        input1, input2, dst);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    // Dequantize inputs
+                    const T     src1    = *(input1_ptr + x);
+                    const T     src2    = *(input2_ptr + x);
+                    const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
+                    const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
+                    const float tmp_f   = tmp_in1 * tmp_in2;
+
+                    // Quantize dst
+                    const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
+                    *(output_ptr + x)  = tmp_qua;
+                }
+            },
+            input1, input2, dst);
     }
 }
 
-bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, float scale)
+bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
+                                     const ITensorInfo *src1,
+                                     const ITensorInfo *dst,
+                                     float              scale)
 {
     const auto iq0 = src0->quantization_info().uniform();
     const auto iq1 = src1->quantization_info().uniform();
@@ -308,7 +326,7 @@ bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo
 
     const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale;
 
-    if(multiplier < -8191.f || multiplier > 8191.f)
+    if (multiplier < -8191.f || multiplier > 8191.f)
     {
         //The multiplier cannot be stored as a 14.18 signed fixed-point number
         return false;
@@ -318,7 +336,7 @@ bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo
 
     const auto max_result = multiplier * (256) * (256) + offset_out;
 
-    if(max_result > 8191.f)
+    if (max_result > 8191.f)
     {
         //It might not be possible to store the result as a 14.18 signed fixed-point number.
         return false;
@@ -366,7 +384,7 @@ void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *d
     const auto out_offset_14p18 = static_cast<int32_t>(out_offset * two_pwr18i);
     const auto multiplier_14p18 = static_cast<int32_t>(multiplier * two_pwr18f);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         // Prefix: a = non-broadcast, b = broadcast.
 
@@ -392,78 +410,76 @@ void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *d
         Iterator out_it(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto a_ptr   = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
-            const auto b_ptr   = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
-            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
-
-            const auto b_val            = *b_ptr;
-            const auto b_offseted_32p0  = static_cast<int32_t>(b_val - b_offset_16p0);
-            const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());
+            win,
+            [&](const Coordinates &)
+            {
+                const auto a_ptr   = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
+                const auto b_ptr   = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
+                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
 
-            const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
-            const auto voffsetout_14p18  = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
+                const auto b_val            = *b_ptr;
+                const auto b_offseted_32p0  = static_cast<int32_t>(b_val - b_offset_16p0);
+                const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());
 
-            int x = window_start_x;
+                const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
+                const auto voffsetout_14p18  = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
 
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Load the inputs.
-                const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
-
-                // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
-                const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
-                const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
-
-                const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0);
-                const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0);
-                const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0);
-                const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0);
-
-                const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);
-                const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);
-                const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);
-                const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);
-
-                const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
-                const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
-                const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
-                const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
-
-                // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
-                const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
-                const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
-                const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
-                const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
-
-                const auto vout_15p1_0 = wrapper::vcombine(
-                                             vout_15p1_00,
-                                             vout_15p1_01);
-
-                const auto vout_15p1_1 = wrapper::vcombine(
-                                             vout_15p1_10,
-                                             vout_15p1_11);
-                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+                int x = window_start_x;
 
-                const auto vout_8p0 = wrapper::vcombine(
-                                          wrapper::vqrshrn<2>(vout_15p1_0),
-                                          wrapper::vqrshrn<2>(vout_15p1_1));
-                wrapper::vstore(out_ptr + x, vout_8p0);
-            }
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load the inputs.
+                    const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
+
+                    // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
+                    const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
+                    const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
+
+                    const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0);
+                    const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0);
+                    const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0);
+                    const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0);
+
+                    const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);
+                    const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);
+                    const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);
+                    const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);
+
+                    const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
+                    const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
+                    const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
+                    const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
+
+                    // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
+                    const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
+                    const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
+                    const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
+                    const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
+
+                    const auto vout_15p1_0 = wrapper::vcombine(vout_15p1_00, vout_15p1_01);
+
+                    const auto vout_15p1_1 = wrapper::vcombine(vout_15p1_10, vout_15p1_11);
+                    const auto out_ptr     = reinterpret_cast<ScalarType *>(out_it.ptr());
+
+                    const auto vout_8p0 =
+                        wrapper::vcombine(wrapper::vqrshrn<2>(vout_15p1_0), wrapper::vqrshrn<2>(vout_15p1_1));
+                    wrapper::vstore(out_ptr + x, vout_8p0);
+                }
 
-            //Process the left-over elements.
-            for(; x < window_end_x; ++x)
-            {
+                //Process the left-over elements.
+                for (; x < window_end_x; ++x)
+                {
 #ifdef __aarch64__
-                out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(
-                                                                                                             b_val) - b_offset_16p0)) + out_offset_14p18)));
+                    out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(
+                        (multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(b_val) - b_offset_16p0)) +
+                        out_offset_14p18)));
 #else  //__aarch64__
-                out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset)));
+                    out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(
+                        multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset)));
 #endif //__aarch64__
-            }
-        },
-        a_input_it, b_input_it, out_it);
+                }
+            },
+            a_input_it, b_input_it, out_it);
     }
     else
     {
@@ -481,82 +497,83 @@ void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *d
         Iterator out_it(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
-            const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
-            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+            win,
+            [&](const Coordinates &)
+            {
+                const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
+                const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
+                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
 
-            int x = window_start_x;
+                int x = window_start_x;
 
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Load the inputs.
-                const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
-                const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
-
-                // Widen the input elements to signed 16-bit regardless of the input signedness.
-                const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
-                const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
-                const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
-                const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
-
-                const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0);
-                const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0);
-                const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0);
-                const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0);
-
-                const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0);
-                const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0);
-                const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0);
-                const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0);
-
-                const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);
-                const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);
-                const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);
-                const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);
-
-                const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
-                const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
-                const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
-                const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
-
-                // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
-                const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
-                const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
-                const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
-                const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
-
-                const auto vout_14p2_0 = wrapper::vcombine(
-                                             vout_14p2_00,
-                                             vout_14p2_01);
-
-                const auto vout_14p2_1 = wrapper::vcombine(
-                                             vout_14p2_10,
-                                             vout_14p2_11);
-
-                const auto vout_8p0 = wrapper::vcombine(
-                                          wrapper::vqrshrn<2>(vout_14p2_0),
-                                          wrapper::vqrshrn<2>(vout_14p2_1));
-                wrapper::vstore(out_ptr + x, vout_8p0);
-            }
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load the inputs.
+                    const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
+                    const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
+
+                    // Widen the input elements to signed 16-bit regardless of the input signedness.
+                    const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
+                    const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
+                    const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
+                    const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
+
+                    const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0);
+                    const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0);
+                    const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0);
+                    const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0);
+
+                    const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0);
+                    const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0);
+                    const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0);
+                    const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0);
+
+                    const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);
+                    const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);
+                    const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);
+                    const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);
+
+                    const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
+                    const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
+                    const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
+                    const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
+
+                    // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
+                    const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
+                    const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
+                    const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
+                    const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
+
+                    const auto vout_14p2_0 = wrapper::vcombine(vout_14p2_00, vout_14p2_01);
+
+                    const auto vout_14p2_1 = wrapper::vcombine(vout_14p2_10, vout_14p2_11);
+
+                    const auto vout_8p0 =
+                        wrapper::vcombine(wrapper::vqrshrn<2>(vout_14p2_0), wrapper::vqrshrn<2>(vout_14p2_1));
+                    wrapper::vstore(out_ptr + x, vout_8p0);
+                }
 
-            //Process the left-over elements.
-            for(; x < window_end_x; ++x)
-            {
+                //Process the left-over elements.
+                for (; x < window_end_x; ++x)
+                {
 #ifdef __aarch64__
-                out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) * (int32_t(
-                                                                                                             in1_ptr[x]) - in1_offset_16p0)) + out_offset_14p18)));
+                    out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(
+                        wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) *
+                                             (int32_t(in1_ptr[x]) - in1_offset_16p0)) +
+                                            out_offset_14p18)));
 #else  //__aarch64__
-                out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) + float(out_offset)));
+                    out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(
+                        multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) +
+                        float(out_offset)));
 #endif //__aarch64__
-            }
-        },
-        in0_it, in1_it, out_it);
+                }
+            },
+            in0_it, in1_it, out_it);
     }
 }
 
-void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
+void mul_saturate_QSYMM16_QSYMM16_QSYMM16(
+    const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
 {
     const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
     const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
@@ -580,66 +597,61 @@ void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *sr
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
+    const UniformQuantizationInfo tmp_qua_info = {output_qua_info.scale / scale, output_qua_info.offset};
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const qsymm16x8x2_t input1_q =
+            const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
+                const qsymm16x8x2_t input1_q = {{
                     vld1q_s16(input1_ptr + x),
                     vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const qsymm16x8x2_t input2_q =
-            {
-                {
+                }};
+                const qsymm16x8x2_t input2_q = {{
                     vld1q_s16(input2_ptr + x),
                     vld1q_s16(input2_ptr + x + 8),
-                }
-            };
+                }};
 
-            // Dequantize inputs
-            const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
-            const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+                // Dequantize inputs
+                const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
+                const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
 
-            const float32x4x4_t out_f32x4x4 =
-            {
-                vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-                vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-                vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-                vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-            };
-
-            const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
-            vst1q_s16(output_ptr + x, result.val[0]);
-            vst1q_s16(output_ptr + x + 8, result.val[1]);
-        }
+                const float32x4x4_t out_f32x4x4 = {
+                    vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+                    vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+                    vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+                    vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+                };
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            // Dequantize inputs
-            float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
-            float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
-            float tmp_f   = tmp_in1 * tmp_in2;
-
-            // Quantize dst, lrintf() has same rounding mode as vcombine_s16
-            int32_t   tmp     = lrintf(tmp_f / tmp_qua_info.scale);
-            qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
-            *(output_ptr + x) = tmp_qua;
-        }
-    },
-    input1, input2, dst);
+                const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
+                vst1q_s16(output_ptr + x, result.val[0]);
+                vst1q_s16(output_ptr + x + 8, result.val[1]);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                // Dequantize inputs
+                float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
+                float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
+                float tmp_f   = tmp_in1 * tmp_in2;
+
+                // Quantize dst, lrintf() has same rounding mode as vcombine_s16
+                int32_t   tmp = lrintf(tmp_f / tmp_qua_info.scale);
+                qsymm16_t tmp_qua =
+                    static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
+                *(output_ptr + x) = tmp_qua;
+            }
+        },
+        input1, input2, dst);
 }
 
 void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)
@@ -665,74 +677,60 @@ void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *
     const auto window_end_x   = static_cast<int>(window.x().end());
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const qsymm16x8x2_t input1_q =
+            const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
+                const qsymm16x8x2_t input1_q = {{
                     vld1q_s16(input1_ptr + x),
                     vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const qsymm16x8x2_t input2_q =
-            {
-                {
+                }};
+                const qsymm16x8x2_t input2_q = {{
                     vld1q_s16(input2_ptr + x),
                     vld1q_s16(input2_ptr + x + 8),
-                }
-            };
+                }};
 
-            const int32x4x4_t in1_s32 =
-            {
-                {
+                const int32x4x4_t in1_s32 = {{
                     vmovl_s16(vget_low_s16(input1_q.val[0])),
                     vmovl_s16(vget_high_s16(input1_q.val[0])),
                     vmovl_s16(vget_low_s16(input1_q.val[1])),
                     vmovl_s16(vget_high_s16(input1_q.val[1])),
-                }
-            };
-            const int32x4x4_t in2_s32 =
-            {
-                {
+                }};
+                const int32x4x4_t in2_s32 = {{
                     vmovl_s16(vget_low_s16(input2_q.val[0])),
                     vmovl_s16(vget_high_s16(input2_q.val[0])),
                     vmovl_s16(vget_low_s16(input2_q.val[1])),
                     vmovl_s16(vget_high_s16(input2_q.val[1])),
-                }
-            };
+                }};
 
-            const int32x4x4_t result =
-            {
-                {
+                const int32x4x4_t result = {{
                     vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
                     vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
                     vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
                     vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
-                }
-            };
+                }};
 
-            vst1q_s32(output_ptr + x, result.val[0]);
-            vst1q_s32(output_ptr + x + 4, result.val[1]);
-            vst1q_s32(output_ptr + x + 8, result.val[2]);
-            vst1q_s32(output_ptr + x + 12, result.val[3]);
-        }
+                vst1q_s32(output_ptr + x, result.val[0]);
+                vst1q_s32(output_ptr + x + 4, result.val[1]);
+                vst1q_s32(output_ptr + x + 8, result.val[2]);
+                vst1q_s32(output_ptr + x + 12, result.val[3]);
+            }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp       = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input1, input2, dst);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int32_t tmp       = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+                *(output_ptr + x) = tmp;
+            }
+        },
+        input1, input2, dst);
 }
 
 template <bool is_scale255, bool is_sat>
@@ -757,79 +755,80 @@ void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const
     const auto window_end_x   = static_cast<int>(window.x().end());
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
-            const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
+            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
+                const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
 
-            uint16x8_t       tmp1_high = vmovl_u8(vget_high_u8(ta1));
-            const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
-            uint16x8_t       tmp1_low  = vmovl_u8(vget_low_u8(ta1));
-            const uint16x8_t tmp2_low  = vmovl_u8(vget_low_u8(ta2));
+                uint16x8_t       tmp1_high = vmovl_u8(vget_high_u8(ta1));
+                const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
+                uint16x8_t       tmp1_low  = vmovl_u8(vget_low_u8(ta1));
+                const uint16x8_t tmp2_low  = vmovl_u8(vget_low_u8(ta2));
 
-            tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
-            tmp1_low  = vmulq_u16(tmp1_low, tmp2_low);
+                tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
+                tmp1_low  = vmulq_u16(tmp1_low, tmp2_low);
 
-            if(is_scale255)
-            {
-                tmp1_high = scale255_U16_U16(tmp1_high);
-                tmp1_low  = scale255_U16_U16(tmp1_low);
-            }
-            else
-            {
-                const int16x8_t vn = vdupq_n_s16(-n);
+                if (is_scale255)
+                {
+                    tmp1_high = scale255_U16_U16(tmp1_high);
+                    tmp1_low  = scale255_U16_U16(tmp1_low);
+                }
+                else
+                {
+                    const int16x8_t vn = vdupq_n_s16(-n);
 
-                if(is_sat)
+                    if (is_sat)
+                    {
+                        tmp1_high = vqshlq_u16(tmp1_high, vn);
+                        tmp1_low  = vqshlq_u16(tmp1_low, vn);
+                    }
+                    else
+                    {
+                        tmp1_high = vshlq_u16(tmp1_high, vn);
+                        tmp1_low  = vshlq_u16(tmp1_low, vn);
+                    }
+                }
+                if (is_sat)
                 {
-                    tmp1_high = vqshlq_u16(tmp1_high, vn);
-                    tmp1_low  = vqshlq_u16(tmp1_low, vn);
+                    vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
                 }
                 else
                 {
-                    tmp1_high = vshlq_u16(tmp1_high, vn);
-                    tmp1_low  = vshlq_u16(tmp1_low, vn);
+                    vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
                 }
             }
-            if(is_sat)
-            {
-                vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
-            }
-            else
-            {
-                vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
-            }
-        }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
-
-            if(is_scale255)
-            {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
-                tmp         = static_cast<uint16_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                tmp >>= n;
-            }
-            if(is_sat && tmp > 255)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                tmp = 255;
+                uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
+
+                if (is_scale255)
+                {
+                    float tmp_f = static_cast<float>(tmp) * scale255_constant;
+                    tmp         = static_cast<uint16_t>(tmp_f + 0.5f);
+                }
+                else
+                {
+                    tmp >>= n;
+                }
+                if (is_sat && tmp > 255)
+                {
+                    tmp = 255;
+                }
+                *(output_ptr + x) = static_cast<uint8_t>(tmp);
             }
-            *(output_ptr + x) = static_cast<uint8_t>(tmp);
-        }
-    },
-    input1, input2, dst);
+        },
+        input1, input2, dst);
 }
 
 template <bool is_scale255, bool is_sat>
@@ -843,7 +842,7 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &
     tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
     tmp1_low  = vmulq_s32(tmp1_low, tmp2_low);
 
-    if(is_scale255)
+    if (is_scale255)
     {
         tmp1_high = scale255_S32_S32(tmp1_high);
         tmp1_low  = scale255_S32_S32(tmp1_low);
@@ -863,7 +862,7 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &
         const int32x4_t  sign_low_s   = vreinterpretq_s32_u32(sign_low);
         const int32x4_t  convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
         const int32x4_t  convert_low  = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
-        if(is_sat)
+        if (is_sat)
         {
             tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
             tmp1_low  = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
@@ -875,7 +874,7 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &
         }
     }
 
-    if(is_sat)
+    if (is_sat)
     {
         return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
     }
@@ -888,15 +887,10 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &
 template <bool is_scale255, bool is_sat>
 inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)
 {
-    const int16x8x2_t result =
-    {
-        {
-            // First 8 elements
-            mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
-            // Second 8 elements
-            mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)
-        }
-    };
+    const int16x8x2_t result = {{// First 8 elements
+                                 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
+                                 // Second 8 elements
+                                 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)}};
 
     return result;
 }
@@ -923,67 +917,62 @@ void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, con
     const auto window_end_x   = static_cast<int>(window.x().end());
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const int16x8x2_t ta1 =
-            {
-                {
-                    vld1q_s16(input1_ptr + x),
-                    vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const int16x8x2_t ta2 =
-            {
-                {
-                    vld1q_s16(input2_ptr + x),
-                    vld1q_s16(input2_ptr + x + 8),
-                }
-            };
-            const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
-
-            vst1q_s16(output_ptr + x, result.val[0]);
-            vst1q_s16(output_ptr + x + 8, result.val[1]);
-        }
+            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const int16x8x2_t ta1    = {{
+                       vld1q_s16(input1_ptr + x),
+                       vld1q_s16(input1_ptr + x + 8),
+                }};
+                const int16x8x2_t ta2    = {{
+                       vld1q_s16(input2_ptr + x),
+                       vld1q_s16(input2_ptr + x + 8),
+                }};
+                const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+
+                vst1q_s16(output_ptr + x, result.val[0]);
+                vst1q_s16(output_ptr + x + 8, result.val[1]);
+            }
 
-            if(is_scale255)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
+                int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
 
-                tmp = static_cast<int32_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                if(tmp >= 0)
+                if (is_scale255)
                 {
-                    tmp >>= n;
+                    float tmp_f = static_cast<float>(tmp) * scale255_constant;
+
+                    tmp = static_cast<int32_t>(tmp_f + 0.5f);
                 }
                 else
                 {
-                    uint32_t mask = (1u << n) - 1;
-                    tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
+                    if (tmp >= 0)
+                    {
+                        tmp >>= n;
+                    }
+                    else
+                    {
+                        uint32_t mask = (1u << n) - 1;
+                        tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
+                    }
                 }
+                if (is_sat)
+                {
+                    tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
+                }
+                *(output_ptr + x) = static_cast<int16_t>(tmp);
             }
-            if(is_sat)
-            {
-                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
-            }
-            *(output_ptr + x) = static_cast<int16_t>(tmp);
-        }
-    },
-    input1, input2, dst);
+        },
+        input1, input2, dst);
 }
 
 template <bool is_sat>
@@ -1012,7 +1001,7 @@ inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &
     const uint64x2_t sign_2    = vshrq_n_u64(tmp_2_u, 63);
     const int64x2_t  sign_2_s  = vreinterpretq_s64_u64(sign_2);
     const int64x2_t  convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
-    if(is_sat)
+    if (is_sat)
     {
         tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
         tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
@@ -1029,15 +1018,10 @@ inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &
 template <bool is_sat>
 inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)
 {
-    const int32x4x2_t result =
-    {
-        {
-            // First 4 elements
-            mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
-            // Second 4 elements
-            mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)
-        }
-    };
+    const int32x4x2_t result = {{// First 4 elements
+                                 mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
+                                 // Second 4 elements
+                                 mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)}};
 
     return result;
 }
@@ -1058,7 +1042,7 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con
     const auto window_end_x          = static_cast<int>(window.x().end());
     const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -1074,60 +1058,56 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int32_t *>(dst.ptr());
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int32_t *>(dst.ptr());
 
-            const int32_t broadcast_value     = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
-            const auto    broadcast_value_vec = vdupq_n_s32(broadcast_value);
+                const int32_t broadcast_value     = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
+                const auto    broadcast_value_vec = vdupq_n_s32(broadcast_value);
 
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int32x4x2_t broadcast_v =
-                {
-                    {
-                        broadcast_value_vec,
-                        broadcast_value_vec,
-                    }
-                };
-                const int32x4x2_t non_broadcast_v =
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
+                    const int32x4x2_t broadcast_v     = {{
+                            broadcast_value_vec,
+                            broadcast_value_vec,
+                    }};
+                    const int32x4x2_t non_broadcast_v = {{
                         vld1q_s32(non_broadcast_input_ptr + x),
                         vld1q_s32(non_broadcast_input_ptr + x + 4),
-                    }
-                };
-                const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
-
-                vst1q_s32(output_ptr + x, result.val[0]);
-                vst1q_s32(output_ptr + x + 4, result.val[1]);
-            }
+                    }};
+                    const int32x4x2_t result          = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
-
-                if(tmp >= 0)
-                {
-                    tmp >>= n;
-                }
-                else
-                {
-                    uint64_t mask = ((uint64_t)1u << n) - 1;
-                    tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
+                    vst1q_s32(output_ptr + x, result.val[0]);
+                    vst1q_s32(output_ptr + x + 4, result.val[1]);
                 }
-                if(is_sat)
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    tmp = utility::clamp<int64_t, int32_t>(tmp);
+                    int64_t tmp =
+                        static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
+
+                    if (tmp >= 0)
+                    {
+                        tmp >>= n;
+                    }
+                    else
+                    {
+                        uint64_t mask = ((uint64_t)1u << n) - 1;
+                        tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
+                    }
+                    if (is_sat)
+                    {
+                        tmp = utility::clamp<int64_t, int32_t>(tmp);
+                    }
+                    *(output_ptr + x) = static_cast<int32_t>(tmp);
                 }
-                *(output_ptr + x) = static_cast<int32_t>(tmp);
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
+            },
+            broadcast_input, non_broadcast_input, dst);
     }
     else
     {
@@ -1140,58 +1120,53 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const int32x4x2_t ta1 =
+                const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int32x4x2_t ta1    = {{
+                           vld1q_s32(input1_ptr + x),
+                           vld1q_s32(input1_ptr + x + 4),
+                    }};
+                    const int32x4x2_t ta2    = {{
+                           vld1q_s32(input2_ptr + x),
+                           vld1q_s32(input2_ptr + x + 4),
+                    }};
+                    const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
+
+                    vst1q_s32(output_ptr + x, result.val[0]);
+                    vst1q_s32(output_ptr + x + 4, result.val[1]);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
+                    int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
+
+                    if (tmp >= 0)
                     {
-                        vld1q_s32(input1_ptr + x),
-                        vld1q_s32(input1_ptr + x + 4),
+                        tmp >>= n;
                     }
-                };
-                const int32x4x2_t ta2 =
-                {
+                    else
                     {
-                        vld1q_s32(input2_ptr + x),
-                        vld1q_s32(input2_ptr + x + 4),
+                        uint64_t mask = ((uint64_t)1u << n) - 1;
+                        tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
                     }
-                };
-                const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
-
-                vst1q_s32(output_ptr + x, result.val[0]);
-                vst1q_s32(output_ptr + x + 4, result.val[1]);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
-
-                if(tmp >= 0)
-                {
-                    tmp >>= n;
-                }
-                else
-                {
-                    uint64_t mask = ((uint64_t)1u << n) - 1;
-                    tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
-                }
-                if(is_sat)
-                {
-                    tmp = utility::clamp<int64_t, int32_t>(tmp);
+                    if (is_sat)
+                    {
+                        tmp = utility::clamp<int64_t, int32_t>(tmp);
+                    }
+                    *(output_ptr + x) = static_cast<int32_t>(tmp);
                 }
-                *(output_ptr + x) = static_cast<int32_t>(tmp);
-            }
-        },
-        input1, input2, dst);
+            },
+            input1, input2, dst);
     }
 }
 
@@ -1212,7 +1187,7 @@ void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, con
 
     using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -1228,32 +1203,33 @@ void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, con
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
 
-            const float broadcast_value     = *reinterpret_cast<const float *>(broadcast_input.ptr());
-            const auto  broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-            const auto  scale_vec           = wrapper::vdup_n(scale, ExactTagType{});
+                const float broadcast_value     = *reinterpret_cast<const float *>(broadcast_input.ptr());
+                const auto  broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+                const auto  scale_vec           = wrapper::vdup_n(scale, ExactTagType{});
 
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                auto       res             = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
-                wrapper::vstore(output_ptr + x, res);
-            }
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                    auto       res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
+                    wrapper::vstore(output_ptr + x, res);
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
     }
     else
     {
@@ -1266,32 +1242,33 @@ void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, con
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto ta1       = wrapper::vloadq(input1_ptr + x);
-                const auto ta2       = wrapper::vloadq(input2_ptr + x);
-                const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
-                const auto res       = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
-                wrapper::vstore(output_ptr + x, res);
-            }
+                const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto ta1    = *(input1_ptr + x);
-                const auto ta2    = *(input2_ptr + x);
-                *(output_ptr + x) = ta1 * ta2 * scale;
-            }
-        },
-        input1, input2, dst);
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto ta1       = wrapper::vloadq(input1_ptr + x);
+                    const auto ta2       = wrapper::vloadq(input2_ptr + x);
+                    const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
+                    const auto res       = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto ta1    = *(input1_ptr + x);
+                    const auto ta2    = *(input2_ptr + x);
+                    *(output_ptr + x) = ta1 * ta2 * scale;
+                }
+            },
+            input1, input2, dst);
     }
 }
 
@@ -1312,7 +1289,7 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out,
 
     using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -1328,48 +1305,49 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out,
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
 
-            const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
+                const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
 
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto  a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
-                float32x4_t b = vdupq_n_f32(broadcast_value);
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto  a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
+                    float32x4_t b = vdupq_n_f32(broadcast_value);
 
-                const float32x4_t mask  = { -1.0f, 1.0f, -1.0f, 1.0f };
-                const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
-                const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
-                const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
-                const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
+                    const float32x4_t mask  = {-1.0f, 1.0f, -1.0f, 1.0f};
+                    const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
+                    const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
+                    const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
+                    const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
 
-                const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
-                const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
+                    const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
+                    const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
 
-                float32x4_t res = wrapper::vmul(tmp0, b);
-                b               = wrapper::vmul(b, mask);
+                    float32x4_t res = wrapper::vmul(tmp0, b);
+                    b               = wrapper::vmul(b, mask);
 
-                res = wrapper::vmla(res, tmp1, b);
-                wrapper::vstore(output_ptr + 2 * x, res);
-            }
+                    res = wrapper::vmla(res, tmp1, b);
+                    wrapper::vstore(output_ptr + 2 * x, res);
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
-                const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
-                auto       res1                 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
-                auto       res2                 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
-                *(output_ptr + 2 * x)           = res1;
-                *(output_ptr + 2 * x + 1)       = res2;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
+                    const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
+                    auto       res1                 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
+                    auto       res2                 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
+                    *(output_ptr + 2 * x)           = res1;
+                    *(output_ptr + 2 * x + 1)       = res2;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
     }
     else
     {
@@ -1382,51 +1360,52 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out,
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
-                float32x4_t       b = wrapper::vloadq(input2_ptr + 2 * x);
+                const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
 
-                const float32x4_t mask  = { -1.0f, 1.0f, -1.0f, 1.0f };
-                const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
-                const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
-                const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
-                const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
+                    float32x4_t       b = wrapper::vloadq(input2_ptr + 2 * x);
 
-                const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
-                const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
+                    const float32x4_t mask  = {-1.0f, 1.0f, -1.0f, 1.0f};
+                    const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
+                    const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
+                    const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
+                    const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
 
-                float32x4_t res = wrapper::vmul(tmp0, b);
+                    const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
+                    const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
 
-                b = wrapper::vrev64(b);
-                b = wrapper::vmul(b, mask);
+                    float32x4_t res = wrapper::vmul(tmp0, b);
 
-                res = wrapper::vmla(res, tmp1, b);
-                wrapper::vstore(output_ptr + 2 * x, res);
-            }
+                    b = wrapper::vrev64(b);
+                    b = wrapper::vmul(b, mask);
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto a0             = *(input1_ptr + 2 * x);
-                const auto a1             = *(input1_ptr + 2 * x + 1);
-                const auto b0             = *(input2_ptr + 2 * x);
-                const auto b1             = *(input2_ptr + 2 * x + 1);
-                auto       res1           = a0 * b0 - a1 * b1;
-                auto       res2           = a0 * b1 + a1 * b0;
-                *(output_ptr + 2 * x)     = res1;
-                *(output_ptr + 2 * x + 1) = res2;
-            }
-        },
-        input1, input2, dst);
+                    res = wrapper::vmla(res, tmp1, b);
+                    wrapper::vstore(output_ptr + 2 * x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto a0             = *(input1_ptr + 2 * x);
+                    const auto a1             = *(input1_ptr + 2 * x + 1);
+                    const auto b0             = *(input2_ptr + 2 * x);
+                    const auto b1             = *(input2_ptr + 2 * x + 1);
+                    auto       res1           = a0 * b0 - a1 * b1;
+                    auto       res2           = a0 * b1 + a1 * b0;
+                    *(output_ptr + 2 * x)     = res1;
+                    *(output_ptr + 2 * x + 1) = res2;
+                }
+            },
+            input1, input2, dst);
     }
 }
 
@@ -1444,7 +1423,7 @@ void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, con
     const auto    window_start_x        = static_cast<int>(window.x().start());
     const auto    window_end_x          = static_cast<int>(window.x().end());
     const bool    is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -1457,48 +1436,40 @@ void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, con
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator dst(out, win);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto          non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
-            const auto          output_ptr              = reinterpret_cast<float16_t *>(dst.ptr());
-            const auto          broadcast_value         = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
-            const float16x8x2_t broadcast_value_vec     =
+            win,
+            [&](const Coordinates &)
             {
-                {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<float16_t *>(dst.ptr());
+                const auto broadcast_value         = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
+                const float16x8x2_t broadcast_value_vec = {{
                     vdupq_n_f16(broadcast_value),
                     vdupq_n_f16(broadcast_value),
-                }
-            };
-            const auto scale_vec = vdupq_n_f16(scale);
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const float16x8x2_t non_broadcast_v =
+                }};
+                const auto          scale_vec           = vdupq_n_f16(scale);
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
+                    const float16x8x2_t non_broadcast_v = {{
                         vld1q_f16(non_broadcast_input_ptr + x),
                         vld1q_f16(non_broadcast_input_ptr + x + 8),
-                    }
-                };
-                const float16x8x2_t result =
+                    }};
+                    const float16x8x2_t result          = {{
+                                 vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
+                                 vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
+                    }};
+                    vst1q_f16(output_ptr + x, result.val[0]);
+                    vst1q_f16(output_ptr + x + 8, result.val[1]);
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
-                        vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
-                    }
-                };
-                vst1q_f16(output_ptr + x, result.val[0]);
-                vst1q_f16(output_ptr + x + 8, result.val[1]);
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
     }
     else
     {
@@ -1508,49 +1479,41 @@ void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, con
         Iterator input2(src2, input2_win);
         Iterator dst(out, win);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const float16x8x2_t ta1 =
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_f16(input1_ptr + x),
-                        vld1q_f16(input1_ptr + x + 8),
-                    }
-                };
-                const float16x8x2_t ta2 =
-                {
-                    {
-                        vld1q_f16(input2_ptr + x),
-                        vld1q_f16(input2_ptr + x + 8),
-                    }
-                };
-                const float16x8_t   scale_vec = vdupq_n_f16(scale);
-                const float16x8x2_t result    =
+                    const float16x8x2_t ta1       = {{
+                              vld1q_f16(input1_ptr + x),
+                              vld1q_f16(input1_ptr + x + 8),
+                    }};
+                    const float16x8x2_t ta2       = {{
+                              vld1q_f16(input2_ptr + x),
+                              vld1q_f16(input2_ptr + x + 8),
+                    }};
+                    const float16x8_t   scale_vec = vdupq_n_f16(scale);
+                    const float16x8x2_t result    = {{
+                           vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
+                           vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
+                    }};
+                    vst1q_f16(output_ptr + x, result.val[0]);
+                    vst1q_f16(output_ptr + x + 8, result.val[1]);
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
-                        vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
-                    }
-                };
-                vst1q_f16(output_ptr + x, result.val[0]);
-                vst1q_f16(output_ptr + x + 8, result.val[1]);
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto ta1    = *(input1_ptr + x);
-                const auto ta2    = *(input2_ptr + x);
-                *(output_ptr + x) = ta1 * ta2 * scale;
-            }
-        },
-        input1, input2, dst);
+                    const auto ta1    = *(input1_ptr + x);
+                    const auto ta2    = *(input2_ptr + x);
+                    *(output_ptr + x) = ta1 * ta2 * scale;
+                }
+            },
+            input1, input2, dst);
     }
 }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -1577,81 +1540,82 @@ void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const
     const auto window_end_x   = static_cast<int>(window.x().end());
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
-            const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
-
-            uint16x8_t tmp_low  = vmovl_u8(vget_low_u8(av));
-            uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
-            tmp_low             = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
-            tmp_high            = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
+            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
 
-            if(is_scale255)
-            {
-                tmp_low  = scale255_U16_U16(tmp_low);
-                tmp_high = scale255_U16_U16(tmp_high);
-            }
-            else
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                const int16x8_t vn = vdupq_n_s16(-n);
+                const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
+                const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
 
-                if(is_sat)
+                uint16x8_t tmp_low  = vmovl_u8(vget_low_u8(av));
+                uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
+                tmp_low             = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
+                tmp_high            = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
+
+                if (is_scale255)
                 {
-                    tmp_low  = vqshlq_u16(tmp_low, vn);
-                    tmp_high = vqshlq_u16(tmp_high, vn);
+                    tmp_low  = scale255_U16_U16(tmp_low);
+                    tmp_high = scale255_U16_U16(tmp_high);
                 }
                 else
                 {
-                    tmp_low  = vshlq_u16(tmp_low, vn);
-                    tmp_high = vshlq_u16(tmp_high, vn);
+                    const int16x8_t vn = vdupq_n_s16(-n);
+
+                    if (is_sat)
+                    {
+                        tmp_low  = vqshlq_u16(tmp_low, vn);
+                        tmp_high = vqshlq_u16(tmp_high, vn);
+                    }
+                    else
+                    {
+                        tmp_low  = vshlq_u16(tmp_low, vn);
+                        tmp_high = vshlq_u16(tmp_high, vn);
+                    }
                 }
-            }
 
-            if(is_sat)
-            {
-                static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
+                if (is_sat)
+                {
+                    static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
 
-                tmp_low  = vminq_u16(tmp_low, max);
-                tmp_high = vminq_u16(tmp_high, max);
+                    tmp_low  = vminq_u16(tmp_low, max);
+                    tmp_high = vminq_u16(tmp_high, max);
+                }
+
+                vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
+                vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
             }
 
-            vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
-            vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
-        }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+                if (is_scale255)
+                {
+                    float tmp_f = static_cast<float>(tmp) * scale255_constant;
+                    tmp         = static_cast<int32_t>(tmp_f + 0.5f);
+                }
+                else
+                {
+                    tmp >>= n;
+                }
 
-            if(is_scale255)
-            {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
-                tmp         = static_cast<int32_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                tmp >>= n;
-            }
+                if (is_sat)
+                {
+                    tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
+                }
 
-            if(is_sat)
-            {
-                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
+                *(output_ptr + x) = static_cast<int16_t>(tmp);
             }
-
-            *(output_ptr + x) = static_cast<int16_t>(tmp);
-        }
-    },
-    input1, input2, dst);
+        },
+        input1, input2, dst);
 }
 
 template <bool is_scale255, bool is_sat>
@@ -1676,75 +1640,65 @@ void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, cons
     const auto window_end_x   = static_cast<int>(window.x().end());
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const int16x8x2_t ta1 =
-            {
-                {
-                    vld1q_s16(input1_ptr + x),
-                    vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const uint8x8x2_t ta2u =
+            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
+                const int16x8x2_t ta1  = {{
+                     vld1q_s16(input1_ptr + x),
+                     vld1q_s16(input1_ptr + x + 8),
+                }};
+                const uint8x8x2_t ta2u = {{
                     vld1_u8(input2_ptr + x),
                     vld1_u8(input2_ptr + x + 8),
-                }
-            };
-            const int16x8x2_t ta2 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
-                    vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
-                }
-            };
-
-            const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+                }};
+                const int16x8x2_t ta2  = {
+                     {vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))}};
 
-            vst1q_s16(output_ptr + x, result.val[0]);
-            vst1q_s16(output_ptr + x + 8, result.val[1]);
-        }
+                const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+                vst1q_s16(output_ptr + x, result.val[0]);
+                vst1q_s16(output_ptr + x + 8, result.val[1]);
+            }
 
-            if(is_scale255)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
+                int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
 
-                tmp = static_cast<int32_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                if(tmp >= 0)
+                if (is_scale255)
                 {
-                    tmp >>= n;
+                    float tmp_f = static_cast<float>(tmp) * scale255_constant;
+
+                    tmp = static_cast<int32_t>(tmp_f + 0.5f);
                 }
                 else
                 {
-                    uint32_t mask = (1u << n) - 1;
-                    tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
+                    if (tmp >= 0)
+                    {
+                        tmp >>= n;
+                    }
+                    else
+                    {
+                        uint32_t mask = (1u << n) - 1;
+                        tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
+                    }
+                }
+                if (is_sat)
+                {
+                    tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
                 }
+                *(output_ptr + x) = static_cast<int16_t>(tmp);
             }
-            if(is_sat)
-            {
-                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
-            }
-            *(output_ptr + x) = static_cast<int16_t>(tmp);
-        }
-    },
-    input1, input2, dst);
+        },
+        input1, input2, dst);
 }
 
 template <bool is_scale255, bool is_sat>
@@ -1755,7 +1709,12 @@ void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, cons
 }
 } // namespace
 
-void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+void CpuMulKernel::configure(ITensorInfo   *src1,
+                             ITensorInfo   *src2,
+                             ITensorInfo   *dst,
+                             float          scale,
+                             ConvertPolicy  overflow_policy,
+                             RoundingPolicy rounding_policy)
 {
     ARM_COMPUTE_UNUSED(rounding_policy);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
@@ -1775,7 +1734,7 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
 
     bool is_scale_255 = false;
     // Check and validate scaling factor
-    if(std::abs(scale - scale255_constant) < 0.00001f)
+    if (std::abs(scale - scale255_constant) < 0.00001f)
     {
         is_scale_255 = true;
     }
@@ -1795,12 +1754,12 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
     const DataType dt_output = dst->data_type();
     const bool     is_sat    = (overflow_policy == ConvertPolicy::SATURATE);
 
-    switch(dt_input1)
+    switch (dt_input1)
     {
         case DataType::QASYMM8:
-            if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
+            if (dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
             {
-                if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
+                if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
                 {
                     _func_quantized = &mul_q8_neon_fixedpoint<uint8_t>;
                 }
@@ -1811,9 +1770,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
             }
             break;
         case DataType::QASYMM8_SIGNED:
-            if(dt_input2 == DataType::QASYMM8_SIGNED)
+            if (dt_input2 == DataType::QASYMM8_SIGNED)
             {
-                if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
+                if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
                 {
                     _func_quantized = &mul_q8_neon_fixedpoint<int8_t>;
                 }
@@ -1824,19 +1783,19 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
             }
             break;
         case DataType::QSYMM16:
-            if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
+            if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
             {
                 _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
             }
-            else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
+            else if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
             {
                 _func_int = &mul_QSYMM16_QSYMM16_S32;
             }
             break;
         case DataType::S16:
-            if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
+            if (DataType::U8 == dt_input2 && DataType::S16 == dt_output)
             {
-                if(is_scale_255)
+                if (is_scale_255)
                 {
                     _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
                 }
@@ -1845,9 +1804,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
                     _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
                 }
             }
-            if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
+            if (DataType::S16 == dt_input2 && DataType::S16 == dt_output)
             {
-                if(is_scale_255)
+                if (is_scale_255)
                 {
                     _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
                 }
@@ -1858,15 +1817,15 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
             }
             break;
         case DataType::S32:
-            if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)
+            if (DataType::S32 == dt_input2 && DataType::S32 == dt_output)
             {
                 _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
             }
             break;
         case DataType::U8:
-            if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
+            if (DataType::U8 == dt_input2 && DataType::U8 == dt_output)
             {
-                if(is_scale_255)
+                if (is_scale_255)
                 {
                     _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
                 }
@@ -1875,9 +1834,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
                     _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
                 }
             }
-            else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
+            else if (DataType::U8 == dt_input2 && DataType::S16 == dt_output)
             {
-                if(is_scale_255)
+                if (is_scale_255)
                 {
                     _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
                 }
@@ -1886,9 +1845,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
                     _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
                 }
             }
-            else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
+            else if (DataType::S16 == dt_input2 && DataType::S16 == dt_output)
             {
-                if(is_scale_255)
+                if (is_scale_255)
                 {
                     _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
                 }
@@ -1922,20 +1881,20 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
     ARM_COMPUTE_UNUSED(thread_count);
 
 #if defined(ENABLE_FP32_KERNELS)
-    if(this->_func_float == &mul_F32_F32_F32)
+    if (this->_func_float == &mul_F32_F32_F32)
     {
         size_t mws = ICPPKernel::default_mws;
-        if(platform.get_cpu_model() == CPUModel::N1)
+        if (platform.get_cpu_model() == CPUModel::N1)
         {
             mws = default_mws_N1_fp32_neon;
         }
-        else if(platform.get_cpu_model() == CPUModel::V1)
+        else if (platform.get_cpu_model() == CPUModel::V1)
         {
             mws = default_mws_V1_fp32_neon;
         }
         else
         {
-            if(_split_dimension == Window::DimX)
+            if (_split_dimension == Window::DimX)
             {
                 // Don't split the work load too small if the tensor has been reinterpreted as 1D.
                 // This number is loosely chosen as threading overhead in each platform varies wildly.
@@ -1945,7 +1904,7 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
         }
 
         // tensor is 1D or was re-interpreted as 1D
-        if(this->window().shape().num_dimensions() == 1)
+        if (this->window().shape().num_dimensions() == 1)
         {
             return mws;
         }
@@ -1958,10 +1917,10 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
             return std::max(static_cast<size_t>(1), mws);
         }
     }
-#else /* ENABLE_FP32_KERNELS */
+#else  /* ENABLE_FP32_KERNELS */
     ARM_COMPUTE_UNUSED(platform);
 #endif /* ENABLE_FP32_KERNELS */
-    if(_split_dimension == Window::DimX)
+    if (_split_dimension == Window::DimX)
     {
         // Don't split the work load too small if the tensor has been reinterpreted as 1D.
         // This number is loosely chosen as threading overhead in each platform varies wildly.
@@ -1970,8 +1929,12 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
     return default_mws;
 }
 
-Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
-                              RoundingPolicy rounding_policy)
+Status CpuMulKernel::validate(const ITensorInfo *src1,
+                              const ITensorInfo *src2,
+                              const ITensorInfo *dst,
+                              float              scale,
+                              ConvertPolicy      overflow_policy,
+                              RoundingPolicy     rounding_policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
@@ -1989,11 +1952,11 @@ void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const Thre
     auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     auto dst  = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_func_quantized != nullptr)
+    if (_func_quantized != nullptr)
     {
         (*_func_quantized)(src1, src2, dst, window, _scale);
     }
-    else if(_func_int != nullptr)
+    else if (_func_int != nullptr)
     {
         (*_func_int)(src1, src2, dst, window, _scale_exponent);
     }
@@ -2021,10 +1984,11 @@ Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *sr
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                        "Wrong shape for dst");
     }
 
     return Status{};
diff --git a/src/cpu/kernels/CpuMulKernel.h b/src/cpu/kernels/CpuMulKernel.h
index 9e4a37110b5f7e76c75ce4aa6158c0b3036f5410..7eaf28750761b9a43828854906c1f0ac5a0459cb 100644
--- a/src/cpu/kernels/CpuMulKernel.h
+++ b/src/cpu/kernels/CpuMulKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_MUL_KERNEL_H
 
 #include "arm_compute/core/Rounding.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -68,17 +69,27 @@ public:
      * @param[in]  overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
      * @param[in]  rounding_policy Rounding policy.
      */
-    void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+    void configure(ITensorInfo   *src1,
+                   ITensorInfo   *src2,
+                   ITensorInfo   *dst,
+                   float          scale,
+                   ConvertPolicy  overflow_policy,
+                   RoundingPolicy rounding_policy);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuMulKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+    static Status validate(const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           float              scale,
+                           ConvertPolicy      overflow_policy,
+                           RoundingPolicy     rounding_policy);
 
     // Inherited methods overridden
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     /** Return minimum workload size of the relevant kernel
@@ -108,7 +119,8 @@ private:
      * @param[in]  window Region on which to execute the kernel
      * @param[in]  scale  Integer scale factor.
      */
-    using MulFunctionInt = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale);
+    using MulFunctionInt =
+        void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale);
     /** Common signature for all the specialised multiplication functions with float scaling factor
      *
      * @param[in]  src1   Src1 tensor object.
@@ -117,7 +129,8 @@ private:
      * @param[in]  window Region on which to execute the kernel
      * @param[in]  scale  Float scale factor.
      */
-    using MulFunctionFloat = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
+    using MulFunctionFloat =
+        void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
     /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor
      *
      * @param[in]  src1   Src1 tensor object.
@@ -127,14 +140,15 @@ private:
      * @param[in]  scale  Float scale factor.
      *
      */
-    using MulFunctionQuantized = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
+    using MulFunctionQuantized =
+        void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
 
-    MulFunctionFloat     *_func_float{ nullptr };
-    MulFunctionInt       *_func_int{ nullptr };
-    MulFunctionQuantized *_func_quantized{ nullptr };
-    float                 _scale{ 0 };
-    int                   _scale_exponent{ 0 };
-    size_t                _split_dimension{ Window::DimY };
+    MulFunctionFloat     *_func_float{nullptr};
+    MulFunctionInt       *_func_int{nullptr};
+    MulFunctionQuantized *_func_quantized{nullptr};
+    float                 _scale{0};
+    int                   _scale_exponent{0};
+    size_t                _split_dimension{Window::DimY};
 };
 
 /** Interface for the complex pixelwise multiplication kernel. */
@@ -159,7 +173,7 @@ public:
     static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuPermuteKernel.cpp b/src/cpu/kernels/CpuPermuteKernel.cpp
index d65e01103224e08da662ec426e52d5343499ef51..b444a25ff737c948f9d595b09de820f59bb10d1c 100644
--- a/src/cpu/kernels/CpuPermuteKernel.cpp
+++ b/src/cpu/kernels/CpuPermuteKernel.cpp
@@ -28,8 +28,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -48,56 +49,31 @@ namespace
 {
 inline bool is_permutation_supported(const PermutationVector &v)
 {
-    static const std::array<PermutationVector, 2> permutations2 =
-    {
-        {
-            PermutationVector(0U, 1U),
-            PermutationVector(1U, 0U),
-        }
-    };
-    static const std::array<PermutationVector, 6> permutations3 =
-    {
-        {
-            PermutationVector(2U, 0U, 1U),
-            PermutationVector(1U, 2U, 0U),
-            PermutationVector(0U, 1U, 2U),
-            PermutationVector(0U, 2U, 1U),
-            PermutationVector(1U, 0U, 2U),
-            PermutationVector(2U, 1U, 0U),
-        }
-    };
-    static const std::array<PermutationVector, 24> permutations4 =
-    {
-        {
-            PermutationVector(0U, 1U, 2U, 3U),
-            PermutationVector(1U, 0U, 2U, 3U),
-            PermutationVector(2U, 0U, 1U, 3U),
-            PermutationVector(0U, 2U, 1U, 3U),
-            PermutationVector(1U, 2U, 0U, 3U),
-            PermutationVector(2U, 1U, 0U, 3U),
-            PermutationVector(2U, 1U, 3U, 0U),
-            PermutationVector(1U, 2U, 3U, 0U),
-            PermutationVector(3U, 2U, 1U, 0U),
-            PermutationVector(2U, 3U, 1U, 0U),
-            PermutationVector(1U, 3U, 2U, 0U),
-            PermutationVector(3U, 1U, 2U, 0U),
-            PermutationVector(3U, 0U, 2U, 1U),
-            PermutationVector(0U, 3U, 2U, 1U),
-            PermutationVector(2U, 3U, 0U, 1U),
-            PermutationVector(3U, 2U, 0U, 1U),
-            PermutationVector(0U, 2U, 3U, 1U),
-            PermutationVector(2U, 0U, 3U, 1U),
-            PermutationVector(1U, 0U, 3U, 2U),
-            PermutationVector(0U, 1U, 3U, 2U),
-            PermutationVector(3U, 1U, 0U, 2U),
-            PermutationVector(1U, 3U, 0U, 2U),
-            PermutationVector(0U, 3U, 1U, 2U),
-            PermutationVector(3U, 0U, 1U, 2U)
-        }
-    };
+    static const std::array<PermutationVector, 2>  permutations2 = {{
+         PermutationVector(0U, 1U),
+         PermutationVector(1U, 0U),
+    }};
+    static const std::array<PermutationVector, 6>  permutations3 = {{
+         PermutationVector(2U, 0U, 1U),
+         PermutationVector(1U, 2U, 0U),
+         PermutationVector(0U, 1U, 2U),
+         PermutationVector(0U, 2U, 1U),
+         PermutationVector(1U, 0U, 2U),
+         PermutationVector(2U, 1U, 0U),
+    }};
+    static const std::array<PermutationVector, 24> permutations4 = {
+        {PermutationVector(0U, 1U, 2U, 3U), PermutationVector(1U, 0U, 2U, 3U), PermutationVector(2U, 0U, 1U, 3U),
+         PermutationVector(0U, 2U, 1U, 3U), PermutationVector(1U, 2U, 0U, 3U), PermutationVector(2U, 1U, 0U, 3U),
+         PermutationVector(2U, 1U, 3U, 0U), PermutationVector(1U, 2U, 3U, 0U), PermutationVector(3U, 2U, 1U, 0U),
+         PermutationVector(2U, 3U, 1U, 0U), PermutationVector(1U, 3U, 2U, 0U), PermutationVector(3U, 1U, 2U, 0U),
+         PermutationVector(3U, 0U, 2U, 1U), PermutationVector(0U, 3U, 2U, 1U), PermutationVector(2U, 3U, 0U, 1U),
+         PermutationVector(3U, 2U, 0U, 1U), PermutationVector(0U, 2U, 3U, 1U), PermutationVector(2U, 0U, 3U, 1U),
+         PermutationVector(1U, 0U, 3U, 2U), PermutationVector(0U, 1U, 3U, 2U), PermutationVector(3U, 1U, 0U, 2U),
+         PermutationVector(1U, 3U, 0U, 2U), PermutationVector(0U, 3U, 1U, 2U), PermutationVector(3U, 0U, 1U, 2U)}};
 
-    return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) || (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v))
-           || (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v));
+    return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) ||
+           (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v)) ||
+           (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v));
 }
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
@@ -108,7 +84,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
 
     // Validate configured destination
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -128,18 +104,22 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c
 
     // we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others
     // we have to fall back to C++
-    if((src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U }) || (src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U }))
+    if ((src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U}) ||
+        (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U}))
     {
-        window_src.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
-        window_src.set(Window::DimY, Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
-        window_src.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
+        window_src.set(Window::DimX,
+                       Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
+        window_src.set(Window::DimY,
+                       Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
+        window_src.set(Window::DimZ,
+                       Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
         window_src.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start()));
     }
 
     // Destination window
     Window                  window_dst(window);
     const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
-    for(size_t d = 0; d <= dst->info()->num_dimensions(); ++d)
+    for (size_t d = 0; d <= dst->info()->num_dimensions(); ++d)
     {
         window_dst.set(d, zero_window);
     }
@@ -157,7 +137,7 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c
     int n_channels        = 0;
     int n_batches         = 0;
 
-    switch(src_layout)
+    switch (src_layout)
     {
         case DataLayout::NCHW:
         {
@@ -189,38 +169,42 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c
     }
 
     // CHW -> HWC
-    if(src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U })
+    if (src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U})
     {
         const int out_channel_stride = dst->info()->strides_in_bytes().x() / sizeof(T);
         const int out_col_stride     = dst->info()->strides_in_bytes().y() / sizeof(T);
         const int out_row_stride     = dst->info()->strides_in_bytes().z() / sizeof(T);
         const int out_batch_stride   = dst->info()->strides_in_bytes()[3] / sizeof(T);
-        execute_window_loop(window_src, [&](const Coordinates & id)
-        {
-            const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride;
-            reorder::nchw_to_nhwc(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx,
-                                  n_batches, n_channels, n_rows, n_cols,
-                                  in_batch_stride, in_channel_stride, in_row_stride,
-                                  out_batch_stride, out_row_stride, out_col_stride);
-        },
-        src_it, dst_it);
+        execute_window_loop(
+            window_src,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride;
+                reorder::nchw_to_nhwc(reinterpret_cast<const T *>(src_it.ptr()),
+                                      reinterpret_cast<T *>(dst_it.ptr()) + idx, n_batches, n_channels, n_rows, n_cols,
+                                      in_batch_stride, in_channel_stride, in_row_stride, out_batch_stride,
+                                      out_row_stride, out_col_stride);
+            },
+            src_it, dst_it);
     }
     // HWC -> CHW
-    else if(src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U })
+    else if (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U})
     {
         const int out_col_stride     = dst->info()->strides_in_bytes().x() / sizeof(T);
         const int out_row_stride     = dst->info()->strides_in_bytes().y() / sizeof(T);
         const int out_channel_stride = dst->info()->strides_in_bytes().z() / sizeof(T);
         const int out_batch_stride   = dst->info()->strides_in_bytes()[3] / sizeof(T);
-        execute_window_loop(window_src, [&](const Coordinates & id)
-        {
-            const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride;
-            reorder::nhwc_to_nchw(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx,
-                                  n_batches, n_rows, n_cols, n_channels,
-                                  in_batch_stride, in_row_stride, in_col_stride,
-                                  out_batch_stride, out_channel_stride, out_row_stride);
-        },
-        src_it, dst_it);
+        execute_window_loop(
+            window_src,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride;
+                reorder::nhwc_to_nchw(reinterpret_cast<const T *>(src_it.ptr()),
+                                      reinterpret_cast<T *>(dst_it.ptr()) + idx, n_batches, n_rows, n_cols, n_channels,
+                                      in_batch_stride, in_row_stride, in_col_stride, out_batch_stride,
+                                      out_channel_stride, out_row_stride);
+            },
+            src_it, dst_it);
     }
     else
     {
@@ -230,12 +214,15 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c
         Strides perm_strides = strides;
         permute_strides(perm_strides, perm);
         const int perm_stride_3 = src->info()->num_dimensions() >= 4 ? perm_strides[3] : 0;
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int idx                                = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3;
-            *(reinterpret_cast<T *>(dst_it.ptr() + idx)) = *(reinterpret_cast<const T *>(src_it.ptr()));
-        },
-        src_it, dst_it);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int idx =
+                    id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3;
+                *(reinterpret_cast<T *>(dst_it.ptr() + idx)) = *(reinterpret_cast<const T *>(src_it.ptr()));
+            },
+            src_it, dst_it);
     }
 }
 } // namespace
@@ -275,7 +262,7 @@ void CpuPermuteKernel::run_op(ITensorPack &tensors, const Window &window, const
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    switch(src->info()->element_size())
+    switch (src->info()->element_size())
     {
         case 1:
             run_permute<uint8_t>(window, src, dst, _perm);
diff --git a/src/cpu/kernels/CpuPermuteKernel.h b/src/cpu/kernels/CpuPermuteKernel.h
index 9e1b93318e089821130cbff5f37d47bda4178a76..0cb2faf22394a2956686f1ca98eddbd189f806c2 100644
--- a/src/cpu/kernels/CpuPermuteKernel.h
+++ b/src/cpu/kernels/CpuPermuteKernel.h
@@ -57,7 +57,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
diff --git a/src/cpu/kernels/CpuPool2dKernel.cpp b/src/cpu/kernels/CpuPool2dKernel.cpp
index d72a41cbbe3aaffad176505060a43868820063f7..9308d860d1fa14e6f1badf2ef891df0ff133b10b 100644
--- a/src/cpu/kernels/CpuPool2dKernel.cpp
+++ b/src/cpu/kernels/CpuPool2dKernel.cpp
@@ -25,15 +25,17 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/cpu/kernels/pool2d/neon/list.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/pool2d/neon/list.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -46,99 +48,111 @@ namespace
 {
 using namespace misc::shape_calculator;
 
-static const std::vector<CpuPool2dKernel::PoolingKernel> available_kernels =
-{
-    {
-        "neon_qu8_nhwc_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)
-    },
-    {
-        "neon_qs8_nhwc_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)
-    },
-    {
-        "neon_f16_nhwc_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)) && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)
-    },
-    {
-        "neon_fp32_nhwc_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)
-    },
+static const std::vector<CpuPool2dKernel::PoolingKernel> available_kernels = {
+    {"neon_qu8_nhwc_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)},
+    {"neon_qs8_nhwc_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)},
+    {"neon_f16_nhwc_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)) && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)},
+    {"neon_fp32_nhwc_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
+     REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)},
 #if defined(ENABLE_NCHW_KERNELS)
-    {
-        "neon_qu8_nchw_pool2",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "neon_qu8_nchw_pool3",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "neon_qu8_nchw_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "neon_qs8_nchw_pool2",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)
-    },
-    {
-        "neon_qs8_nchw_pool3",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)
-    },
-    {
-        "neon_qs8_nchw_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)
-    },
-    {
-        "neon_fp16_nchw_pool2",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)
-    },
-    {
-        "neon_fp16_nchw_pool3",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)
-    },
-    {
-        "neon_fp16_nchw_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)
-    },
-    {
-        "neon_fp32_nchw_pool2",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)
-    },
-    {
-        "neon_fp32_nchw_pool3",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)
-    },
-    {
-        "neon_fp32_nchw_pool7",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)
-    },
-    {
-        "neon_fp32_nchw_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)
-    },
+    {"neon_qu8_nchw_pool2",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3));
+     },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)},
+    {"neon_qu8_nchw_pool3",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3));
+     },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)},
+    {"neon_qu8_nchw_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)},
+    {"neon_qs8_nchw_pool2",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3));
+     },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)},
+    {"neon_qs8_nchw_pool3",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3));
+     },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)},
+    {"neon_qs8_nchw_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)},
+    {"neon_fp16_nchw_pool2",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2));
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)},
+    {"neon_fp16_nchw_pool3",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3));
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)},
+    {"neon_fp16_nchw_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16)); },
+     REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)},
+    {"neon_fp32_nchw_pool2",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2));
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)},
+    {"neon_fp32_nchw_pool3",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3));
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)},
+    {"neon_fp32_nchw_pool7",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7));
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)},
+    {"neon_fp32_nchw_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
+     REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)},
 #endif /* defined(ENABLE_NCHW_KERNELS) */
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info,
-                          const ITensorInfo *indices, Size2D pool_size)
+Status validate_arguments(const ITensorInfo      *src,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &pool_info,
+                          const ITensorInfo      *indices,
+                          Size2D                  pool_size)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(pool_size.x() == 0);
@@ -150,65 +164,78 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     int                 output_height   = 0;
     PoolingType         pool_type       = pool_info.pool_type;
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    const auto          data_layout     = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int           idx_width       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height      = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type()))
-                                    && (is_pool_region_entirely_outside_input(pool_info)),
-                                    "Pooling region that is entirely outside input tensor is unsupported for non-float types");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (!is_data_type_float(src->data_type())) && (is_pool_region_entirely_outside_input(pool_info)),
+        "Pooling region that is entirely outside input tensor is unsupported for non-float types");
 
-    std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                     pool_size.x(), pool_size.y(), pool_info.pad_stride_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
+    std::tie(output_width, output_height) =
+        scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], pool_size.x(),
+                                 pool_size.y(), pool_info.pad_stride_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1),
+                                    "Calculated output dimension size is invalid");
 
     TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type()));
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    if(indices)
+    if (indices)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+                                        "Pooling indices only supported for MAX pooling method");
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type()));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
-                                    && (src->data_layout() == DataLayout::NHWC),
-                                    "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding &&
+            (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding() &&
+            (src->data_layout() == DataLayout::NHWC),
+        "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
-        if(indices)
+        if (indices)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(((pool_size != Size2D(2, 2)) && !pool_info.use_kernel_indices), "Pooling indices returning source tensor coordinates is only supported for pool size 2x2");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.use_kernel_indices && (src->data_layout() != DataLayout::NHWC), "Pooling kernel indices only supported for NHWC");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                ((pool_size != Size2D(2, 2)) && !pool_info.use_kernel_indices),
+                "Pooling indices returning source tensor coordinates is only supported for pool size 2x2");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.use_kernel_indices && (src->data_layout() != DataLayout::NHWC),
+                                            "Pooling kernel indices only supported for NHWC");
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &out_info);
         }
     }
 
-    const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ src->data_type(), src->data_layout(), pool_stride_x, pool_size, CPUInfo::get().get_isa() });
+    const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{
+        src->data_type(), src->data_layout(), pool_stride_x, pool_size, CPUInfo::get().get_isa()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
-                                                        unsigned int &num_elems_processed_per_iteration,
-                                                        int pool_size_x, int pool_size_y)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo            *src,
+                                                        ITensorInfo            *dst,
+                                                        ITensorInfo            *indices,
+                                                        const PoolingLayerInfo &pool_info,
+                                                        unsigned int           &num_elems_processed_per_iteration,
+                                                        int                     pool_size_x,
+                                                        int                     pool_size_y)
 {
     // dst auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)));
-    if(indices)
+    if (indices)
     {
         // Indices auto inizialitation if not yet initialized
-        auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src,
-                                                                                        pool_info)))
-                           .set_data_type(DataType::U32) /* we store the offset to the element */);
+        auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)))
+                                         .set_data_type(DataType::U32) /* we store the offset to the element */);
     }
     const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
 
@@ -219,20 +246,20 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
 
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const bool         is_square = pool_size_x == pool_size_y;
-    const unsigned int pooled_w  = dst->dimension(idx_width);
-    const unsigned int pooled_h  = dst->dimension(idx_height);
+    const bool         is_square           = pool_size_x == pool_size_y;
+    const unsigned int pooled_w            = dst->dimension(idx_width);
+    const unsigned int pooled_h            = dst->dimension(idx_height);
 
     //If it's not squared and optimized will be executed the MxN
     num_elems_processed_per_iteration = 1;
 
-    if(is_square)
+    if (is_square)
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::QASYMM8:
             case DataType::QASYMM8_SIGNED:
-                switch(pool_size_x)
+                switch (pool_size_x)
                 {
                     case 2:
                         num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
@@ -261,18 +288,22 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     bool   window_changed = false;
     Window win{};
     // Upper limit for the number of right/bottom border elements that are accessed
-    TensorShape dst_shape{ src->tensor_shape() };
+    TensorShape dst_shape{src->tensor_shape()};
     dst_shape.set(0, pooled_w);
     dst_shape.set(1, pooled_h);
     TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape));
     win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration));
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
-void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+void CpuPool2dKernel::configure(ITensorInfo            *src,
+                                ITensorInfo            *dst,
+                                const PoolingLayerInfo &pool_info,
+                                ITensorInfo            *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info;
@@ -284,14 +315,15 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin
     const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Update pool size in case of global pooling
-    const Size2D pool_size(
-        is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
-        is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height);
+    const Size2D pool_size(is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
+                           is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height);
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size));
 
-    const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ src->data_type(), src->data_layout(), (int)pad_stride_info.stride().first, pool_size, CPUInfo::get().get_isa() });
+    const auto *uk = CpuPool2dKernel::get_implementation(
+        PoolDataTypeISASelectorData{src->data_type(), src->data_layout(), (int)pad_stride_info.stride().first,
+                                    pool_size, CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr);
 
     // Set instance variables
@@ -302,7 +334,7 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin
     _run_method    = uk->ukernel;
     _name          = std::string("CpuPool2dKernel").append("/").append(uk->name);
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         // Configure kernel window
         Window win = calculate_max_window(*dst, Steps());
@@ -311,14 +343,17 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin
     else
     {
         // Configure kernel window
-        auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration,
-                                                        pool_size.x(), pool_size.y());
+        auto win_config = validate_and_configure_window(
+            src, dst, indices, pool_info, _num_elems_processed_per_iteration, pool_size.x(), pool_size.y());
         ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
         ICpuKernel::configure(win_config.second);
     }
 }
 
-Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CpuPool2dKernel::validate(const ITensorInfo      *src,
+                                 const ITensorInfo      *dst,
+                                 const PoolingLayerInfo &pool_info,
+                                 const ITensorInfo      *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
@@ -336,9 +371,10 @@ Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst,
 
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices, Size2D(pool_size_x, pool_size_y)));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(),
-                                                              (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration,
-                                                              pool_size_x, pool_size_y)
-                                .first);
+                                                              (indices) ? indices->clone().get() : nullptr, pool_info,
+                                                              num_elems_processed_per_iteration, pool_size_x,
+                                                              pool_size_y)
+                                    .first);
 
     return Status{};
 }
@@ -359,19 +395,20 @@ void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const T
     const unsigned int pool_size     = _pool_info.pool_size.width;
 
     Window window_src(window);
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         // Set step for src in x and y direction for the src
         unsigned int window_x_inc = 0;
-        switch(src->info()->data_type())
+        switch (src->info()->data_type())
         {
             case DataType::QASYMM8:
             case DataType::QASYMM8_SIGNED:
             {
                 window_x_inc = pool_stride_x;
-                if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
+                if ((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
                 {
-                    window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+                    window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2
+                                                        : _num_elems_processed_per_iteration;
                 }
                 break;
             }
@@ -387,8 +424,10 @@ void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const T
                 ARM_COMPUTE_ERROR("Not supported");
             }
         }
-        window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
-        window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
+        window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x,
+                                                       window.x().end() * pool_stride_x, window_x_inc));
+        window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y,
+                                                       window.y().end() * pool_stride_y, pool_stride_y));
     }
     else
     {
diff --git a/src/cpu/kernels/CpuPool2dKernel.h b/src/cpu/kernels/CpuPool2dKernel.h
index c952ea839d1c3d3eaa2c8db45170d3600ee276fe..859de8cc5fb28fbce743e13a83e68a63234aac6d 100644
--- a/src/cpu/kernels/CpuPool2dKernel.h
+++ b/src/cpu/kernels/CpuPool2dKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_POOL2D_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -38,7 +39,8 @@ namespace kernels
 class CpuPool2dKernel : public ICpuKernel<CpuPool2dKernel>
 {
 private:
-    using PoolingKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
+    using PoolingKernelPtr = std::add_pointer<void(
+        const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
 
 public:
     CpuPool2dKernel() = default;
@@ -52,17 +54,21 @@ public:
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    void
+    configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuPool2dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct PoolingKernel
@@ -76,11 +82,11 @@ public:
 
 private:
     PoolingLayerInfo _pool_info{};
-    DataLayout       _data_layout{ DataLayout::UNKNOWN };
-    unsigned int     _num_elems_processed_per_iteration{ 0 };
+    DataLayout       _data_layout{DataLayout::UNKNOWN};
+    unsigned int     _num_elems_processed_per_iteration{0};
     Size2D           _pool_size{};
     int              _pool_stride_x{};
-    PoolingKernelPtr _run_method{ nullptr };
+    PoolingKernelPtr _run_method{nullptr};
     std::string      _name{};
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuPool3dKernel.cpp b/src/cpu/kernels/CpuPool3dKernel.cpp
index 4504f3f7c9dafe61f3de4d1eedb4fe4c4c4ae9bd..8b484d4e0b81013b15642c64d9ef2bb4a97dd57f 100644
--- a/src/cpu/kernels/CpuPool3dKernel.cpp
+++ b/src/cpu/kernels/CpuPool3dKernel.cpp
@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/pool3d/list.h"
@@ -41,39 +42,28 @@ namespace
 {
 using namespace misc::shape_calculator;
 
-static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels =
-{
-    {
-        "neon_qu8_ndhwc_poolMxNxD",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_q8_pool3d)
-    },
-    {
-        "neon_qs8_ndhwc_poolMxNxD",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_q8_signed_pool3d)
-    },
-    {
-        "neon_fp16_ndhwc_poolMxNxD",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16 && data.isa.fp16); },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)
-    },
-    {
-        "neon_fp32_ndhwc_poolMxNxD",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d)
-    }
-};
+static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels = {
+    {"neon_qu8_ndhwc_poolMxNxD", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_q8_pool3d)},
+    {"neon_qs8_ndhwc_poolMxNxD",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_q8_signed_pool3d)},
+    {"neon_fp16_ndhwc_poolMxNxD",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16 && data.isa.fp16); },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)},
+    {"neon_fp32_ndhwc_poolMxNxD", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d)}};
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && (!pool_info.exclude_padding
-                                                                                && (pool_info.pool_type == PoolingType::AVG)),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) &&
+                                        (!pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG)),
                                     "Exclude padding is unsupported for non-float types for Avg op");
 
     const auto data_layout = src->data_layout();
@@ -97,21 +87,26 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     int output_height = 0;
     int output_depth  = 0;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info),
+                                    "Pooling region that is entirely outside input tensor is unsupported");
 
-    std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], src->tensor_shape()[idx_depth],
-                                                                                      pool_size_x, pool_size_y, pool_size_z, pool_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid");
+    std::tie(output_width, output_height, output_depth) =
+        scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
+                                    src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, pool_size_z, pool_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1),
+                                    "Calculated output dimension size is invalid");
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-        TensorInfo out_info(TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC));
+        TensorInfo out_info(
+            TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
     }
 
-    const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuPool3dKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     return Status{};
@@ -136,12 +131,12 @@ void CpuPool3dKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
 
     // Update pool size in case of global pooling
     const bool   is_global_pooling = pool_info.is_global_pooling;
-    const Size3D pool_size(
-        is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
-        is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height,
-        is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth);
+    const Size3D pool_size(is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
+                           is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height,
+                           is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth);
 
-    const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuPool3dKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr);
 
     // Set instance variables
@@ -188,4 +183,4 @@ const std::vector<CpuPool3dKernel::Pooling3dKernel> &CpuPool3dKernel::get_availa
 
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuPool3dKernel.h b/src/cpu/kernels/CpuPool3dKernel.h
index 437f2af7e431a0cdbf627d32bb9322db18987c3f..bd1ff61046d915c1f8382000b334deec12243e18 100644
--- a/src/cpu/kernels/CpuPool3dKernel.h
+++ b/src/cpu/kernels/CpuPool3dKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_POOL3D_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -39,7 +40,8 @@ class CpuPool3dKernel : public ICpuKernel<CpuPool3dKernel>
 {
 private:
     /* Template function for Pooling 3D NDHWC */
-    using Pooling3dKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, Pooling3dLayerInfo &, const Window &)>::type;
+    using Pooling3dKernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, Pooling3dLayerInfo &, const Window &)>::type;
 
 public:
     CpuPool3dKernel() = default;
@@ -68,7 +70,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct Pooling3dKernel
@@ -82,11 +84,11 @@ public:
 
 private:
     Pooling3dLayerInfo _pool_info{};
-    Pooling3dKernelPtr _run_method{ nullptr };
+    Pooling3dKernelPtr _run_method{nullptr};
     std::string        _name{};
 };
 
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */
\ No newline at end of file
+#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */
diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp
index 9700c623186e47e78411377cda8989e2dfc85222..5dde6808373b0912fc944317f66a57063c634eca 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.cpp
+++ b/src/cpu/kernels/CpuQuantizeKernel.cpp
@@ -28,13 +28,13 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/CPP/Validate.h"
 
 #include <arm_neon.h>
 #include <map>
@@ -53,9 +53,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::QASYMM16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
 
     return Status{};
@@ -71,19 +73,15 @@ inline float32x4x4_t load_value(const T *input_ptr)
 template <>
 inline float32x4x4_t load_value(const float *input_ptr)
 {
-    return { wrapper::vloadq(input_ptr),
-             wrapper::vloadq(input_ptr + 4),
-             wrapper::vloadq(input_ptr + 8),
-             wrapper::vloadq(input_ptr + 12) };
+    return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), wrapper::vloadq(input_ptr + 8),
+            wrapper::vloadq(input_ptr + 12)};
 }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <>
 inline float32x4x4_t load_value(const float16_t *input_ptr)
 {
-    return { vcvt_f32_f16(wrapper::vload(input_ptr)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 12)) };
+    return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+            vcvt_f32_f16(wrapper::vload(input_ptr + 8)), vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
 }
 
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -113,26 +111,25 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
-    static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map =
-    {
-        { "op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t> },
-        { "op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t> },
-        { "op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t> },
+    static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map = {
+        {"op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t>},
+        {"op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t>},
+        {"op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t>},
 
-        { "op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t> },
-        { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t> },
-        { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t> },
+        {"op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t>},
+        {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t>},
+        {"op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t>},
 
-        { "op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t> },
+        {"op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t>},
 
-        { "op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t> },
-        { "op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t> },
-        { "op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float> },
+        {"op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t>},
+        {"op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t>},
+        {"op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float>},
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        { "op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t> },
-        { "op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t> },
-        { "op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t> },
+        {"op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t>},
+        {"op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t>},
+        {"op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t>},
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
     };
 
@@ -142,7 +139,7 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
 
     auto it = quant_map.find(function_to_call);
 
-    if(it == quant_map.end())
+    if (it == quant_map.end())
     {
         ARM_COMPUTE_ERROR("Unsupported combination of input and output data types");
     }
@@ -167,7 +164,7 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co
 
     const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
     UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
     {
         uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
     }
@@ -177,22 +174,24 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co
 
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
-        auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step); x += window_step)
-        {
-            wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info());
-        }
-    },
-    input, output);
+            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+            int  x          = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info());
+            }
+        },
+        input, output);
 }
 
 template <typename TIn, typename TOut>
@@ -203,7 +202,7 @@ void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, c
 
     const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
     UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
     {
         uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
     }
@@ -219,23 +218,25 @@ void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, c
 
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
-        auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step); x += window_step)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
-        }
-    },
-    input, output);
+            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
+            }
+        },
+        input, output);
 }
 
 template <typename T>
@@ -246,7 +247,7 @@ void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst,
 
     const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
     UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
     {
         uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
     }
@@ -262,25 +263,27 @@ void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst,
 
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step); x += window_step)
-        {
-            uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
-            vst1q_u16(&output_ptr[x], tmp.val[0]);
-            vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
-        }
-    },
-    input, output);
+            auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
+                vst1q_u16(&output_ptr[x], tmp.val[0]);
+                vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
+            }
+        },
+        input, output);
 }
 
 void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h
index 2bc8105a11888fe07523f9f2146475435f78de35..d6714136da49669fee770031d7da805bc708a8cf 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.h
+++ b/src/cpu/kernels/CpuQuantizeKernel.h
@@ -59,7 +59,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -67,7 +67,9 @@ private:
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
+    using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src,
+                                                                    ITensor       *dst,
+                                                                    const Window  &window);
     /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor.
      *
      * @param[in] window Region on which to execute the kernel.
@@ -84,7 +86,7 @@ private:
     template <typename TIn, typename TOut>
     void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window);
 
-    QuantizeFunctionExecutorPtr _func{ nullptr };
+    QuantizeFunctionExecutorPtr _func{nullptr};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuReshapeKernel.cpp b/src/cpu/kernels/CpuReshapeKernel.cpp
index 068ff07efa3fe2827a94b4c7118ae9ae5a2592b2..241e58fbce9957633d5267a732669f5ecf28855c 100644
--- a/src/cpu/kernels/CpuReshapeKernel.cpp
+++ b/src/cpu/kernels/CpuReshapeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,10 +29,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
+
+#include "src/core/helpers/Utils.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include <cstdint>
 
@@ -51,7 +51,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
     // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(dst->tensor_shape().total_size() != 0)
+    if (dst->tensor_shape().total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -62,20 +62,111 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 }
 
 template <typename T>
-inline void reshape_tensor(const Window &window, const ITensor *src, ITensor *dst)
+void reshape_tensor_per_element(const Window &window, const ITensor *src, ITensor *dst)
+{
+    const TensorShape &src_shape = src->info()->tensor_shape();
+    const TensorShape &dst_shape = dst->info()->tensor_shape();
+
+    Iterator dst_it(dst, window);
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &dst_coord)
+        {
+            Coordinates src_coord  = index2coords(src_shape, coords2index(dst_shape, dst_coord));
+            const auto  output_ptr = dst->ptr_to_element(dst_coord);
+            const auto  input_ptr  = src->ptr_to_element(src_coord);
+
+            *reinterpret_cast<T *>(output_ptr) = *reinterpret_cast<T *>(input_ptr);
+        },
+        dst_it);
+}
+
+void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst)
+{
+    switch (src->info()->data_type())
+    {
+        case DataType::U8:
+        case DataType::S8:
+        case DataType::QSYMM8:
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+            reshape_tensor_per_element<uint8_t>(window, src, dst);
+            break;
+        case DataType::U16:
+        case DataType::S16:
+        case DataType::F16:
+            reshape_tensor_per_element<uint16_t>(window, src, dst);
+            break;
+        case DataType::U32:
+        case DataType::S32:
+        case DataType::F32:
+            reshape_tensor_per_element<uint32_t>(window, src, dst);
+            break;
+        case DataType::U64:
+        case DataType::S64:
+        case DataType::F64:
+            reshape_tensor_per_element<uint64_t>(window, src, dst);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type!");
+    }
+}
+
+void reshape_tensor_per_row(const Window &window, const ITensor *src, ITensor *dst)
 {
     const TensorShape &src_shape = src->info()->tensor_shape();
     const TensorShape &dst_shape = dst->info()->tensor_shape();
+    Coordinates        src_coord{};
     Coordinates        dst_coord{};
 
+    const auto element_size      = dst->info()->element_size();
+    const auto window_start_x    = static_cast<int>(window.x().start());
+    const auto window_end_x      = static_cast<int>(window.x().end());
+    const auto src_row_size      = static_cast<int>(src_shape[0]);
+    const auto row_size_in_bytes = src_row_size * element_size;
+
+    auto output_ptr = dst->ptr_to_element(dst_coord);
+    auto input_ptr  = src->ptr_to_element(src_coord);
+
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator dst_it(dst, win);
+    execute_window_loop(
+        win,
+        [&](Coordinates &id)
+        {
+            dst_coord = id;
+
+            for (int x = window_start_x; x < window_end_x; x += src_row_size)
+            {
+                src_coord  = index2coords(src_shape, coords2index(dst_shape, dst_coord));
+                output_ptr = dst->ptr_to_element(dst_coord);
+                input_ptr  = src->ptr_to_element(src_coord);
+
+                std::memcpy(output_ptr, input_ptr, row_size_in_bytes);
+
+                dst_coord.increment(Window::DimX, src_row_size);
+            }
+        },
+        dst_it);
+}
+
+void reshape_tensor_per_window(const Window &window, const ITensor *src, ITensor *dst)
+{
     Iterator src_it(src, window);
+    Iterator dst_it(dst, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        dst_coord                                              = index2coords(dst_shape, coords2index(src_shape, id));
-        *reinterpret_cast<T *>(dst->ptr_to_element(dst_coord)) = *reinterpret_cast<T *>(src_it.ptr());
-    },
-    src_it);
+    const size_t element_size         = dst->info()->element_size();
+    const auto   window_size          = window.x().end() - window.x().start();
+    const auto   window_size_in_bytes = window_size * element_size;
+
+    const auto input_ptr  = src_it.ptr();
+    const auto output_ptr = dst_it.ptr();
+
+    std::memcpy(output_ptr, input_ptr, window_size_in_bytes);
 }
 } // namespace
 
@@ -83,10 +174,11 @@ void CpuReshapeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-    ARM_COMPUTE_UNUSED(dst);
+    ARM_COMPUTE_UNUSED(src);
 
+    _reshape_tensor_fn = reshape_tensor_per_element_selector;
     // Configure kernel window
-    Window win = calculate_max_window(*src);
+    Window win = calculate_max_window(*dst);
 
     ICpuKernel::configure(win);
 }
@@ -94,7 +186,6 @@ void CpuReshapeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
 Status CpuReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-
     return Status{};
 }
 
@@ -106,28 +197,7 @@ void CpuReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const
 
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    switch(src->info()->data_type())
-    {
-        case DataType::U8:
-        case DataType::S8:
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-            reshape_tensor<uint8_t>(window, src, dst);
-            break;
-        case DataType::U16:
-        case DataType::S16:
-        case DataType::F16:
-            reshape_tensor<uint16_t>(window, src, dst);
-            break;
-        case DataType::U32:
-        case DataType::S32:
-        case DataType::F32:
-            reshape_tensor<uint32_t>(window, src, dst);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type!");
-    }
+    _reshape_tensor_fn(window, src, dst);
 }
 
 const char *CpuReshapeKernel::name() const
@@ -143,6 +213,58 @@ size_t CpuReshapeKernel::get_mws(const CPUInfo &platform, size_t thread_count) c
     return ICPPKernel::default_mws;
 }
 
+void CpuReshapeKernel::prepare(ITensorPack &tensors)
+{
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    const ITensorInfo *src_info = src->info();
+    const ITensorInfo *dst_info = dst->info();
+
+    // Calculate kernel window based on the padding info
+    Window win;
+
+    const bool src_has_holes      = has_holes(*src_info, src_info->num_dimensions() - 1);
+    const bool dst_has_holes      = has_holes(*dst_info, dst_info->num_dimensions() - 1);
+    const bool src_has_holes_in_x = has_holes(*src_info, Window::DimX);
+    const bool dst_has_holes_in_x = has_holes(*dst_info, Window::DimX);
+    const auto src_row_size       = static_cast<int>(src_info->tensor_shape()[0]);
+    const auto dst_row_size       = static_cast<int>(dst_info->tensor_shape()[0]);
+
+    if (!src_has_holes && !dst_has_holes)
+    {
+        std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*dst_info);
+        /*
+            Copy the tensor per window. If the src and dst tensors
+            are contiguous memory allocations without any holes or
+            padding, then the tensor is squashed to 1D window and
+            we can use use a single memcopy call to copy the whole
+            window in reshape_tensor_per_window fn
+        */
+        _reshape_tensor_fn = reshape_tensor_per_window;
+    }
+    else
+    {
+        win = calculate_max_window(*dst_info);
+        /*
+            Copy tensor row by row if src and dst have no holes in X
+            dim and they have the same number of elements in their rows
+        */
+        if (!src_has_holes_in_x && !dst_has_holes_in_x && (src_row_size == dst_row_size))
+        {
+            _reshape_tensor_fn = reshape_tensor_per_row;
+        }
+        else
+        {
+            /*
+                Fall back to the element wise copy
+            */
+            _reshape_tensor_fn = reshape_tensor_per_element_selector;
+        }
+    }
+
+    ICPPKernel::configure(win);
+}
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h
index 17302c673154068729df47f133a580f82a5cd132..ce566fd9e2d6fe169a3425a25ffba0d5be6c4cf9 100644
--- a/src/cpu/kernels/CpuReshapeKernel.h
+++ b/src/cpu/kernels/CpuReshapeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,9 +55,16 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
+    /** Prepare the reshape kernel for execution (Only executed once) by calculating max or squashed window and selecting the _reshape_tensor_fn based on the presence of holes
+     *
+     * @param[in] tensors Pack of input and output tensors
+     *
+     */
+    void prepare(ITensorPack &tensors);
+
     /** Return minimum workload size of the relevant kernel
      *
      * @param[in] platform     The CPU platform used to create the context.
@@ -66,6 +73,20 @@ public:
      * @return[out] small_network_mws          Minimum workload size for requsted configuration.
      */
     size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
+    /** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
+      *
+      * @return The split dimension.
+      */
+    size_t get_split_dimension() const
+    {
+        return _split_dimension;
+    }
+
+private:
+    size_t _split_dimension{Window::DimY};
+
+    std::function<void(const Window &window, const ITensor *src, ITensor *dst)> _reshape_tensor_fn{};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuScaleKernel.cpp b/src/cpu/kernels/CpuScaleKernel.cpp
index 332304599f628eb65b27e07e24fd7c8985b02f7c..7cf8916e9b6bd7e99107e0258399c5639083feff 100644
--- a/src/cpu/kernels/CpuScaleKernel.cpp
+++ b/src/cpu/kernels/CpuScaleKernel.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/InterpolationPolicyUtils.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/common/Registrars.h"
 #include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -40,108 +41,315 @@ namespace arm_compute
 {
 namespace cpu
 {
-namespace kernels
-{
-namespace
-{
-static const std::vector<CpuScaleKernel::ScaleKernel> available_kernels =
+
+#ifdef ENABLE_NCHW_KERNELS
+void scale_area_nchw_u8(const ITensor      *src,
+                        ITensor            *dst,
+                        const ITensor      *offsets,
+                        const ITensor      *dx,
+                        const ITensor      *dy,
+                        InterpolationPolicy policy,
+                        BorderMode          border_mode,
+                        PixelValue          constant_border_value,
+                        float               sampling_offset,
+                        bool                align_corners,
+                        const Window       &window)
 {
-    {
-        "sve_fp16_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)
-    },
-    {
-        "sve_fp32_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)
-    },
-    {
-        "sve_qu8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)
-    },
-    {
-        "sve_qs8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)
-    },
-    {
-        "sve_u8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::U8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)
-    },
-    {
-        "sve_s16_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
+    ARM_COMPUTE_UNUSED(dx, dy, offsets, policy, border_mode, constant_border_value, sampling_offset, align_corners);
+    using namespace scale_helpers;
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8);
+
+    // Don't increment in width/height/channels for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Iterator src_i(src, win_in);
+    Iterator dst_i(dst, window);
+
+    const auto wr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), align_corners);
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const auto   w         = src->info()->dimension(0);
+    const auto   h         = src->info()->dimension(1);
+    const size_t in_stride = src->info()->strides_in_bytes()[1];
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            return data.dt == DataType::S16 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
+            const auto in_ptr = reinterpret_cast<const uint8_t *>(src_i.ptr());
+
+            uint8x8_t tmp0 = vdup_n_u8(0);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7);
+
+            uint8x8_t tmp1 = vdup_n_u8(0);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7);
+
+            vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1));
         },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)
-    },
-    {
-        "neon_fp16_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale<float16_t>)
-    },
-    {
-        "neon_fp32_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>)
-    },
-    {
-        "neon_qu8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)
-    },
-    {
-        "neon_qs8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)
-    },
+        src_i, dst_i);
+}
+
+template <typename T>
+void scale_bilinear_qasymm_nchw(const ITensor      *src,
+                                ITensor            *dst,
+                                const ITensor      *offsets,
+                                const ITensor      *dx,
+                                const ITensor      *dy,
+                                InterpolationPolicy policy,
+                                BorderMode          border_mode,
+                                PixelValue          constant_border_value,
+                                float               sampling_offset,
+                                bool                align_corners,
+                                const Window       &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    // Get data layout and width/height indices
+    const int idx_width  = get_data_layout_dimension_index(DataLayout::NCHW, DataLayoutDimension::WIDTH);
+    const int idx_height = get_data_layout_dimension_index(DataLayout::NCHW, DataLayoutDimension::HEIGHT);
+
+    // Compute the ratio between source height and destination height
+    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height),
+                                                        dst->info()->dimension(idx_height), align_corners);
+    Window     win_off;
+    win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(idx_width, Window::Dimension(0, 0, 0));
+    win_in.set(idx_height, Window::Dimension(0, 0, 0));
+
+    for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
     {
-        "neon_u8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::U8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale)
-    },
+        win_off.set(d, Window::Dimension(0, 0, 0));
+    }
+
+    Iterator src_i(src, win_in);
+    Iterator dst_i(dst, window);
+
+    const int32_t in_dim_w = src->info()->dimension(idx_width);
+    const int32_t in_dim_h = src->info()->dimension(idx_height);
+    const int32_t stride_w = src->info()->strides_in_bytes()[idx_width];
+    const int32_t stride_h = src->info()->strides_in_bytes()[idx_height];
+
+    const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
+
+    if (border_mode == BorderMode::CONSTANT)
     {
-        "neon_s8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::S8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_scale)
-    },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        using ConstType = T;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int32_t index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset);
+                const int32_t index_w = *(reinterpret_cast<const int32_t *>(
+                    offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto    dx_val =
+                    *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto dy_val =
+                    *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+
+                const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h)
+                                     ? (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h))
+                                     : const_border_value;
+                const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h)
+                                     ? (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h))
+                                     : const_border_value;
+                const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1)
+                                     ? (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h))
+                                     : const_border_value;
+                const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1)
+                                     ? (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h))
+                                     : const_border_value;
+
+                const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
+                const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
+                const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
+                const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
+                *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(
+                    scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+            },
+            src_i, dst_i);
+    }
+    else if (border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int     index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset);
+                const int32_t index_w = *(reinterpret_cast<const int32_t *>(
+                    offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto    dx_val =
+                    *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto dy_val =
+                    *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+
+                auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
+                auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
+                auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
+                auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
+
+                const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
+                const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
+                const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
+                const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
+
+                const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
+                const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
+                const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
+                const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
+                *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(
+                    scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+            },
+            src_i, dst_i);
+    }
+    else
     {
-        "neon_s16_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::S16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale)
-    },
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
+/** function to perform scale using bilinear interpolation on the given window */
+template <typename T>
+void scale_bilinear_nchw(const ITensor      *src,
+                         ITensor            *dst,
+                         const ITensor      *offsets,
+                         const ITensor      *dx,
+                         const ITensor      *dy,
+                         InterpolationPolicy policy,
+                         BorderMode          border_mode,
+                         PixelValue          constant_border_value,
+                         float               sampling_offset,
+                         bool                align_corners,
+                         const Window       &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    arm_compute::cpu::scale_bilinear_nchw<T>(src, dst, dx, dy, offsets, border_mode, constant_border_value,
+                                             sampling_offset, align_corners, window);
+}
+
+template <typename T>
+void scale_nearest_nchw(const ITensor      *src,
+                        ITensor            *dst,
+                        const ITensor      *offsets,
+                        const ITensor      *dx,
+                        const ITensor      *dy,
+                        InterpolationPolicy policy,
+                        BorderMode          border_mode,
+                        PixelValue          constant_border_value,
+                        float               sampling_offset,
+                        bool                align_corners,
+                        const Window       &window)
+{
+    ARM_COMPUTE_UNUSED(policy, border_mode);
+    arm_compute::cpu::scale_nearest_nchw<T>(src, dst, dx, dy, offsets, constant_border_value, sampling_offset,
+                                            align_corners, window);
+}
+
+#endif // ENABLE_NCHW_KERNELS
+
+namespace kernels
+{
+namespace
+{
+static const std::vector<CpuScaleKernel::ScaleKernel> available_kernels = {
+    {"sve_fp16_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+                data.interpolation_policy != InterpolationPolicy::BILINEAR;
+     },
+     REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)},
+    {"sve_fp32_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; },
+     REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)},
+    {"sve_qu8_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data) {
+         return data.dt == DataType::QASYMM8 && data.isa.sve &&
+                data.interpolation_policy != InterpolationPolicy::BILINEAR;
+     },
+     REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)},
+    {"sve_qs8_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve &&
+                data.interpolation_policy != InterpolationPolicy::BILINEAR;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)},
+    {"sve_u8_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     { return data.dt == DataType::U8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)},
+    {"sve_s16_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)},
+    {"neon_fp16_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_common_neon_scale)},
+    {"neon_fp32_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>)},
+    {"neon_qu8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)},
+    {"neon_qs8_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)},
+    {"neon_u8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::U8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale)},
+    {"neon_s8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::S8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_scale)},
+    {"neon_s16_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::S16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale)},
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy,
-                          const ITensorInfo *offsets, ITensorInfo *dst, const ScaleKernelInfo &info)
+Status validate_arguments(const ITensorInfo     *src,
+                          const ITensorInfo     *dx,
+                          const ITensorInfo     *dy,
+                          const ITensorInfo     *offsets,
+                          ITensorInfo           *dst,
+                          const ScaleKernelInfo &info)
 {
-    const auto *uk = CpuScaleKernel::get_implementation(ScaleKernelDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy });
+    const auto *uk = CpuScaleKernel::get_implementation(
+        ScaleKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy});
 
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels()!=1);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER &&
+                                info.sampling_policy != SamplingPolicy::TOP_LEFT);
     ARM_COMPUTE_UNUSED(info.constant_border_value);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported");
 
@@ -153,27 +361,30 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const I
     ARM_COMPUTE_RETURN_ERROR_ON(output_width == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(output_height == 0);
 
-    ARM_COMPUTE_RETURN_ERROR_ON((src->data_type() == DataType::S8) && (data_layout != DataLayout::NHWC || info.interpolation_policy != InterpolationPolicy::BILINEAR
-                                                                       || info.border_mode != BorderMode::REPLICATE));
+    ARM_COMPUTE_RETURN_ERROR_ON((src->data_type() == DataType::S8) &&
+                                (data_layout != DataLayout::NHWC ||
+                                 info.interpolation_policy != InterpolationPolicy::BILINEAR ||
+                                 info.border_mode != BorderMode::REPLICATE));
 
-    if(info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR && offsets != nullptr)
+    if (info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR && offsets != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
     }
 
-    if(info.interpolation_policy == InterpolationPolicy::BILINEAR && offsets != nullptr)
+    if (info.interpolation_policy == InterpolationPolicy::BILINEAR && offsets != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
-        if(dx != nullptr && dy != nullptr)
+        if (dx != nullptr && dy != nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
         }
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
+    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners &&
+                                !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
 
-    if(info.interpolation_policy == InterpolationPolicy::AREA)
+    if (info.interpolation_policy == InterpolationPolicy::AREA)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8);
@@ -183,24 +394,28 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const I
 }
 } // namespace
 
-void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets,
-                               ITensorInfo *dst, const ScaleKernelInfo &info)
+void CpuScaleKernel::configure(const ITensorInfo     *src,
+                               const ITensorInfo     *dx,
+                               const ITensorInfo     *dy,
+                               const ITensorInfo     *offsets,
+                               ITensorInfo           *dst,
+                               const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(dx, dy, offsets);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
-                                                  dx,
-                                                  dy,
-                                                  offsets,
-                                                  dst,
-                                                  info));
-
-    const auto *uk = CpuScaleKernel::get_implementation(ScaleKernelDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy });
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dx, dy, offsets, dst, info));
+
+    const auto *uk = CpuScaleKernel::get_implementation(
+        ScaleKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
     _run_method = uk->ukernel;
-    _name       = std::string("CpuScaleKernel").append("/").append(uk->name).append("_").append(string_from_interpolation_policy(info.interpolation_policy));
+    _name       = std::string("CpuScaleKernel")
+                .append("/")
+                .append(uk->name)
+                .append("_")
+                .append(string_from_interpolation_policy(info.interpolation_policy));
 
     // Get data layout and width/height indices
     _data_layout         = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
@@ -212,19 +427,22 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co
     _constant_border_value = info.constant_border_value;
     _align_corners         = info.align_corners;
 
-    if(info.sampling_policy == SamplingPolicy::CENTER)
+    if (info.sampling_policy == SamplingPolicy::CENTER)
     {
         _sampling_offset = 0.5f;
     }
 
     // Compute the ratio between source width/height and destination width/height
-    const auto wr = scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners);
-    const auto hr = scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners);
+    const auto wr =
+        scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners);
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy;
+    _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR
+                                                                               : _policy;
 
-    if(_border_mode == BorderMode::UNDEFINED)
+    if (_border_mode == BorderMode::UNDEFINED)
     {
         _border_mode           = BorderMode::CONSTANT;
         _constant_border_value = PixelValue();
@@ -232,41 +450,33 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co
 
 #ifdef ENABLE_NCHW_KERNELS
     // Configure scale function to run
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         std::string function_to_call("scale_");
         function_to_call += string_from_data_type(src->data_type()) + "_";
         function_to_call += string_from_data_layout(_data_layout) + "_";
         function_to_call += string_from_interpolation_policy(_policy);
 
-        static std::map<std::string, ScaleFunctionPtr> map_function =
-        {
-            { "scale_U8_NCHW_AREA_CONSTANT", &CpuScaleKernel::scale_area_nchw_u8 },
-
-            { "scale_U8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<uint8_t> },
-            { "scale_U8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> },
-
-            { "scale_QASYMM8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<uint8_t> },
-            { "scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> },
-
-            { "scale_QASYMM8_SIGNED_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<int8_t> },
-            { "scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int8_t> },
-
-            { "scale_S16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<int16_t> },
-            { "scale_S16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int16_t> },
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            { "scale_F16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float16_t> },
-            { "scale_F16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float16_t> },
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-            { "scale_F32_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float> },
-            { "scale_F32_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float> },
+        const static std::map<std::string, ScaleKernelPtr> map_nchw_function = {
+            {"scale_U8_NCHW_AREA_CONSTANT", &arm_compute::cpu::scale_area_nchw_u8},
+            {"scale_U8_NCHW_AREA_CONSTANT", &arm_compute::cpu::scale_area_nchw_u8},
+            {"scale_U8_NCHW_BILINEAR", &arm_compute::cpu::scale_bilinear_nchw<uint8_t>},
+            {"scale_U8_NCHW_NEAREST_NEIGHBOUR", &arm_compute::cpu::scale_nearest_nchw<uint8_t>},
+            {"scale_QASYMM8_NCHW_BILINEAR", &arm_compute::cpu::scale_bilinear_qasymm_nchw<uint8_t>},
+            {"scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &arm_compute::cpu::scale_nearest_nchw<uint8_t>},
+            {"scale_QASYMM8_SIGNED_NCHW_BILINEAR", &arm_compute::cpu::scale_bilinear_qasymm_nchw<int8_t>},
+            {"scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &arm_compute::cpu::scale_nearest_nchw<int8_t>},
+            {"scale_S16_NCHW_BILINEAR", &arm_compute::cpu::scale_bilinear_nchw<int16_t>},
+            {"scale_S16_NCHW_NEAREST_NEIGHBOUR", &arm_compute::cpu::scale_nearest_nchw<int16_t>},
+            {"scale_F16_NCHW_BILINEAR", REGISTER_FP16_NEON(arm_compute::cpu::fp16_bilinear_neon_scale_nchw)},
+            {"scale_F16_NCHW_NEAREST_NEIGHBOUR", REGISTER_FP16_NEON(arm_compute::cpu::fp16_nearest_neon_scale_nchw)},
+            {"scale_F32_NCHW_BILINEAR", &arm_compute::cpu::scale_bilinear_nchw<float>},
+            {"scale_F32_NCHW_NEAREST_NEIGHBOUR", &arm_compute::cpu::scale_nearest_nchw<float>},
         };
-        auto it = map_function.find(function_to_call);
-        if(it != map_function.end())
+        auto it = map_nchw_function.find(function_to_call);
+        if (it != map_nchw_function.end())
         {
-            _func = it->second;
+            _nchw_func = it->second;
         }
     }
 #endif // ENABLE_NCHW_KERNELS
@@ -276,299 +486,12 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co
     ICpuKernel::configure(win);
 }
 
-#ifdef ENABLE_NCHW_KERNELS
-template <typename T>
-void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dx, dy);
-    const size_t in_stride_x = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    // Set offsets window
-    Window win_off;
-    win_off.set(Window::DimX, window[Window::DimX]);
-    win_off.set(Window::DimY, window[Window::DimY]);
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    // Create iterators
-    Iterator src_i(src, win_in);
-    Iterator dst_i(dst, window);
-    Iterator offsets_i(offsets, win_off);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets_i.ptr());
-        const auto in_yi       = static_cast<int32_t>(_align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((
-                                                          id.y() + _sampling_offset)
-                                                      * hr));
-        const int32_t offset_row            = in_yi * in_stride_x;
-        *reinterpret_cast<T *>(dst_i.ptr()) = *(reinterpret_cast<const T *>(src_i.ptr()) + offsets_ptr[0] + offset_row);
-    },
-    src_i, offsets_i, dst_i);
-}
-
-template <typename T>
-void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
-    Window     win_off;
-    win_off.set(Window::DimX, window.x());
-    win_off.set(Window::DimY, window.y());
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    Iterator src_i(src, win_in);
-    Iterator dst_i(dst, window);
-    Iterator offsets_i(offsets, win_off);
-    Iterator dx_i(dx, win_off);
-    Iterator dy_i(dy, win_off);
-
-    const int32_t in_dim_w    = src->info()->dimension(0);
-    const int32_t in_dim_h    = src->info()->dimension(1);
-    const int32_t in_stride_w = in_dim_w + src->info()->padding().left + src->info()->padding().right;
-
-    if(_border_mode == BorderMode::CONSTANT)
-    {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        using ConstType = T;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-            const auto    index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
-            const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + index_h * in_stride_w)) : const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w)) : const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h
-                              && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h
-                              && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w)) :
-                             const_border_value;
-
-            *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        src_i, offsets_i, dx_i, dy_i, dst_i);
-    }
-    else if(_border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int  index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-            const auto index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
-            const auto dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
-            const auto dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
-            const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
-            auto clamped_x  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_y  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w);
-            const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w);
-            const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w);
-            const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w);
-
-            *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        src_i, offsets_i, dx_i, dy_i, dst_i);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dx, dy, offsets);
-    using namespace scale_helpers;
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8);
-
-    // Don't increment in width/height/channels for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Iterator src_i(src, win_in);
-    Iterator dst_i(dst, window);
-
-    const auto   wr        = scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), _align_corners);
-    const auto   hr        = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
-    const auto   w         = src->info()->dimension(0);
-    const auto   h         = src->info()->dimension(1);
-    const size_t in_stride = src->info()->strides_in_bytes()[1];
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto in_ptr = reinterpret_cast<const uint8_t *>(src_i.ptr());
-
-        uint8x8_t tmp0 = vdup_n_u8(0);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7);
-
-        uint8x8_t tmp1 = vdup_n_u8(0);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7);
-
-        vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1));
-    },
-    src_i, dst_i);
-}
-
-template <typename T>
-void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
-{
-    // Get data layout and width/height indices
-    const int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), _align_corners);
-    Window     win_off;
-    win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(idx_width, Window::Dimension(0, 0, 0));
-    win_in.set(idx_height, Window::Dimension(0, 0, 0));
-
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    Iterator src_i(src, win_in);
-    Iterator dst_i(dst, window);
-
-    const int32_t in_dim_w = src->info()->dimension(idx_width);
-    const int32_t in_dim_h = src->info()->dimension(idx_height);
-    const int32_t stride_w = src->info()->strides_in_bytes()[idx_width];
-    const int32_t stride_h = src->info()->strides_in_bytes()[idx_height];
-
-    const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
-    if(_border_mode == BorderMode::CONSTANT)
-    {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        using ConstType = T;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-
-            const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
-            const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
-            const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
-            const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
-            *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        src_i, dst_i);
-    }
-    else if(_border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int     index_h       = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
-            auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
-            const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
-            const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
-            const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
-            const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
-            const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
-            const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
-            const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
-            *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        src_i, dst_i);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-#endif // ENABLE_NCHW_KERNELS
-
-Status CpuScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
-                                const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info)
+Status CpuScaleKernel::validate(const ITensorInfo     *input,
+                                const ITensorInfo     *dx,
+                                const ITensorInfo     *dy,
+                                const ITensorInfo     *offsets,
+                                ITensorInfo           *output,
+                                const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info));
     return Status{};
@@ -579,7 +502,7 @@ void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const Th
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr && _data_layout == DataLayout::NCHW);
+    ARM_COMPUTE_ERROR_ON(_nchw_func == nullptr && _data_layout == DataLayout::NCHW);
     ARM_COMPUTE_ERROR_ON(_run_method == nullptr && _data_layout == DataLayout::NHWC);
 
     const auto src     = tensors.get_const_tensor(TensorType::ACL_SRC);
@@ -588,13 +511,15 @@ void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const Th
     const auto dy      = tensors.get_const_tensor(TensorType::ACL_INT_1);
     const auto offsets = tensors.get_const_tensor(TensorType::ACL_INT_2);
 
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
-        (this->*_func)(src, dst, dx, dy, offsets, window);
+        _nchw_func(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset,
+                   _align_corners, window);
     }
     else
     {
-        _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, _align_corners, window);
+        _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset,
+                    _align_corners, window);
     }
 }
 
diff --git a/src/cpu/kernels/CpuScaleKernel.h b/src/cpu/kernels/CpuScaleKernel.h
index 8102142fc3dbfa3e48395e5ef436fcae1f86d166..f2cad5e899f5202f2c185a3ed8a5d78db9fc9539 100644
--- a/src/cpu/kernels/CpuScaleKernel.h
+++ b/src/cpu/kernels/CpuScaleKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_SCALEKERNEL_H
-#define ARM_COMPUTE_CPU_SCALEKERNEL_H
+#ifndef ACL_SRC_CPU_KERNELS_CPUSCALEKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUSCALEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -39,9 +40,17 @@ class CpuScaleKernel : public ICpuKernel<CpuScaleKernel>
 {
 private:
     /** Scale function to use for the particular function to use */
-    using ScaleFunctionPtr = void (CpuScaleKernel::*)(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window);
-    using ScaleKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *,
-                                                   InterpolationPolicy, BorderMode, PixelValue, float, bool, const Window &)>::type;
+    using ScaleKernelPtr = std::add_pointer<void(const ITensor *,
+                                                 ITensor *,
+                                                 const ITensor *,
+                                                 const ITensor *,
+                                                 const ITensor *,
+                                                 InterpolationPolicy,
+                                                 BorderMode,
+                                                 PixelValue,
+                                                 float,
+                                                 bool,
+                                                 const Window &)>::type;
 
 public:
     CpuScaleKernel() = default;
@@ -59,7 +68,11 @@ public:
      * @param[out] dst     Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  info    @ref ScaleKernelInfo to use for configuration
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst,
+    void configure(const ITensorInfo     *src,
+                   const ITensorInfo     *dx,
+                   const ITensorInfo     *dy,
+                   const ITensorInfo     *offsets,
+                   ITensorInfo           *dst,
                    const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -67,11 +80,15 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst,
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *dx,
+                           const ITensorInfo     *dy,
+                           const ITensorInfo     *offsets,
+                           ITensorInfo           *dst,
                            const ScaleKernelInfo &info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct ScaleKernel
@@ -84,36 +101,17 @@ public:
     static const std::vector<ScaleKernel> &get_available_kernels();
 
 private:
-#ifdef ENABLE_NCHW_KERNELS
-    /** function to perform scale using area interpolation on the given window
-     *
-     *  @note Used only in case down-sampling.
-     */
-    void scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
-
-    /** function to perform scale using bilinear interpolation on the given window */
-    template <typename T>
-    void scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
-    /** function to perform scale using bilinear interpolation on the given window */
-    template <typename T>
-    void scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
-
-    /** function to perform scale using nearest neighbour on the given window */
-    template <typename T>
-    void scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
-#endif // ENABLE_NCHW_KERNELS
-
-    ScaleFunctionPtr    _func{ nullptr };
+    ScaleKernelPtr      _nchw_func{nullptr};
     InterpolationPolicy _policy{};
     BorderMode          _border_mode{};
     PixelValue          _constant_border_value{};
-    float               _sampling_offset{ 0 };
-    bool                _align_corners{ false };
-    DataLayout          _data_layout{ DataLayout::UNKNOWN };
-    ScaleKernelPtr      _run_method{ nullptr };
+    float               _sampling_offset{0};
+    bool                _align_corners{false};
+    DataLayout          _data_layout{DataLayout::UNKNOWN};
+    ScaleKernelPtr      _run_method{nullptr};
     std::string         _name{};
 };
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SCALEKERNEL_H */
+#endif // ACL_SRC_CPU_KERNELS_CPUSCALEKERNEL_H
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp
index e06ab9917c8567f0d685f29ba2b72da38f16d981..ce144351f8b32324f0765eae38dc6d4c390441b6 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.cpp
+++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp
@@ -30,11 +30,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
 #include "src/cpu/kernels/softmax/list.h"
 
 namespace arm_compute
@@ -46,61 +46,44 @@ namespace kernels
 namespace
 {
 /* Softmax Logits 1D Max - identifying the max value of 1D Logits  */
-static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits =
-{
-    {
-        "sve_fp32_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; },
-        REGISTER_FP32_SVE(sve_fp32_logits)
-    },
-    {
-        "sve_fp16_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
-        REGISTER_FP16_SVE(sve_fp16_logits)
-    },
-    {
-        "sve_qu8_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; },
-        REGISTER_QASYMM8_SVE(sve_qasymm8_logits)
-    },
-    {
-        "sve_qs8_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; },
-        REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)
-    },
-    {
-        "neon_fp32_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(neon_fp32_logits)
-    },
-    {
-        "neon_fp16_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-        REGISTER_FP16_NEON(neon_fp16_logits)
-    },
-    {
-        "neon_qu8_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(neon_qasymm8_logits)
-    },
-    {
-        "neon_qs8_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)
-    },
+static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits = {
+    {"sve_fp32_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
+     REGISTER_FP32_SVE(sve_fp32_logits)},
+    {"sve_fp16_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
+     REGISTER_FP16_SVE(sve_fp16_logits)},
+    {"sve_qu8_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; },
+     REGISTER_QASYMM8_SVE(sve_qasymm8_logits)},
+    {"sve_qs8_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; },
+     REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)},
+    {"neon_fp32_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_logits)},
+    {"neon_fp16_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_logits)},
+    {"neon_qu8_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(neon_qasymm8_logits)},
+    {"neon_qs8_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)},
 };
 
 Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     // Validate in case of configured output
-    if(output.total_size() != 0)
+    if (output.total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(),
+                                                           TensorShape(input.tensor_shape()).set(0, 1));
     }
 
     return Status{};
@@ -121,7 +104,7 @@ void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
 
-    const auto *uk = get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk = get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     _run_method = uk->ukernel;
@@ -158,60 +141,46 @@ const char *CpuLogits1DMaxKernel::name() const
 }
 
 /* Softmax Logits 1D  - computation for QASYMM8 with pre-computed max.  */
-template <bool                                                                             IS_LOG>
-static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits =
-{
-    {
-        "sve2_qu8_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
-        REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)
-    },
-    {
-        "sve2_qs8_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
-        REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)
-    },
-    {
-        "sve_fp32_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; },
-        REGISTER_FP32_SVE(sve_fp32_softmax)
-    },
-    {
-        "sve_fp16_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
-        REGISTER_FP16_SVE(sve_fp16_softmax)
-    },
-
-    {
-        "neon_fp32_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(neon_fp32_softmax)
-    },
-    {
-        "neon_fp16_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-        REGISTER_FP16_NEON(neon_fp16_softmax)
-    },
-    {
-        "neon_qu8_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)
-    },
-    {
-        "neon_qs8_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)
-    },
+template <bool IS_LOG>
+static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits = {
+    {"sve2_qu8_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
+     REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)},
+    {"sve2_qs8_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
+     REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)},
+    {"sve_fp32_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
+     REGISTER_FP32_SVE(sve_fp32_softmax)},
+    {"sve_fp16_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
+     REGISTER_FP16_SVE(sve_fp16_softmax)},
+
+    {"neon_fp32_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_softmax)},
+    {"neon_fp16_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_softmax)},
+    {"neon_qu8_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)},
+    {"neon_qs8_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)},
 };
 namespace
 {
-Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max,
-                                         const ITensorInfo &dst, const float beta, const ITensorInfo &tmp, bool is_log)
+Status validate_arguments_logits_softmax(const ITensorInfo &src,
+                                         const ITensorInfo &max,
+                                         const ITensorInfo &dst,
+                                         const float        beta,
+                                         const ITensorInfo &tmp,
+                                         bool               is_log)
 {
     ARM_COMPUTE_UNUSED(beta);
     // Check input
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
 
@@ -221,16 +190,18 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max);
 
     // Check output if configured
-    if(dst.total_size() != 0)
+    if (dst.total_size() != 0)
     {
-        const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) : dst.quantization_info();
+        const QuantizationInfo output_quantization =
+            is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log)
+                                    : dst.quantization_info();
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
         ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization);
     }
 
     // Check tmp if configured
-    if(tmp.total_size() != 0)
+    if (tmp.total_size() != 0)
     {
         const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type();
         ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
@@ -243,14 +214,16 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn
 }
 } // namespace
 
-template <bool                                                                       IS_LOG>
-const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> &CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels()
+template <bool IS_LOG>
+const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> &
+CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels()
 {
     return available_kernels_logits<IS_LOG>;
 }
 
 template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
+void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(
+    const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
@@ -259,17 +232,21 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const I
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
 
     // Output auto initialization if not yet initialized
-    const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info();
+    const QuantizationInfo output_quantization =
+        is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG)
+                                : dst->quantization_info();
     auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
 
     // Tmp auto initialization if not yet initialized
     const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
     auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
 
-    const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation(
+        DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    std::string kernel_name = IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
+    std::string kernel_name =
+        IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
 
     _beta       = beta;
     _run_method = uk->ukernel;
@@ -282,8 +259,8 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const I
 }
 
 template <bool IS_LOG>
-Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *max,
-                                                  const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
+Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
+    const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
@@ -305,7 +282,7 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window
     auto       tmp = tensors.get_tensor(TensorType::ACL_DST_1);
 
     const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
-    const unsigned int tmp_size_for_thread               = tmp->info()->element_size() * num_elems_processed_per_iteration;
+    const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
 
     ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
 
@@ -314,7 +291,7 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window
 }
 
 template <bool IS_LOG>
-const char    *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
+const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
 {
     return _name.c_str();
 }
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h
index 59f43bd1d2f89a67360267581bd7b28cca3ea4e3..5d288179fdac3a3b52cf2994a4e515db000d37fd 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.h
+++ b/src/cpu/kernels/CpuSoftmaxKernel.h
@@ -57,7 +57,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct SoftmaxLogits1DMaxKernel
@@ -70,7 +70,7 @@ public:
     static const std::vector<SoftmaxLogits1DMaxKernel> &get_available_kernels();
 
 private:
-    SoftmaxLogits1DMaxKernelPtr _run_method{ nullptr };
+    SoftmaxLogits1DMaxKernelPtr _run_method{nullptr};
     std::string                 _name{};
 };
 
@@ -79,7 +79,8 @@ template <bool IS_LOG = false>
 class CpuLogits1DSoftmaxKernel : public ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>
 {
 private:
-    using SoftmaxLogits1DKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
+    using SoftmaxLogits1DKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
 
 public:
     CpuLogits1DSoftmaxKernel() = default;
@@ -95,18 +96,22 @@ public:
      *
      * @param      tmp    Auxiliary tensor info. Must be type F32 and same shape as the input.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp);
+    void
+    configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuLogits1DSoftmaxKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *max,
-                           const ITensorInfo *dst, const float beta, const ITensorInfo *tmp);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *max,
+                           const ITensorInfo *dst,
+                           const float        beta,
+                           const ITensorInfo *tmp);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct SoftmaxLogits1DKernel
@@ -119,8 +124,8 @@ public:
     static const std::vector<SoftmaxLogits1DKernel> &get_available_kernels();
 
 private:
-    float                    _beta{ 1.0f };
-    SoftmaxLogits1DKernelPtr _run_method{ nullptr };
+    float                    _beta{1.0f};
+    SoftmaxLogits1DKernelPtr _run_method{nullptr};
     std::string              _name{};
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp
index 875d613dcac87e379289a4822c1d0f2b8ed779f3..c8706ff651891b49716e4aa68d591844e5a3216a 100644
--- a/src/cpu/kernels/CpuSubKernel.cpp
+++ b/src/cpu/kernels/CpuSubKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,11 +25,13 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/add/generic/neon/impl.h"
+#include "src/cpu/kernels/sub/neon/impl.h"
 #include "src/cpu/kernels/sub/neon/list.h"
 
 #if defined(ENABLE_FP32_KERNELS)
@@ -51,70 +53,48 @@ namespace
 using CpuSubKernelDataTypeISASelectorData    = CpuAddKernelDataTypeISASelectorData;
 using CpuSubKernelDataTypeISASelectorDataPtr = CpuAddKernelDataTypeISASelectorDataPtr;
 
-static const std::vector<CpuSubKernel::SubKernel> available_kernels =
-{
-    {
-        "neon_fp32_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)
-    },
-    {
-        "neon_fp16_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon<float16_t>)
-    },
-    {
-        "neon_u8_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::U8); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)
-    },
-    {
-        "neon_s16_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S16); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)
-    },
-    {
-        "neon_s32_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S32); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)
-    },
-    {
-        "neon_qu8_sub_fixedpoint",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return ((data.dt == DataType::QASYMM8) && data.can_use_fixedpoint); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon_fixedpoint)
-    },
-    {
-        "neon_qs8_sub_fixedpoint",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return ((data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon_fixedpoint)
-    },
-    {
-        "neon_qu8_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)
-    },
-    {
-        "neon_qs8_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)
-    },
-    {
-        "neon_qs16_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QSYMM16); },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)
-    },
+static const std::vector<CpuSubKernel::SubKernel> available_kernels = {
+    {"neon_fp32_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)},
+    {"neon_fp16_sub",
+     [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon_fp16)},
+    {"neon_u8_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)},
+    {"neon_s16_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)},
+    {"neon_s32_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)},
+    {"neon_qu8_sub_fixedpoint",
+     [](const CpuSubKernelDataTypeISASelectorData &data)
+     { return ((data.dt == DataType::QASYMM8) && data.can_use_fixedpoint); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon_fixedpoint)},
+    {"neon_qs8_sub_fixedpoint",
+     [](const CpuSubKernelDataTypeISASelectorData &data)
+     { return ((data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon_fixedpoint)},
+    {"neon_qu8_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)},
+    {"neon_qs8_sub",
+     [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)},
+    {"neon_qs16_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16); },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)},
 };
 
-inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
+inline Status
+validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
 {
     ARM_COMPUTE_UNUSED(policy);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
-                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16,
+                                                         DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
 
     const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(&src0, &src1, &dst);
-    const auto uk                 = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(CpuSubKernelDataTypeISASelectorData{ src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint });
+    const auto uk                 = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(
+        CpuSubKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
 
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
@@ -125,7 +105,7 @@ inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src
                                     "Convert policy cannot be WRAP if datatype is quantized");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
@@ -147,7 +127,8 @@ void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
     set_data_type_if_unknown(*dst, src0->data_type());
 
     const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(src0, src1, dst);
-    const auto uk                 = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(CpuSubKernelDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint });
+    const auto uk                 = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(
+        CpuSubKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
@@ -167,14 +148,14 @@ size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
     ARM_COMPUTE_UNUSED(thread_count);
 
 #if defined(ENABLE_FP32_KERNELS)
-    if(this->_run_method == &sub_same_neon<float>)
+    if (this->_run_method == &sub_same_neon<float>)
     {
         size_t mws = ICPPKernel::default_mws;
-        if(platform.get_cpu_model() == CPUModel::N1)
+        if (platform.get_cpu_model() == CPUModel::N1)
         {
             mws = default_mws_N1_fp32_neon;
         }
-        else if(platform.get_cpu_model() == CPUModel::V1)
+        else if (platform.get_cpu_model() == CPUModel::V1)
         {
             mws = default_mws_V1_fp32_neon;
         }
@@ -184,7 +165,7 @@ size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
         }
 
         // tensor is 1D or was re-interpreted as 1D
-        if(this->window().shape().num_dimensions() == 1)
+        if (this->window().shape().num_dimensions() == 1)
         {
             return mws;
         }
@@ -203,7 +184,8 @@ size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
     return ICPPKernel::default_mws;
 }
 
-Status CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
+Status
+CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy));
diff --git a/src/cpu/kernels/CpuSubKernel.h b/src/cpu/kernels/CpuSubKernel.h
index cd209d1837d51f424ec8b95d589e53eb7968a7b9..5fa0dc411ae85f9beb029be1ef89cd0f9b753ddf 100644
--- a/src/cpu/kernels/CpuSubKernel.h
+++ b/src/cpu/kernels/CpuSubKernel.h
@@ -37,7 +37,8 @@ namespace kernels
 class CpuSubKernel : public ICpuKernel<CpuSubKernel>
 {
 private:
-    using SubKernelPtr                           = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
+    using SubKernelPtr                           = std::add_pointer<void(
+        const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
     using CpuSubKernelDataTypeISASelectorDataPtr = CpuAddKernelDataTypeISASelectorDataPtr;
 
 public:
@@ -68,7 +69,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
+    static Status
+    validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
 
     // Inherited methods overridden:
     void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -99,9 +101,9 @@ public:
 
 private:
     ConvertPolicy _policy{};
-    SubKernelPtr  _run_method{ nullptr };
+    SubKernelPtr  _run_method{nullptr};
     std::string   _name{};
-    size_t        _split_dimension{ Window::DimY };
+    size_t        _split_dimension{Window::DimY};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuTransposeKernel.cpp b/src/cpu/kernels/CpuTransposeKernel.cpp
index b2cebc4230679133d75b6392414fa69e48c3c114..0f762ba0417f8cf2478be355b8c9f00e072c44e8 100644
--- a/src/cpu/kernels/CpuTransposeKernel.cpp
+++ b/src/cpu/kernels/CpuTransposeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -45,7 +46,7 @@ namespace
 {
 unsigned int num_elems_processed(size_t element_size)
 {
-    switch(element_size)
+    switch (element_size)
     {
         case 1:
             return 8;
@@ -81,10 +82,10 @@ void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &wind
 
     Window window_in(window);
     window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
+        if (window_end_y_multiple_of > window_start_y)
         {
             window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
         }
@@ -101,87 +102,121 @@ void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &wind
     Iterator output(out, window_out);
 
     // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
+    if (in->info()->dimension(1) != 1)
     {
         Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 8x8 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
             {
-                const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes));
-                const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes));
-                const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes));
-                const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes));
-                const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes));
-                const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes));
-                const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes));
-                const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes));
-
-                // Transpose 2x2
-                const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
-                const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
-                const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
-                const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
-
-                // Transpose 4x4
-                const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
-                const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
-                const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
-                const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
-
-                // Transpose 8x8
-                const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
-                const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
-                const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
-                const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
-
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
-            }
-
-            // Compute left-over elements along the x dimension (1x8)
-            for(; x < window_end_x; ++x)
-            {
-                const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes);
-                const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes);
-                const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes);
-                const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes);
-                const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes);
-                const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes);
-                const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes);
-                const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes);
-
-                uint8x8_t result = vdup_n_u8(0);
-                result           = vset_lane_u8(val0, result, 0);
-                result           = vset_lane_u8(val1, result, 1);
-                result           = vset_lane_u8(val2, result, 2);
-                result           = vset_lane_u8(val3, result, 3);
-                result           = vset_lane_u8(val4, result, 4);
-                result           = vset_lane_u8(val5, result, 5);
-                result           = vset_lane_u8(val6, result, 6);
-                result           = vset_lane_u8(val7, result, 7);
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
-
-                vst1_u8(output.ptr() + dst_offset_in_bytes, result);
-            }
-        },
-        input, output);
+                // Compute 8x8 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint8x8_t row0 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes));
+                    const uint8x8_t row1 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes));
+                    const uint8x8_t row2 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes));
+                    const uint8x8_t row3 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes));
+                    const uint8x8_t row4 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes));
+                    const uint8x8_t row5 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes));
+                    const uint8x8_t row6 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes));
+                    const uint8x8_t row7 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes));
+
+                    // Transpose 2x2
+                    const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
+                    const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
+                    const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
+                    const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
+
+                    // Transpose 4x4
+                    const uint16x4x2_t k0_u16 =
+                        vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
+                    const uint16x4x2_t k1_u16 =
+                        vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
+                    const uint16x4x2_t k2_u16 =
+                        vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
+                    const uint16x4x2_t k3_u16 =
+                        vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
+
+                    // Transpose 8x8
+                    const uint32x2x2_t k0_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
+                    const uint32x2x2_t k1_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
+                    const uint32x2x2_t k2_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
+                    const uint32x2x2_t k3_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
+
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
+                }
+
+                // Compute left-over elements along the x dimension (1x8)
+                for (; x < window_end_x; ++x)
+                {
+                    const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes);
+                    const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes);
+                    const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes);
+                    const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes);
+                    const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes);
+                    const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes);
+                    const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes);
+                    const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes);
+
+                    uint8x8_t result = vdup_n_u8(0);
+                    result           = vset_lane_u8(val0, result, 0);
+                    result           = vset_lane_u8(val1, result, 1);
+                    result           = vset_lane_u8(val2, result, 2);
+                    result           = vset_lane_u8(val3, result, 3);
+                    result           = vset_lane_u8(val4, result, 4);
+                    result           = vset_lane_u8(val5, result, 5);
+                    result           = vset_lane_u8(val6, result, 6);
+                    result           = vset_lane_u8(val7, result, 7);
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
+
+                    vst1_u8(output.ptr() + dst_offset_in_bytes, result);
+                }
+            },
+            input, output);
     }
 
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
         window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
@@ -190,16 +225,18 @@ void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &wind
         Iterator output(out, window_out);
 
         // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint8_t val0 = *input.ptr();
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                const uint8_t val0 = *input.ptr();
 
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
 
-            *(output.ptr() + dst_offset_in_bytes) = val0;
-        },
-        input, output);
+                *(output.ptr() + dst_offset_in_bytes) = val0;
+            },
+            input, output);
     }
 }
 
@@ -220,10 +257,10 @@ void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &win
 
     Window window_in(window);
     window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
+        if (window_end_y_multiple_of > window_start_y)
         {
             window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
         }
@@ -240,61 +277,77 @@ void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &win
     Iterator output(out, window_out);
 
     // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
+    if (in->info()->dimension(1) != 1)
     {
         Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 4x4 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
             {
-                const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
-                // Transpose 2x2
-                const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
-                const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
-
-                // Transpose 4x4
-                const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
-                const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
-
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0]));
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0]));
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1]));
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1]));
-            }
-
-            // Compute left-over elements (1x4)
-            for(; x < window_end_x; ++x)
-            {
-                const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
-                uint16x4_t result = vdup_n_u16(0);
-                result            = vset_lane_u16(val0, result, 0);
-                result            = vset_lane_u16(val1, result, 1);
-                result            = vset_lane_u16(val2, result, 2);
-                result            = vset_lane_u16(val3, result, 3);
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
-
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
-            }
-        },
-        input, output);
+                // Compute 4x4 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint16x4_t row0 =
+                        vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint16x4_t row1 =
+                        vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint16x4_t row2 =
+                        vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint16x4_t row3 =
+                        vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+                    // Transpose 2x2
+                    const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
+                    const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
+
+                    // Transpose 4x4
+                    const uint32x2x2_t k0_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
+                    const uint32x2x2_t k1_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
+
+                    vst1_u16(
+                        reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+                        vreinterpret_u16_u32(k0_u32.val[0]));
+                    vst1_u16(
+                        reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+                        vreinterpret_u16_u32(k1_u32.val[0]));
+                    vst1_u16(
+                        reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+                        vreinterpret_u16_u32(k0_u32.val[1]));
+                    vst1_u16(
+                        reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+                        vreinterpret_u16_u32(k1_u32.val[1]));
+                }
+
+                // Compute left-over elements (1x4)
+                for (; x < window_end_x; ++x)
+                {
+                    const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+                    uint16x4_t result = vdup_n_u16(0);
+                    result            = vset_lane_u16(val0, result, 0);
+                    result            = vset_lane_u16(val1, result, 1);
+                    result            = vset_lane_u16(val2, result, 2);
+                    result            = vset_lane_u16(val3, result, 3);
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
+
+                    vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
+                }
+            },
+            input, output);
     }
 
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
         window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
@@ -303,16 +356,18 @@ void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &win
         Iterator output(out, window_out);
 
         // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr()));
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr()));
 
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
 
-            *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
-        },
-        input, output);
+                *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+            },
+            input, output);
     }
 }
 
@@ -347,10 +402,10 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win
 
     Window window_in(window);
     window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
+        if (window_end_y_multiple_of > window_start_y)
         {
             window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
         }
@@ -367,102 +422,160 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win
     Iterator output(out, window_out);
 
     // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
+    if (in->info()->dimension(1) != 1)
     {
         Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 8x8 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Load
-                const uint32x4x2_t row0 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row1 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row2 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row3 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row4 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row5 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row6 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row7 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
-
-                // Transpose 2x4
-                const uint32x4x2_t k0_u32 = {vtrn1q_u32(row0.val[0], row1.val[0]), vtrn2q_u32(row0.val[0], row1.val[0])};
-                const uint32x4x2_t k1_u32 = {vtrn1q_u32(row0.val[1], row1.val[1]), vtrn2q_u32(row0.val[1], row1.val[1])};
-                const uint32x4x2_t k2_u32 = {vtrn1q_u32(row2.val[0], row3.val[0]), vtrn2q_u32(row2.val[0], row3.val[0])};
-                const uint32x4x2_t k3_u32 = {vtrn1q_u32(row2.val[1], row3.val[1]), vtrn2q_u32(row2.val[1], row3.val[1])};
-                const uint32x4x2_t k4_u32 = {vtrn1q_u32(row4.val[0], row5.val[0]), vtrn2q_u32(row4.val[0], row5.val[0])};
-                const uint32x4x2_t k5_u32 = {vtrn1q_u32(row4.val[1], row5.val[1]), vtrn2q_u32(row4.val[1], row5.val[1])};
-                const uint32x4x2_t k6_u32 = {vtrn1q_u32(row6.val[0], row7.val[0]), vtrn2q_u32(row6.val[0], row7.val[0])};
-                const uint32x4x2_t k7_u32 = {vtrn1q_u32(row6.val[1], row7.val[1]), vtrn2q_u32(row6.val[1], row7.val[1])};
-
-                // Transpose 2x2
-                const uint64x2x2_t k0_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0]))};
-                const uint64x2x2_t k1_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1]))};
-                const uint64x2x2_t k2_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0]))};
-                const uint64x2x2_t k3_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1]))};
-                const uint64x2x2_t k4_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0]))};
-                const uint64x2x2_t k5_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1]))};
-                const uint64x2x2_t k6_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0]))};
-                const uint64x2x2_t k7_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1]))};
-
-                // Swap blocks
-                const uint32x4x2_t col0 = {vreinterpretq_u32_u64(k0_u64.val[0]), vreinterpretq_u32_u64(k4_u64.val[0])};
-                const uint32x4x2_t col1 = {vreinterpretq_u32_u64(k1_u64.val[0]), vreinterpretq_u32_u64(k5_u64.val[0])};
-                const uint32x4x2_t col2 = {vreinterpretq_u32_u64(k0_u64.val[1]), vreinterpretq_u32_u64(k4_u64.val[1])};
-                const uint32x4x2_t col3 = {vreinterpretq_u32_u64(k1_u64.val[1]), vreinterpretq_u32_u64(k5_u64.val[1])};
-                const uint32x4x2_t col4 = {vreinterpretq_u32_u64(k2_u64.val[0]), vreinterpretq_u32_u64(k6_u64.val[0])};
-                const uint32x4x2_t col5 = {vreinterpretq_u32_u64(k3_u64.val[0]), vreinterpretq_u32_u64(k7_u64.val[0])};
-                const uint32x4x2_t col6 = {vreinterpretq_u32_u64(k2_u64.val[1]), vreinterpretq_u32_u64(k6_u64.val[1])};
-                const uint32x4x2_t col7 = {vreinterpretq_u32_u64(k3_u64.val[1]), vreinterpretq_u32_u64(k7_u64.val[1])};
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
-                // Store
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), col0);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), col1);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), col2);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), col3);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), col4);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), col5);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), col6);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), col7);
-            }
-
-            // Compute left-over elements (8x1)
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
             {
-                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-                const uint32_t val4 = *(reinterpret_cast<uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
-                const uint32_t val5 = *(reinterpret_cast<uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
-                const uint32_t val6 = *(reinterpret_cast<uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
-                const uint32_t val7 = *(reinterpret_cast<uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
-
-                uint32x4_t result0 = vdupq_n_u32(0);
-                uint32x4_t result1 = vdupq_n_u32(0);
-                result0            = vsetq_lane_u32(val0, result0, 0);
-                result0            = vsetq_lane_u32(val1, result0, 1);
-                result0            = vsetq_lane_u32(val2, result0, 2);
-                result0            = vsetq_lane_u32(val3, result0, 3);
-                result1            = vsetq_lane_u32(val4, result1, 0);
-                result1            = vsetq_lane_u32(val5, result1, 1);
-                result1            = vsetq_lane_u32(val6, result1, 2);
-                result1            = vsetq_lane_u32(val7, result1, 3);
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), {result0, result1});
-            }
-        },
-        input, output);
+                // Compute 8x8 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load
+                    const uint32x4x2_t row0 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row1 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row2 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row3 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row4 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row5 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row6 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row7 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
+
+                    // Transpose 2x4
+                    const uint32x4x2_t k0_u32 = {vtrn1q_u32(row0.val[0], row1.val[0]),
+                                                 vtrn2q_u32(row0.val[0], row1.val[0])};
+                    const uint32x4x2_t k1_u32 = {vtrn1q_u32(row0.val[1], row1.val[1]),
+                                                 vtrn2q_u32(row0.val[1], row1.val[1])};
+                    const uint32x4x2_t k2_u32 = {vtrn1q_u32(row2.val[0], row3.val[0]),
+                                                 vtrn2q_u32(row2.val[0], row3.val[0])};
+                    const uint32x4x2_t k3_u32 = {vtrn1q_u32(row2.val[1], row3.val[1]),
+                                                 vtrn2q_u32(row2.val[1], row3.val[1])};
+                    const uint32x4x2_t k4_u32 = {vtrn1q_u32(row4.val[0], row5.val[0]),
+                                                 vtrn2q_u32(row4.val[0], row5.val[0])};
+                    const uint32x4x2_t k5_u32 = {vtrn1q_u32(row4.val[1], row5.val[1]),
+                                                 vtrn2q_u32(row4.val[1], row5.val[1])};
+                    const uint32x4x2_t k6_u32 = {vtrn1q_u32(row6.val[0], row7.val[0]),
+                                                 vtrn2q_u32(row6.val[0], row7.val[0])};
+                    const uint32x4x2_t k7_u32 = {vtrn1q_u32(row6.val[1], row7.val[1]),
+                                                 vtrn2q_u32(row6.val[1], row7.val[1])};
+
+                    // Transpose 2x2
+                    const uint64x2x2_t k0_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0]))};
+                    const uint64x2x2_t k1_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1]))};
+                    const uint64x2x2_t k2_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0]))};
+                    const uint64x2x2_t k3_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1]))};
+                    const uint64x2x2_t k4_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0]))};
+                    const uint64x2x2_t k5_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1]))};
+                    const uint64x2x2_t k6_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0]))};
+                    const uint64x2x2_t k7_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1]))};
+
+                    // Swap blocks
+                    const uint32x4x2_t col0 = {vreinterpretq_u32_u64(k0_u64.val[0]),
+                                               vreinterpretq_u32_u64(k4_u64.val[0])};
+                    const uint32x4x2_t col1 = {vreinterpretq_u32_u64(k1_u64.val[0]),
+                                               vreinterpretq_u32_u64(k5_u64.val[0])};
+                    const uint32x4x2_t col2 = {vreinterpretq_u32_u64(k0_u64.val[1]),
+                                               vreinterpretq_u32_u64(k4_u64.val[1])};
+                    const uint32x4x2_t col3 = {vreinterpretq_u32_u64(k1_u64.val[1]),
+                                               vreinterpretq_u32_u64(k5_u64.val[1])};
+                    const uint32x4x2_t col4 = {vreinterpretq_u32_u64(k2_u64.val[0]),
+                                               vreinterpretq_u32_u64(k6_u64.val[0])};
+                    const uint32x4x2_t col5 = {vreinterpretq_u32_u64(k3_u64.val[0]),
+                                               vreinterpretq_u32_u64(k7_u64.val[0])};
+                    const uint32x4x2_t col6 = {vreinterpretq_u32_u64(k2_u64.val[1]),
+                                               vreinterpretq_u32_u64(k6_u64.val[1])};
+                    const uint32x4x2_t col7 = {vreinterpretq_u32_u64(k3_u64.val[1]),
+                                               vreinterpretq_u32_u64(k7_u64.val[1])};
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+                    // Store
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+                        col0);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+                        col1);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+                        col2);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+                        col3);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes),
+                        col4);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes),
+                        col5);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes),
+                        col6);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes),
+                        col7);
+                }
+
+                // Compute left-over elements (8x1)
+                for (; x < window_end_x; ++x)
+                {
+                    const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+                    const uint32_t val4 = *(reinterpret_cast<uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
+                    const uint32_t val5 = *(reinterpret_cast<uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
+                    const uint32_t val6 = *(reinterpret_cast<uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
+                    const uint32_t val7 = *(reinterpret_cast<uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
+
+                    uint32x4_t result0 = vdupq_n_u32(0);
+                    uint32x4_t result1 = vdupq_n_u32(0);
+                    result0            = vsetq_lane_u32(val0, result0, 0);
+                    result0            = vsetq_lane_u32(val1, result0, 1);
+                    result0            = vsetq_lane_u32(val2, result0, 2);
+                    result0            = vsetq_lane_u32(val3, result0, 3);
+                    result1            = vsetq_lane_u32(val4, result1, 0);
+                    result1            = vsetq_lane_u32(val5, result1, 1);
+                    result1            = vsetq_lane_u32(val6, result1, 2);
+                    result1            = vsetq_lane_u32(val7, result1, 3);
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+                    vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), {result0, result1});
+                }
+            },
+            input, output);
     }
 
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
         window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
@@ -471,40 +584,42 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win
         Iterator output(out, window_out);
 
         // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
 
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
 
-            *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
-        },
-        input, output);
+                *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+            },
+            input, output);
     }
 }
 #else  // __aarch64__
 void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)
 {
-    const int    window_step_x            = 4;
-    const int    window_step_y            = 4;
-    const int    window_start_x           = window.x().start();
-    const int    window_end_x             = window.x().end();
-    const int    window_start_y           = window.y().start();
-    const int    window_end_y             = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
-    const int    window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
-    const size_t input_stride_in_bytes    = in->info()->strides_in_bytes()[1];
-    const size_t output_stride_in_bytes   = out->info()->strides_in_bytes()[1];
+    const int window_step_x = 4;
+    const int window_step_y = 4;
+    const int window_start_x = window.x().start();
+    const int window_end_x = window.x().end();
+    const int window_start_y = window.y().start();
+    const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
+    const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
+    const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
+    const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
 
     // Check if we need a left-over loop for the y dimension
     bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
 
     Window window_in(window);
     window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
+        if (window_end_y_multiple_of > window_start_y)
         {
             window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
         }
@@ -521,60 +636,74 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win
     Iterator output(out, window_out);
 
     // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
+    if (in->info()->dimension(1) != 1)
     {
         Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 4x4 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
-                // Transpose 2x2
-                const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
-                const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
-                const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
-                const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
-                // Swap block 01 with block 10 and store
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
-            }
-
-            // Compute left-over elements (1x4)
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
             {
-                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
-                uint32x4_t result = vdupq_n_u32(0);
-                result            = vsetq_lane_u32(val0, result, 0);
-                result            = vsetq_lane_u32(val1, result, 1);
-                result            = vsetq_lane_u32(val2, result, 2);
-                result            = vsetq_lane_u32(val3, result, 3);
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
-            }
-        },
-        input, output);
+                // Compute 4x4 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint32x4_t row0 =
+                        vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint32x4_t row1 =
+                        vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint32x4_t row2 =
+                        vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint32x4_t row3 =
+                        vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+                    // Transpose 2x2
+                    const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
+                    const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
+                    const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
+                    const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+                    // Swap block 01 with block 10 and store
+                    vst1q_u32(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+                        vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
+                    vst1q_u32(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+                        vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
+                    vst1q_u32(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+                        vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
+                    vst1q_u32(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+                        vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
+                }
+
+                // Compute left-over elements (1x4)
+                for (; x < window_end_x; ++x)
+                {
+                    const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+                    uint32x4_t result = vdupq_n_u32(0);
+                    result = vsetq_lane_u32(val0, result, 0);
+                    result = vsetq_lane_u32(val1, result, 1);
+                    result = vsetq_lane_u32(val2, result, 2);
+                    result = vsetq_lane_u32(val3, result, 3);
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+                    vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
+                }
+            },
+            input, output);
     }
 
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
         window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
@@ -583,16 +712,18 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win
         Iterator output(out, window_out);
 
         // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
 
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
 
-            *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
-        },
-        input, output);
+                *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+            },
+            input, output);
     }
 }
 #endif // __aarch64__
@@ -606,6 +737,9 @@ void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
     const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
 
+    // Explicitly set the tensor shape to preserve dimensions
+    dst->set_tensor_shape(dst_shape);
+
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst));
 
@@ -616,7 +750,8 @@ void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
     const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(src->element_size());
 
     // Configure kernel window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // The CpuTranspose doesn't need padding so update_window_and_padding() can be skipped
     Coordinates coord;
@@ -637,7 +772,7 @@ Status CpuTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *d
                                     "Element size not supported");
 
     // Validate configured destination
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
 
@@ -658,7 +793,7 @@ void CpuTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cons
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    switch(src->info()->element_size())
+    switch (src->info()->element_size())
     {
         case 1:
             transpose_8bit_elements(src, dst, window);
diff --git a/src/cpu/kernels/CpuTransposeKernel.h b/src/cpu/kernels/CpuTransposeKernel.h
index cb85daeb40f1b4de9ac09286517cb6bd1f91d21f..e79a405677cdbd8adbb863117e85a39a3d238735 100644
--- a/src/cpu/kernels/CpuTransposeKernel.h
+++ b/src/cpu/kernels/CpuTransposeKernel.h
@@ -54,7 +54,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp
index 2ccc9779954f4956d2f8d1ac95a31f00f0219482..297ba63826fe39c68a62d9fefa12d0d750abba92 100644
--- a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp
+++ b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -38,7 +39,7 @@ namespace
 {
 TensorShape get_output_shape(const ITensorInfo *src, bool has_bias)
 {
-    TensorShape output_shape{ src->tensor_shape() };
+    TensorShape output_shape{src->tensor_shape()};
 
     output_shape.collapse(3);
     const size_t tmp_dim = output_shape[0];
@@ -54,20 +55,22 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *biases, con
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(src->data_type()));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
         ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->num_dimensions() != 1));
         ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->num_dimensions() != 2));
         ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->dimension(0) != src->tensor_shape()[3]));
-        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] || biases->dimension(1) != src->tensor_shape()[4]));
+        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] ||
+                                                                     biases->dimension(1) != src->tensor_shape()[4]));
     }
 
     // Checks performed when output is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), get_output_shape(src, biases != nullptr));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                           get_output_shape(src, biases != nullptr));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -84,9 +87,7 @@ void CpuWeightsReshapeKernel::configure(const ITensorInfo *src, const ITensorInf
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(get_output_shape(src, (biases != nullptr))));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
-                                                  biases,
-                                                  dst));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst));
 
     // Configure kernel
     Window window = calculate_max_window(*src, Steps());
@@ -122,44 +123,47 @@ void CpuWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window,
 
     // Create iterators
     Iterator in(src, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Get column index
-        const int kernel_idx = id[3];
-        const int kernel_idz = id[4];
-
-        // Setup pointers
-        const uint8_t *tmp_input_ptr        = in.ptr();
-        uint8_t       *tmp_output_ptr       = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
-        const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
-        const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
-
-        // Linearize volume
-        for(unsigned int d = 0; d < kernel_depth; ++d)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            for(unsigned int j = 0; j < kernel_size_y; ++j)
+            // Get column index
+            const int kernel_idx = id[3];
+            const int kernel_idz = id[4];
+
+            // Setup pointers
+            const uint8_t *tmp_input_ptr        = in.ptr();
+            uint8_t       *tmp_output_ptr       = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
+            const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
+            const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
+
+            // Linearize volume
+            for (unsigned int d = 0; d < kernel_depth; ++d)
             {
-                for(unsigned int i = 0; i < kernel_size_x; ++i)
+                for (unsigned int j = 0; j < kernel_size_y; ++j)
                 {
-                    std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size());
-                    tmp_input_ptr += input_stride_x;
-                    tmp_output_ptr += output_stride_y;
+                    for (unsigned int i = 0; i < kernel_size_x; ++i)
+                    {
+                        std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size());
+                        tmp_input_ptr += input_stride_x;
+                        tmp_output_ptr += output_stride_y;
+                    }
+                    curr_input_row_ptr += input_stride_y;
+                    tmp_input_ptr = curr_input_row_ptr;
                 }
-                curr_input_row_ptr += input_stride_y;
-                tmp_input_ptr = curr_input_row_ptr;
+                curr_input_depth_ptr += input_stride_z;
+                curr_input_row_ptr = curr_input_depth_ptr;
+                tmp_input_ptr      = curr_input_depth_ptr;
             }
-            curr_input_depth_ptr += input_stride_z;
-            curr_input_row_ptr = curr_input_depth_ptr;
-            tmp_input_ptr      = curr_input_depth_ptr;
-        }
 
-        // Add bias
-        if(biases != nullptr)
-        {
-            std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), src->info()->element_size());
-        }
-    },
-    in);
+            // Add bias
+            if (biases != nullptr)
+            {
+                std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)),
+                            src->info()->element_size());
+            }
+        },
+        in);
 }
 const char *CpuWeightsReshapeKernel::name() const
 {
@@ -167,4 +171,4 @@ const char *CpuWeightsReshapeKernel::name() const
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.h b/src/cpu/kernels/CpuWeightsReshapeKernel.h
index 1a260edc9668e27392e4ab4a2e70b2addad48e36..9310b3c78446f30118acf72470aee2d82f8cd858 100644
--- a/src/cpu/kernels/CpuWeightsReshapeKernel.h
+++ b/src/cpu/kernels/CpuWeightsReshapeKernel.h
@@ -82,7 +82,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.cpp b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp
index 818d87811900bf389b20fe88de0baa9e373d3cf0..52e3f2549c629d5fcb8a7650f5f2fa308dd0b926 100644
--- a/src/cpu/kernels/CpuWinogradConv2dKernel.cpp
+++ b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp
@@ -28,8 +28,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-CpuWinogradConv2dTransformInputKernel::CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads)
-    : _winograd_impl{ w_impl }, _conv_args{ _c_args }, _nthreads{ nthreads }
+CpuWinogradConv2dTransformInputKernel::CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+                                                                             arm_conv::ConvolutionArgs        &_c_args,
+                                                                             uint32_t                          nthreads)
+    : _winograd_impl{w_impl}, _conv_args{_c_args}, _nthreads{nthreads}
 {
 }
 
@@ -49,24 +51,20 @@ void CpuWinogradConv2dTransformInputKernel::run_op(ITensorPack &tensors, const W
     const size_t input_row_stride   = src_strides[height_idx] / element_size_in_bytes;
     const size_t input_col_stride   = src_strides[width_idx] / element_size_in_bytes;
     const size_t input_batch_stride = src_strides[batch_idx] / element_size_in_bytes;
-    const auto   input_nhwc_ptr     = reinterpret_cast<const void *>(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes());
-    auto         win_transf_ptr     = reinterpret_cast<void *>(winograd_input_transform->buffer() + winograd_input_transform->info()->offset_first_element_in_bytes());
+    const auto   input_nhwc_ptr =
+        reinterpret_cast<const void *>(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes());
+    auto win_transf_ptr = reinterpret_cast<void *>(winograd_input_transform->buffer() +
+                                                   winograd_input_transform->info()->offset_first_element_in_bytes());
 
-    _winograd_impl.input_transform->execute(
-        _conv_args,
-        input_nhwc_ptr,
-        input_batch_stride,
-        input_row_stride,
-        input_col_stride,
-        win_transf_ptr,
-        _winograd_impl.winograd_spec,
-        workspace->buffer(),
-        info.thread_id,
-        _nthreads);
+    _winograd_impl.input_transform->execute(_conv_args, input_nhwc_ptr, input_batch_stride, input_row_stride,
+                                            input_col_stride, win_transf_ptr, _winograd_impl.winograd_spec,
+                                            workspace->buffer(), info.thread_id, _nthreads);
 }
 
-CpuWinogradConv2dTransformOutputKernel::CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads)
-    : _winograd_impl{ w_impl }, _conv_args{ _c_args }, _nthreads{ nthreads }
+CpuWinogradConv2dTransformOutputKernel::CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+                                                                               arm_conv::ConvolutionArgs &_c_args,
+                                                                               uint32_t                   nthreads)
+    : _winograd_impl{w_impl}, _conv_args{_c_args}, _nthreads{nthreads}
 {
 }
 
@@ -88,28 +86,21 @@ void CpuWinogradConv2dTransformOutputKernel::run_op(ITensorPack &tensors, const
     const size_t out_row_stride   = dst_strides[height_idx] / element_size_in_bytes;
     const size_t out_col_stride   = dst_strides[width_idx] / element_size_in_bytes;
     const size_t out_batch_stride = dst_strides[batch_idx] / element_size_in_bytes;
-    const auto   wout_transf_ptr  = reinterpret_cast<const void *>(winograd_output_transform->buffer() + winograd_output_transform->info()->offset_first_element_in_bytes());
-    auto         dst_nhwc_ptr     = reinterpret_cast<void *>(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes());
-    void        *biases_data_ptr  = nullptr;
-    if(biases != nullptr)
+    const auto   wout_transf_ptr  = reinterpret_cast<const void *>(
+        winograd_output_transform->buffer() + winograd_output_transform->info()->offset_first_element_in_bytes());
+    auto dst_nhwc_ptr =
+        reinterpret_cast<void *>(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes());
+    void *biases_data_ptr = nullptr;
+    if (biases != nullptr)
     {
         biases_data_ptr = reinterpret_cast<void *>(biases->buffer() + biases->info()->offset_first_element_in_bytes());
     }
 
     // Output transform
-    _winograd_impl.output_transform->execute(
-        _conv_args,
-        wout_transf_ptr,
-        _winograd_impl.winograd_spec,
-        biases_data_ptr,
-        dst_nhwc_ptr,
-        out_batch_stride,
-        out_row_stride,
-        out_col_stride,
-        workspace->buffer(),
-        info.thread_id,
-        _nthreads);
+    _winograd_impl.output_transform->execute(_conv_args, wout_transf_ptr, _winograd_impl.winograd_spec, biases_data_ptr,
+                                             dst_nhwc_ptr, out_batch_stride, out_row_stride, out_col_stride,
+                                             workspace->buffer(), info.thread_id, _nthreads);
 }
 
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.h b/src/cpu/kernels/CpuWinogradConv2dKernel.h
index 0170dcae220026059112f82d93f5fa6e81a9fe48..8a3b745e8584b7d3ce7b732207562b1d98938755 100644
--- a/src/cpu/kernels/CpuWinogradConv2dKernel.h
+++ b/src/cpu/kernels/CpuWinogradConv2dKernel.h
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Steps.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/NEON/kernels/assembly/winograd.hpp"
 #include "src/core/NEON/kernels/convolution/common/tensor.hpp"
 #include "src/cpu/ICpuKernel.h"
@@ -53,7 +54,9 @@ public:
     /**  Prevent instances of this class from being moved it contains references.*/
     CpuWinogradConv2dTransformInputKernel &operator=(CpuWinogradConv2dTransformInputKernel &&) = delete;
 
-    CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads);
+    CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+                                          arm_conv::ConvolutionArgs        &_c_args,
+                                          uint32_t                          nthreads);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -83,7 +86,9 @@ public:
     /**  Prevent instances of this class from being moved it contains references.*/
     CpuWinogradConv2dTransformOutputKernel &operator=(CpuWinogradConv2dTransformOutputKernel &&) = delete;
 
-    CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads);
+    CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+                                           arm_conv::ConvolutionArgs        &_c_args,
+                                           uint32_t                          nthreads);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -95,7 +100,7 @@ public:
 
 private:
     arm_conv::winograd::WinogradImpl &_winograd_impl;
-    const arm_conv::ConvolutionArgs &_conv_args;
+    const arm_conv::ConvolutionArgs  &_conv_args;
     uint32_t                          _nthreads;
 };
 
diff --git a/src/cpu/kernels/activation/generic/neon/fp16.cpp b/src/cpu/kernels/activation/generic/neon/fp16.cpp
index e51b5b3423356f7668f528e80538d16c3b4a01f0..ddc6dc24cd569bf2dccdaafd5035f345362beb0a 100644
--- a/src/cpu/kernels/activation/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/activation/generic/neon/fp16.cpp
@@ -31,7 +31,7 @@ namespace cpu
 {
 namespace
 {
-constexpr ActFpImplParams Fp16Params = { static_cast<float16_t>(1e-7), 8 };
+constexpr ActFpImplParams Fp16Params = {static_cast<float16_t>(1e-7), 8};
 } // namespace
 
 void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
@@ -40,4 +40,4 @@ void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLaye
 }
 } // namespace cpu
 } // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/activation/generic/neon/fp32.cpp b/src/cpu/kernels/activation/generic/neon/fp32.cpp
index 2a3b8a0bfdd7845c345e02d14d3588da681aacf4..e558f8c73ebdad54006f6f275a69319e7107b6f1 100644
--- a/src/cpu/kernels/activation/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/activation/generic/neon/fp32.cpp
@@ -29,7 +29,7 @@ namespace cpu
 {
 namespace
 {
-constexpr ActFpImplParams Fp32Params = { static_cast<float>(1e-24), 4 };
+constexpr ActFpImplParams Fp32Params = {static_cast<float>(1e-24), 4};
 } // namespace
 void neon_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
diff --git a/src/cpu/kernels/activation/generic/neon/impl.h b/src/cpu/kernels/activation/generic/neon/impl.h
index 05885d84769e997119aa125b3a7fba0bd4ea68a4..afeb6f7f3dfa3b9128cf52c4f8dab4879a0b5426 100644
--- a/src/cpu/kernels/activation/generic/neon/impl.h
+++ b/src/cpu/kernels/activation/generic/neon/impl.h
@@ -24,6 +24,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 namespace arm_compute
 {
@@ -56,10 +57,14 @@ inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &ma
 #endif /* __aarch64__ */
 
 template <typename T, const ActFpImplParams &P>
-void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void fp_neon_activation_impl(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
 {
     /** SIMD vector tag type. */
-    using ExactTagType                                           = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+    using ExactTagType =
+        typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
     constexpr int                                 window_step_x  = P.step_x;
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
@@ -72,12 +77,12 @@ void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationL
     // to prevent NAN values caused by zeros in inputs to SQRT.
     // In case of aarh64, we call vsqrt directly, so we don't use delta.
 #ifndef __aarch64__
-    const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType {});
+    const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType{});
 #else  /* #ifndef __aarch64__ */
-    const auto const_inv_2      = wrapper::vdup_n(static_cast<T>(0.5f), ExactTagType {});
+    const auto const_inv_2      = wrapper::vdup_n(static_cast<T>(0.5f), ExactTagType{});
     const auto const_inv_sqrt_2 = wrapper::vdup_n(static_cast<T>(0.70710678118f), ExactTagType{});
 #endif /* __aarch64__ */
-    const auto      const_1           = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType {});
+    const auto      const_1           = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
     const auto      const_0           = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
     const auto      const_6           = wrapper::vdup_n(static_cast<T>(6.f), ExactTagType{});
     const auto      const_3           = wrapper::vdup_n(static_cast<T>(3.f), ExactTagType{});
@@ -88,143 +93,154 @@ void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationL
     const auto      vb                = wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{});
     const auto      a                 = static_cast<T>(act_info.a());
     const auto      b                 = static_cast<T>(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-        wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            switch(act)
+            const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+            wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = wrapper::vabs(vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = wrapper::vmla(vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = wrapper::vmax(const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                switch (act)
+                {
+                    case ActivationLayerInfo::ActivationFunction::ABS:
+                        tmp = wrapper::vabs(vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LINEAR:
+                        tmp = wrapper::vmla(vb, va, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                        tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::RELU:
+                        tmp = wrapper::vmax(const_0, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                        tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                        tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                        tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                        tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin,
+                                            wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::ELU:
+                        tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin,
+                                            wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQRT:
 #ifdef __aarch64__
-                    tmp = wrapper::vsqrt(vin);
+                        tmp = wrapper::vsqrt(vin);
 #else  /* __aarch64__ */
                     {
                         const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{}));
-                        tmp                 = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
-                        tmp                 = mask_float_vector(tmp, wrapper::vnot(bitmask));
+                        tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
+                        tmp = mask_float_vector(tmp, wrapper::vnot(bitmask));
                     }
 #endif /* __aarch64__ */
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = wrapper::vmul(vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SWISH:
-                    tmp = wrapper::vmul(vin, wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(wrapper::vmul(va, vin))))));
-                    break;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQUARE:
+                        tmp = wrapper::vmul(vin, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::TANH:
+                        tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                        tmp = vin;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                        tmp = wrapper::vmul(
+                            vin,
+                            wrapper::vmul(const_inv_6,
+                                          wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SWISH:
+                        tmp = wrapper::vmul(vin, wrapper::vinv(wrapper::vadd(
+                                                     const_1, wrapper::vexpq(wrapper::vneg(wrapper::vmul(va, vin))))));
+                        break;
 #ifdef __aarch64__
-                case ActivationLayerInfo::ActivationFunction::GELU:
-                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_2, wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2)))));
-                    break;
+                    case ActivationLayerInfo::ActivationFunction::GELU:
+                        tmp = wrapper::vmul(
+                            vin,
+                            wrapper::vmul(const_inv_2,
+                                          wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2)))));
+                        break;
 #endif /* __aarch64__ */
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                wrapper::vstore(output_ptr + x, tmp);
             }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const T in = *(reinterpret_cast<const T *>(input_ptr + x));
-            T       tmp;
-            switch(act)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = std::abs(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = a * in + b;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = std::max<T>(static_cast<T>(0), in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = std::min<T>(a, std::max<T>(b, in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = (in > 0) ? in : a * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = std::sqrt(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = in * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = a * std::tanh(b * in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SWISH:
-                    tmp = in / (static_cast<T>(1) + std::exp(-a * in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::GELU:
-                    tmp = in * static_cast<T>(0.5f * (1.0f + erff(static_cast<float>(in) / 1.41421356237f)));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                const T in = *(reinterpret_cast<const T *>(input_ptr + x));
+                T       tmp;
+                switch (act)
+                {
+                    case ActivationLayerInfo::ActivationFunction::ABS:
+                        tmp = std::abs(in);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LINEAR:
+                        tmp = a * in + b;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                        tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::RELU:
+                        tmp = std::max<T>(static_cast<T>(0), in);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                        tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                        tmp = std::min<T>(a, std::max<T>(b, in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                        tmp = (in > 0) ? in : a * in;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                        tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::ELU:
+                        tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQRT:
+                        tmp = std::sqrt(in);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQUARE:
+                        tmp = in * in;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::TANH:
+                        tmp = a * std::tanh(b * in);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                        tmp = in;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                        tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SWISH:
+                        tmp = in / (static_cast<T>(1) + std::exp(-a * in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::GELU:
+                        tmp = in * static_cast<T>(0.5f * (1.0f + erff(static_cast<float>(in) / 1.41421356237f)));
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                *(output_ptr + x) = tmp;
             }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/lut.cpp b/src/cpu/kernels/activation/generic/neon/lut.cpp
index c973e964e4f93664c218a20acab190d2848f346a..f289c80d4be05b683831e2b941d3e05ef0960364 100644
--- a/src/cpu/kernels/activation/generic/neon/lut.cpp
+++ b/src/cpu/kernels/activation/generic/neon/lut.cpp
@@ -24,6 +24,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/kernels/lut/list.h"
 
 namespace arm_compute
@@ -33,19 +34,22 @@ namespace cpu
 #ifdef __aarch64__
 void neon_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 &&
+                         src->info()->data_type() != DataType::QASYMM8_SIGNED);
     const auto window_end_x  = window.x().end();
     Window     win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
-        auto       output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-        lut_u8_neon(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
-    },
-    input, output);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
+            auto       output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+            lut_u8_neon(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
+        },
+        input, output);
 }
 #endif // __aarch64__
 } // namespace cpu
diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
index e7c146e46f1be02a68bac3f038427a1125446d4e..1451301ea20ce4c893a7dd4db39e7d2a84c2d38e 100644
--- a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
@@ -25,6 +25,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
@@ -38,7 +39,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qasymm8_activation(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
 {
     constexpr int                                 window_step_x  = 16;
     const auto                                    window_start_x = static_cast<int>(window.x().start());
@@ -85,206 +89,222 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
     float32x4_t vs = vdupq_n_f32(s);
     float32x4_t vo = vdupq_n_f32(o);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qasymm8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const qasymm8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
 
-        wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
+            wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
 
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = vmaxq_u8(vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                // Perform activation
-                tmp = vminq_u8(va, vmaxq_u8(vb, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
+                {
+                    // Perform activation
+                    tmp = vmaxq_u8(vconst_0, vin);
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = vminq_u8(va, vmaxq_u8(vb, vin));
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
                 {
-                    {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
 #endif // __aarch64__
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
                 {
-                    {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
                 {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                const auto vin_deq = vdequantize(vin, qi_in);
-
-                const uint32x4x4_t pos_mask =
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vmul(
+                            vin_deq.val[0],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[1],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[2],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[3],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
                 {
-                    {
+                    const auto vin_deq = vdequantize(vin, qi_in);
+
+                    const uint32x4x4_t pos_mask = {{
                         wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
-                    }
-                };
+                    }};
 
-                const float32x4x4_t tmp_dep =
-                {
-                    {
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
                         wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
                         wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
                         wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
-                    }
-                };
+                    }};
 
-                tmp = vquantize(tmp_dep, qi_out);
-            }
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
 #else  // #ifndef __aarch64__
-            else if (act == ActivationLayerInfo::ActivationFunction::GELU)
-            {
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::GELU)
                 {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[0], const_inv_sqrt_2))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[1], const_inv_sqrt_2))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[2], const_inv_sqrt_2))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[3], const_inv_sqrt_2))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vmul(vin_deq.val[0],
+                                      wrapper::vmul(const_inv_2,
+                                                    wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+                                                                                vin_deq.val[0], const_inv_sqrt_2))))),
+                        wrapper::vmul(vin_deq.val[1],
+                                      wrapper::vmul(const_inv_2,
+                                                    wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+                                                                                vin_deq.val[1], const_inv_sqrt_2))))),
+                        wrapper::vmul(vin_deq.val[2],
+                                      wrapper::vmul(const_inv_2,
+                                                    wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+                                                                                vin_deq.val[2], const_inv_sqrt_2))))),
+                        wrapper::vmul(vin_deq.val[3],
+                                      wrapper::vmul(const_inv_2,
+                                                    wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+                                                                                vin_deq.val[3], const_inv_sqrt_2))))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
 #endif // __aarch64__
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                wrapper::vstore(output_ptr + x, tmp);
             }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_t in  = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
-            qasymm8_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
-            }
+                qasymm8_t in  = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
+                qasymm8_t tmp = 0;
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
+                {
+                    tmp = std::max(const_0, in);
+                    tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    tmp = std::min(a, std::max(const_0, in));
+                    tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    tmp = std::min(a, std::max(b, in));
+                    tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+                }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
 #endif // __aarch64__
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::GELU)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp         = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f));
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::GELU)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp         = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f));
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
 #endif // __aarch64__
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                *(output_ptr + x) = tmp;
             }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
index 52c396459b116eae34a4f6747779a22b0121535a..a2f588245ac620a5afb14ab2d453e91a1d88cc5c 100644
--- a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
@@ -36,7 +37,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qasymm8_signed_activation(const ITensor             *src,
+                                    ITensor                   *dst,
+                                    const ActivationLayerInfo &act_info,
+                                    const Window              &window)
 {
     constexpr int                                 window_step_x  = 16;
     const auto                                    window_start_x = static_cast<int>(window.x().start());
@@ -76,191 +80,195 @@ void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti
     float32x4_t vs = vdupq_n_f32(s);
     float32x4_t vo = vdupq_n_f32(o);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
 
-        wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
+            wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
 
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = vmaxq_s8(vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                // Perform activation
-                tmp = vminq_s8(va, vmaxq_s8(vb, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
+                {
+                    // Perform activation
+                    tmp = vmaxq_s8(vconst_0, vin);
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = vminq_s8(va, vmaxq_s8(vb, vin));
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
                 {
-                    {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_signed(tmp_dep, qi_out);
+                }
 #endif // __aarch64__
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
                 {
-                    {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_signed(tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
                 {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                const auto vin_deq = vdequantize(vin, qi_in);
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vmul(
+                            vin_deq.val[0],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[1],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[2],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[3],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_signed(tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    const auto vin_deq = vdequantize(vin, qi_in);
 
 #ifdef __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
+                    const uint32x4x4_t pos_mask = {{
                         wrapper::vcgtz(vin_deq.val[0]),
                         wrapper::vcgtz(vin_deq.val[1]),
                         wrapper::vcgtz(vin_deq.val[2]),
                         wrapper::vcgtz(vin_deq.val[3]),
-                    }
-                };
+                    }};
 #else  // __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
+                    const uint32x4x4_t pos_mask = {{
                         wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
-                    }
-                };
+                    }};
 #endif // __aarch64__
 
-                const float32x4x4_t tmp_dep =
-                {
-                    {
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
                         wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
                         wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
                         wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
-                    }
-                };
+                    }};
 
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
+                    tmp = vquantize_signed(tmp_dep, qi_out);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                wrapper::vstore(output_ptr + x, tmp);
             }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_signed_t in  = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
-            qasymm8_signed_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
-            }
+                qasymm8_signed_t in  = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
+                qasymm8_signed_t tmp = 0;
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
+                {
+                    tmp = std::max(const_0, in);
+                    tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    tmp = std::min(a, std::max(const_0, in));
+                    tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    tmp = std::min(a, std::max(b, in));
+                    tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+                }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                    tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                    tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                }
 #endif // __aarch64__
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                    tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                    tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+                {
+                    float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                    tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+                    tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                    tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+                    tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                *(output_ptr + x) = tmp;
             }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/qsymm16.cpp b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
index 2aea6cba3c27e9a268e5259ccb6545f70be7f329..891646ea00e4f59370c6103417e537176b05614c 100644
--- a/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
@@ -21,11 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/NESymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
@@ -38,7 +39,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qsymm16_activation(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
 {
     constexpr int                                 window_step_x  = 8;
     const auto                                    window_start_x = static_cast<int>(window.x().start());
@@ -59,103 +63,94 @@ void neon_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationL
     const float                   a_f32    = act_info.a();
     const float                   b_f32    = act_info.b();
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qsymm16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const qsymm16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
 
-        wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
-        ARM_COMPUTE_UNUSED(tmp);
+            wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
+            ARM_COMPUTE_UNUSED(tmp);
 
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
                 {
-                    {
+                    // De-quantize
+                    const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                    // Perform activation
+                    const float32x4x2_t tmp_dep = {{
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_int16(tmp_dep, qi_out.scale);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
                 {
-                    {
+                    // De-quantize
+                    const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                    // Perform activation
+                    const float32x4x2_t tmp_dep = {{
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
-            }
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_int16(tmp_dep, qi_out.scale);
+                }
 
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
                 {
-                    {
-                        wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[0])),
-                        wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[1]))
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
+                    // De-quantize
+                    const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                    // Perform activation
+                    const float32x4x2_t tmp_dep = {{wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[0])),
+                                                    wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[1]))}};
+                    // Re-quantize to new output space
+                    tmp = vquantize_int16(tmp_dep, qi_out.scale);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                wrapper::vstore(output_ptr + x, tmp);
             }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qsymm16_t in  = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
-            qsymm16_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = std::min<float>(a_f32, std::max<float>(b_f32, tmp_f));
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
+                qsymm16_t in  = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
+                qsymm16_t tmp = 0;
+                if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                    tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                    tmp         = quantize_qsymm16(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                    tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                    tmp         = quantize_qsymm16(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                    tmp_f       = std::min<float>(a_f32, std::max<float>(b_f32, tmp_f));
+                    tmp         = quantize_qsymm16(tmp_f, qi_out);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                *(output_ptr + x) = tmp;
             }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve/fp16.cpp b/src/cpu/kernels/activation/generic/sve/fp16.cpp
index 4757c60d8f80cd62bec73620a9e5462e1c1317f9..97399e01e00035b72ef21a1fe0f1b4cf8108c44a 100644
--- a/src/cpu/kernels/activation/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/activation/generic/sve/fp16.cpp
@@ -29,11 +29,11 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
-#include <cmath>
-#include <cstddef>
-
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
 
 namespace arm_compute
 {
@@ -59,77 +59,87 @@ void sve_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayer
 
     const auto va = svdup_n_f16(act_info.a());
     const auto vb = svdup_n_f16(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
 
-        svfloat16_t tmp;
+            svfloat16_t tmp;
 
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_f16(pg, input_ptr + x);
-            switch(act)
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
             {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = svabs_f16_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = svmla_f16_z(pg, vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = svmax_f16_z(pg, const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), svmax_f16_z(pg, vin, const_0));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = svsqrt_f16_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = svmul_f16_z(pg, vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = svmul_f16_z(pg, vin, svmul_f16_z(pg, const_inv_6, svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SWISH:
-                    tmp = svmul_f16_z(pg, vin, svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin))))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            svst1_f16(pg, output_ptr + x, tmp);
+                const auto vin = svld1_f16(pg, input_ptr + x);
+                switch (act)
+                {
+                    case ActivationLayerInfo::ActivationFunction::ABS:
+                        tmp = svabs_f16_z(pg, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LINEAR:
+                        tmp = svmla_f16_z(pg, vb, va, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                        tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::RELU:
+                        tmp = svmax_f16_z(pg, const_0, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                        tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                        tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                        tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va),
+                                          svmax_f16_z(pg, vin, const_0));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                        tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::ELU:
+                        tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin,
+                                        svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQRT:
+                        tmp = svsqrt_f16_z(pg, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQUARE:
+                        tmp = svmul_f16_z(pg, vin, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::TANH:
+                        tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                        tmp = vin;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                        tmp = svmul_f16_z(
+                            pg, vin,
+                            svmul_f16_z(
+                                pg, const_inv_6,
+                                svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SWISH:
+                        tmp = svmul_f16_z(
+                            pg, vin,
+                            svinv_f16_z(pg, svadd_f16_z(pg, const_1,
+                                                        svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin))))));
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                svst1_f16(pg, output_ptr + x, tmp);
 
-            x += svcnth();
-            pg = svwhilelt_b16(x, window_end_x);
+                x += svcnth();
+                pg = svwhilelt_b16(x, window_end_x);
 
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    input, output);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve/fp32.cpp b/src/cpu/kernels/activation/generic/sve/fp32.cpp
index 87f04c255ac8205812d8b20ddcd2799c1dbe2e6c..d1b075d52cb2498872782329a8f825f572c881c8 100644
--- a/src/cpu/kernels/activation/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/activation/generic/sve/fp32.cpp
@@ -26,13 +26,13 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/SVEMath.h"
 
+#include <arm_sve.h>
 #include <cmath>
 #include <cstddef>
 
-#include <arm_sve.h>
-
 namespace arm_compute
 {
 namespace cpu
@@ -58,78 +58,89 @@ void sve_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayer
 
     const auto va = svdup_n_f32(act_info.a());
     const auto vb = svdup_n_f32(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
 
-        svfloat32_t tmp;
+            svfloat32_t tmp;
 
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b32(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_f32(pg, input_ptr + x);
-            switch(act)
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b32(x, window_end_x);
+            do
             {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = svabs_f32_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = svmla_f32_z(pg, vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = svmax_f32_z(pg, const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), svmax_f32_z(pg, vin, const_0));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = svsel_f32(svcmpgt_f32(pg, vin, soft_relu_thresh), vin, svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = svsqrt_f32_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = svmul_f32_z(pg, vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = svmul_f32_z(pg, vin, svmul_f32_z(pg, const_inv_6, svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SWISH:
-                    tmp = svmul_f32_z(pg, vin, svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, svmul_f32_z(pg, va, vin))))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            svst1_f32(pg, output_ptr + x, tmp);
+                const auto vin = svld1_f32(pg, input_ptr + x);
+                switch (act)
+                {
+                    case ActivationLayerInfo::ActivationFunction::ABS:
+                        tmp = svabs_f32_z(pg, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LINEAR:
+                        tmp = svmla_f32_z(pg, vb, va, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                        tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::RELU:
+                        tmp = svmax_f32_z(pg, const_0, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                        tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                        tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                        tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va),
+                                          svmax_f32_z(pg, vin, const_0));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                        tmp = svsel_f32(svcmpgt_f32(pg, vin, soft_relu_thresh), vin,
+                                        svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::ELU:
+                        tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin,
+                                        svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQRT:
+                        tmp = svsqrt_f32_z(pg, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQUARE:
+                        tmp = svmul_f32_z(pg, vin, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::TANH:
+                        tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                        tmp = vin;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                        tmp = svmul_f32_z(
+                            pg, vin,
+                            svmul_f32_z(
+                                pg, const_inv_6,
+                                svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SWISH:
+                        tmp = svmul_f32_z(
+                            pg, vin,
+                            svinv_f32_z(pg, svadd_f32_z(pg, const_1,
+                                                        svexp_f32_z(pg, svneg_f32_z(pg, svmul_f32_z(pg, va, vin))))));
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                svst1_f32(pg, output_ptr + x, tmp);
 
-            x += svcntw();
-            pg = svwhilelt_b32(x, window_end_x);
+                x += svcntw();
+                pg = svwhilelt_b32(x, window_end_x);
 
-        }
-        while(svptest_any(svptrue_b32(), pg));
-    },
-    input, output);
+            } while (svptest_any(svptrue_b32(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/lut.cpp b/src/cpu/kernels/activation/generic/sve2/lut.cpp
index d65de8d64988798e720bbfbdb00d06da62677f20..2ed667debf9df1a968907256d49555703cb7317a 100644
--- a/src/cpu/kernels/activation/generic/sve2/lut.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/lut.cpp
@@ -24,6 +24,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/kernels/lut/list.h"
 
 namespace arm_compute
@@ -33,19 +34,22 @@ namespace cpu
 #ifdef __aarch64__
 void sve2_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 &&
+                         src->info()->data_type() != DataType::QASYMM8_SIGNED);
     const auto window_end_x  = window.x().end();
     Window     win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = input.ptr();
-        auto       output_ptr = output.ptr();
-        lut_u8_sve2(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
-    },
-    input, output);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = input.ptr();
+            auto       output_ptr = output.ptr();
+            lut_u8_sve2(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
+        },
+        input, output);
 }
 #endif // __aarch64__
 } // namespace cpu
diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
index bc9bc7aa3cb352fe3d884cca81ade63fc423c32c..7efa9e4b72700080a9f3f4e6a9f5639c13b4318a 100644
--- a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
@@ -26,18 +26,21 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
-#include <cmath>
-#include <cstddef>
-
 #include "src/core/NEON/SVEAsymm.h"
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qasymm8_activation(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
 {
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
@@ -61,7 +64,7 @@ void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
 
     // Initialise scale/offset for re-quantization
     bool requant = true;
-    if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+    if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
     {
         requant = false;
     }
@@ -78,139 +81,160 @@ void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
     const auto vo_s32     = svdup_n_s32(o_s32);
 
     // Initialise scale/offset for re-quantization for leaky relu
-    int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
-                                arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    s_leaky_s32  = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    o_leaky_s32  = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+                                    arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
     const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
     const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
 
-        svuint8_t tmp;
+            svuint8_t tmp;
 
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_u8(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = svmax_u8_z(pg, vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
-                // Re-quantize to new output space
-                tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep = svcreate4_f32(svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
-                                                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
-                                                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
-                                                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
-
-                // Re-quantize to new output space
-                tmp = svquantize_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
             {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep = svcreate4_f32(svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
-                                                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
-                                                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
-                                                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
-
-                // Re-quantize to new output space
-                tmp = svquantize_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                svbool_t    p0, p1, p2, p3;
-                svint32x4_t tmp_dep;
-
-                // Expand to int32
-                const svint32x4_t vin_s32 = svcreate4_s32(
-                                                svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
-                                                svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
-                                                svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
-                                                svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))));
-
-                // Compare elements to input offset
-                if(qi_in.scale >= 0)
+                const auto vin = svld1_u8(pg, input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
                 {
-                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    // Perform activation
+                    tmp = svmax_u8_z(pg, vconst_0, vin);
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
                 }
-                else
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
                 {
-                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    // Perform activation
+                    tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
                 }
-
-                // Multiply negative elements and requantize if necessary
-                if(requant)
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
+                    // Re-quantize to new output space
+                    tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
+
+                    // Re-quantize to new output space
+                    tmp = svquantize_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
                 {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
+
+                    // Re-quantize to new output space
+                    tmp = svquantize_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    svbool_t    p0, p1, p2, p3;
+                    svint32x4_t tmp_dep;
+
+                    // Expand to int32
+                    const svint32x4_t vin_s32 = svcreate4_s32(svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
+                                                              svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
+                                                              svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
+                                                              svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))));
+
+                    // Compare elements to input offset
+                    if (qi_in.scale >= 0)
+                    {
+                        p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                        p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                        p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                        p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    }
+                    else
+                    {
+                        p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                        p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                        p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                        p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    }
+
+                    // Multiply negative elements and requantize if necessary
+                    if (requant)
+                    {
+                        tmp_dep = svcreate4_s32(
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0),
+                                                      svsel(p0, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1),
+                                                      svsel(p1, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2),
+                                                      svsel(p2, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3),
+                                                      svsel(p3, vs_leaky_s32, vs_s32)),
+                                          8));
+                    }
+                    else
+                    {
+                        tmp_dep = svcreate4_s32(
+                            svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+                    }
+
+                    // Convert uint32 vectors to uint16 vectors (with saturation)
+                    const auto v_low_u16  = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+                    const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+                    // convert uint16 vectors to uint8 vectors (with saturation)
+                    tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
                 }
                 else
                 {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
                 }
 
-                // Convert uint32 vectors to uint16 vectors (with saturation)
-                const auto v_low_u16  = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
-                const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
-
-                // convert uint16 vectors to uint8 vectors (with saturation)
-                tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-
-            svst1_u8(pg, output_ptr + x, tmp);
-
-            x += svcntb();
-            pg = svwhilelt_b8(x, window_end_x);
+                svst1_u8(pg, output_ptr + x, tmp);
 
-        }
-        while(svptest_any(svptrue_b8(), pg));
+                x += svcntb();
+                pg = svwhilelt_b8(x, window_end_x);
 
-    },
-    input, output);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
index d20684f54dc0662d827393866a9a242893f8c253..e4667522ddb70177f38251070dce30ef6a1730dd 100644
--- a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
@@ -24,20 +24,23 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <cmath>
-#include <cstddef>
 
 #include "src/core/NEON/SVEAsymm.h"
 #include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
 #include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qasymm8_signed_activation(const ITensor             *src,
+                                    ITensor                   *dst,
+                                    const ActivationLayerInfo &act_info,
+                                    const Window              &window)
 {
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
@@ -65,7 +68,7 @@ void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti
 
     // Initialise scale/offset for re-quantization
     bool requant = true;
-    if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+    if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
     {
         requant = false;
     }
@@ -82,151 +85,190 @@ void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti
     const auto vo_s32     = svdup_n_s32(o_s32);
 
     // Initialise scale/offset for re-quantization for leaky relu
-    int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
-                                arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    s_leaky_s32  = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    o_leaky_s32  = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+                                    arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
     const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
     const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
 
-        svint8_t tmp;
+            svint8_t tmp;
 
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_s8(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = svmax_s8_z(pg, vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep = svcreate4_f32(
-                                                  svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
-                                                  svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
-                                                  svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
-                                                  svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep = svcreate4_f32(
-                                                  svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
-                                                  svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
-                                                  svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
-                                                  svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep = svcreate4_f32(
-                                                  svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
-                                                  svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
-                                                  svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
-                                                  svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))));
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
             {
-                svbool_t    p0, p1, p2, p3;
-                svint32x4_t tmp_dep;
-
-                // Expand to int32
-                const svint32x4_t vin_s32 = svcreate4_s32(
-                                                svmovlb_s32(svmovlb_s16(vin)),
-                                                svmovlt_s32(svmovlb_s16(vin)),
-                                                svmovlb_s32(svmovlt_s16(vin)),
-                                                svmovlt_s32(svmovlt_s16(vin)));
-
-                // Compare elements to input offset
-                if(qi_in.scale >= 0)
+                const auto vin = svld1_s8(pg, input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
                 {
-                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    // Perform activation
+                    tmp = svmax_s8_z(pg, vconst_0, vin);
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
                 }
-                else
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
                 {
-                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    // Perform activation
+                    tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
                 }
-
-                // Multiply negative elements and requantize if necessary
-                if(requant)
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
                 {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
+                    // Perform activation
+                    tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
                 }
-                else
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
                 {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svmul_f32_z(pg, svget4_f32(vin_deq, 0),
+                                    svmul_f32_z(pg, const_inv_6_f32,
+                                                svmin_f32_z(pg, const_6_f32,
+                                                            svmax_f32_z(pg, const_0_f32,
+                                                                        svadd_f32_z(pg, svget4_f32(vin_deq, 0),
+                                                                                    const_3_f32))))),
+                        svmul_f32_z(pg, svget4_f32(vin_deq, 1),
+                                    svmul_f32_z(pg, const_inv_6_f32,
+                                                svmin_f32_z(pg, const_6_f32,
+                                                            svmax_f32_z(pg, const_0_f32,
+                                                                        svadd_f32_z(pg, svget4_f32(vin_deq, 1),
+                                                                                    const_3_f32))))),
+                        svmul_f32_z(pg, svget4_f32(vin_deq, 2),
+                                    svmul_f32_z(pg, const_inv_6_f32,
+                                                svmin_f32_z(pg, const_6_f32,
+                                                            svmax_f32_z(pg, const_0_f32,
+                                                                        svadd_f32_z(pg, svget4_f32(vin_deq, 2),
+                                                                                    const_3_f32))))),
+                        svmul_f32_z(pg, svget4_f32(vin_deq, 3),
+                                    svmul_f32_z(pg, const_inv_6_f32,
+                                                svmin_f32_z(pg, const_6_f32,
+                                                            svmax_f32_z(pg, const_0_f32,
+                                                                        svadd_f32_z(pg, svget4_f32(vin_deq, 3),
+                                                                                    const_3_f32))))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
                 }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    svbool_t    p0, p1, p2, p3;
+                    svint32x4_t tmp_dep;
 
-                // Convert uint32 vectors to uint16 vectors (with saturation)
-                const auto v_low_s16  = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
-                const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+                    // Expand to int32
+                    const svint32x4_t vin_s32 =
+                        svcreate4_s32(svmovlb_s32(svmovlb_s16(vin)), svmovlt_s32(svmovlb_s16(vin)),
+                                      svmovlb_s32(svmovlt_s16(vin)), svmovlt_s32(svmovlt_s16(vin)));
 
-                // convert uint16 vectors to uint8 vectors (with saturation)
-                tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
+                    // Compare elements to input offset
+                    if (qi_in.scale >= 0)
+                    {
+                        p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                        p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                        p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                        p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    }
+                    else
+                    {
+                        p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                        p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                        p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                        p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    }
+
+                    // Multiply negative elements and requantize if necessary
+                    if (requant)
+                    {
+                        tmp_dep = svcreate4_s32(
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0),
+                                                      svsel(p0, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1),
+                                                      svsel(p1, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2),
+                                                      svsel(p2, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3),
+                                                      svsel(p3, vs_leaky_s32, vs_s32)),
+                                          8));
+                    }
+                    else
+                    {
+                        tmp_dep = svcreate4_s32(
+                            svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+                    }
+
+                    // Convert uint32 vectors to uint16 vectors (with saturation)
+                    const auto v_low_s16  = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+                    const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+                    // convert uint16 vectors to uint8 vectors (with saturation)
+                    tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
 
-            svst1_s8(pg, output_ptr + x, tmp);
+                svst1_s8(pg, output_ptr + x, tmp);
 
-            x += svcntb();
-            pg = svwhilelt_b8(x, window_end_x);
+                x += svcntb();
+                pg = svwhilelt_b8(x, window_end_x);
 
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    input, output);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
index 5154fac8a7ef2d03e9a91aa8651322602b8932ea..f9558933071bc27161d38689d6fc93b316cb1527 100644
--- a/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
@@ -21,24 +21,27 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
-#include <cmath>
-#include <cstddef>
-
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/SVESymm.h"
+
 #include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qsymm16_activation(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
 {
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
@@ -56,62 +59,70 @@ void sve2_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationL
     const auto                    va_f32   = svdup_n_f32(act_info.a());
     const auto                    vb_f32   = svdup_n_f32(act_info.b());
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
 
-        svint16_t tmp;
+            svint16_t tmp;
 
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_s16(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
-                // Perform activation
-                const svfloat32x2_t tmp_dep = svcreate2_f32(svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
-                                                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))));
-                // Re-quantize to new output space
-                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
-                // Perform activation
-                const svfloat32x2_t tmp_dep = svcreate2_f32(svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
-                                                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))));
-                // Re-quantize to new output space
-                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // De-quantize
-                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
-                // Perform activation
-                const svfloat32x2_t tmp_dep = svcreate2_f32(svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 0))),
-                                                            svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 1))));
-                // Re-quantize to new output space
-                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
-            }
-            else
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
             {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
+                const auto vin = svld1_s16(pg, input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+                    // Perform activation
+                    const svfloat32x2_t tmp_dep = svcreate2_f32(
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    // De-quantize
+                    auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+                    // Perform activation
+                    const svfloat32x2_t tmp_dep = svcreate2_f32(
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // De-quantize
+                    auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+                    // Perform activation
+                    const svfloat32x2_t tmp_dep =
+                        svcreate2_f32(svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 0))),
+                                      svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 1))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
 
-            svst1_s16(pg, output_ptr + x, tmp);
+                svst1_s16(pg, output_ptr + x, tmp);
 
-            x += svcnth();
-            pg = svwhilelt_b16(x, window_end_x);
+                x += svcnth();
+                pg = svwhilelt_b16(x, window_end_x);
 
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    input, output);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/fp16.cpp b/src/cpu/kernels/add/generic/neon/fp16.cpp
index fca7b2cd9fdaf2187e2a0e7d88ba928cdf45a106..e7679c14e3d6125a9df65546ab82adf1fe012f66 100644
--- a/src/cpu/kernels/add/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/add/generic/neon/fp16.cpp
@@ -30,10 +30,11 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_fp16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_fp16_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_neon<float16_t>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/add/generic/neon/fp32.cpp b/src/cpu/kernels/add/generic/neon/fp32.cpp
index 1f599b1968624eb52217fc72170af72daa9512c8..11a970bef4f5f10391fe700670f53c439f554d3a 100644
--- a/src/cpu/kernels/add/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/add/generic/neon/fp32.cpp
@@ -28,9 +28,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_fp32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_fp32_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_neon<float>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/impl.cpp b/src/cpu/kernels/add/generic/neon/impl.cpp
index a1734d7dd61e20ce75ac1358b00d78451e2c844c..34938cc4c47b2bf5d9b867d99f208af84c4877e9 100644
--- a/src/cpu/kernels/add/generic/neon/impl.cpp
+++ b/src/cpu/kernels/add/generic/neon/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,113 +23,15 @@
  */
 
 #include "src/cpu/kernels/add/generic/neon/impl.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 namespace arm_compute
 {
 namespace cpu
 {
-template <typename ScalarType>
-void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>;
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    constexpr int window_step_x         = 16 / sizeof(ScalarType);
-    const auto    window_start_x        = static_cast<int>(window.x().start());
-    const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
-
-            const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const auto       broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                const auto res             = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x)          = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto val1 = wrapper::vloadq(input1_ptr + x);
-                const auto val2 = wrapper::vloadq(input2_ptr + x);
-                const auto res  = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto val1   = *(input1_ptr + x);
-                const auto val2   = *(input2_ptr + x);
-                *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
-            }
-        },
-        input1, input2, output);
-    }
-}
-
 bool sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
     return add_sub_q8_neon_fixedpoint_possible(src0, src1, dst, false);
@@ -140,7 +42,10 @@ bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo
     return add_sub_q8_neon_fixedpoint_possible(src0, src1, dst, true);
 }
 
-bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, bool is_addition)
+bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
+                                         const ITensorInfo *src1,
+                                         const ITensorInfo *dst,
+                                         bool               is_addition)
 {
     const auto iq0 = src0->quantization_info().uniform();
     const auto iq1 = src1->quantization_info().uniform();
@@ -149,7 +54,7 @@ bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorI
     const auto scale0 = iq0.scale / oq.scale;
     const auto scale1 = iq1.scale / oq.scale;
 
-    if(scale0 < -15.f || scale0 > 15.f || scale1 < -15.f || scale1 > 15.f)
+    if (scale0 < -15.f || scale0 > 15.f || scale1 < -15.f || scale1 > 15.f)
     {
         // The scale factor cannot be stored as 5.11 signed fixed-point number.
         return false;
@@ -157,9 +62,10 @@ bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorI
 
     const auto offset = float(oq.offset) - scale0 * float(iq0.offset) - scale1 * float(iq1.offset);
 
-    const auto max_acc = is_addition ? ((std::abs(scale0) + std::abs(scale1)) * 256.f + std::abs(offset)) : ((std::abs(scale0) - std::abs(scale1)) * 256.f + std::abs(offset));
+    const auto max_acc = is_addition ? ((std::abs(scale0) + std::abs(scale1)) * 256.f + std::abs(offset))
+                                     : ((std::abs(scale0) - std::abs(scale1)) * 256.f + std::abs(offset));
 
-    if(max_acc > 1048575.f) // 2^20 - 1
+    if (max_acc > 1048575.f) // 2^20 - 1
     {
         // It might not be possible to store the result as 21.11 signed fixed-point number.
         return false;
@@ -169,13 +75,19 @@ bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorI
 }
 
 template <typename ScalarType>
-void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_q8_neon_fixedpoint(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_q8_neon_fixedpoint<ScalarType>(src0, src1, dst, policy, window, true /*is_addition*/);
 }
 
 template <typename ScalarType>
-void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition)
+void add_sub_q8_neon_fixedpoint(const ITensor       *src0,
+                                const ITensor       *src1,
+                                ITensor             *dst,
+                                const ConvertPolicy &policy,
+                                const Window        &window,
+                                bool                 is_addition)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -203,7 +115,7 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso
     const auto oq_info   = dst->info()->quantization_info().uniform();
     const auto in0_scale = iq0_info.scale / oq_info.scale;
     const auto in1_scale = is_addition ? (iq1_info.scale / oq_info.scale) : (-(iq1_info.scale / oq_info.scale));
-    const auto offset    = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset);
+    const auto offset = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset);
 
     constexpr float _2pow11        = 2048;
     const auto      in0_scale_5p11 = static_cast<int16_t>(support::cpp11::lround(in0_scale * _2pow11));
@@ -212,7 +124,7 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso
 
     constexpr uint8_t shift_amount_remainder = 3;
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         // Prefix: a = non-broadcast, b = broadcast.
 
@@ -238,68 +150,75 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso
         Iterator out_it(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto a_ptr   = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
-            const auto b_ptr   = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
-            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
-
-            const auto b_val                    = *b_ptr;
-            const auto b_scaled                 = b_scale * b_val;
-            const auto b_scaled_21p11           = static_cast<int32_t>(support::cpp11::lround(b_scaled * _2pow11));
-            const auto b_scaled_offseted_21p11  = b_scaled_21p11 + offset_21p11;
-            const auto b_vscaled_offseted_21p11 = wrapper::vdup_n(b_scaled_offseted_21p11, wrapper::traits::vector_128_tag());
+            win,
+            [&](const Coordinates &)
+            {
+                const auto a_ptr   = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
+                const auto b_ptr   = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
+                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+
+                const auto b_val                   = *b_ptr;
+                const auto b_scaled                = b_scale * b_val;
+                const auto b_scaled_21p11          = static_cast<int32_t>(support::cpp11::lround(b_scaled * _2pow11));
+                const auto b_scaled_offseted_21p11 = b_scaled_21p11 + offset_21p11;
+                const auto b_vscaled_offseted_21p11 =
+                    wrapper::vdup_n(b_scaled_offseted_21p11, wrapper::traits::vector_128_tag());
 
 #ifndef __aarch64__
-            const auto b_scaled_offseted = b_scaled + offset;
+                const auto b_scaled_offseted = b_scaled + offset;
 #endif // __aarch64__
 
-            int x = window_start_x;
-
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Load the input.
-                const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
-
-                // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
-                const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
-                const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
-
-                // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset.
-                // Widen and store the result in 32-bit integer.
-                const auto vout_21p11_00 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_0), a_vscale_5p11);
-                const auto vout_21p11_01 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_0), a_vscale_5p11);
-                const auto vout_21p11_10 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_1), a_vscale_5p11);
-                const auto vout_21p11_11 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_1), a_vscale_5p11);
-
-                // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
-                const auto vout_8p8_0 = wrapper::vcombine(
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
-                const auto vout_8p8_1 = wrapper::vcombine(
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
-
-                // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
-                const auto vout_8p0 = wrapper::vcombine(
-                                          wrapper::vqrshrn<8>(vout_8p8_0),
-                                          wrapper::vqrshrn<8>(vout_8p8_1));
-
-                // Store the result.
-                wrapper::vstore(out_ptr + x, vout_8p0);
-            }
-
-            // Process the left-over elements.
-            for(; x < window_end_x; ++x)
-            {
+                int x = window_start_x;
+
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load the input.
+                    const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
+
+                    // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
+                    const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
+                    const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
+
+                    // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset.
+                    // Widen and store the result in 32-bit integer.
+                    const auto vout_21p11_00 =
+                        wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_0), a_vscale_5p11);
+                    const auto vout_21p11_01 =
+                        wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_0), a_vscale_5p11);
+                    const auto vout_21p11_10 =
+                        wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_1), a_vscale_5p11);
+                    const auto vout_21p11_11 =
+                        wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_1), a_vscale_5p11);
+
+                    // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
+                    const auto vout_8p8_0 =
+                        wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
+                                          wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
+                    const auto vout_8p8_1 =
+                        wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
+                                          wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
+
+                    // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
+                    const auto vout_8p0 =
+                        wrapper::vcombine(wrapper::vqrshrn<8>(vout_8p8_0), wrapper::vqrshrn<8>(vout_8p8_1));
+
+                    // Store the result.
+                    wrapper::vstore(out_ptr + x, vout_8p0);
+                }
+
+                // Process the left-over elements.
+                for (; x < window_end_x; ++x)
+                {
 #ifdef __aarch64__
-                out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(int32_t(a_ptr[x]) * a_scale_5p11 + b_scaled_offseted_21p11));
+                    out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(
+                        int32_t(a_ptr[x]) * a_scale_5p11 + b_scaled_offseted_21p11));
 #else  // __aarch64__
-                out_ptr[x] = utility::clamp<int, ScalarType>(support::cpp11::lround(float(a_ptr[x]) * a_scale + b_scaled_offseted));
+                    out_ptr[x] = utility::clamp<int, ScalarType>(
+                        support::cpp11::lround(float(a_ptr[x]) * a_scale + b_scaled_offseted));
 #endif // __aarch64__
-            }
-        },
-        b_input_it, a_input_it, out_it);
+                }
+            },
+            b_input_it, a_input_it, out_it);
     }
     else
     {
@@ -316,70 +235,85 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso
         Iterator out_it(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
-            const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
-            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
-
-            int x = window_start_x;
-
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Load the inputs.
-                const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
-                const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
-
-                // Widen the input elements to signed 16-bit regardless of the input signedness.
-                const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
-                const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
-                const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
-                const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
-
-                // Multiply the input elements by the scale factor and add the offset.
-                // Widen and store the result in 32-bit integer.
-                const auto vscaled0_offseted_21p11_00 = wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_0), vscale0_5p11);
-                const auto vscaled0_offseted_21p11_01 = wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_0), vscale0_5p11);
-                const auto vscaled0_offseted_21p11_10 = wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_1), vscale0_5p11);
-                const auto vscaled0_offseted_21p11_11 = wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_1), vscale0_5p11);
-
-                const auto vout_21p11_00 = wrapper::vmlal(vscaled0_offseted_21p11_00, wrapper::vgetlow(vin1_16p0_0), vscale1_5p11);
-                const auto vout_21p11_01 = wrapper::vmlal(vscaled0_offseted_21p11_01, wrapper::vgethigh(vin1_16p0_0), vscale1_5p11);
-                const auto vout_21p11_10 = wrapper::vmlal(vscaled0_offseted_21p11_10, wrapper::vgetlow(vin1_16p0_1), vscale1_5p11);
-                const auto vout_21p11_11 = wrapper::vmlal(vscaled0_offseted_21p11_11, wrapper::vgethigh(vin1_16p0_1), vscale1_5p11);
-
-                // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
-                const auto vout_8p8_0 = wrapper::vcombine(
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
-                const auto vout_8p8_1 = wrapper::vcombine(
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
-
-                // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
-                const auto vout_8p0 = wrapper::vcombine(
-                                          wrapper::vqrshrn<8>(vout_8p8_0),
-                                          wrapper::vqrshrn<8>(vout_8p8_1));
-
-                // Store the result.
-                wrapper::vstore(out_ptr + x, vout_8p0);
-            }
-
-            // Process the left-over elements.
-            for(; x < window_end_x; ++x)
+            win,
+            [&](const Coordinates &)
             {
+                const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
+                const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
+                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+
+                int x = window_start_x;
+
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load the inputs.
+                    const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
+                    const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
+
+                    // Widen the input elements to signed 16-bit regardless of the input signedness.
+                    const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
+                    const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
+                    const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
+                    const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
+
+                    // Multiply the input elements by the scale factor and add the offset.
+                    // Widen and store the result in 32-bit integer.
+                    const auto vscaled0_offseted_21p11_00 =
+                        wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_0), vscale0_5p11);
+                    const auto vscaled0_offseted_21p11_01 =
+                        wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_0), vscale0_5p11);
+                    const auto vscaled0_offseted_21p11_10 =
+                        wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_1), vscale0_5p11);
+                    const auto vscaled0_offseted_21p11_11 =
+                        wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_1), vscale0_5p11);
+
+                    const auto vout_21p11_00 =
+                        wrapper::vmlal(vscaled0_offseted_21p11_00, wrapper::vgetlow(vin1_16p0_0), vscale1_5p11);
+                    const auto vout_21p11_01 =
+                        wrapper::vmlal(vscaled0_offseted_21p11_01, wrapper::vgethigh(vin1_16p0_0), vscale1_5p11);
+                    const auto vout_21p11_10 =
+                        wrapper::vmlal(vscaled0_offseted_21p11_10, wrapper::vgetlow(vin1_16p0_1), vscale1_5p11);
+                    const auto vout_21p11_11 =
+                        wrapper::vmlal(vscaled0_offseted_21p11_11, wrapper::vgethigh(vin1_16p0_1), vscale1_5p11);
+
+                    // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
+                    const auto vout_8p8_0 =
+                        wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
+                                          wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
+                    const auto vout_8p8_1 =
+                        wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
+                                          wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
+
+                    // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
+                    const auto vout_8p0 =
+                        wrapper::vcombine(wrapper::vqrshrn<8>(vout_8p8_0), wrapper::vqrshrn<8>(vout_8p8_1));
+
+                    // Store the result.
+                    wrapper::vstore(out_ptr + x, vout_8p0);
+                }
+
+                // Process the left-over elements.
+                for (; x < window_end_x; ++x)
+                {
 #ifdef __aarch64__
-                out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(int32_t(in0_ptr[x]) * in0_scale_5p11 + int32_t(in1_ptr[x]) * in1_scale_5p11 + offset_21p11));
+                    out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(
+                        int32_t(in0_ptr[x]) * in0_scale_5p11 + int32_t(in1_ptr[x]) * in1_scale_5p11 + offset_21p11));
 #else  // __aarch64__
-                out_ptr[x] = utility::clamp<int, ScalarType>(support::cpp11::lround(float(in0_ptr[x]) * in0_scale + float(in1_ptr[x]) * in1_scale + offset));
+                    out_ptr[x] = utility::clamp<int, ScalarType>(
+                        support::cpp11::lround(float(in0_ptr[x]) * in0_scale + float(in1_ptr[x]) * in1_scale + offset));
 #endif // __aarch64__
-            }
-        },
-        in0_it, in1_it, out_it);
+                }
+            },
+            in0_it, in1_it, out_it);
     }
 }
 
-void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition)
+void add_sub_qasymm8_neon(const ITensor       *src0,
+                          const ITensor       *src1,
+                          ITensor             *dst,
+                          const ConvertPolicy &policy,
+                          const Window        &window,
+                          bool                 is_addition)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -404,7 +338,7 @@ void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst
     const auto scale2 = is_addition ? (iq2_info.scale / oq_info.scale) : (-(iq2_info.scale / oq_info.scale));
     const auto offset = float(oq_info.offset) - scale1 * float(iq1_info.offset) - scale2 * float(iq2_info.offset);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -424,63 +358,64 @@ void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst
         Iterator output(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = non_broadcast_input.ptr();
-            const auto output_ptr              = output.ptr();
-
-            const auto broadcast_value = *broadcast_input.ptr();
-            const auto bf              = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
-            const auto bfs             = float(broadcast_value) * bf_scale + offset;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
+                const auto non_broadcast_input_ptr = non_broadcast_input.ptr();
+                const auto output_ptr              = output.ptr();
 
-                const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
-                const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
+                const auto broadcast_value = *broadcast_input.ptr();
+                const auto bf              = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
+                const auto bfs             = float(broadcast_value) * bf_scale + offset;
 
-                const auto af_0 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
-                const auto af_1 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
-                const auto af_2 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
-                const auto af_3 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
 
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
+                    const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
+                    const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
+
+                    const auto af_0 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
+                    const auto af_1 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
+                    const auto af_2 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
+                    const auto af_3 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+                    int32x4_t rf_2{};
+                    int32x4_t rf_3{};
 
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(af_0);
-                rf_1 = vcvtnq_s32_f32(af_1);
-                rf_2 = vcvtnq_s32_f32(af_2);
-                rf_3 = vcvtnq_s32_f32(af_3);
+                    rf_0 = vcvtnq_s32_f32(af_0);
+                    rf_1 = vcvtnq_s32_f32(af_1);
+                    rf_2 = vcvtnq_s32_f32(af_2);
+                    rf_3 = vcvtnq_s32_f32(af_3);
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(af_0);
-                rf_1 = vcvtq_s32_f32(af_1);
-                rf_2 = vcvtq_s32_f32(af_2);
-                rf_3 = vcvtq_s32_f32(af_3);
+                    rf_0          = vcvtq_s32_f32(af_0);
+                    rf_1          = vcvtq_s32_f32(af_1);
+                    rf_2          = vcvtq_s32_f32(af_2);
+                    rf_3          = vcvtq_s32_f32(af_3);
 #endif //__aarch64__
 
-                const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
-            }
+                    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+                    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+                    vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
 #ifdef __aarch64__
-                output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
+                    output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
 #else  // __aarch64__
-                output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
+                    output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
 #endif // __aarch64__
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -497,72 +432,78 @@ void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst
         const auto voffset = vdupq_n_f32(offset);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = input1.ptr();
-            const auto input2_ptr = input2.ptr();
-            const auto output_ptr = output.ptr();
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const uint8x16_t a = vld1q_u8(input1_ptr + x);
-                const uint8x16_t b = vld1q_u8(input2_ptr + x);
-
-                const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
-                const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
-                const auto b_u16_0 = vmovl_u8(vget_low_u8(b));
-                const auto b_u16_1 = vmovl_u8(vget_high_u8(b));
-
-                const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
-                const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
-                const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
-                const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
-
-                const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_0))), vscale2);
-                const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_0))), vscale2);
-                const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_1))), vscale2);
-                const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_1))), vscale2);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
+                const auto input1_ptr = input1.ptr();
+                const auto input2_ptr = input2.ptr();
+                const auto output_ptr = output.ptr();
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint8x16_t a = vld1q_u8(input1_ptr + x);
+                    const uint8x16_t b = vld1q_u8(input2_ptr + x);
+
+                    const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
+                    const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
+                    const auto b_u16_0 = vmovl_u8(vget_low_u8(b));
+                    const auto b_u16_1 = vmovl_u8(vget_high_u8(b));
+
+                    const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
+                    const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
+                    const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
+                    const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
+
+                    const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_0))), vscale2);
+                    const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_0))), vscale2);
+                    const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_1))), vscale2);
+                    const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_1))), vscale2);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+                    int32x4_t rf_2{};
+                    int32x4_t rf_3{};
 
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(bf_0);
-                rf_1 = vcvtnq_s32_f32(bf_1);
-                rf_2 = vcvtnq_s32_f32(bf_2);
-                rf_3 = vcvtnq_s32_f32(bf_3);
+                    rf_0 = vcvtnq_s32_f32(bf_0);
+                    rf_1 = vcvtnq_s32_f32(bf_1);
+                    rf_2 = vcvtnq_s32_f32(bf_2);
+                    rf_3 = vcvtnq_s32_f32(bf_3);
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(bf_0);
-                rf_1 = vcvtq_s32_f32(bf_1);
-                rf_2 = vcvtq_s32_f32(bf_2);
-                rf_3 = vcvtq_s32_f32(bf_3);
+                    rf_0          = vcvtq_s32_f32(bf_0);
+                    rf_1          = vcvtq_s32_f32(bf_1);
+                    rf_2          = vcvtq_s32_f32(bf_2);
+                    rf_3          = vcvtq_s32_f32(bf_3);
 #endif //__aarch64__
 
-                const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
-            }
+                    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+                    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+                    vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
 #ifdef __aarch64__
-                output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
+                    output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
 #else  // __aarch64__
-                output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
+                    output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
 #endif // __aarch64__
-            }
-        },
-        input1, input2, output);
+                }
+            },
+            input1, input2, output);
     }
 }
 
-void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition)
+void add_sub_qasymm8_signed_neon(const ITensor       *src0,
+                                 const ITensor       *src1,
+                                 ITensor             *dst,
+                                 const ConvertPolicy &policy,
+                                 const Window        &window,
+                                 bool                 is_addition)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -587,7 +528,7 @@ void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITens
     const auto scale2 = is_addition ? (iq2_info.scale / oq_info.scale) : (-(iq2_info.scale / oq_info.scale));
     const auto offset = float(oq_info.offset) - scale1 * float(iq1_info.offset) - scale2 * float(iq2_info.offset);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -607,63 +548,64 @@ void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITens
         Iterator output(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
-
-            const auto broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const auto bf              = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
-            const auto bfs             = float(broadcast_value) * bf_scale + offset;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
+
+                const auto broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+                const auto bf              = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
+                const auto bfs             = float(broadcast_value) * bf_scale + offset;
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
 
-                const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
-                const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
+                    const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
+                    const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
 
-                const auto af_0 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
-                const auto af_1 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
-                const auto af_2 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
-                const auto af_3 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
+                    const auto af_0 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
+                    const auto af_1 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
+                    const auto af_2 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
+                    const auto af_3 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
 
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+                    int32x4_t rf_2{};
+                    int32x4_t rf_3{};
 
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(af_0);
-                rf_1 = vcvtnq_s32_f32(af_1);
-                rf_2 = vcvtnq_s32_f32(af_2);
-                rf_3 = vcvtnq_s32_f32(af_3);
+                    rf_0 = vcvtnq_s32_f32(af_0);
+                    rf_1 = vcvtnq_s32_f32(af_1);
+                    rf_2 = vcvtnq_s32_f32(af_2);
+                    rf_3 = vcvtnq_s32_f32(af_3);
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(af_0);
-                rf_1 = vcvtq_s32_f32(af_1);
-                rf_2 = vcvtq_s32_f32(af_2);
-                rf_3 = vcvtq_s32_f32(af_3);
+                    rf_0          = vcvtq_s32_f32(af_0);
+                    rf_1          = vcvtq_s32_f32(af_1);
+                    rf_2          = vcvtq_s32_f32(af_2);
+                    rf_3          = vcvtq_s32_f32(af_3);
 #endif //__aarch64__
 
-                const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
-            }
+                    const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+                    const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+                    vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
 #ifdef __aarch64__
-                output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
+                    output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
 #else  // __aarch64__
-                output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
+                    output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
 #endif // __aarch64__
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -680,88 +622,102 @@ void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITens
         const auto voffset = vdupq_n_f32(offset);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const int8x16_t a = vld1q_s8(input1_ptr + x);
-                const int8x16_t b = vld1q_s8(input2_ptr + x);
-
-                const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
-                const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
-                const auto b_s16_0 = vmovl_s8(vget_low_s8(b));
-                const auto b_s16_1 = vmovl_s8(vget_high_s8(b));
-
-                const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
-                const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
-                const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
-                const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
-
-                const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_0))), vscale2);
-                const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_0))), vscale2);
-                const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_1))), vscale2);
-                const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_1))), vscale2);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
+                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int8x16_t a = vld1q_s8(input1_ptr + x);
+                    const int8x16_t b = vld1q_s8(input2_ptr + x);
+
+                    const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
+                    const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
+                    const auto b_s16_0 = vmovl_s8(vget_low_s8(b));
+                    const auto b_s16_1 = vmovl_s8(vget_high_s8(b));
+
+                    const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
+                    const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
+                    const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
+                    const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
+
+                    const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_0))), vscale2);
+                    const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_0))), vscale2);
+                    const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_1))), vscale2);
+                    const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_1))), vscale2);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+                    int32x4_t rf_2{};
+                    int32x4_t rf_3{};
 
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(bf_0);
-                rf_1 = vcvtnq_s32_f32(bf_1);
-                rf_2 = vcvtnq_s32_f32(bf_2);
-                rf_3 = vcvtnq_s32_f32(bf_3);
+                    rf_0 = vcvtnq_s32_f32(bf_0);
+                    rf_1 = vcvtnq_s32_f32(bf_1);
+                    rf_2 = vcvtnq_s32_f32(bf_2);
+                    rf_3 = vcvtnq_s32_f32(bf_3);
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(bf_0);
-                rf_1 = vcvtq_s32_f32(bf_1);
-                rf_2 = vcvtq_s32_f32(bf_2);
-                rf_3 = vcvtq_s32_f32(bf_3);
+                    rf_0          = vcvtq_s32_f32(bf_0);
+                    rf_1          = vcvtq_s32_f32(bf_1);
+                    rf_2          = vcvtq_s32_f32(bf_2);
+                    rf_3          = vcvtq_s32_f32(bf_3);
 #endif //__aarch64__
 
-                const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
-            }
+                    const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+                    const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+                    vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
 #ifdef __aarch64__
-                output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
+                    output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
 #else  // __aarch64__
-                output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
+                    output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
 #endif // __aarch64__
-            }
-        },
-        input1, input2, output);
+                }
+            },
+            input1, input2, output);
     }
 }
 
-template void add_same_neon<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_neon<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_neon<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_neon<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void add_same_neon<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-
-template void add_q8_neon_fixedpoint<int8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_q8_neon_fixedpoint<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-
-template void add_sub_q8_neon_fixedpoint<int8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
-template void add_sub_q8_neon_fixedpoint<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
-
-void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
-void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
+template void add_q8_neon_fixedpoint<int8_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_q8_neon_fixedpoint<uint8_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+
+template void add_sub_q8_neon_fixedpoint<int8_t>(const ITensor       *src0,
+                                                 const ITensor       *src1,
+                                                 ITensor             *dst,
+                                                 const ConvertPolicy &policy,
+                                                 const Window        &window,
+                                                 bool                 is_addition);
+template void add_sub_q8_neon_fixedpoint<uint8_t>(const ITensor       *src0,
+                                                  const ITensor       *src1,
+                                                  ITensor             *dst,
+                                                  const ConvertPolicy &policy,
+                                                  const Window        &window,
+                                                  bool                 is_addition);
+
+void add_sub_qasymm8_neon(const ITensor       *src0,
+                          const ITensor       *src1,
+                          ITensor             *dst,
+                          const ConvertPolicy &policy,
+                          const Window        &window,
+                          bool                 is_addition);
+void add_sub_qasymm8_signed_neon(const ITensor       *src0,
+                                 const ITensor       *src1,
+                                 ITensor             *dst,
+                                 const ConvertPolicy &policy,
+                                 const Window        &window,
+                                 bool                 is_addition);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/impl.h b/src/cpu/kernels/add/generic/neon/impl.h
index d544ef5728de5d3a404994b536817907d14a2df5..faa99baffebc191a41b5ad5e4fceac4d51fc6759 100644
--- a/src/cpu/kernels/add/generic/neon/impl.h
+++ b/src/cpu/kernels/add/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,31 +23,161 @@
  */
 #ifndef SRC_CORE_NEON_KERNELS_ADD_IMPL_H
 #define SRC_CORE_NEON_KERNELS_ADD_IMPL_H
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+
 namespace arm_compute
 {
 namespace cpu
 {
 template <typename ScalarType>
-void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+void add_same_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>;
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    constexpr int window_step_x         = 16 / sizeof(ScalarType);
+    const auto    window_start_x        = static_cast<int>(window.x().start());
+    const auto    window_end_x          = static_cast<int>(window.x().end());
+    const bool    is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
+
+                const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+                const auto       broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                    const auto res             = (policy == ConvertPolicy::SATURATE)
+                                                     ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v)
+                                                     : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x)          = (policy == ConvertPolicy::SATURATE)
+                                                     ? wrapper::add_sat(broadcast_value, non_broadcast_v)
+                                                     : broadcast_value + non_broadcast_v;
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src0, input1_win);
+        Iterator input2(src1, input2_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto val1 = wrapper::vloadq(input1_ptr + x);
+                    const auto val2 = wrapper::vloadq(input2_ptr + x);
+                    const auto res =
+                        (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto val1 = *(input1_ptr + x);
+                    const auto val2 = *(input2_ptr + x);
+                    *(output_ptr + x) =
+                        (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
+                }
+            },
+            input1, input2, output);
+    }
+}
 
 bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
 bool sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
-bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, bool is_addition);
+bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
+                                         const ITensorInfo *src1,
+                                         const ITensorInfo *dst,
+                                         bool               is_addition);
 
-void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
+void add_sub_qasymm8_neon(const ITensor       *src0,
+                          const ITensor       *src1,
+                          ITensor             *dst,
+                          const ConvertPolicy &policy,
+                          const Window        &window,
+                          bool                 is_addition);
 
-void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
+void add_sub_qasymm8_signed_neon(const ITensor       *src0,
+                                 const ITensor       *src1,
+                                 ITensor             *dst,
+                                 const ConvertPolicy &policy,
+                                 const Window        &window,
+                                 bool                 is_addition);
 
 template <typename ScalarType>
-void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+void add_q8_neon_fixedpoint(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 
 template <typename ScalarType>
-void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
+void add_sub_q8_neon_fixedpoint(const ITensor       *src0,
+                                const ITensor       *src1,
+                                ITensor             *dst,
+                                const ConvertPolicy &policy,
+                                const Window        &window,
+                                bool                 is_addition);
 } // namespace cpu
 } // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_ADD_IMPL_H
\ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_ADD_IMPL_H
diff --git a/src/cpu/kernels/add/generic/neon/integer.cpp b/src/cpu/kernels/add/generic/neon/integer.cpp
index 5698d6d552ee5cfddb2671d52cc43c8891de0f2d..f0bcebc9d29eef3bb5a4733385895fe7d49e2c43 100644
--- a/src/cpu/kernels/add/generic/neon/integer.cpp
+++ b/src/cpu/kernels/add/generic/neon/integer.cpp
@@ -28,19 +28,22 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_u8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_u8_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_neon<uint8_t>(src0, src1, dst, policy, window);
 }
 
-void add_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_s16_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_neon<int16_t>(src0, src1, dst, policy, window);
 }
 
-void add_s32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_s32_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_neon<int32_t>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/qasymm8.cpp b/src/cpu/kernels/add/generic/neon/qasymm8.cpp
index 69cca956c8682fdd98cf27b41fe562beb866c855..8195d229d99f9c55f10a30fcdafafed9c99da4e2 100644
--- a/src/cpu/kernels/add/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/add/generic/neon/qasymm8.cpp
@@ -23,15 +23,17 @@
  */
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/kernels/add/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_qasymm8_neon(src0, src1, dst, policy, window, true /*is_addition*/);
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp
index dfdf8fe85b3c864d6a8814da34d5c91112ab329c..7e23096239e5df5434c441bf5cc484ec1ac730b1 100644
--- a/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp
@@ -23,15 +23,17 @@
  */
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/kernels/add/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_signed_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_qasymm8_signed_neon(src0, src1, dst, policy, window, true /*is_addition*/);
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/qsymm16.cpp b/src/cpu/kernels/add/generic/neon/qsymm16.cpp
index e76e408d6e43058c4b404d14f3c53e485cc526b6..ac2de0557a2a7054810218e42763dbae1e9dfbf2 100644
--- a/src/cpu/kernels/add/generic/neon/qsymm16.cpp
+++ b/src/cpu/kernels/add/generic/neon/qsymm16.cpp
@@ -25,14 +25,16 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qsymm16_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -57,7 +59,7 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
     const float32x4_t vscale2    = vdupq_n_f32(iq2_info.scale);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
         Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -65,7 +67,7 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
         const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
         const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
 
         // Clear X Dimension on execution window as we handle manually
         non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -74,48 +76,50 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
-
-            const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
-            const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
-
-            const auto  bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2);
-            const auto  bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2);
-            const float bfs  = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const int16x8_t a    = vld1q_s16(non_broadcast_input_ptr + x);
-                const auto      af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
-                const auto      af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
+
+                const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
+                const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
+
+                const auto  bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2);
+                const auto  bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2);
+                const float bfs  = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int16x8_t a    = vld1q_s16(non_broadcast_input_ptr + x);
+                    const auto      af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
+                    const auto      af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+                    rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+                    rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+                    rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+                    rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
 #endif //__aarch64__
 
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
-                vst1q_s16(output_ptr + x, pa);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                    const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
+                    vst1q_s16(output_ptr + x, pa);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
+                    *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -127,48 +131,50 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         Iterator input2(src1, input2_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const int16x8_t a = vld1q_s16(input1_ptr + x);
-                const int16x8_t b = vld1q_s16(input2_ptr + x);
-
-                const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
-                const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
-                const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2);
-                const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
+                const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int16x8_t a = vld1q_s16(input1_ptr + x);
+                    const int16x8_t b = vld1q_s16(input2_ptr + x);
+
+                    const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
+                    const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
+                    const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2);
+                    const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+                    rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+                    rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+                    rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+                    rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
 #endif //__aarch64__
 
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
-                vst1q_s16(output_ptr + x, pa);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
-                const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
-                *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info());
-            }
-        },
-        input1, input2, output);
+                    const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
+                    vst1q_s16(output_ptr + x, pa);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
+                    const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
+                    *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info());
+                }
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve/fp16.cpp b/src/cpu/kernels/add/generic/sve/fp16.cpp
index 581f3abded3001663f5e6fd6f822a8e664d130f1..01dfe6c44bebefee5e0541d7ed685415bb2a3639 100644
--- a/src/cpu/kernels/add/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/add/generic/sve/fp16.cpp
@@ -31,10 +31,11 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_fp16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_fp16_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_sve<float16_t>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/add/generic/sve/fp32.cpp b/src/cpu/kernels/add/generic/sve/fp32.cpp
index b37799113a4d71f956fdd74efa3aa94eabfa1c1d..56771a5411040bdf8a338cf0c4b5cb483e184382 100644
--- a/src/cpu/kernels/add/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/add/generic/sve/fp32.cpp
@@ -24,15 +24,17 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+
 #include "src/cpu/kernels/add/generic/sve/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_fp32_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_fp32_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_sve<float>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve/impl.cpp b/src/cpu/kernels/add/generic/sve/impl.cpp
index e8606436fd02f7d3ed3e7888467cb4a2bc0e9b83..ca850fcef463c1b2e64f08c108cfcb9148bf37e1 100644
--- a/src/cpu/kernels/add/generic/sve/impl.cpp
+++ b/src/cpu/kernels/add/generic/sve/impl.cpp
@@ -23,17 +23,21 @@
  */
 
 #include "src/cpu/kernels/add/generic/sve/impl.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include <arm_sve.h>
 namespace arm_compute
 {
 namespace cpu
 {
 template <typename ScalarType>
-void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_same_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     const auto all_true_pg           = wrapper::svptrue<ScalarType>();
     const auto window_start_x        = static_cast<int>(window.x().start());
@@ -53,7 +57,7 @@ void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const
     Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
     Iterator output(dst, window);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -68,28 +72,30 @@ void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
 
-            const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const auto       broadcast_value_vec = wrapper::svdup_n(broadcast_value);
+                const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+                const auto       broadcast_value_vec = wrapper::svdup_n(broadcast_value);
 
-            int      x  = window_start_x;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
-            {
-                const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
-                svst1(pg, output_ptr + x, res);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                int      x  = window_start_x;
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
+                    auto       res             = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v)
+                                                        : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
+                    svst1(pg, output_ptr + x, res);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -101,35 +107,41 @@ void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const
         Iterator input2(src1, input2_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto val1 = svld1(pg, input1_ptr + x);
-                const auto val2 = svld1(pg, input2_ptr + x);
-                const auto res  = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
-                svst1(pg, output_ptr + x, res);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+
+                int      x  = window_start_x;
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto val1 = svld1(pg, input1_ptr + x);
+                    const auto val2 = svld1(pg, input2_ptr + x);
+                    const auto res  = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
+                    svst1(pg, output_ptr + x, res);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
-template void add_same_sve<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<float>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<uint8_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<int16_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<int32_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void add_same_sve<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<float16_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve/impl.h b/src/cpu/kernels/add/generic/sve/impl.h
index 0136f142465f4bfcdc9b495f02dbc2e8c6e16177..6a95d66826580e6f72dc0f3af63979e3c12e6bd4 100644
--- a/src/cpu/kernels/add/generic/sve/impl.h
+++ b/src/cpu/kernels/add/generic/sve/impl.h
@@ -33,7 +33,8 @@ namespace arm_compute
 namespace cpu
 {
 template <typename ScalarType>
-void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+void add_same_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H
diff --git a/src/cpu/kernels/add/generic/sve/integer.cpp b/src/cpu/kernels/add/generic/sve/integer.cpp
index 3642dccd7b7305534f13c03b399062f6c17333a3..4d17f2adbd4e51c0784f9f2d715b5af2608fcd6b 100644
--- a/src/cpu/kernels/add/generic/sve/integer.cpp
+++ b/src/cpu/kernels/add/generic/sve/integer.cpp
@@ -24,25 +24,29 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+
 #include "src/cpu/kernels/add/generic/sve/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_u8_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_u8_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_sve<uint8_t>(src0, src1, dst, policy, window);
 }
 
-void add_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_s16_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_sve<int16_t>(src0, src1, dst, policy, window);
 }
 
-void add_s32_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_s32_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_sve<int32_t>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve2/qasymm8.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp
index 1dec214aa0719077d100fced9157e0ec56007a3a..40add9d51b1c3bc7591443f8f29eecae2dfee8f6 100644
--- a/src/cpu/kernels/add/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp
@@ -26,15 +26,18 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_sve2(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -58,7 +61,7 @@ void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
     const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
     const auto voffseto   = svdup_n_f32(oq_info.offset);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -78,48 +81,89 @@ void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
 
-            const uint8_t   broadcast_value     = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
-            const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value);
+                const uint8_t   broadcast_value     = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
+                const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value);
 
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b8(x, window_end_x);
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b8(x, window_end_x);
 
-            const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2);
-            const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2);
-            const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2);
-            const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2);
+                const auto bf_0 = svmul_f32_z(
+                    pg,
+                    svcvt_f32_s32_z(
+                        pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))),
+                                        voffset2)),
+                    vscale2);
+                const auto bf_1 = svmul_f32_z(
+                    pg,
+                    svcvt_f32_s32_z(
+                        pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))),
+                                        voffset2)),
+                    vscale2);
+                const auto bf_2 = svmul_f32_z(
+                    pg,
+                    svcvt_f32_s32_z(
+                        pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))),
+                                        voffset2)),
+                    vscale2);
+                const auto bf_3 = svmul_f32_z(
+                    pg,
+                    svcvt_f32_s32_z(
+                        pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))),
+                                        voffset2)),
+                    vscale2);
 
-            do
-            {
-                const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x);
-
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1);
-
-                const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
-                const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
-                const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
-
-                const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
-                svst1_u8(pg, output_ptr + x, res);
-
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                do
+                {
+                    const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x);
+
+                    const auto af_0 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_1 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_2 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_3 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)),
+                        vscale1);
+
+                    const auto rf_0 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_2 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+                    const auto rf_3 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+                    const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
+                    const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
+
+                    const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
+                    svst1_u8(pg, output_ptr + x, res);
+
+                    x += svcntb();
+                    pg = svwhilelt_b8(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -136,45 +180,82 @@ void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         const auto voffset1 = svdup_n_s32(iq1_info.offset);
         const auto voffset2 = svdup_n_s32(iq2_info.offset);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b8(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto a    = svld1_u8(pg, input1_ptr + x);
-                const auto b    = svld1_u8(pg, input2_ptr + x);
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1);
-
-                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)), vscale2);
-                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)), vscale2);
-                const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)), vscale2);
-                const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)), vscale2);
-
-                const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
-                const auto pa  = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
-                const auto pb  = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
-                const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
-
-                svst1_u8(pg, output_ptr + x, res);
-
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b8(x, window_end_x);
+                do
+                {
+                    const auto a    = svld1_u8(pg, input1_ptr + x);
+                    const auto b    = svld1_u8(pg, input2_ptr + x);
+                    const auto af_0 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_1 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_2 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_3 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)),
+                        vscale1);
+
+                    const auto bf_0 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)),
+                        vscale2);
+                    const auto bf_1 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)),
+                        vscale2);
+                    const auto bf_2 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)),
+                        vscale2);
+                    const auto bf_3 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)),
+                        vscale2);
+
+                    const auto rf_0 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_2 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+                    const auto rf_3 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+                    const auto pa  = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
+                    const auto pb  = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
+                    const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
+
+                    svst1_u8(pg, output_ptr + x, res);
+
+                    x += svcntb();
+                    pg = svwhilelt_b8(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp
index dae8899753d4c9198bc562346c114a4b5ff44150..2e585115e1aea79e4350c3ae805c4e6b59cc3400 100644
--- a/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp
@@ -26,15 +26,18 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_signed_sve2(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -57,7 +60,7 @@ void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor *
     const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
     const auto voffseto   = svdup_n_f32(oq_info.offset);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -78,46 +81,63 @@ void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor *
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
-
-            const int8_t broadcast_value     = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const auto   broadcast_value_vec = svdup_n_s8(broadcast_value);
-
-            int        x    = window_start_x;
-            svbool_t   pg   = svwhilelt_b8(x, window_end_x);
-            const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
-            const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
-            const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
-            const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
-
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto a    = svld1_s8(pg, non_broadcast_input_ptr + x);
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
-
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
-                const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-                const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
-                const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
-
-                svst1_s8(pg, output_ptr + x, res);
-
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
+
+                const int8_t broadcast_value     = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+                const auto   broadcast_value_vec = svdup_n_s8(broadcast_value);
+
+                int        x    = window_start_x;
+                svbool_t   pg   = svwhilelt_b8(x, window_end_x);
+                const auto bf_0 = svmul_f32_z(
+                    pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)),
+                    vscale2);
+                const auto bf_1 = svmul_f32_z(
+                    pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)),
+                    vscale2);
+                const auto bf_2 = svmul_f32_z(
+                    pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)),
+                    vscale2);
+                const auto bf_3 = svmul_f32_z(
+                    pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)),
+                    vscale2);
+
+                do
+                {
+                    const auto a    = svld1_s8(pg, non_broadcast_input_ptr + x);
+                    const auto af_0 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
+                    const auto af_1 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
+                    const auto af_2 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
+                    const auto af_3 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
+
+                    const auto rf_0 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_2 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+                    const auto rf_3 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+                    const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+                    const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
+                    const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
+
+                    svst1_s8(pg, output_ptr + x, res);
+
+                    x += svcntb();
+                    pg = svwhilelt_b8(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -134,46 +154,59 @@ void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor *
         const auto voffset1 = svdup_n_s32(iq1_info.offset);
         const auto voffset2 = svdup_n_s32(iq2_info.offset);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b8(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto a = svld1_s8(pg, input1_ptr + x);
-                const auto b = svld1_s8(pg, input2_ptr + x);
-
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
-
-                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2);
-                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2);
-                const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2);
-                const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2);
-
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
-                const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-                const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
-                const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
-
-                svst1_s8(pg, output_ptr + x, res);
-
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(svptrue_b8(), pg));
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b8(x, window_end_x);
+                do
+                {
+                    const auto a = svld1_s8(pg, input1_ptr + x);
+                    const auto b = svld1_s8(pg, input2_ptr + x);
+
+                    const auto af_0 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
+                    const auto af_1 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
+                    const auto af_2 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
+                    const auto af_3 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
+
+                    const auto bf_0 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2);
+                    const auto bf_1 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2);
+                    const auto bf_2 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2);
+                    const auto bf_3 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2);
+
+                    const auto rf_0 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_2 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+                    const auto rf_3 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+                    const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+                    const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
+                    const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
+
+                    svst1_s8(pg, output_ptr + x, res);
+
+                    x += svcntb();
+                    pg = svwhilelt_b8(x, window_end_x);
+                } while (svptest_any(svptrue_b8(), pg));
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/add/generic/sve2/qsymm16.cpp b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp
index 8c48ded9429a8d18f7cfe3b4f9d329cce3b9a22c..17a42c213853cb8088f61a96edb84fd824af0eb4 100644
--- a/src/cpu/kernels/add/generic/sve2/qsymm16.cpp
+++ b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp
@@ -26,15 +26,18 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qsymm16_sve2(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -59,7 +62,7 @@ void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
     const auto invvscaleo  = svdup_n_f32(1.f / oq_info.scale);
     const auto all_true_pg = svptrue_b16();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -74,39 +77,40 @@ void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
 
-            const int16_t broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
-            const auto    broadcast_value_vec = svdup_n_s16(broadcast_value);
+                const int16_t broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
+                const auto    broadcast_value_vec = svdup_n_s16(broadcast_value);
 
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b16(x, window_end_x);
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b16(x, window_end_x);
 
-            const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2);
-            const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2);
+                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2);
+                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2);
 
-            do
-            {
-                const auto a    = svld1_s16(pg, non_broadcast_input_ptr + x);
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
+                do
+                {
+                    const auto a    = svld1_s16(pg, non_broadcast_input_ptr + x);
+                    const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
+                    const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
 
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
 
-                const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+                    const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
 
-                svst1_s16(pg, output_ptr + x, res);
+                    svst1_s16(pg, output_ptr + x, res);
 
-                x += svcnth();
-                pg = svwhilelt_b16(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                    x += svcnth();
+                    pg = svwhilelt_b16(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -118,37 +122,38 @@ void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         Iterator input2(src1, input2_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b16(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                auto a = svld1_s16(pg, input1_ptr + x);
-                auto b = svld1_s16(pg, input2_ptr + x);
-
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
-
-                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2);
-                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2);
-
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-
-                const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-                svst1_s16(pg, output_ptr + x, res);
-
-                x += svcnth();
-                pg = svwhilelt_b16(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b16(x, window_end_x);
+                do
+                {
+                    auto a = svld1_s16(pg, input1_ptr + x);
+                    auto b = svld1_s16(pg, input2_ptr + x);
+
+                    const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
+                    const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
+
+                    const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2);
+                    const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2);
+
+                    const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+
+                    const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+                    svst1_s16(pg, output_ptr + x, res);
+
+                    x += svcnth();
+                    pg = svwhilelt_b16(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/add/list.h b/src/cpu/kernels/add/list.h
index 7cdb70fd9eec7222d3271258a259d81fa7ee0614..1040c39a41279c5eaaed04b5d5df8a03ec6be764 100644
--- a/src/cpu/kernels/add/list.h
+++ b/src/cpu/kernels/add/list.h
@@ -31,8 +31,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_ADD_KERNEL(func_name) \
-    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+#define DECLARE_ADD_KERNEL(func_name)                                                                   \
+    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, \
+                   const Window &window)
 
 DECLARE_ADD_KERNEL(add_qasymm8_neon);
 DECLARE_ADD_KERNEL(add_qasymm8_signed_neon);
@@ -55,4 +56,4 @@ DECLARE_ADD_KERNEL(add_qsymm16_sve2);
 
 } // namespace cpu
 } // namespace arm_compute
-#endif // SRC_CORE_KERNELS_ADD_LIST_H
\ No newline at end of file
+#endif // SRC_CORE_KERNELS_ADD_LIST_H
diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp
index d8e5f694a844e5fb5f192511bb77bbeb93a76f9c..b4b81aa78b4714b716b98347f6d16ec09b402dbc 100644
--- a/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/CpuTypes.h"
 
 #include <cstddef>
@@ -38,16 +39,20 @@ namespace
 {
 using arm_compute::float16_t;
 
-void a64_add_bn_clamp_direct_fp16_2x32(
-    float16_t *out, size_t out_stride,
-    float16_t *out_direct, size_t out_direct_stride,
-    const float16_t *in0, size_t in0_stride,
-    const float16_t *in1, size_t in1_stride,
-    const float16_t *bn_mul,
-    const float16_t *bn_add,
-    const float16_t  minval,
-    const float16_t  maxval,
-    size_t width, size_t height)
+void a64_add_bn_clamp_direct_fp16_2x32(float16_t       *out,
+                                       size_t           out_stride,
+                                       float16_t       *out_direct,
+                                       size_t           out_direct_stride,
+                                       const float16_t *in0,
+                                       size_t           in0_stride,
+                                       const float16_t *in1,
+                                       size_t           in1_stride,
+                                       const float16_t *bn_mul,
+                                       const float16_t *bn_add,
+                                       const float16_t  minval,
+                                       const float16_t  maxval,
+                                       size_t           width,
+                                       size_t           height)
 {
     struct KernelArgs
     {
@@ -858,9 +863,14 @@ void a64_add_bn_clamp_direct_fp16_2x32(
         "subs x20, x20, #0x2\n"
         "bgt 8b\n"
         "58:" // odd columns skip
-        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
-        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+          [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+          [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+          [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+          "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+          "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
 }
 
 } // namespace
@@ -869,8 +879,15 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
-                           ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+void add_mul_add_fp16_neon(const ITensor             *input1,
+                           const ITensor             *input2,
+                           const ITensor             *bn_mul,
+                           const ITensor             *bn_add,
+                           ITensor                   *add_output,
+                           ITensor                   *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info,
+                           const Window              &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -882,16 +899,16 @@ void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const I
     float16_t minval = std::numeric_limits<half>::lowest();
     float16_t maxval = std::numeric_limits<half>::max();
 
-    if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
     {
         minval = static_cast<float16_t>(0.f);
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
     {
         minval = static_cast<float16_t>(0.f);
         maxval = static_cast<float16_t>(act_info.a());
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
     {
         minval = static_cast<float16_t>(act_info.b());
         maxval = static_cast<float16_t>(act_info.a());
@@ -909,42 +926,37 @@ void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const I
     const size_t width  = window.num_iterations(0);
     const size_t height = window.num_iterations(1);
 
-    if(add_output != nullptr)
+    if (add_output != nullptr)
     {
         Iterator add_out_it(add_output, window);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_fp16_2x32(
-                reinterpret_cast<float16_t *>(out_it.ptr()), out_stride,
-                reinterpret_cast<float16_t *>(add_out_it.ptr()), out_direct_stride,
-                reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
-                reinterpret_cast<float16_t *>(bn_mul->buffer()),
-                reinterpret_cast<float16_t *>(bn_add->buffer()),
-                minval,
-                maxval,
-                width, height);
-        },
-        in1_it, in2_it, add_out_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast<float16_t *>(out_it.ptr()), out_stride,
+                                                  reinterpret_cast<float16_t *>(add_out_it.ptr()), out_direct_stride,
+                                                  reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride,
+                                                  reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
+                                                  reinterpret_cast<float16_t *>(bn_mul->buffer()),
+                                                  reinterpret_cast<float16_t *>(bn_add->buffer()), minval, maxval,
+                                                  width, height);
+            },
+            in1_it, in2_it, add_out_it, out_it);
     }
     else
     {
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_fp16_2x32(
-                reinterpret_cast<float16_t *>(out_it.ptr()), out_stride,
-                nullptr, out_direct_stride,
-                reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
-                reinterpret_cast<float16_t *>(bn_mul->buffer()),
-                reinterpret_cast<float16_t *>(bn_add->buffer()),
-                minval,
-                maxval,
-                width, height);
-        },
-        in1_it, in2_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast<float16_t *>(out_it.ptr()), out_stride, nullptr,
+                                                  out_direct_stride, reinterpret_cast<float16_t *>(in1_it.ptr()),
+                                                  in0_stride, reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
+                                                  reinterpret_cast<float16_t *>(bn_mul->buffer()),
+                                                  reinterpret_cast<float16_t *>(bn_add->buffer()), minval, maxval,
+                                                  width, height);
+            },
+            in1_it, in2_it, out_it);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp
index b0c487ec56a5e6b34495beaa7934ed9f6cf8fdf5..f0444b6acd3048c033ec4c334e87e731d830eab2 100644
--- a/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp
@@ -35,16 +35,20 @@
 #ifdef __aarch64__
 namespace
 {
-void a64_add_bn_clamp_direct_fp32_2x16(
-    float *out, size_t out_stride,
-    float *out_direct, size_t out_direct_stride,
-    const float *in0, size_t in0_stride,
-    const float *in1, size_t in1_stride,
-    const float *bn_mul,
-    const float *bn_add,
-    const float  minval,
-    const float  maxval,
-    size_t width, size_t height)
+void a64_add_bn_clamp_direct_fp32_2x16(float       *out,
+                                       size_t       out_stride,
+                                       float       *out_direct,
+                                       size_t       out_direct_stride,
+                                       const float *in0,
+                                       size_t       in0_stride,
+                                       const float *in1,
+                                       size_t       in1_stride,
+                                       const float *bn_mul,
+                                       const float *bn_add,
+                                       const float  minval,
+                                       const float  maxval,
+                                       size_t       width,
+                                       size_t       height)
 {
     struct KernelArgs
     {
@@ -631,18 +635,30 @@ void a64_add_bn_clamp_direct_fp32_2x16(
         "subs x20, x20, #0x2\n"
         "bgt 8b\n"
         "34:" // odd columns skip
-        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
-        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
-}
+        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+          [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+          [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+          [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+          "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+          "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
 }
+} // namespace
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
-                           ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+void add_mul_add_fp32_neon(const ITensor             *input1,
+                           const ITensor             *input2,
+                           const ITensor             *bn_mul,
+                           const ITensor             *bn_add,
+                           ITensor                   *add_output,
+                           ITensor                   *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info,
+                           const Window              &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -654,16 +670,16 @@ void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const I
     float minval = std::numeric_limits<float>::lowest();
     float maxval = std::numeric_limits<float>::max();
 
-    if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
     {
         minval = 0.f;
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
     {
         minval = 0.f;
         maxval = act_info.a();
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
     {
         minval = act_info.b();
         maxval = act_info.a();
@@ -681,42 +697,34 @@ void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const I
     const size_t width  = window.num_iterations(0);
     const size_t height = window.num_iterations(1);
 
-    if(add_output != nullptr)
+    if (add_output != nullptr)
     {
         Iterator add_out_it(add_output, window);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_fp32_2x16(
-                reinterpret_cast<float *>(out_it.ptr()), out_stride,
-                reinterpret_cast<float *>(add_out_it.ptr()), out_direct_stride,
-                reinterpret_cast<float *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<float *>(in2_it.ptr()), in1_stride,
-                reinterpret_cast<float *>(bn_mul->buffer()),
-                reinterpret_cast<float *>(bn_add->buffer()),
-                minval,
-                maxval,
-                width, height);
-        },
-        in1_it, in2_it, add_out_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_fp32_2x16(
+                    reinterpret_cast<float *>(out_it.ptr()), out_stride, reinterpret_cast<float *>(add_out_it.ptr()),
+                    out_direct_stride, reinterpret_cast<float *>(in1_it.ptr()), in0_stride,
+                    reinterpret_cast<float *>(in2_it.ptr()), in1_stride, reinterpret_cast<float *>(bn_mul->buffer()),
+                    reinterpret_cast<float *>(bn_add->buffer()), minval, maxval, width, height);
+            },
+            in1_it, in2_it, add_out_it, out_it);
     }
     else
     {
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_fp32_2x16(
-                reinterpret_cast<float *>(out_it.ptr()), out_stride,
-                nullptr, out_direct_stride,
-                reinterpret_cast<float *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<float *>(in2_it.ptr()), in1_stride,
-                reinterpret_cast<float *>(bn_mul->buffer()),
-                reinterpret_cast<float *>(bn_add->buffer()),
-                minval,
-                maxval,
-                width, height);
-        },
-        in1_it, in2_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_fp32_2x16(
+                    reinterpret_cast<float *>(out_it.ptr()), out_stride, nullptr, out_direct_stride,
+                    reinterpret_cast<float *>(in1_it.ptr()), in0_stride, reinterpret_cast<float *>(in2_it.ptr()),
+                    in1_stride, reinterpret_cast<float *>(bn_mul->buffer()),
+                    reinterpret_cast<float *>(bn_add->buffer()), minval, maxval, width, height);
+            },
+            in1_it, in2_it, out_it);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp
index f7448a6717e426de855d41b4eb22290c873ce52d..035805c944b46c76d53bb700d59a297da850fdd0 100644
--- a/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp
@@ -36,22 +36,30 @@
 #ifdef __aarch64__
 namespace
 {
-void a64_add_bn_clamp_direct_u8_fp32_2x16(
-    uint8_t *out, size_t out_stride,
-    uint8_t *out_direct, size_t out_direct_stride,
-    const uint8_t *in0, size_t in0_stride,
-    const uint8_t *in1, size_t in1_stride,
-    const float *bn_mul,
-    const float *bn_add,
-    const uint8_t minval,
-    const uint8_t maxval,
-    int32_t out_zeropt, float out_scale,
-    int32_t out_direct_zeropt, float out_direct_scale,
-    int32_t in0_zeropt, float in0_scale,
-    int32_t in1_zeropt, float in1_scale,
-    size_t width, size_t height)
+void a64_add_bn_clamp_direct_u8_fp32_2x16(uint8_t       *out,
+                                          size_t         out_stride,
+                                          uint8_t       *out_direct,
+                                          size_t         out_direct_stride,
+                                          const uint8_t *in0,
+                                          size_t         in0_stride,
+                                          const uint8_t *in1,
+                                          size_t         in1_stride,
+                                          const float   *bn_mul,
+                                          const float   *bn_add,
+                                          const uint8_t  minval,
+                                          const uint8_t  maxval,
+                                          int32_t        out_zeropt,
+                                          float          out_scale,
+                                          int32_t        out_direct_zeropt,
+                                          float          out_direct_scale,
+                                          int32_t        in0_zeropt,
+                                          float          in0_scale,
+                                          int32_t        in1_zeropt,
+                                          float          in1_scale,
+                                          size_t         width,
+                                          size_t         height)
 {
-    float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale };
+    float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale};
     struct KernelArgs
     {
         const float *scales;
@@ -709,9 +717,19 @@ void a64_add_bn_clamp_direct_u8_fp32_2x16(
         "subs x23, x23, #0x2\n"
         "bgt 6b\n"
         "32:" // odd columns skip
-        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
-        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+          [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+          [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)),
+          [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)),
+          [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+          [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)),
+          [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)),
+          [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride),
+          [out_stride] "r"(out_stride)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+          "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+          "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
 }
 
 } // namespace
@@ -720,8 +738,15 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
-                         ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+void add_mul_add_u8_neon(const ITensor             *input1,
+                         const ITensor             *input2,
+                         const ITensor             *bn_mul,
+                         const ITensor             *bn_add,
+                         ITensor                   *add_output,
+                         ITensor                   *final_output,
+                         ConvertPolicy              policy,
+                         const ActivationLayerInfo &act_info,
+                         const Window              &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -739,24 +764,25 @@ void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITe
     uint8_t maxval = std::numeric_limits<uint8_t>::max();
 
     const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform();
-    if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
     {
         minval = quantize_qasymm8(0.f, final_output_qinfo);
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
     {
         minval = quantize_qasymm8(0.f, final_output_qinfo);
         maxval = quantize_qasymm8(act_info.a(), final_output_qinfo);
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
     {
         minval = quantize_qasymm8(act_info.b(), final_output_qinfo);
         maxval = quantize_qasymm8(act_info.a(), final_output_qinfo);
     }
 
-    const UniformQuantizationInfo in1_qinfo        = input1_info->quantization_info().uniform();
-    const UniformQuantizationInfo in2_qinfo        = input2_info->quantization_info().uniform();
-    const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
+    const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
+    const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
+    const UniformQuantizationInfo add_output_qinfo =
+        (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
 
     const int32_t in1_offset        = in1_qinfo.offset;
     const int32_t in2_offset        = in2_qinfo.offset;
@@ -783,50 +809,35 @@ void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITe
     const size_t width  = window.num_iterations(0);
     const size_t height = window.num_iterations(1);
 
-    if(add_output != nullptr)
+    if (add_output != nullptr)
     {
         Iterator add_out_it(add_output, window);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_u8_fp32_2x16(
-                reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
-                reinterpret_cast<uint8_t *>(add_out_it.ptr()), out_direct_stride,
-                reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<uint8_t *>(in2_it.ptr()), in1_stride,
-                bn_mul_buffer,
-                bn_add_buffer,
-                minval,
-                maxval,
-                out_offset, out_scale,
-                out_direct_offset, out_direct_scale,
-                in1_offset, in1_scale,
-                in2_offset, in2_scale,
-                width, height);
-        },
-        in1_it, in2_it, add_out_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_u8_fp32_2x16(
+                    reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
+                    reinterpret_cast<uint8_t *>(add_out_it.ptr()), out_direct_stride,
+                    reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<uint8_t *>(in2_it.ptr()),
+                    in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset,
+                    out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height);
+            },
+            in1_it, in2_it, add_out_it, out_it);
     }
     else
     {
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_u8_fp32_2x16(
-                reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
-                nullptr, out_direct_stride,
-                reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<uint8_t *>(in2_it.ptr()), in1_stride,
-                bn_mul_buffer,
-                bn_add_buffer,
-                minval,
-                maxval,
-                out_offset, out_scale,
-                out_direct_offset, out_direct_scale,
-                in1_offset, in1_scale,
-                in2_offset, in2_scale,
-                width, height);
-        },
-        in1_it, in2_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_u8_fp32_2x16(
+                    reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride,
+                    reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<uint8_t *>(in2_it.ptr()),
+                    in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset,
+                    out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height);
+            },
+            in1_it, in2_it, out_it);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp
index 1ae2cb76a99f0632d2a5763ba04527b82ab4a441..e1a45b467b7f6be39cec7bcc2b7d882ef2fe348f 100644
--- a/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp
@@ -36,22 +36,30 @@
 #ifdef __aarch64__
 namespace
 {
-void a64_add_bn_clamp_direct_s8_fp32_2x16(
-    int8_t *out, size_t out_stride,
-    int8_t *out_direct, size_t out_direct_stride,
-    const int8_t *in0, size_t in0_stride,
-    const int8_t *in1, size_t in1_stride,
-    const float *bn_mul,
-    const float *bn_add,
-    const int8_t minval,
-    const int8_t maxval,
-    int32_t out_zeropt, float out_scale,
-    int32_t out_direct_zeropt, float out_direct_scale,
-    int32_t in0_zeropt, float in0_scale,
-    int32_t in1_zeropt, float in1_scale,
-    size_t width, size_t height)
+void a64_add_bn_clamp_direct_s8_fp32_2x16(int8_t       *out,
+                                          size_t        out_stride,
+                                          int8_t       *out_direct,
+                                          size_t        out_direct_stride,
+                                          const int8_t *in0,
+                                          size_t        in0_stride,
+                                          const int8_t *in1,
+                                          size_t        in1_stride,
+                                          const float  *bn_mul,
+                                          const float  *bn_add,
+                                          const int8_t  minval,
+                                          const int8_t  maxval,
+                                          int32_t       out_zeropt,
+                                          float         out_scale,
+                                          int32_t       out_direct_zeropt,
+                                          float         out_direct_scale,
+                                          int32_t       in0_zeropt,
+                                          float         in0_scale,
+                                          int32_t       in1_zeropt,
+                                          float         in1_scale,
+                                          size_t        width,
+                                          size_t        height)
 {
-    float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale };
+    float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale};
     struct KernelArgs
     {
         const float *scales;
@@ -709,9 +717,19 @@ void a64_add_bn_clamp_direct_s8_fp32_2x16(
         "subs x23, x23, #0x2\n"
         "bgt 6b\n"
         "32:" // odd columns skip
-        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
-        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+          [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+          [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)),
+          [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)),
+          [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+          [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)),
+          [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)),
+          [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride),
+          [out_stride] "r"(out_stride)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+          "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+          "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
 }
 
 } // namespace
@@ -720,8 +738,15 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
-                         ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+void add_mul_add_s8_neon(const ITensor             *input1,
+                         const ITensor             *input2,
+                         const ITensor             *bn_mul,
+                         const ITensor             *bn_add,
+                         ITensor                   *add_output,
+                         ITensor                   *final_output,
+                         ConvertPolicy              policy,
+                         const ActivationLayerInfo &act_info,
+                         const Window              &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -739,24 +764,25 @@ void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITe
     int8_t maxval = std::numeric_limits<int8_t>::max();
 
     const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform();
-    if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
     {
         minval = quantize_qasymm8_signed(0.f, final_output_qinfo);
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
     {
         minval = quantize_qasymm8_signed(0.f, final_output_qinfo);
         maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo);
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
     {
         minval = quantize_qasymm8_signed(act_info.b(), final_output_qinfo);
         maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo);
     }
 
-    const UniformQuantizationInfo in1_qinfo        = input1_info->quantization_info().uniform();
-    const UniformQuantizationInfo in2_qinfo        = input2_info->quantization_info().uniform();
-    const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
+    const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
+    const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
+    const UniformQuantizationInfo add_output_qinfo =
+        (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
 
     const int32_t in1_offset        = in1_qinfo.offset;
     const int32_t in2_offset        = in2_qinfo.offset;
@@ -783,50 +809,35 @@ void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITe
     const size_t width  = window.num_iterations(0);
     const size_t height = window.num_iterations(1);
 
-    if(add_output != nullptr)
+    if (add_output != nullptr)
     {
         Iterator add_out_it(add_output, window);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_s8_fp32_2x16(
-                reinterpret_cast<int8_t *>(out_it.ptr()), out_stride,
-                reinterpret_cast<int8_t *>(add_out_it.ptr()), out_direct_stride,
-                reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride,
-                bn_mul_buffer,
-                bn_add_buffer,
-                minval,
-                maxval,
-                out_offset, out_scale,
-                out_direct_offset, out_direct_scale,
-                in1_offset, in1_scale,
-                in2_offset, in2_scale,
-                width, height);
-        },
-        in1_it, in2_it, add_out_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_s8_fp32_2x16(
+                    reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, reinterpret_cast<int8_t *>(add_out_it.ptr()),
+                    out_direct_stride, reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
+                    reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval,
+                    out_offset, out_scale, out_direct_offset, out_direct_scale, in1_offset, in1_scale, in2_offset,
+                    in2_scale, width, height);
+            },
+            in1_it, in2_it, add_out_it, out_it);
     }
     else
     {
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_s8_fp32_2x16(
-                reinterpret_cast<int8_t *>(out_it.ptr()), out_stride,
-                nullptr, out_direct_stride,
-                reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride,
-                bn_mul_buffer,
-                bn_add_buffer,
-                minval,
-                maxval,
-                out_offset, out_scale,
-                out_direct_offset, out_direct_scale,
-                in1_offset, in1_scale,
-                in2_offset, in2_scale,
-                width, height);
-        },
-        in1_it, in2_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_s8_fp32_2x16(
+                    reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride,
+                    reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<int8_t *>(in2_it.ptr()),
+                    in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset,
+                    out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height);
+            },
+            in1_it, in2_it, out_it);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/addmuladd/list.h b/src/cpu/kernels/addmuladd/list.h
index a7c22c06d8834d0cb79116431fb30d29d3ac1e78..568003a916904ed2ab3aba61f6928e53c412de75 100644
--- a/src/cpu/kernels/addmuladd/list.h
+++ b/src/cpu/kernels/addmuladd/list.h
@@ -32,9 +32,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_ADD_MUL_ADD_KERNEL(func_name)                                                                          \
+#define DECLARE_ADD_MUL_ADD_KERNEL(func_name)                                                                  \
     void func_name(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, \
-                   ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+                   ITensor *add_output, ITensor *final_output, ConvertPolicy policy,                           \
+                   const ActivationLayerInfo &act_info, const Window &window)
 
 DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp32_neon);
 DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp16_neon);
diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
index 10bf8e4ff7345df38012ec3212c8b852fd9fa27f..6e8f32ef4742cb8d84941ccf1a60f1ed93985637 100644
--- a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/NEON/INEKernel.h"
 #include "src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp"
 
@@ -57,13 +58,12 @@ class CpuGemmAssemblyWrapperKernel final : public INEKernel
 public:
     /** Constructor
      */
-    CpuGemmAssemblyWrapperKernel()
-        : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel")
+    CpuGemmAssemblyWrapperKernel() : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel")
     {
     }
 
-    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &)  = delete;
-    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default;
+    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &)            = delete;
+    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&)           = default;
     CpuGemmAssemblyWrapperKernel &operator=(CpuGemmAssemblyWrapperKernel &) = delete;
 
     const char *name() const override
@@ -110,7 +110,7 @@ public:
 
         INEKernel::configure(win);
 
-        if(!kernel_name_tag.empty())
+        if (!kernel_name_tag.empty())
         {
             _name += "/" + kernel_name_tag;
         }
@@ -132,7 +132,7 @@ public:
 
 private:
     arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel;
-    std::string _name;
+    std::string                                  _name;
 };
 } // namespace kernel
 } // namespace cpu
diff --git a/src/cpu/kernels/assembly/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm.hpp
index 4c127b4ec341cafb800f49160edf0879cdf6432c..9a913c5c588656d9715c44a9372887b7656ccfc9 100644
--- a/src/cpu/kernels/assembly/arm_gemm.hpp
+++ b/src/cpu/kernels/assembly/arm_gemm.hpp
@@ -23,13 +23,12 @@
  */
 #pragma once
 
+#include "arm_gemm_local.hpp"
+#include "gemm_common.hpp"
 #include <cstring>
 #include <memory>
 #include <vector>
 
-#include "arm_gemm_local.hpp"
-#include "gemm_common.hpp"
-
 namespace arm_gemm
 {
 enum class GemmMethod
@@ -111,8 +110,7 @@ struct GemmConfig
     unsigned int outer_block_size = 0;
     WeightFormat weight_format    = WeightFormat::ANY;
 
-    GemmConfig(GemmMethod method)
-        : method(method)
+    GemmConfig(GemmMethod method) : method(method)
     {
     }
     GemmConfig()
@@ -133,8 +131,7 @@ struct Activation
     float param1;
     float param2;
 
-    Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f)
-        : type(type), param1(p1), param2(p2)
+    Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) : type(type), param1(p1), param2(p2)
     {
     }
 };
@@ -156,12 +153,32 @@ public:
     bool              _fast_mode;
     const GemmConfig *_cfg;
 
-    GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N,
-             unsigned int K, unsigned int Ksections, unsigned int nbatches,
-             unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads,
-             bool fixed_format = false, bool fast_mode = false, const GemmConfig *cfg = nullptr)
-        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads),
-          _fixed_format(fixed_format), _fast_mode(fast_mode), _cfg(cfg)
+    GemmArgs(const CPUInfo    *ci,
+             unsigned int      M,
+             unsigned int      N,
+             unsigned int      K,
+             unsigned int      Ksections,
+             unsigned int      nbatches,
+             unsigned int      nmulti,
+             bool              indirect_input,
+             Activation        act,
+             const int         maxthreads,
+             bool              fixed_format = false,
+             bool              fast_mode    = false,
+             const GemmConfig *cfg          = nullptr)
+        : _ci(ci),
+          _Msize(M),
+          _Nsize(N),
+          _Ksize(K),
+          _Ksections(Ksections),
+          _nbatches(nbatches),
+          _nmulti(nmulti),
+          _indirect_input(indirect_input),
+          _act(act),
+          _maxthreads(maxthreads),
+          _fixed_format(fixed_format),
+          _fast_mode(fast_mode),
+          _cfg(cfg)
     {
     }
 };
@@ -187,23 +204,51 @@ public:
     Requantize32() = default;
 
     // Constructor for per-tensor quantization
-    Requantize32(const int32_t *bias, size_t bias_multi_stride,
-                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
-                 int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv)
-        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
-          per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv)
+    Requantize32(const int32_t *bias,
+                 size_t         bias_multi_stride,
+                 int32_t        a_offset,
+                 int32_t        b_offset,
+                 int32_t        c_offset,
+                 int32_t        requant_shift,
+                 int32_t        requant_mul,
+                 int32_t        minv,
+                 int32_t        maxv)
+        : bias(bias),
+          bias_multi_stride(bias_multi_stride),
+          a_offset(a_offset),
+          b_offset(b_offset),
+          c_offset(c_offset),
+          per_channel_requant(false),
+          per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
+          per_layer_right_shift(std::min<int32_t>(requant_shift, 0)),
+          per_layer_mul(requant_mul),
+          minval(minv),
+          maxval(maxv)
     {
     }
 
     // Constructor for per-channel quantization
-    Requantize32(const int32_t *bias, size_t bias_multi_stride,
-                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
+    Requantize32(const int32_t *bias,
+                 size_t         bias_multi_stride,
+                 int32_t        a_offset,
+                 int32_t        b_offset,
+                 int32_t        c_offset,
                  const int32_t *requant_left_shifts,
                  const int32_t *requant_right_shifts,
                  const int32_t *requant_muls,
-                 int32_t minv, int32_t maxv)
-        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_left_shifts(requant_left_shifts),
-          per_channel_right_shifts(requant_right_shifts), per_channel_muls(requant_muls), minval(minv), maxval(maxv)
+                 int32_t        minv,
+                 int32_t        maxv)
+        : bias(bias),
+          bias_multi_stride(bias_multi_stride),
+          a_offset(a_offset),
+          b_offset(b_offset),
+          c_offset(c_offset),
+          per_channel_requant(true),
+          per_channel_left_shifts(requant_left_shifts),
+          per_channel_right_shifts(requant_right_shifts),
+          per_channel_muls(requant_muls),
+          minval(minv),
+          maxval(maxv)
     {
     }
 };
diff --git a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
index 718fcd1fb45615c7915ae2e39312034ab74b1a50..0672e899b6f7672463b1cad0fef1dbe2dd990acd 100644
--- a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
+++ b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
@@ -27,7 +27,6 @@
 #include "arm_compute/core/Window.h"
 
 #include "ndrange.hpp"
-
 #include <cassert>
 
 /* This file contains mapping between integral types used in arm_compute and arm_gemm
@@ -38,8 +37,7 @@
 namespace arm_gemm
 {
 //we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library
-constexpr std::size_t ndrange_max =
-    arm_compute::Dimensions<unsigned int>::num_max_dimensions;
+constexpr std::size_t ndrange_max = arm_compute::Dimensions<unsigned int>::num_max_dimensions;
 
 using ndrange_t = NDRange<ndrange_max>;
 using ndcoord_t = NDCoordinate<ndrange_max>;
@@ -56,7 +54,7 @@ inline arm_compute::Window to_window(const ndrange_t &ndr)
 {
     arm_compute::Window win;
 
-    for(unsigned int i = 0; i != ndrange_max; ++i)
+    for (unsigned int i = 0; i != ndrange_max; ++i)
     {
         //populate the window with the dimensions of the NDRange
         win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i)));
@@ -75,7 +73,7 @@ inline arm_compute::Window to_window(const ndcoord_t &ndc)
 {
     arm_compute::Window win;
 
-    for(unsigned int i = 0; i != ndrange_max; ++i)
+    for (unsigned int i = 0; i != ndrange_max; ++i)
     {
         const auto start = ndc.get_position(i);
         const auto size  = ndc.get_size(i);
@@ -98,15 +96,12 @@ inline arm_compute::Window to_window(const ndcoord_t &ndc)
  */
 inline ndrange_t to_ndrange(const arm_compute::Window &win)
 {
-    return
-    {
-        static_cast<unsigned int>(win[0].end() - win[0].start()),
-        static_cast<unsigned int>(win[1].end() - win[1].start()),
-        static_cast<unsigned int>(win[2].end() - win[2].start()),
-        static_cast<unsigned int>(win[3].end() - win[3].start()),
-        static_cast<unsigned int>(win[4].end() - win[4].start()),
-        static_cast<unsigned int>(win[5].end() - win[5].start())
-    };
+    return {static_cast<unsigned int>(win[0].end() - win[0].start()),
+            static_cast<unsigned int>(win[1].end() - win[1].start()),
+            static_cast<unsigned int>(win[2].end() - win[2].start()),
+            static_cast<unsigned int>(win[3].end() - win[3].start()),
+            static_cast<unsigned int>(win[4].end() - win[4].start()),
+            static_cast<unsigned int>(win[5].end() - win[5].start())};
 }
 
 /** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions
@@ -116,15 +111,12 @@ inline ndrange_t to_ndrange(const arm_compute::Window &win)
  */
 inline ndcoord_t to_ndcoord(const arm_compute::Window &win)
 {
-    return
-    {
-        { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) },
-        { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) },
-        { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) },
-        { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) },
-        { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) },
-        { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) }
-    };
+    return {{static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start())},
+            {static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start())},
+            {static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start())},
+            {static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start())},
+            {static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start())},
+            {static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start())}};
 }
 
 } //namespace arm_gemm
diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp
index 834cd1061e4f132536a02fab9c648f7866727e54..6fe9f13f02cd796fb790577bb5aaa995e239daf9 100644
--- a/src/cpu/kernels/assembly/gemm_common.hpp
+++ b/src/cpu/kernels/assembly/gemm_common.hpp
@@ -25,7 +25,6 @@
 
 #include "convolution_parameters.hpp"
 #include "ndrange.hpp"
-
 #include <cstddef>
 
 namespace arm_gemm
@@ -51,10 +50,19 @@ public:
      * appropriately typed pointers.  If B is pretransposed (see below) then
      * the settings for B here are ignored.
      */
-    virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                                    const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
-                                    void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                                    const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0;
+    virtual void set_arrays_generic(const void                                   *A,
+                                    const int                                     lda,
+                                    const int                                     A_batch_stride,
+                                    const int                                     A_multi_stride,
+                                    const void                                   *B,
+                                    const int                                     ldb,
+                                    /* batches share B */ const int               B_multi_stride,
+                                    void                                         *C,
+                                    const int                                     ldc,
+                                    const int                                     C_batch_stride,
+                                    const int                                     C_multi_stride,
+                                    const void                                   *bias,
+                                    /* no row or batch stride needed */ const int bias_multi_stride) = 0;
 
     /** @returns an ndrange containing ranges of the compute space which can be
      * broken up and parallelised over
@@ -73,7 +81,7 @@ public:
      * This has an empty default implementation, as GEMMs which don't care
      * about thread count can safely ignore this.
      */
-    virtual void set_nthreads(int) {};
+    virtual void set_nthreads(int){};
 
     /* Whether this GEMM can be dynamically scheduled or not. */
     virtual bool supports_dynamic_scheduling() const
@@ -95,7 +103,7 @@ public:
         return 0;
     }
     /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
-    virtual void set_working_space(void *) {};
+    virtual void set_working_space(void *){};
 
     /*** "Pretransposed" interface (optional) ***/
     /* Is this object set up for pretranspose?  If so, pretranspose_array() needs to be called before execute(); */
@@ -122,7 +130,8 @@ public:
     /* The "real" version of this depends on the templated operand type (see below).  */
     virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0;
     /* Threaded version with window start/end parameters */
-    virtual void pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0;
+    virtual void
+    pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0;
 
     /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
     virtual void set_pretransposed_B_data(void *)
@@ -186,10 +195,19 @@ protected:
 public:
     /* Pass in the pointers to the arrays to be operated on and their
      * strides (templated version with appropriate types). */
-    virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const To *B, const int ldb, /* batches share B */ const int B_multi_stride,
-                            Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride)
+    virtual void set_arrays(const To                                     *A,
+                            const int                                     lda,
+                            const int                                     A_batch_stride,
+                            const int                                     A_multi_stride,
+                            const To                                     *B,
+                            const int                                     ldb,
+                            /* batches share B */ const int               B_multi_stride,
+                            Tr                                           *C,
+                            const int                                     ldc,
+                            const int                                     C_batch_stride,
+                            const int                                     C_multi_stride,
+                            const Tr                                     *bias,
+                            /* no row or batch stride needed */ const int bias_multi_stride)
     {
         _Aptr              = A;
         _lda               = lda;
@@ -207,25 +225,33 @@ public:
     }
 
     /* Implementation of the void * overload which casts its arguments to the appropriate type. */
-    void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
-                            void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override
+    void set_arrays_generic(const void                                   *A,
+                            const int                                     lda,
+                            const int                                     A_batch_stride,
+                            const int                                     A_multi_stride,
+                            const void                                   *B,
+                            const int                                     ldb,
+                            /* batches share B */ const int               B_multi_stride,
+                            void                                         *C,
+                            const int                                     ldc,
+                            const int                                     C_batch_stride,
+                            const int                                     C_multi_stride,
+                            const void                                   *bias,
+                            /* no row or batch stride needed */ const int bias_multi_stride) override
     {
-        set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride,
-                   static_cast<const To *>(B), ldb, B_multi_stride,
-                   static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
+        set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, static_cast<const To *>(B), ldb,
+                   B_multi_stride, static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
                    static_cast<const Tr *>(bias), bias_multi_stride);
     }
 
     /*** "Pretransposed" interface ***/
 
     /* Compute col sums over all columns */
-    virtual void requantize_bias(void *, const To *, const int, const int) {};
+    virtual void requantize_bias(void *, const To *, const int, const int){};
 
     /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
     /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
-    virtual void pretranspose_B_array(void *, const To *, const int, const int) {};
+    virtual void pretranspose_B_array(void *, const To *, const int, const int){};
 
     /* Implementation of the void * overload which casts its arguments to the appropriate type. */
     void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override
@@ -237,12 +263,14 @@ public:
      * The fallback/backwards compatible version of the threaded interface exposes a window size of 1 and
      * just calls the non-threaded functions to do the work.  This is valid as with window size of 1 the only
      * legal values for start and end are 0 and 1 respectively. */
-    virtual void pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t)
+    virtual void
+    pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t)
     {
         pretranspose_B_array(out, in, row_stride, multi_stride);
     };
 
-    void pretranspose_B_array_part_generic(void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override
+    void pretranspose_B_array_part_generic(
+        void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override
     {
         pretranspose_B_array_part(out, static_cast<const To *>(in), row_stride, multi_stride, start, end);
     }
diff --git a/src/cpu/kernels/assembly/ndrange.hpp b/src/cpu/kernels/assembly/ndrange.hpp
index 1c8261aef74cf4b9dbb87eac9f65fceb396d325f..baccdc0d88bac21ae8572203327f80030f3626bf 100644
--- a/src/cpu/kernels/assembly/ndrange.hpp
+++ b/src/cpu/kernels/assembly/ndrange.hpp
@@ -45,8 +45,7 @@ private:
         unsigned int   m_end = 0;
 
     public:
-        NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e)
-            : m_parent(p), m_pos(s), m_end(e)
+        NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e)
         {
         }
 
@@ -59,12 +58,12 @@ private:
         {
             unsigned int r = m_pos;
 
-            if(d < (D - 1))
+            if (d < (D - 1))
             {
                 r %= m_parent.m_totalsizes[d];
             }
 
-            if(d > 0)
+            if (d > 0)
             {
                 r /= m_parent.m_totalsizes[d - 1];
             }
@@ -98,9 +97,9 @@ private:
     {
         unsigned int t = 1;
 
-        for(unsigned int i = 0; i < D; i++)
+        for (unsigned int i = 0; i < D; i++)
         {
-            if(m_sizes[i] == 0)
+            if (m_sizes[i] == 0)
             {
                 m_sizes[i] = 1;
             }
@@ -116,14 +115,12 @@ public:
     NDRange(const NDRange &rhs)            = default;
 
     template <typename... T>
-    NDRange(T... ts)
-        : m_sizes{ ts... }
+    NDRange(T... ts) : m_sizes{ts...}
     {
         set_totalsizes();
     }
 
-    NDRange(const std::array<unsigned int, D> &n)
-        : m_sizes(n)
+    NDRange(const std::array<unsigned int, D> &n) : m_sizes(n)
     {
         set_totalsizes();
     }
@@ -163,7 +160,7 @@ public:
         std::array<int_t, N> sizes{};
 
         std::size_t i = 0;
-        for(auto &p : list)
+        for (auto &p : list)
         {
             m_positions[i] = p.first;
             sizes[i++]     = p.second;
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp
index 56614790591c2a8a00122b4bd76d5c0e740f7019..dbdec5fb5010123748c77e9654d8a0fe86c2c7a4 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp
@@ -29,7 +29,11 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void neon_fp16_boundingboxtransform(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window)
 {
     return bounding_box_transform<float16_t>(boxes, pred_boxes, deltas, bbinfo, window);
 }
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp
index 34ff9224d5e09067c56a2b6882864bbbd8b46309..0224b3406a6c10581876ceae34db1d015d5d8dfa 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp
@@ -26,7 +26,11 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void neon_fp32_boundingboxtransform(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window)
 {
     return bounding_box_transform<float>(boxes, pred_boxes, deltas, bbinfo, window);
 }
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp
index d74a8a712d8f9afee19823ee3eaa8fab689d9d7d..5a2939b587b3388aab98c9fd6160fa6fa0e89a62 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp
@@ -29,7 +29,11 @@ namespace arm_compute
 {
 namespace cpu
 {
-void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void bounding_box_transform_qsymm16(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window)
 
 {
     const size_t num_classes  = deltas->info()->tensor_shape()[0] >> 2;
@@ -41,7 +45,8 @@ void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, c
     const auto scale_before = bbinfo.scale();
     const auto offset       = (bbinfo.correct_transform_coords() ? 1.f : 0.f);
 
-    auto pred_ptr  = reinterpret_cast<uint16_t *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes());
+    auto pred_ptr =
+        reinterpret_cast<uint16_t *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes());
     auto delta_ptr = reinterpret_cast<uint8_t *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes());
 
     const auto boxes_qinfo  = boxes->info()->quantization_info().uniform();
@@ -49,101 +54,49 @@ void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, c
     const auto pred_qinfo   = pred_boxes->info()->quantization_info().uniform();
 
     Iterator box_it(boxes, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto  ptr    = reinterpret_cast<uint16_t *>(box_it.ptr());
-        const auto  b0     = dequantize_qasymm16(*ptr, boxes_qinfo);
-        const auto  b1     = dequantize_qasymm16(*(ptr + 1), boxes_qinfo);
-        const auto  b2     = dequantize_qasymm16(*(ptr + 2), boxes_qinfo);
-        const auto  b3     = dequantize_qasymm16(*(ptr + 3), boxes_qinfo);
-        const float width  = (b2 / scale_before) - (b0 / scale_before) + 1.f;
-        const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f;
-        const float ctr_x  = (b0 / scale_before) + 0.5f * width;
-        const float ctr_y  = (b1 / scale_before) + 0.5f * height;
-        for(size_t j = 0; j < num_classes; ++j)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            // Extract deltas
-            const size_t delta_id = id.y() * deltas_width + 4u * j;
-            const float  dx       = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / bbinfo.weights()[0];
-            const float  dy       = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / bbinfo.weights()[1];
-            float        dw       = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / bbinfo.weights()[2];
-            float        dh       = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / bbinfo.weights()[3];
-            // Clip dw and dh
-            dw = std::min(dw, bbinfo.bbox_xform_clip());
-            dh = std::min(dh, bbinfo.bbox_xform_clip());
-            // Determine the predictions
-            const float pred_ctr_x = dx * width + ctr_x;
-            const float pred_ctr_y = dy * height + ctr_y;
-            const float pred_w     = std::exp(dw) * width;
-            const float pred_h     = std::exp(dh) * height;
-            // Store the prediction into the output tensor
-            pred_ptr[delta_id]     = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 1] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 2] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 3] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f), pred_qinfo);
-        }
-    },
-    box_it);
+            const auto  ptr    = reinterpret_cast<uint16_t *>(box_it.ptr());
+            const auto  b0     = dequantize_qasymm16(*ptr, boxes_qinfo);
+            const auto  b1     = dequantize_qasymm16(*(ptr + 1), boxes_qinfo);
+            const auto  b2     = dequantize_qasymm16(*(ptr + 2), boxes_qinfo);
+            const auto  b3     = dequantize_qasymm16(*(ptr + 3), boxes_qinfo);
+            const float width  = (b2 / scale_before) - (b0 / scale_before) + 1.f;
+            const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f;
+            const float ctr_x  = (b0 / scale_before) + 0.5f * width;
+            const float ctr_y  = (b1 / scale_before) + 0.5f * height;
+            for (size_t j = 0; j < num_classes; ++j)
+            {
+                // Extract deltas
+                const size_t delta_id = id.y() * deltas_width + 4u * j;
+                const float  dx       = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / bbinfo.weights()[0];
+                const float  dy       = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / bbinfo.weights()[1];
+                float        dw       = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / bbinfo.weights()[2];
+                float        dh       = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / bbinfo.weights()[3];
+                // Clip dw and dh
+                dw = std::min(dw, bbinfo.bbox_xform_clip());
+                dh = std::min(dh, bbinfo.bbox_xform_clip());
+                // Determine the predictions
+                const float pred_ctr_x = dx * width + ctr_x;
+                const float pred_ctr_y = dy * height + ctr_y;
+                const float pred_w     = std::exp(dw) * width;
+                const float pred_h     = std::exp(dh) * height;
+                // Store the prediction into the output tensor
+                pred_ptr[delta_id] = quantize_qasymm16(
+                    scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo);
+                pred_ptr[delta_id + 1] = quantize_qasymm16(
+                    scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo);
+                pred_ptr[delta_id + 2] = quantize_qasymm16(
+                    scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f),
+                    pred_qinfo);
+                pred_ptr[delta_id + 3] = quantize_qasymm16(
+                    scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f),
+                    pred_qinfo);
+            }
+        },
+        box_it);
 }
-
-template <typename T>
-void bounding_box_transform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
-{
-    const size_t num_classes  = deltas->info()->tensor_shape()[0] >> 2;
-    const size_t deltas_width = deltas->info()->tensor_shape()[0];
-    const int    img_h        = std::floor(bbinfo.img_height() / bbinfo.scale() + 0.5f);
-    const int    img_w        = std::floor(bbinfo.img_width() / bbinfo.scale() + 0.5f);
-
-    const auto scale_after  = (bbinfo.apply_scale() ? T(bbinfo.scale()) : T(1));
-    const auto scale_before = T(bbinfo.scale());
-    ARM_COMPUTE_ERROR_ON(scale_before <= 0);
-    const auto offset = (bbinfo.correct_transform_coords() ? T(1.f) : T(0.f));
-
-    auto pred_ptr  = reinterpret_cast<T *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes());
-    auto delta_ptr = reinterpret_cast<T *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes());
-
-    Iterator box_it(boxes, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto ptr    = reinterpret_cast<T *>(box_it.ptr());
-        const auto b0     = *ptr;
-        const auto b1     = *(ptr + 1);
-        const auto b2     = *(ptr + 2);
-        const auto b3     = *(ptr + 3);
-        const T    width  = (b2 / scale_before) - (b0 / scale_before) + T(1.f);
-        const T    height = (b3 / scale_before) - (b1 / scale_before) + T(1.f);
-        const T    ctr_x  = (b0 / scale_before) + T(0.5f) * width;
-        const T    ctr_y  = (b1 / scale_before) + T(0.5f) * height;
-        for(size_t j = 0; j < num_classes; ++j)
-        {
-            // Extract deltas
-            const size_t delta_id = id.y() * deltas_width + 4u * j;
-            const T      dx       = delta_ptr[delta_id] / T(bbinfo.weights()[0]);
-            const T      dy       = delta_ptr[delta_id + 1] / T(bbinfo.weights()[1]);
-            T            dw       = delta_ptr[delta_id + 2] / T(bbinfo.weights()[2]);
-            T            dh       = delta_ptr[delta_id + 3] / T(bbinfo.weights()[3]);
-            // Clip dw and dh
-            dw = std::min(dw, T(bbinfo.bbox_xform_clip()));
-            dh = std::min(dh, T(bbinfo.bbox_xform_clip()));
-            // Determine the predictions
-            const T pred_ctr_x = dx * width + ctr_x;
-            const T pred_ctr_y = dy * height + ctr_y;
-            const T pred_w     = std::exp(dw) * width;
-            const T pred_h     = std::exp(dh) * height;
-            // Store the prediction into the output tensor
-            pred_ptr[delta_id]     = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1));
-            pred_ptr[delta_id + 1] = scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1));
-            pred_ptr[delta_id + 2] = scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1));
-            pred_ptr[delta_id + 3] = scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1));
-        }
-    },
-    box_it);
-}
-
-template void bounding_box_transform<float>(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window);
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void bounding_box_transform<float16_t>(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window);
-#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h
index d9ff694ae53746a06e08fca700f5ed948b76340c..d8013c62274c65b357ab6a74c9179dd4cc71f412 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,9 +30,73 @@ namespace arm_compute
 namespace cpu
 {
 template <typename T>
-void bounding_box_transform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window);
+void bounding_box_transform(const ITensor           *boxes,
+                            ITensor                 *pred_boxes,
+                            const ITensor           *deltas,
+                            BoundingBoxTransformInfo bbinfo,
+                            const Window            &window)
+{
+    const size_t num_classes  = deltas->info()->tensor_shape()[0] >> 2;
+    const size_t deltas_width = deltas->info()->tensor_shape()[0];
+    const int    img_h        = std::floor(bbinfo.img_height() / bbinfo.scale() + 0.5f);
+    const int    img_w        = std::floor(bbinfo.img_width() / bbinfo.scale() + 0.5f);
+
+    const auto scale_after  = (bbinfo.apply_scale() ? T(bbinfo.scale()) : T(1));
+    const auto scale_before = T(bbinfo.scale());
+    ARM_COMPUTE_ERROR_ON(scale_before <= 0);
+    const auto offset = (bbinfo.correct_transform_coords() ? T(1.f) : T(0.f));
+
+    auto pred_ptr  = reinterpret_cast<T *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes());
+    auto delta_ptr = reinterpret_cast<T *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes());
+
+    Iterator box_it(boxes, window);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto ptr    = reinterpret_cast<T *>(box_it.ptr());
+            const auto b0     = *ptr;
+            const auto b1     = *(ptr + 1);
+            const auto b2     = *(ptr + 2);
+            const auto b3     = *(ptr + 3);
+            const T    width  = (b2 / scale_before) - (b0 / scale_before) + T(1.f);
+            const T    height = (b3 / scale_before) - (b1 / scale_before) + T(1.f);
+            const T    ctr_x  = (b0 / scale_before) + T(0.5f) * width;
+            const T    ctr_y  = (b1 / scale_before) + T(0.5f) * height;
+            for (size_t j = 0; j < num_classes; ++j)
+            {
+                // Extract deltas
+                const size_t delta_id = id.y() * deltas_width + 4u * j;
+                const T      dx       = delta_ptr[delta_id] / T(bbinfo.weights()[0]);
+                const T      dy       = delta_ptr[delta_id + 1] / T(bbinfo.weights()[1]);
+                T            dw       = delta_ptr[delta_id + 2] / T(bbinfo.weights()[2]);
+                T            dh       = delta_ptr[delta_id + 3] / T(bbinfo.weights()[3]);
+                // Clip dw and dh
+                dw = std::min(dw, T(bbinfo.bbox_xform_clip()));
+                dh = std::min(dh, T(bbinfo.bbox_xform_clip()));
+                // Determine the predictions
+                const T pred_ctr_x = dx * width + ctr_x;
+                const T pred_ctr_y = dy * height + ctr_y;
+                const T pred_w     = std::exp(dw) * width;
+                const T pred_h     = std::exp(dh) * height;
+                // Store the prediction into the output tensor
+                pred_ptr[delta_id] = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1));
+                pred_ptr[delta_id + 1] =
+                    scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1));
+                pred_ptr[delta_id + 2] =
+                    scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1));
+                pred_ptr[delta_id + 3] =
+                    scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1));
+            }
+        },
+        box_it);
+}
 
-void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window);
+void bounding_box_transform_qsymm16(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif //define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp
index b27c187df3749747d66f4e50c311b700a5af3508..64ef8151951d4903e195c03674f550601364e2a1 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp
@@ -26,7 +26,11 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qu16_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void neon_qu16_boundingboxtransform(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window)
 {
     return bounding_box_transform_qsymm16(boxes, pred_boxes, deltas, bbinfo, window);
 }
diff --git a/src/cpu/kernels/boundingboxtransform/list.h b/src/cpu/kernels/boundingboxtransform/list.h
index 8f06acc8a6d8db6efdb72a392b618b90d7f39b69..4da725a257b29fef4d729f093b94fc304e2d384c 100644
--- a/src/cpu/kernels/boundingboxtransform/list.h
+++ b/src/cpu/kernels/boundingboxtransform/list.h
@@ -27,8 +27,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_BOUNDINGBOXTRANFORM_KERNEL(func_name) \
-    void func_name(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+#define DECLARE_BOUNDINGBOXTRANFORM_KERNEL(func_name)                                                                 \
+    void func_name(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, \
+                   const Window &window)
 DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp32_boundingboxtransform);
 DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp16_boundingboxtransform);
 DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_qu16_boundingboxtransform);
diff --git a/src/cpu/kernels/cast/generic/neon/bfloat16.cpp b/src/cpu/kernels/cast/generic/neon/bfloat16.cpp
deleted file mode 100644
index 91c15be279470e4eb1d75b95ecabe1c932750b09..0000000000000000000000000000000000000000
--- a/src/cpu/kernels/cast/generic/neon/bfloat16.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2016-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ARM_COMPUTE_ENABLE_BF16)
-
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/cpu/kernels/CpuCastKernel.h"
-#include "src/cpu/kernels/cast/list.h"
-#include "support/SaturateCast.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void neon_fp32_to_bfloat16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_UNUSED(_policy);
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const int  window_step_x  = 16;
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-    ARM_COMPUTE_ERROR_ON(_src == _dst);
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator src(_src, win);
-    Iterator dst(_dst, win);
-
-    /* Down-conversion F32 -> BFLOAT16 */
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<bfloat16 *>(dst.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(src.ptr()),
-                                   reinterpret_cast<uint16_t *>(dst.ptr()));
-            wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(src.ptr()) + 8,
-                                   reinterpret_cast<uint16_t *>(dst.ptr()) + 8);
-        }
-
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = *(src_ptr + x);
-        }
-    },
-    src, dst);
-}
-
-void neon_bfloat16_to_fp32_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_UNUSED(_policy);
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const int  window_step_x  = 16;
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-    ARM_COMPUTE_ERROR_ON(_src == _dst);
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator src(_src, win);
-    Iterator dst(_dst, win);
-    switch(_dst->info()->data_type())
-    {
-        case DataType::F32:
-        {
-            /* Up-conversion BFLOAT16 -> F32 */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const bfloat16 *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                {
-                    const uint16x8x2_t texels =
-                    {
-                        {
-                            vld1q_u16(reinterpret_cast<uint16_t *>(src.ptr())),
-                            vld1q_u16(reinterpret_cast<uint16_t *>(src.ptr()) + 8)
-                        }
-                    };
-
-                    vst1q_f32(reinterpret_cast<float *>(dst.ptr()),
-                              vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[0])), 16)));
-                    vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 4,
-                              vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[0])), 16)));
-                    vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 8,
-                              vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[1])), 16)));
-                    vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 12,
-                              vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[1])), 16)));
-                }
-
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = float(*(src_ptr + x));
-                }
-            },
-            src, dst);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("dst data type unsupported");
-    }
-}
-
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
diff --git a/src/cpu/kernels/cast/generic/neon/fp16.cpp b/src/cpu/kernels/cast/generic/neon/fp16.cpp
index 385ca1898d3e372abd8e7540ffec7c75d21cada1..2897f4b2420be76ecd776549940570d666a4bd4e 100644
--- a/src/cpu/kernels/cast/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/cast/generic/neon/fp16.cpp
@@ -23,9 +23,11 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 
+#include "arm_compute/core/CPP/CPPTypes.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/cpu/kernels/CpuCastKernel.h"
+
 #include "src/cpu/kernels/cast/list.h"
+#include "src/cpu/kernels/CpuCastKernel.h"
 #include "support/SaturateCast.h"
 
 #include "arm_neon.h"
@@ -34,7 +36,8 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_qasymm8_signed_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -48,42 +51,39 @@ void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-        int        x       = window_start_x;
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+            int        x       = window_start_x;
 
-            const int16x8x2_t texels =
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vmovl_s8(vget_low_s8(texels_s8)),
-                    vmovl_s8(vget_high_s8(texels_s8))
-                }
-            };
-            vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
-            vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
-        }
+                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+                const int16x8x2_t texels = {{vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
+                vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
+                vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
 }
 
-void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_s32_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -97,44 +97,41 @@ void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const float32x4x4_t texels =
+            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vcvtq_f32_s32(vld1q_s32(src_ptr + x)),
-                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
-                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)),
-                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))
-                }
-            };
-
-            vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
-            vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
-        }
+                const float32x4x4_t texels = {
+                    {vcvtq_f32_s32(vld1q_s32(src_ptr + x)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
+                     vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))}};
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+                vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+                vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
 }
 
-void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_fp32_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -148,44 +145,40 @@ void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const float32x4x4_t texels =
+            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vld1q_f32(src_ptr + x),
-                    vld1q_f32(src_ptr + x + 4),
-                    vld1q_f32(src_ptr + x + 8),
-                    vld1q_f32(src_ptr + x + 12)
-                }
-            };
-
-            vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
-            vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
-        }
+                const float32x4x4_t texels = {{vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4),
+                                               vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12)}};
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+                vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+                vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
 }
 
-void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_fp16_to_other_dt_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -199,142 +192,133 @@ void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const Thread
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
-    switch(_dst->info()->data_type())
+    switch (_dst->info()->data_type())
     {
         case DataType::QASYMM8_SIGNED:
         {
             /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
                 {
-                    const float16x8x2_t texels =
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        {
+                        const float16x8x2_t texels = {{
                             vld1q_f16(src_ptr + x),
                             vld1q_f16(src_ptr + x + 8),
-                        }
-                    };
+                        }};
 
-                    vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
-                }
+                        vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])),
+                                                          vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
+                    }
 
-                // Compute left-over elements
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                }
-            },
-            src, dst);
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
             break;
         }
         case DataType::QASYMM8:
         case DataType::U8:
         {
             /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
                 {
-                    const float16x8x2_t texels =
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        {
+                        const float16x8x2_t texels = {{
                             vld1q_f16(src_ptr + x),
                             vld1q_f16(src_ptr + x + 8),
-                        }
-                    };
+                        }};
 
-                    vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
-                }
-
-                // Compute left-over elements
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                }
+                        vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])),
+                                                          vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
+                    }
 
-            },
-            src, dst);
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
             break;
         }
         case DataType::F32:
         {
             /* Up-conversion F16 -> F32 */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
                 {
-                    const float16x8x2_t texels =
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        {
-                            vld1q_f16(src_ptr + x),
-                            vld1q_f16(src_ptr + x + 8)
-                        }
-                    };
-                    vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
-                    vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
-                    vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
-                    vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
-                }
-
-                // Compute left-over elements
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
-                }
-            },
-            src, dst);
+                        const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}};
+                        vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
+                        vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
+                        vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
+                        vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
             break;
         }
         case DataType::S32:
         {
             /* Up-conversion F16 -> S32 */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
                 {
-                    const float16x8x2_t texels =
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        {
-                            vld1q_f16(src_ptr + x),
-                            vld1q_f16(src_ptr + x + 8)
-                        }
-                    };
-
-                    vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
-                    vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
-                    vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
-                    vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
-                }
-
-                // Compute left-over elements
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                }
-            },
-            src, dst);
+                        const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}};
+
+                        vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
+                        vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
+                        vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
+                        vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
             break;
         }
         default:
@@ -342,7 +326,8 @@ void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const Thread
     }
 }
 
-void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_u8_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -356,40 +341,37 @@ void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
     /* Up-conversion U8 -> F16 */
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
 
-            const int16x8x2_t texels =
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                }
-            };
-            vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
-            vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
-        }
+                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
+                vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
+                vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
     return;
 }
 
diff --git a/src/cpu/kernels/cast/list.h b/src/cpu/kernels/cast/list.h
index ffd82d5bf3a6e1bcb402004655f48d24b92d08e7..5e634fc1707f44876b449d20bbae47eaf2798e08 100644
--- a/src/cpu/kernels/cast/list.h
+++ b/src/cpu/kernels/cast/list.h
@@ -27,8 +27,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_CAST_KERNEL(func_name) \
-    void func_name(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
+#define DECLARE_CAST_KERNEL(func_name)                                                                  \
+    void func_name(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, \
+                   const Window &window)
 
 DECLARE_CAST_KERNEL(neon_fp32_to_fp16_cast);
 DECLARE_CAST_KERNEL(neon_u8_to_fp16_cast);
@@ -41,4 +42,4 @@ DECLARE_CAST_KERNEL(neon_bfloat16_to_fp32_cast);
 #undef DECLARE_CAST_KERNEL
 } // namespace cpu
 } // namespace arm_compute
-#endif //SRC_CORE_NEON_KERNELS_CAST_LIST_H
\ No newline at end of file
+#endif //SRC_CORE_NEON_KERNELS_CAST_LIST_H
diff --git a/src/cpu/kernels/conv3d/neon/list.h b/src/cpu/kernels/conv3d/neon/list.h
index 3bfa124dc3ba6995a95137b2f897ab8b3e257622..082c60be29d2ade8c286cb2323eaa59ca8b71c2e 100644
--- a/src/cpu/kernels/conv3d/neon/list.h
+++ b/src/cpu/kernels/conv3d/neon/list.h
@@ -27,8 +27,9 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/conv3d/neon/quantized.h"
 
 namespace arm_compute
@@ -36,7 +37,12 @@ namespace arm_compute
 namespace cpu
 {
 template <typename T>
-void directconv3d_float_neon_ndhwc(const ITensor *src0, const ITensor *src1, const ITensor *src2, ITensor *dst, const Conv3dInfo &conv_info, const Window &window)
+void directconv3d_float_neon_ndhwc(const ITensor    *src0,
+                                   const ITensor    *src1,
+                                   const ITensor    *src2,
+                                   ITensor          *dst,
+                                   const Conv3dInfo &conv_info,
+                                   const Window     &window)
 {
     const ITensor *src     = src0;
     const ITensor *weights = src1;
@@ -88,91 +94,104 @@ void directconv3d_float_neon_ndhwc(const ITensor *src0, const ITensor *src1, con
     Iterator wei(weights, window_w);
 
     const T *biases_ptr = nullptr;
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         biases_ptr = reinterpret_cast<T *>(biases->buffer() + biases->info()->offset_first_element_in_bytes());
     }
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // We are computing the theoretical input starting points
-        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-        const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-        const int in_d_end_t   = in_d_start_t + kernel_dim_d;
-
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_d_start = std::max(in_d_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-        const int in_d_end   = std::min(in_d_end_t, input_dim_d);
-
-        // We use the input points to select the valid weight points to use
-        const int wei_w_start = in_w_start - in_w_start_t;
-        const int wei_h_start = in_h_start - in_h_start_t;
-        const int wei_d_start = in_d_start - in_d_start_t;
-        const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
-        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-        const int wei_d_end   = kernel_dim_d - (in_d_end_t - in_d_end);
-
-        const int      index_c_out_end = weights->info()->dimension(0);
-        const int      index_c_in_end  = weights->info()->dimension(1);
-        const T *const in_ptr_start    = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[4] * input_stride_n;
-
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            /*
+            // We are computing the theoretical input starting points
+            const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+            const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+            const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
+            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+            const int in_d_end_t   = in_d_start_t + kernel_dim_d;
+
+            // We are computing the valid initial and ending input points by checking the borders
+            const int in_w_start = std::max(in_w_start_t, 0);
+            const int in_h_start = std::max(in_h_start_t, 0);
+            const int in_d_start = std::max(in_d_start_t, 0);
+            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+            const int in_d_end   = std::min(in_d_end_t, input_dim_d);
+
+            // We use the input points to select the valid weight points to use
+            const int wei_w_start = in_w_start - in_w_start_t;
+            const int wei_h_start = in_h_start - in_h_start_t;
+            const int wei_d_start = in_d_start - in_d_start_t;
+            const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
+            const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+            const int wei_d_end   = kernel_dim_d - (in_d_end_t - in_d_end);
+
+            const int      index_c_out_end = weights->info()->dimension(0);
+            const int      index_c_in_end  = weights->info()->dimension(1);
+            const T *const in_ptr_start =
+                reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                id[4] * input_stride_n;
+
+            execute_window_loop(
+                window_w,
+                [&](const Coordinates &id_w)
+                {
+                    /*
             * This is the loop in the weights, and it goes along OFM (output feature map)
             */
-            const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-            T          out_temp          = static_cast<T>(0);
-            T         *out_ptr           = reinterpret_cast<T *>(out.ptr());
-            for(int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; ++index_wei_d, ++index_in_d)
-            {
-                const auto in_ptr_d      = in_ptr_start + index_in_d * input_stride_d;
-                const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
-                for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
-                {
-                    const T *const in_ptr_row      = in_ptr_d + index_in_h * input_stride_h;
-                    const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
-                    for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
+                    const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                    T          out_temp          = static_cast<T>(0);
+                    T         *out_ptr           = reinterpret_cast<T *>(out.ptr());
+                    for (int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end;
+                         ++index_wei_d, ++index_in_d)
                     {
-                        const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
-                        const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
-                        int         index_c_in        = 0;
-                        vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                        vector_type w_vec             = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                        for(; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
-                            index_c_in += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+                        const auto in_ptr_d      = in_ptr_start + index_in_d * input_stride_d;
+                        const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
+                        for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+                             ++index_wei_h, ++index_in_h)
                         {
-                            const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                            //Load Cin weights
-                            for(int k = 0; k < num_elems_read_per_iteration; ++k, weights_ptr_mover += index_c_out_end)
+                            const T *const in_ptr_row      = in_ptr_d + index_in_h * input_stride_h;
+                            const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
+                            for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end;
+                                 ++index_wei_w, ++index_in_w)
                             {
-                                w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
+                                const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
+                                const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+                                int         index_c_in        = 0;
+                                vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                                vector_type w_vec             = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                                for (; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
+                                     index_c_in += num_elems_read_per_iteration,
+                                     in_ptr_mover += num_elems_read_per_iteration)
+                                {
+                                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                                    //Load Cin weights
+                                    for (int k = 0; k < num_elems_read_per_iteration;
+                                         ++k, weights_ptr_mover += index_c_out_end)
+                                    {
+                                        w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
+                                    }
+                                    out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                                }
+                                out_temp += vreduce(out_temp_vec);
+                                for (; index_c_in < index_c_in_end;
+                                     ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
+                                {
+                                    const auto src_val = *(in_ptr_mover);
+                                    const auto w_val   = *(weights_ptr_mover);
+                                    out_temp += src_val * w_val;
+                                }
                             }
-                            out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                        }
-                        out_temp += vreduce(out_temp_vec);
-                        for(; index_c_in < index_c_in_end; ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
-                        {
-                            const auto src_val = *(in_ptr_mover);
-                            const auto w_val   = *(weights_ptr_mover);
-                            out_temp += src_val * w_val;
                         }
                     }
-                }
-            }
-            *(reinterpret_cast<T *>(out_ptr + id_w[0])) = (biases_ptr != nullptr) ? out_temp + biases_ptr[id_w[0]] : out_temp;
+                    *(reinterpret_cast<T *>(out_ptr + id_w[0])) =
+                        (biases_ptr != nullptr) ? out_temp + biases_ptr[id_w[0]] : out_temp;
+                },
+                wei);
         },
-        wei);
-    },
-    out);
+        out);
 }
 
 } // namespace cpu
 } // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_CONV3D_LIST_H
\ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_CONV3D_LIST_H
diff --git a/src/cpu/kernels/conv3d/neon/quantized.h b/src/cpu/kernels/conv3d/neon/quantized.h
index a8165b494485d830eb6fd2658eca2bd3a666fb56..f0fc9b5a71a9c5b24dfe2c5f1f882eb818264e27 100644
--- a/src/cpu/kernels/conv3d/neon/quantized.h
+++ b/src/cpu/kernels/conv3d/neon/quantized.h
@@ -28,16 +28,22 @@
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
 template <typename T>
-void directconv3d_quantized_neon_ndhwc(const ITensor *src0, const ITensor *src1, const ITensor *src2, ITensor *dst, const Conv3dInfo &conv_info, const Window &window)
+void directconv3d_quantized_neon_ndhwc(const ITensor    *src0,
+                                       const ITensor    *src1,
+                                       const ITensor    *src2,
+                                       ITensor          *dst,
+                                       const Conv3dInfo &conv_info,
+                                       const Window     &window)
 {
     const ITensor *src     = src0;
     const ITensor *weights = src1;
@@ -104,153 +110,166 @@ void directconv3d_quantized_neon_ndhwc(const ITensor *src0, const ITensor *src1,
     Iterator wei(weights, window_w);
 
     const int32_t *biases_ptr = nullptr;
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         biases_ptr = reinterpret_cast<int32_t *>(biases->buffer() + biases->info()->offset_first_element_in_bytes());
     }
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // We are computing the theoretical input starting points
-        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-        const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-        const int in_d_end_t   = in_d_start_t + kernel_dim_d;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // We are computing the theoretical input starting points
+            const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+            const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+            const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
+            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+            const int in_d_end_t   = in_d_start_t + kernel_dim_d;
 
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_d_start = std::max(in_d_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-        const int in_d_end   = std::min(in_d_end_t, input_dim_d);
+            // We are computing the valid initial and ending input points by checking the borders
+            const int in_w_start = std::max(in_w_start_t, 0);
+            const int in_h_start = std::max(in_h_start_t, 0);
+            const int in_d_start = std::max(in_d_start_t, 0);
+            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+            const int in_d_end   = std::min(in_d_end_t, input_dim_d);
 
-        // We use the input points to select the valid weight points to use
-        const int wei_w_start = in_w_start - in_w_start_t;
-        const int wei_h_start = in_h_start - in_h_start_t;
-        const int wei_d_start = in_d_start - in_d_start_t;
-        const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
-        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-        const int wei_d_end   = kernel_dim_d - (in_d_end_t - in_d_end);
+            // We use the input points to select the valid weight points to use
+            const int wei_w_start = in_w_start - in_w_start_t;
+            const int wei_h_start = in_h_start - in_h_start_t;
+            const int wei_d_start = in_d_start - in_d_start_t;
+            const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
+            const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+            const int wei_d_end   = kernel_dim_d - (in_d_end_t - in_d_end);
 
-        const int      index_c_out_end = weights->info()->dimension(0);
-        const int      index_c_in_end  = weights->info()->dimension(1);
-        const T *const in_ptr_start    = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[4] * input_stride_n;
+            const int      index_c_out_end = weights->info()->dimension(0);
+            const int      index_c_in_end  = weights->info()->dimension(1);
+            const T *const in_ptr_start =
+                reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                id[4] * input_stride_n;
 
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
-        {
-            /*
+            execute_window_loop(
+                window_w,
+                [&](const Coordinates &id_w)
+                {
+                    /*
             * This is the loop in the weights, and it goes along OFM (output feature map)
             */
-            const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-            int32_t    acc               = static_cast<int32_t>(0);
-            T         *out_ptr           = reinterpret_cast<T *>(out.ptr());
-            for(int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; ++index_wei_d, ++index_in_d)
-            {
-                const auto in_ptr_d      = in_ptr_start + index_in_d * input_stride_d;
-                const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
-                for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
-                {
-                    const T *const in_ptr_row      = in_ptr_d + index_in_h * input_stride_h;
-                    const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
-                    for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
+                    const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                    int32_t    acc               = static_cast<int32_t>(0);
+                    T         *out_ptr           = reinterpret_cast<T *>(out.ptr());
+                    for (int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end;
+                         ++index_wei_d, ++index_in_d)
                     {
-                        const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
-                        const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
-                        int         index_c_in        = 0;
-                        vector_type w_vec             = wrapper::vdup_n(static_cast<T>(0), tag_type());
-
-                        q32x4_t acc_q32_0 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
-                        q32x4_t acc_q32_1 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
-                        q32x4_t acc_q32_2 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
-                        q32x4_t acc_q32_3 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
-
-                        for(; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
-                            index_c_in += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+                        const auto in_ptr_d      = in_ptr_start + index_in_d * input_stride_d;
+                        const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
+                        for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+                             ++index_wei_h, ++index_in_h)
                         {
-                            const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                            //Load Cin weights
-                            for(int k = 0; k < num_elems_read_per_iteration; ++k, weights_ptr_mover += index_c_out_end)
+                            const T *const in_ptr_row      = in_ptr_d + index_in_h * input_stride_h;
+                            const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
+                            for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end;
+                                 ++index_wei_w, ++index_in_w)
                             {
-                                w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
-                            }
-                            q32x4_t src_q32_0 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
-                            q32x4_t src_q32_1 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
-                            q32x4_t src_q32_2 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
-                            q32x4_t src_q32_3 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+                                const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
+                                const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+                                int         index_c_in        = 0;
+                                vector_type w_vec             = wrapper::vdup_n(static_cast<T>(0), tag_type());
 
-                            q32x4_t wei_q32_0 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
-                            q32x4_t wei_q32_1 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
-                            q32x4_t wei_q32_2 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
-                            q32x4_t wei_q32_3 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+                                q32x4_t acc_q32_0 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
+                                q32x4_t acc_q32_1 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
+                                q32x4_t acc_q32_2 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
+                                q32x4_t acc_q32_3 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
 
-                            const auto src_q16_0 = wrapper::vmovl(wrapper::vgetlow(src_vec));
-                            const auto src_q16_1 = wrapper::vmovl(wrapper::vgethigh(src_vec));
-                            const auto wei_q16_0 = wrapper::vmovl(wrapper::vgetlow(w_vec));
-                            const auto wei_q16_1 = wrapper::vmovl(wrapper::vgethigh(w_vec));
+                                for (; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
+                                     index_c_in += num_elems_read_per_iteration,
+                                     in_ptr_mover += num_elems_read_per_iteration)
+                                {
+                                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                                    //Load Cin weights
+                                    for (int k = 0; k < num_elems_read_per_iteration;
+                                         ++k, weights_ptr_mover += index_c_out_end)
+                                    {
+                                        w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
+                                    }
+                                    q32x4_t src_q32_0 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+                                    q32x4_t src_q32_1 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+                                    q32x4_t src_q32_2 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+                                    q32x4_t src_q32_3 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
 
-                            src_q32_0 = wrapper::vadd(src_q32_0, wrapper::vmovl(wrapper::vgetlow(src_q16_0)));
-                            src_q32_1 = wrapper::vadd(src_q32_1, wrapper::vmovl(wrapper::vgethigh(src_q16_0)));
-                            src_q32_2 = wrapper::vadd(src_q32_2, wrapper::vmovl(wrapper::vgetlow(src_q16_1)));
-                            src_q32_3 = wrapper::vadd(src_q32_3, wrapper::vmovl(wrapper::vgethigh(src_q16_1)));
+                                    q32x4_t wei_q32_0 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+                                    q32x4_t wei_q32_1 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+                                    q32x4_t wei_q32_2 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+                                    q32x4_t wei_q32_3 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
 
-                            wei_q32_0 = wrapper::vadd(wei_q32_0, wrapper::vmovl(wrapper::vgetlow(wei_q16_0)));
-                            wei_q32_1 = wrapper::vadd(wei_q32_1, wrapper::vmovl(wrapper::vgethigh(wei_q16_0)));
-                            wei_q32_2 = wrapper::vadd(wei_q32_2, wrapper::vmovl(wrapper::vgetlow(wei_q16_1)));
-                            wei_q32_3 = wrapper::vadd(wei_q32_3, wrapper::vmovl(wrapper::vgethigh(wei_q16_1)));
+                                    const auto src_q16_0 = wrapper::vmovl(wrapper::vgetlow(src_vec));
+                                    const auto src_q16_1 = wrapper::vmovl(wrapper::vgethigh(src_vec));
+                                    const auto wei_q16_0 = wrapper::vmovl(wrapper::vgetlow(w_vec));
+                                    const auto wei_q16_1 = wrapper::vmovl(wrapper::vgethigh(w_vec));
 
-                            acc_q32_0 = wrapper::vmla(acc_q32_0, wei_q32_0, src_q32_0);
-                            acc_q32_1 = wrapper::vmla(acc_q32_1, wei_q32_1, src_q32_1);
-                            acc_q32_2 = wrapper::vmla(acc_q32_2, wei_q32_2, src_q32_2);
-                            acc_q32_3 = wrapper::vmla(acc_q32_3, wei_q32_3, src_q32_3);
-                        }
+                                    src_q32_0 = wrapper::vadd(src_q32_0, wrapper::vmovl(wrapper::vgetlow(src_q16_0)));
+                                    src_q32_1 = wrapper::vadd(src_q32_1, wrapper::vmovl(wrapper::vgethigh(src_q16_0)));
+                                    src_q32_2 = wrapper::vadd(src_q32_2, wrapper::vmovl(wrapper::vgetlow(src_q16_1)));
+                                    src_q32_3 = wrapper::vadd(src_q32_3, wrapper::vmovl(wrapper::vgethigh(src_q16_1)));
+
+                                    wei_q32_0 = wrapper::vadd(wei_q32_0, wrapper::vmovl(wrapper::vgetlow(wei_q16_0)));
+                                    wei_q32_1 = wrapper::vadd(wei_q32_1, wrapper::vmovl(wrapper::vgethigh(wei_q16_0)));
+                                    wei_q32_2 = wrapper::vadd(wei_q32_2, wrapper::vmovl(wrapper::vgetlow(wei_q16_1)));
+                                    wei_q32_3 = wrapper::vadd(wei_q32_3, wrapper::vmovl(wrapper::vgethigh(wei_q16_1)));
+
+                                    acc_q32_0 = wrapper::vmla(acc_q32_0, wei_q32_0, src_q32_0);
+                                    acc_q32_1 = wrapper::vmla(acc_q32_1, wei_q32_1, src_q32_1);
+                                    acc_q32_2 = wrapper::vmla(acc_q32_2, wei_q32_2, src_q32_2);
+                                    acc_q32_3 = wrapper::vmla(acc_q32_3, wei_q32_3, src_q32_3);
+                                }
 #if defined(__aarch64__)
-                        acc += wrapper::vaddv(acc_q32_0);
-                        acc += wrapper::vaddv(acc_q32_1);
-                        acc += wrapper::vaddv(acc_q32_2);
-                        acc += wrapper::vaddv(acc_q32_3);
+                                acc += wrapper::vaddv(acc_q32_0);
+                                acc += wrapper::vaddv(acc_q32_1);
+                                acc += wrapper::vaddv(acc_q32_2);
+                                acc += wrapper::vaddv(acc_q32_3);
 #else // __aarch64__
-                        auto temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_0), wrapper::vgetlow(acc_q32_0));
-                        temp      = wrapper::vpadd(temp, temp);
-                        acc       += wrapper::vgetlane(temp, 0);
+                                auto temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_0), wrapper::vgetlow(acc_q32_0));
+                                temp      = wrapper::vpadd(temp, temp);
+                                acc += wrapper::vgetlane(temp, 0);
 
-                        temp      = wrapper::vpadd(wrapper::vgethigh(acc_q32_1), wrapper::vgetlow(acc_q32_1));
-                        temp      = wrapper::vpadd(temp, temp);
-                        acc       += wrapper::vgetlane(temp, 0);
+                                temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_1), wrapper::vgetlow(acc_q32_1));
+                                temp = wrapper::vpadd(temp, temp);
+                                acc += wrapper::vgetlane(temp, 0);
 
-                        temp      = wrapper::vpadd(wrapper::vgethigh(acc_q32_2), wrapper::vgetlow(acc_q32_2));
-                        temp      = wrapper::vpadd(temp, temp);
-                        acc       += wrapper::vgetlane(temp, 0);
+                                temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_2), wrapper::vgetlow(acc_q32_2));
+                                temp = wrapper::vpadd(temp, temp);
+                                acc += wrapper::vgetlane(temp, 0);
 
-                        temp      = wrapper::vpadd(wrapper::vgethigh(acc_q32_3), wrapper::vgetlow(acc_q32_3));
-                        temp      = wrapper::vpadd(temp, temp);
-                        acc       += wrapper::vgetlane(temp, 0);
+                                temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_3), wrapper::vgetlow(acc_q32_3));
+                                temp = wrapper::vpadd(temp, temp);
+                                acc += wrapper::vgetlane(temp, 0);
 
 #endif // __aarch64__
 
-                        for(; index_c_in < index_c_in_end; ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
-                        {
-                            const auto src_val = *(in_ptr_mover) + input_offset;
-                            const auto w_val   = *(weights_ptr_mover) + weights_offset;
-                            acc += src_val * w_val;
+                                for (; index_c_in < index_c_in_end;
+                                     ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
+                                {
+                                    const auto src_val = *(in_ptr_mover) + input_offset;
+                                    const auto w_val   = *(weights_ptr_mover) + weights_offset;
+                                    acc += src_val * w_val;
+                                }
+                            }
                         }
                     }
-                }
-            }
 
-            if(biases)
-            {
-                acc += *reinterpret_cast<const int32_t *>(biases_ptr + id_w[0]);
-            }
+                    if (biases)
+                    {
+                        acc += *reinterpret_cast<const int32_t *>(biases_ptr + id_w[0]);
+                    }
 
-            T out_val                                   = finalize_quantization(acc, output_multiplier, output_shift, output_offset, T(0), T(0), false);
-            *(reinterpret_cast<T *>(out_ptr + id_w[0])) = out_val;
+                    T out_val =
+                        finalize_quantization(acc, output_multiplier, output_shift, output_offset, T(0), T(0), false);
+                    *(reinterpret_cast<T *>(out_ptr + id_w[0])) = out_val;
+                },
+                wei);
         },
-        wei);
-    },
-    out);
+        out);
 }
 } // namespace cpu
 } // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H
\ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H
diff --git a/src/cpu/kernels/crop/generic/neon/crop_helper.h b/src/cpu/kernels/crop/generic/neon/crop_helper.h
index 1fe8e11e9809e9b766e52acf220b94f4a140492e..8fb7ad208726bd7a24223e8552749e3323f2bf50 100644
--- a/src/cpu/kernels/crop/generic/neon/crop_helper.h
+++ b/src/cpu/kernels/crop/generic/neon/crop_helper.h
@@ -80,7 +80,7 @@ inline float32x4_t load_as_f32(uint8_t *ptr)
 {
     return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(wrapper::vload(ptr)))));
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 
-#endif //SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H
\ No newline at end of file
+#endif //SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H
diff --git a/src/cpu/kernels/crop/generic/neon/fp16.cpp b/src/cpu/kernels/crop/generic/neon/fp16.cpp
index 218ebba1910d5a225c4377e3be20c77a84e3ce8a..3739c9d4e06f90744b02e7edeb7ace60f29f6c37 100644
--- a/src/cpu/kernels/crop/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/crop/generic/neon/fp16.cpp
@@ -29,12 +29,19 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fp16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void fp16_in_bounds_crop_window(const ITensor *input,
+                                const ITensor *output,
+                                float         *output_ptr,
+                                Coordinates    input_offset,
+                                int32_t        window_step_x,
+                                int32_t        output_width_start,
+                                int32_t        output_width_limit,
+                                bool           input_has_single_channel,
+                                bool           is_width_flipped)
 {
-    return in_bounds_crop_window<float16_t>(input, output, output_ptr, input_offset,
-                                            window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
-}
+    return in_bounds_crop_window<float16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                            output_width_limit, input_has_single_channel, is_width_flipped);
 }
+} // namespace cpu
 } // namespace arm_compute
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/crop/generic/neon/fp32.cpp b/src/cpu/kernels/crop/generic/neon/fp32.cpp
index 16d0218fce091e4f05a328e5ecdbf03e3915ab21..f665c3652c6def9ac8e712241f17a3e8c647c953 100644
--- a/src/cpu/kernels/crop/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/crop/generic/neon/fp32.cpp
@@ -28,11 +28,18 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fp32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void fp32_in_bounds_crop_window(const ITensor *input,
+                                const ITensor *output,
+                                float         *output_ptr,
+                                Coordinates    input_offset,
+                                int32_t        window_step_x,
+                                int32_t        output_width_start,
+                                int32_t        output_width_limit,
+                                bool           input_has_single_channel,
+                                bool           is_width_flipped)
 {
-    return in_bounds_crop_window<float32_t>(input, output, output_ptr, input_offset,
-                                            window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
-}
+    return in_bounds_crop_window<float32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                            output_width_limit, input_has_single_channel, is_width_flipped);
 }
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/crop/generic/neon/impl.cpp b/src/cpu/kernels/crop/generic/neon/impl.cpp
deleted file mode 100644
index 95ab8049405ee6c14c0bc228a624be0683de5665..0000000000000000000000000000000000000000
--- a/src/cpu/kernels/crop/generic/neon/impl.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/cpu/kernels/crop/generic/neon/impl.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Registrars.h"
-#include "src/cpu/kernels/crop/generic/neon/crop_helper.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename T>
-void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                           int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
-{
-    // Reverse elements if width flipped.
-    if(is_width_flipped)
-    {
-        // Collapse first dimension if possible.
-        if(input_has_single_channel)
-        {
-            int32_t     x = output_width_start;
-            Coordinates negative_offset(input_offset);
-            negative_offset.set(1, negative_offset[1] - window_step_x + 1);
-            for(; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
-            {
-                auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset)));
-
-                in = wrapper::vrev64(in);
-                in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
-
-                wrapper::vstore(output_ptr + x, in);
-            }
-            input_offset[1] = negative_offset[1] + window_step_x - 1;
-            for(; x < output_width_limit; ++x, --input_offset[1])
-            {
-                *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-            }
-        }
-        else
-        {
-            for(int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
-            {
-                input_offset.set(0, 0);
-                int32_t c = 0;
-                for(; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; c += window_step_x, input_offset[0] += window_step_x)
-                {
-                    auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-                    wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in);
-                }
-                for(; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
-                {
-                    *(output_ptr + x * output->info()->dimension(0) + c) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-                }
-            }
-        }
-    }
-    else
-    {
-        // Use memcpy if the elements don't need converting to float.
-        if(std::is_same<T, float>::value)
-        {
-            memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)),
-                   reinterpret_cast<const void *>(input->ptr_to_element(input_offset)),
-                   (output_width_limit - output_width_start) * output->info()->dimension(0) * output->info()->element_size());
-        }
-        else
-        {
-            int32_t x                = 0;
-            int32_t limit            = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
-            float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
-            for(; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
-            {
-                auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-                wrapper::vstore(output_start_ptr + x, in);
-            }
-            for(; x < limit; ++x, ++input_offset[0])
-            {
-                *(output_start_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-            }
-        }
-    }
-}
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void in_bounds_crop_window<float16_t>(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-template void in_bounds_crop_window<float32_t>(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
-template void in_bounds_crop_window<uint8_t>(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                             int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
-template void in_bounds_crop_window<uint16_t>(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                              int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
-template void in_bounds_crop_window<uint32_t>(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                              int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
-template void in_bounds_crop_window<int8_t>(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                            int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
-template void in_bounds_crop_window<int16_t>(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                             int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
-template void in_bounds_crop_window<int32_t>(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                             int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
-}
-} // namespace arm_compute
diff --git a/src/cpu/kernels/crop/generic/neon/impl.h b/src/cpu/kernels/crop/generic/neon/impl.h
index 50f889705a29810423b407c4c204715d197026d9..b90ba9ddbfbe1a326d0a3b415fcda9652beedbc3 100644
--- a/src/cpu/kernels/crop/generic/neon/impl.h
+++ b/src/cpu/kernels/crop/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,16 +26,99 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/crop/generic/neon/crop_helper.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
 template <typename T>
-void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                           int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped);
+void in_bounds_crop_window(const ITensor *input,
+                           const ITensor *output,
+                           float         *output_ptr,
+                           Coordinates    input_offset,
+                           int32_t        window_step_x,
+                           int32_t        output_width_start,
+                           int32_t        output_width_limit,
+                           bool           input_has_single_channel,
+                           bool           is_width_flipped)
+{
+    // Reverse elements if width flipped.
+    if (is_width_flipped)
+    {
+        // Collapse first dimension if possible.
+        if (input_has_single_channel)
+        {
+            int32_t     x = output_width_start;
+            Coordinates negative_offset(input_offset);
+            negative_offset.set(1, negative_offset[1] - window_step_x + 1);
+            for (; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
+            {
+                auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset)));
+
+                in = wrapper::vrev64(in);
+                in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+
+                wrapper::vstore(output_ptr + x, in);
+            }
+            input_offset[1] = negative_offset[1] + window_step_x - 1;
+            for (; x < output_width_limit; ++x, --input_offset[1])
+            {
+                *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+            }
+        }
+        else
+        {
+            for (int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
+            {
+                input_offset.set(0, 0);
+                int32_t c = 0;
+                for (; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x;
+                     c += window_step_x, input_offset[0] += window_step_x)
+                {
+                    auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+                    wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in);
+                }
+                for (; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
+                {
+                    *(output_ptr + x * output->info()->dimension(0) + c) =
+                        static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+                }
+            }
+        }
+    }
+    else
+    {
+        // Use memcpy if the elements don't need converting to float.
+        if (std::is_same<T, float>::value)
+        {
+            memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)),
+                   reinterpret_cast<const void *>(input->ptr_to_element(input_offset)),
+                   (output_width_limit - output_width_start) * output->info()->dimension(0) *
+                       output->info()->element_size());
+        }
+        else
+        {
+            int32_t x = 0;
+            int32_t limit =
+                (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
+            float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
+            for (; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
+            {
+                auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+                wrapper::vstore(output_start_ptr + x, in);
+            }
+            for (; x < limit; ++x, ++input_offset[0])
+            {
+                *(output_start_ptr + x) =
+                    static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+            }
+        }
+    }
+}
 } // namespace cpu
 } // namespace arm_compute
 #endif //SRC_CORE_NEON_KERNELS_CROP_IMPL_H
diff --git a/src/cpu/kernels/crop/generic/neon/integer.cpp b/src/cpu/kernels/crop/generic/neon/integer.cpp
index ebf2c1fbd3e4575006ca445155ad65f5d22fe8a9..602434f54fe9387124065cb2055ad369ec4e5826 100644
--- a/src/cpu/kernels/crop/generic/neon/integer.cpp
+++ b/src/cpu/kernels/crop/generic/neon/integer.cpp
@@ -29,46 +29,88 @@ namespace arm_compute
 {
 namespace cpu
 {
-void u8_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                              int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void u8_in_bounds_crop_window(const ITensor *input,
+                              const ITensor *output,
+                              float         *output_ptr,
+                              Coordinates    input_offset,
+                              int32_t        window_step_x,
+                              int32_t        output_width_start,
+                              int32_t        output_width_limit,
+                              bool           input_has_single_channel,
+                              bool           is_width_flipped)
 {
-    return in_bounds_crop_window<uint8_t>(input, output, output_ptr, input_offset,
-                                          window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<uint8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                          output_width_limit, input_has_single_channel, is_width_flipped);
 }
 
-void u16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void u16_in_bounds_crop_window(const ITensor *input,
+                               const ITensor *output,
+                               float         *output_ptr,
+                               Coordinates    input_offset,
+                               int32_t        window_step_x,
+                               int32_t        output_width_start,
+                               int32_t        output_width_limit,
+                               bool           input_has_single_channel,
+                               bool           is_width_flipped)
 {
-    return in_bounds_crop_window<uint16_t>(input, output, output_ptr, input_offset,
-                                           window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<uint16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                           output_width_limit, input_has_single_channel, is_width_flipped);
 }
 
-void u32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void u32_in_bounds_crop_window(const ITensor *input,
+                               const ITensor *output,
+                               float         *output_ptr,
+                               Coordinates    input_offset,
+                               int32_t        window_step_x,
+                               int32_t        output_width_start,
+                               int32_t        output_width_limit,
+                               bool           input_has_single_channel,
+                               bool           is_width_flipped)
 {
-    return in_bounds_crop_window<uint32_t>(input, output, output_ptr, input_offset,
-                                           window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<uint32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                           output_width_limit, input_has_single_channel, is_width_flipped);
 }
 
-void s8_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                              int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void s8_in_bounds_crop_window(const ITensor *input,
+                              const ITensor *output,
+                              float         *output_ptr,
+                              Coordinates    input_offset,
+                              int32_t        window_step_x,
+                              int32_t        output_width_start,
+                              int32_t        output_width_limit,
+                              bool           input_has_single_channel,
+                              bool           is_width_flipped)
 {
-    return in_bounds_crop_window<int8_t>(input, output, output_ptr, input_offset,
-                                         window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<int8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                         output_width_limit, input_has_single_channel, is_width_flipped);
 }
 
-void s16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void s16_in_bounds_crop_window(const ITensor *input,
+                               const ITensor *output,
+                               float         *output_ptr,
+                               Coordinates    input_offset,
+                               int32_t        window_step_x,
+                               int32_t        output_width_start,
+                               int32_t        output_width_limit,
+                               bool           input_has_single_channel,
+                               bool           is_width_flipped)
 {
-    return in_bounds_crop_window<int16_t>(input, output, output_ptr, input_offset,
-                                          window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<int16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                          output_width_limit, input_has_single_channel, is_width_flipped);
 }
 
-void s32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void s32_in_bounds_crop_window(const ITensor *input,
+                               const ITensor *output,
+                               float         *output_ptr,
+                               Coordinates    input_offset,
+                               int32_t        window_step_x,
+                               int32_t        output_width_start,
+                               int32_t        output_width_limit,
+                               bool           input_has_single_channel,
+                               bool           is_width_flipped)
 {
-    return in_bounds_crop_window<int32_t>(input, output, output_ptr, input_offset,
-                                          window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
-}
+    return in_bounds_crop_window<int32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                          output_width_limit, input_has_single_channel, is_width_flipped);
 }
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/crop/list.h b/src/cpu/kernels/crop/list.h
index a6b83215aed292d09fff917dfff337e6afbc3095..9cb7726203a734c289933d0494f77c40b94749a2 100644
--- a/src/cpu/kernels/crop/list.h
+++ b/src/cpu/kernels/crop/list.h
@@ -26,8 +26,9 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/crop/generic/neon/impl.h"
 
 namespace arm_compute
@@ -36,7 +37,8 @@ namespace cpu
 {
 #define DECLARE_CROP_KERNEL(func_name)                                                                       \
     void func_name(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, \
-                   int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+                   int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit,            \
+                   bool input_has_single_channel, bool is_width_flipped)
 
 DECLARE_CROP_KERNEL(fp16_in_bounds_crop_window);
 DECLARE_CROP_KERNEL(fp32_in_bounds_crop_window);
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
index e85a1664ea7b2e3c4742b55587ab807d907265ea..293e606d811d026d18a1aff1a104a7db4a1aa264 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
@@ -29,11 +29,16 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                    ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_fp16_deptwiseconv2dnative(const ITensor         *src,
+                                    const ITensor         *weights,
+                                    const ITensor         *bias,
+                                    ITensor               *dst,
+                                    const Window          &window,
+                                    bool                   has_biases,
+                                    const ConvolutionInfo &info)
 {
     return run_depthwise_float<float16_t, float16_t>(src, weights, bias, dst, window, has_biases, info);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif //__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
index b2333a33341090e6f568e1c14b2a0057366fdd76..c6fa4790b7b61f0cae38041b8b5a7ba2584dc8e5 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
@@ -26,10 +26,15 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                    ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_fp32_deptwiseconv2dnative(const ITensor         *src,
+                                    const ITensor         *weights,
+                                    const ITensor         *bias,
+                                    ITensor               *dst,
+                                    const Window          &window,
+                                    bool                   has_biases,
+                                    const ConvolutionInfo &info)
 {
     return run_depthwise_float<float, float>(src, weights, bias, dst, window, has_biases, info);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
index f12825477140ab74fd7a001c2b999f38b6197747..d08e9739682648e1f86fe76a3bff6d1e3759e4f2 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
@@ -22,71 +22,16 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h"
+
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/function_info/ConvolutionInfo.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-namespace
-{
-constexpr auto data_layout = DataLayout::NHWC;
-const size_t   width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-const size_t   height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-const size_t   channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-constexpr auto   dim_manual_loop      = Window::Dimension(0, 0, 0);
-constexpr auto   dim_single_unit_step = Window::Dimension(0, 1, 1);
-constexpr size_t vector_size          = 8;
-
-struct DepthwiseConvolutionRunInfo
-{
-    const size_t   num_read_elements_per_iteration;
-    const uint32_t x_start;
-    const uint32_t x_end;
-    const uint32_t x_step;
-    const uint32_t x_leftover_start;
-    const size_t   input_stride_y;
-    const size_t   input_stride_z;
-    const size_t   input_max_offset;
-    const size_t   weights_width;
-    const size_t   weights_height;
-    const size_t   weights_stride_y;
-    const size_t   weights_stride_z;
-    const size_t   conv_stride_x;
-    const size_t   conv_stride_y;
-    const size_t   conv_pad_left;
-    const size_t   conv_pad_top;
-    const size_t   input_height;
-    const size_t   input_width;
-    const size_t   input_depth;
-
-    DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT
-        : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
-          x_start(w.x().start()),
-          x_end(w.x().end()),
-          x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
-          x_leftover_start(std::max(static_cast<int32_t>(w.x().end() + 1) - static_cast<int32_t>(x_step), int32_t(0))),
-          input_stride_y(input.strides_in_bytes().y()),
-          input_stride_z(input.strides_in_bytes().z()),
-          input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
-          weights_width(weights.dimension(width_idx)),
-          weights_height(weights.dimension(height_idx)),
-          weights_stride_y(weights.strides_in_bytes().y()),
-          weights_stride_z(weights.strides_in_bytes().z()),
-          conv_stride_x(conv_info.stride().first),
-          conv_stride_y(conv_info.stride().second),
-          conv_pad_left(conv_info.pad_left()),
-          conv_pad_top(conv_info.pad_top()),
-          input_height(input.dimension(height_idx)),
-          input_width(input.dimension(width_idx)),
-          input_depth(input.dimension(channel_idx))
-    {
-    }
-};
-
 inline int32x4_t saturating_doubling_high_mul(const int32x4_t &a, const int32_t &b)
 {
     return vqrdmulhq_n_s32(a, b);
@@ -119,216 +64,19 @@ inline int32_t rounding_divide_by_exp2(const int32_t &x, const int exponent)
     return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
 }
 
-inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
-{
-    const int32_t current_h  = base_h + h * dilation.y();
-    const bool    is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
-
-    const int32_t current_w  = base_w + w * dilation.x();
-    const bool    is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
-
-    return is_valid_h && is_valid_w;
-}
-
-template <typename T>
-void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                   const Size2D &dilation, const Window &window, bool has_biases)
-{
-    constexpr auto element_per_vector = vector_size / sizeof(T);
-    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
-    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
-
-    const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, dim_single_unit_step);
-
-    Window win_input = window;
-    win_input.set(Window::DimX, dim_manual_loop);
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = win_input;
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set(Window::DimX, dim_manual_loop);
-
-    Iterator input_it(src, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(dst, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto const base_weights_ptr = weights_it.ptr();
-        uint32_t   x                = run_info.x_start;
-
-        for(; x < run_info.x_leftover_start; x += run_info.x_step)
-        {
-            VectorType acc          = zero_vector;
-            auto       weights_ptr  = base_weights_ptr;
-            int64_t    input_offset = base_input_offset;
-
-            for(uint32_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(uint32_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ?
-                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
-                                                 zero_vector;
-                    const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-                    acc                     = wrapper::vmla(acc, weights_vals, input_vals);
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            if(has_biases)
-            {
-                const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
-                acc                    = wrapper::vadd(acc, biases_vals);
-            }
-
-            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
-        }
-
-        for(; x < run_info.x_end; ++x)
-        {
-            auto    acc_scalar   = T{ 0 };
-            auto    weights_ptr  = base_weights_ptr;
-            int64_t input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
-                    const auto weights_vals    = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    acc_scalar += (input_vals * weights_vals);
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            if(has_biases)
-            {
-                const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
-                acc_scalar += biases_vals;
-            }
-            *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T>
-void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                               const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
+namespace
 {
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-
-    Window win_input = execution_window;
-    win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = window;
-    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
-    win_weights.set(Window::DimY, dim_manual_loop);
-    win_weights.set(Window::DimZ, dim_manual_loop);
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set_dimension_step(Window::DimX, run_info.x_step);
-
-    Iterator input_it(src, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(dst, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::vector<T> acc(depth_multiplier, static_cast<T>(0));
-
-        const int input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            int offs = input_offset;
-            for(size_t w = 0; w < run_info.weights_width; ++w)
-            {
-                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
-
-                for(size_t m = 0; m < depth_multiplier; ++m)
-                {
-                    const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                    acc.at(m)              = support::cpp11::fma(weights_val, input_val, acc.at(m));
-                }
-
-                offs += dilation.x() * run_info.input_stride_y;
-            }
-
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        if(has_biases)
-        {
-            for(size_t m = 0; m < depth_multiplier; ++m)
-            {
-                const auto biases_val                                     = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
-                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
-            }
-        }
-        else
-        {
-            for(size_t m = 0; m < depth_multiplier; ++m)
-            {
-                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
-            }
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
 template <typename T, typename TW>
-void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                          const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+void depthwise_loop_multiplier1_quantized(const ITensor       *src,
+                                          const ITensor       *weights,
+                                          const ITensor       *biases,
+                                          ITensor             *dst,
+                                          const PadStrideInfo &conv_info,
+                                          const Size2D        &dilation,
+                                          std::vector<int>     output_multiplier,
+                                          std::vector<int>     output_shift,
+                                          const Window        &window,
+                                          bool                 has_biases) // NOLINT
 {
     ARM_COMPUTE_UNUSED(output_multiplier, output_shift);
     constexpr auto element_per_vector = vector_size / sizeof(T);
@@ -337,7 +85,8 @@ void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *wei
     using AccType                     = int32_t;
     using AccArrayType                = std::array<AccType, element_per_vector>;
 
-    const auto out_of_bound_value  = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
+    const auto out_of_bound_value =
+        PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
     const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
 
     const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
@@ -366,152 +115,175 @@ void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *wei
     Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
-    if(has_biases)
+    if (has_biases)
     {
         biases_it = Iterator(biases, win_weights);
     }
 
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-        auto const    base_weights_ptr  = weights_it.ptr();
-        size_t        x                 = run_info.x_start;
-
-        for(; x < run_info.x_leftover_start; x += run_info.x_step)
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
         {
-            AccArrayType acc{};
-            AccArrayType in_sum{};
-            AccArrayType we_sum{};
-
-            auto weights_ptr  = base_weights_ptr;
-            auto input_offset = base_input_offset;
+            const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+            auto const    base_weights_ptr  = weights_it.ptr();
+            size_t        x                 = run_info.x_start;
 
-            for(size_t h = 0; h < run_info.weights_height; ++h)
+            for (; x < run_info.x_leftover_start; x += run_info.x_step)
             {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ?
-                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
-                                                 out_of_bound_vector;
-                    const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+                AccArrayType acc{};
+                AccArrayType in_sum{};
+                AccArrayType we_sum{};
 
-                    for(size_t i = 0; i < element_per_vector; ++i)
+                auto weights_ptr  = base_weights_ptr;
+                auto input_offset = base_input_offset;
+
+                for (size_t h = 0; h < run_info.weights_height; ++h)
+                {
+                    int64_t offs = input_offset + x * sizeof(T);
+                    for (size_t w = 0; w < run_info.weights_width; ++w)
                     {
-                        acc.at(i) += input_vals[i] * weights_vals[i];
-                        in_sum.at(i) += input_vals[i];
-                        we_sum.at(i) += weights_vals[i];
+                        const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                        const auto input_vals =
+                            is_valid_region
+                                ? wrapper::vload(reinterpret_cast<T *>(
+                                      input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)))
+                                : out_of_bound_vector;
+                        const auto weights_vals =
+                            wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                        for (size_t i = 0; i < element_per_vector; ++i)
+                        {
+                            acc.at(i) += input_vals[i] * weights_vals[i];
+                            in_sum.at(i) += input_vals[i];
+                            we_sum.at(i) += weights_vals[i];
+                        }
+
+                        offs += dilation.x() * run_info.input_stride_y;
                     }
 
-                    offs += dilation.x() * run_info.input_stride_y;
+                    weights_ptr += run_info.weights_stride_z;
+                    input_offset += dilation.y() * run_info.input_stride_z;
                 }
 
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
+                VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
+                for (size_t i = 0; i < element_per_vector; ++i)
+                {
+                    acc.at(i) -= in_sum.at(i) * weights_qoffset;
+                    acc.at(i) -= we_sum.at(i) * input_qoffset;
+                    acc.at(i) += k_offset;
 
-            VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
-            for(size_t i = 0; i < element_per_vector; ++i)
-            {
-                acc.at(i) -= in_sum.at(i) * weights_qoffset;
-                acc.at(i) -= we_sum.at(i) * input_qoffset;
-                acc.at(i) += k_offset;
+                    if (has_biases)
+                    {
+                        acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
+                    }
 
-                if(has_biases)
-                {
-                    acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
+                    const int32_t out_mul   = output_multiplier.at(x + i);
+                    const int32_t out_shift = output_shift.at(x + i);
+                    if (out_shift < 0)
+                    {
+                        acc.at(i) =
+                            saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
+                    }
+                    else
+                    {
+                        acc.at(i) =
+                            rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) +
+                            output_qoffset;
+                    }
+                    out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
                 }
 
-                const int32_t out_mul   = output_multiplier.at(x + i);
-                const int32_t out_shift = output_shift.at(x + i);
-                if(out_shift < 0)
-                {
-                    acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
-                }
-                else
-                {
-                    acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
-                }
-                out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
+                wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
             }
 
-            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
-        }
-
-        // left-over
-        for(; x < run_info.x_end; ++x)
-        {
-            AccType acc    = 0;
-            AccType in_sum = 0;
-            AccType we_sum = 0;
+            // left-over
+            for (; x < run_info.x_end; ++x)
+            {
+                AccType acc    = 0;
+                AccType in_sum = 0;
+                AccType we_sum = 0;
 
-            auto weights_ptr  = base_weights_ptr;
-            auto input_offset = base_input_offset;
+                auto weights_ptr  = base_weights_ptr;
+                auto input_offset = base_input_offset;
 
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
+                for (size_t h = 0; h < run_info.weights_height; ++h)
                 {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_val       = is_valid_region ?
-                                                 *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
-                                                 out_of_bound_value;
-                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    acc += input_val * weights_val;
-                    in_sum += input_val;
-                    we_sum += weights_val;
+                    int64_t offs = input_offset + x * sizeof(T);
+                    for (size_t w = 0; w < run_info.weights_width; ++w)
+                    {
+                        const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                        const auto input_val =
+                            is_valid_region
+                                ? *reinterpret_cast<T *>(input_it.ptr() +
+                                                         std::min(static_cast<size_t>(offs), run_info.input_max_offset))
+                                : out_of_bound_value;
+                        const auto weights_val =
+                            *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                        acc += input_val * weights_val;
+                        in_sum += input_val;
+                        we_sum += weights_val;
+
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
 
-                    offs += dilation.x() * run_info.input_stride_y;
+                    weights_ptr += run_info.weights_stride_z;
+                    input_offset += dilation.y() * run_info.input_stride_z;
                 }
 
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
+                T out_vals{0};
 
-            T out_vals{ 0 };
+                acc -= in_sum * weights_qoffset;
+                acc -= we_sum * input_qoffset;
+                acc += k_offset;
 
-            acc -= in_sum * weights_qoffset;
-            acc -= we_sum * input_qoffset;
-            acc += k_offset;
+                if (has_biases)
+                {
+                    acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
+                }
 
-            if(has_biases)
-            {
-                acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
-            }
+                const int32_t out_mul   = output_multiplier.at(x);
+                const int32_t out_shift = output_shift.at(x);
 
-            const int32_t out_mul   = output_multiplier.at(x);
-            const int32_t out_shift = output_shift.at(x);
+                if (out_shift < 0)
+                {
+                    acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
+                }
+                else
+                {
+                    acc =
+                        rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
+                }
 
-            if(out_shift < 0)
-            {
-                acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
-            }
-            else
-            {
-                acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
+                out_vals                                      = static_cast<T>(utility::clamp<AccType, T>(acc));
+                *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
             }
-
-            out_vals                                      = static_cast<T>(utility::clamp<AccType, T>(acc));
-            *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
+        },
+        input_it, weights_it, biases_it, output_it);
 }
 
 template <typename T, typename TW>
-void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                      const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+void depthwise_loop_generic_quantized(const ITensor       *src,
+                                      const ITensor       *weights,
+                                      const ITensor       *biases,
+                                      ITensor             *dst,
+                                      const PadStrideInfo &conv_info,
+                                      const Size2D        &dilation,
+                                      unsigned int         depth_multiplier,
+                                      std::vector<int>     output_multiplier,
+                                      std::vector<int>     output_shift,
+                                      const Window        &window,
+                                      bool                 has_biases) // NOLINT
 {
     using AccType = int32_t;
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info =
+        DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
-    const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
+    const auto out_of_bound_value =
+        PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
 
     const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
     const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
@@ -539,76 +311,93 @@ void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights
     Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
-    if(has_biases)
+    if (has_biases)
     {
         biases_it = Iterator(biases, win_weights);
     }
 
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::vector<AccType> acc(depth_multiplier, 0);
-        std::vector<AccType> we_sum(depth_multiplier, 0);
-        AccType              in_sum = 0;
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
+        {
+            std::vector<AccType> acc(depth_multiplier, 0);
+            std::vector<AccType> we_sum(depth_multiplier, 0);
+            AccType              in_sum = 0;
 
-        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+            const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
 
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            int offs = input_offset;
-            for(size_t w = 0; w < run_info.weights_width; ++w)
+            auto weights_ptr = weights_it.ptr();
+            for (size_t h = 0; h < run_info.weights_height; ++h)
             {
-                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
-
-                for(size_t m = 0; m < depth_multiplier; ++m)
+                int offs = input_offset;
+                for (size_t w = 0; w < run_info.weights_width; ++w)
                 {
-                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                    acc.at(m) += input_val * weights_val;
-
-                    we_sum.at(m) += weights_val;
-                }
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_val =
+                        is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs),
+                                                                                            run_info.input_max_offset)))
+                                        : out_of_bound_value;
 
-                offs += dilation.x() * run_info.input_stride_y;
-                in_sum += input_val;
-            }
+                    for (size_t m = 0; m < depth_multiplier; ++m)
+                    {
+                        const auto weights_val =
+                            *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                        acc.at(m) += input_val * weights_val;
 
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
+                        we_sum.at(m) += weights_val;
+                    }
 
-        for(size_t m = 0; m < depth_multiplier; ++m)
-        {
-            acc.at(m) -= in_sum * weights_qoffset;
-            acc.at(m) -= we_sum.at(m) * input_qoffset;
-            acc.at(m) += k_offset;
+                    offs += dilation.x() * run_info.input_stride_y;
+                    in_sum += input_val;
+                }
 
-            if(has_biases)
-            {
-                acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
             }
 
-            const int32_t out_mul   = output_multiplier.at(id.x() * depth_multiplier + m);
-            const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
-            if(out_shift < 0)
-            {
-                acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
-            }
-            else
+            for (size_t m = 0; m < depth_multiplier; ++m)
             {
-                acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
+                acc.at(m) -= in_sum * weights_qoffset;
+                acc.at(m) -= we_sum.at(m) * input_qoffset;
+                acc.at(m) += k_offset;
+
+                if (has_biases)
+                {
+                    acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+                }
+
+                const int32_t out_mul   = output_multiplier.at(id.x() * depth_multiplier + m);
+                const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
+                if (out_shift < 0)
+                {
+                    acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
+                }
+                else
+                {
+                    acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) +
+                                output_qoffset;
+                }
+                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) =
+                    static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
             }
-            *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
+        },
+        input_it, weights_it, biases_it, output_it);
 }
 
 template <typename T, typename TW>
-void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                              const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+void depthwise_loop_pow2_quantized_per_tensor(const ITensor       *src,
+                                              const ITensor       *weights,
+                                              const ITensor       *biases,
+                                              ITensor             *dst,
+                                              const PadStrideInfo &conv_info,
+                                              const Size2D        &dilation,
+                                              unsigned int         depth_multiplier,
+                                              std::vector<int>     output_multiplier,
+                                              std::vector<int>     output_shift,
+                                              const Window        &window,
+                                              bool                 has_biases) // NOLINT
 {
     constexpr int half_vec = vector_size / 2;
 
@@ -617,11 +406,15 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor
     using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
     using TagType          = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info =
+        DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
-    const auto input_qoffset_vec   = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
-    const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
-    const auto output_qoffset_vec  = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
+    const auto input_qoffset_vec = wrapper::vreinterpret(
+        wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
+    const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(
+        wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
+    const auto output_qoffset_vec  = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset,
+                                                     arm_compute::wrapper::traits::vector_128_tag{});
 
     const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
     const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
@@ -651,7 +444,7 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor
     Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
-    if(has_biases)
+    if (has_biases)
     {
         biases_it = Iterator(biases, win_weights);
     }
@@ -659,118 +452,117 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor
     std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
     std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
 
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::fill(begin(acc0), end(acc0), zero);
-        std::fill(begin(acc1), end(acc1), zero);
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
+        {
+            std::fill(begin(acc0), end(acc0), zero);
+            std::fill(begin(acc1), end(acc1), zero);
 
-        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+            const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
 
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            const int32_t current_h = input_z + h * dilation.y();
-            if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
+            auto weights_ptr = weights_it.ptr();
+            for (size_t h = 0; h < run_info.weights_height; ++h)
             {
-                int offs = input_offset;
-                for(size_t w = 0; w < run_info.weights_width; ++w)
+                const int32_t current_h = input_z + h * dilation.y();
+                if (current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
                 {
-                    const int32_t current_w = input_y + w * dilation.x();
-                    if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
+                    int offs = input_offset;
+                    for (size_t w = 0; w < run_info.weights_width; ++w)
                     {
-                        const auto input_8x8     = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
-                        const auto input_s16x8   = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
-                        const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
-
-                        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+                        const int32_t current_w = input_y + w * dilation.x();
+                        if (current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
                         {
-                            const auto weights_8x8     = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                            const auto weights_s16x8   = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
-                            const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
-
-                            acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
-                            acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
+                            const auto input_8x8 = wrapper::vdup_n(
+                                *(reinterpret_cast<T *>(
+                                    input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))),
+                                TagType{});
+                            const auto input_s16x8   = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
+                            const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
+
+                            for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+                            {
+                                const auto weights_8x8     = wrapper::vload(reinterpret_cast<TW *>(
+                                    weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                                const auto weights_s16x8   = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
+                                const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
+
+                                acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs),
+                                                            wrapper::vgetlow(weights_no_offs));
+                                acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs),
+                                                            wrapper::vgethigh(weights_no_offs));
+                            }
                         }
-                    }
 
-                    offs += dilation.x() * run_info.input_stride_y;
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
                 }
-            }
 
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
 
-        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
-        {
-            if(has_biases)
+            for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
             {
-                const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
-                const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
+                if (has_biases)
+                {
+                    const auto bias_val0 =
+                        wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+                    const auto bias_val1 = wrapper::vloadq(
+                        reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
 
-                acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
-                acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
-            }
+                    acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
+                    acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
+                }
 
-            if(out_shift < 0)
-            {
-                acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
-                acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
-            }
-            else
-            {
-                acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
-                acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
-            }
+                if (out_shift < 0)
+                {
+                    acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul),
+                                               output_qoffset_vec);
+                    acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul),
+                                               output_qoffset_vec);
+                }
+                else
+                {
+                    acc0.at(i) = wrapper::vadd(
+                        rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift),
+                        output_qoffset_vec);
+                    acc1.at(i) = wrapper::vadd(
+                        rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift),
+                        output_qoffset_vec);
+                }
 
-            acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
-            acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
+                acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
+                acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
 
-            const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
-                                                   wrapper::vmovn(acc1.at(i)));
+                const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)), wrapper::vmovn(acc1.at(i)));
 
-            if(std::is_same<T, uint8_t>::value)
-            {
-                wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
-            }
-            else
-            {
-                wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
+                if (std::is_same<T, uint8_t>::value)
+                {
+                    wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)),
+                                    wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
+                }
+                else
+                {
+                    wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)),
+                                    wrapper::vqmovn(out_val));
+                }
             }
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
+        },
+        input_it, weights_it, biases_it, output_it);
 }
 } // namespace
-template <typename T, typename TW>
-void run_depthwise_float(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                         ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
-{
-    PadStrideInfo conv_info        = info.pad_stride_info;
-    unsigned int  depth_multiplier = info.depth_multiplier;
-    Size2D        dilation         = info.dilation;
-
-    if(depth_multiplier == 1)
-    {
-        depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, conv_info, dilation, window, has_biases);
-    }
-    else
-    {
-        depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window, has_biases);
-    }
-}
-template void run_depthwise_float<float, float>(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template void run_depthwise_float<float16_t, float16_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                        ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 template <typename T, typename TW>
-void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                  ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void run_depthwise_quanitized8bit(const ITensor         *src,
+                                  const ITensor         *weights,
+                                  const ITensor         *biases,
+                                  ITensor               *dst,
+                                  const Window          &window,
+                                  bool                   has_biases,
+                                  const ConvolutionInfo &info)
 {
     PadStrideInfo    conv_info        = info.pad_stride_info;
     unsigned int     depth_multiplier = info.depth_multiplier;
@@ -782,15 +574,15 @@ void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, co
     const auto output_scale  = dst->info()->quantization_info().uniform().scale;
     auto       weights_scale = weights->info()->quantization_info().scale();
 
-    if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
+    if (!is_data_type_quantized_per_channel(weights->info()->data_type()))
     {
-        for(size_t i = 1; i < weights->info()->dimension(channel_idx); ++i)
+        for (size_t i = 1; i < weights->info()->dimension(channel_idx); ++i)
         {
             weights_scale.push_back(weights_scale.front());
         }
     }
 
-    for(const auto &s : weights_scale)
+    for (const auto &s : weights_scale)
     {
         int32_t     out_mult   = 0;
         int32_t     out_shift  = 0;
@@ -801,30 +593,49 @@ void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, co
         output_shift.push_back(out_shift);
     }
 
-    if(depth_multiplier == 1)
+    if (depth_multiplier == 1)
     {
-        depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, output_multiplier, output_shift, window, has_biases);
+        depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, output_multiplier,
+                                                    output_shift, window, has_biases);
     }
     else
     {
         const bool is_pow2                 = ((depth_multiplier & (depth_multiplier - 1)) == 0);
         const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
 
-        if(is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8)
+        if (is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8)
         {
-            depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases);
+            depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, conv_info, dilation,
+                                                            depth_multiplier, output_multiplier, output_shift, window,
+                                                            has_biases);
         }
         else
         {
-            depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases);
+            depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier,
+                                                    output_multiplier, output_shift, window, has_biases);
         }
     }
 }
-template void run_depthwise_quanitized8bit<uint8_t, uint8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                             ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
-template void run_depthwise_quanitized8bit<int8_t, int8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                           ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
-template void run_depthwise_quanitized8bit<uint8_t, int8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                            ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<uint8_t, uint8_t>(const ITensor         *src,
+                                                             const ITensor         *weights,
+                                                             const ITensor         *biases,
+                                                             ITensor               *dst,
+                                                             const Window          &window,
+                                                             bool                   has_biases,
+                                                             const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<int8_t, int8_t>(const ITensor         *src,
+                                                           const ITensor         *weights,
+                                                           const ITensor         *biases,
+                                                           ITensor               *dst,
+                                                           const Window          &window,
+                                                           bool                   has_biases,
+                                                           const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<uint8_t, int8_t>(const ITensor         *src,
+                                                            const ITensor         *weights,
+                                                            const ITensor         *biases,
+                                                            ITensor               *dst,
+                                                            const Window          &window,
+                                                            bool                   has_biases,
+                                                            const ConvolutionInfo &info);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
index 1f01ce43d9fbdd9e8db3c93c3e8cb4891741ea58..3fa5c58c3cb29755e5767da32c4e542265f309ec 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
@@ -24,19 +24,350 @@
 #ifndef SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H
 #define SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H
 #include "arm_compute/core/Helpers.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+
 namespace arm_compute
 {
 struct ConvolutionInfo;
 
 namespace cpu
 {
+constexpr auto data_layout = DataLayout::NHWC;
+const size_t   width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+const size_t   height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+const size_t   channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+constexpr auto   dim_manual_loop      = Window::Dimension(0, 0, 0);
+constexpr auto   dim_single_unit_step = Window::Dimension(0, 1, 1);
+constexpr size_t vector_size          = 8;
+
+struct DepthwiseConvolutionRunInfo
+{
+    const size_t   num_read_elements_per_iteration;
+    const uint32_t x_start;
+    const uint32_t x_end;
+    const uint32_t x_step;
+    const uint32_t x_leftover_start;
+    const size_t   input_stride_y;
+    const size_t   input_stride_z;
+    const size_t   input_max_offset;
+    const size_t   weights_width;
+    const size_t   weights_height;
+    const size_t   weights_stride_y;
+    const size_t   weights_stride_z;
+    const size_t   conv_stride_x;
+    const size_t   conv_stride_y;
+    const size_t   conv_pad_left;
+    const size_t   conv_pad_top;
+    const size_t   input_height;
+    const size_t   input_width;
+    const size_t   input_depth;
+
+    DepthwiseConvolutionRunInfo(const ITensorInfo   &input,
+                                const ITensorInfo   &weights,
+                                const PadStrideInfo &conv_info,
+                                const Window        &w,
+                                uint32_t             depth_multiplier = 1) // NOLINT
+        : num_read_elements_per_iteration(
+              (depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
+          x_start(w.x().start()),
+          x_end(w.x().end()),
+          x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
+          x_leftover_start(std::max(static_cast<int32_t>(w.x().end() + 1) - static_cast<int32_t>(x_step), int32_t(0))),
+          input_stride_y(input.strides_in_bytes().y()),
+          input_stride_z(input.strides_in_bytes().z()),
+          input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) -
+                           (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
+          weights_width(weights.dimension(width_idx)),
+          weights_height(weights.dimension(height_idx)),
+          weights_stride_y(weights.strides_in_bytes().y()),
+          weights_stride_z(weights.strides_in_bytes().z()),
+          conv_stride_x(conv_info.stride().first),
+          conv_stride_y(conv_info.stride().second),
+          conv_pad_left(conv_info.pad_left()),
+          conv_pad_top(conv_info.pad_top()),
+          input_height(input.dimension(height_idx)),
+          input_width(input.dimension(width_idx)),
+          input_depth(input.dimension(channel_idx))
+    {
+    }
+};
+
+inline bool is_valid_input_region(int32_t                            base_w,
+                                  uint32_t                           base_h,
+                                  uint32_t                           w,
+                                  uint32_t                           h,
+                                  const DepthwiseConvolutionRunInfo &run_info,
+                                  const Size2D                      &dilation)
+{
+    const int32_t current_h  = base_h + h * dilation.y();
+    const bool    is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
+
+    const int32_t current_w  = base_w + w * dilation.x();
+    const bool    is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
+
+    return is_valid_h && is_valid_w;
+}
+
+template <typename T>
+void depthwise_loop_multiplier1_fp(const ITensor       *src,
+                                   const ITensor       *weights,
+                                   const ITensor       *biases,
+                                   ITensor             *dst,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   const Window        &window,
+                                   bool                 has_biases)
+{
+    constexpr auto element_per_vector = vector_size / sizeof(T);
+    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
+    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
+
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
+
+    const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, dim_single_unit_step);
+
+    Window win_input = window;
+    win_input.set(Window::DimX, dim_manual_loop);
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = win_input;
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set(Window::DimX, dim_manual_loop);
+
+    Iterator input_it(src, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(dst, win_output);
+    Iterator biases_it{};
+
+    if (has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
+        {
+            const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+            auto const base_weights_ptr = weights_it.ptr();
+            uint32_t   x                = run_info.x_start;
+
+            for (; x < run_info.x_leftover_start; x += run_info.x_step)
+            {
+                VectorType acc          = zero_vector;
+                auto       weights_ptr  = base_weights_ptr;
+                int64_t    input_offset = base_input_offset;
+
+                for (uint32_t h = 0; h < run_info.weights_height; ++h)
+                {
+                    int64_t offs = input_offset + x * sizeof(T);
+                    for (uint32_t w = 0; w < run_info.weights_width; ++w)
+                    {
+                        const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                        const auto input_vals =
+                            is_valid_region
+                                ? wrapper::vload(reinterpret_cast<T *>(
+                                      input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)))
+                                : zero_vector;
+                        const auto weights_vals =
+                            wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+                        acc = wrapper::vmla(acc, weights_vals, input_vals);
+
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
+
+                    weights_ptr += run_info.weights_stride_z;
+                    input_offset += dilation.y() * run_info.input_stride_z;
+                }
+
+                if (has_biases)
+                {
+                    const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
+                    acc                    = wrapper::vadd(acc, biases_vals);
+                }
+
+                wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
+            }
+
+            for (; x < run_info.x_end; ++x)
+            {
+                auto    acc_scalar   = T{0};
+                auto    weights_ptr  = base_weights_ptr;
+                int64_t input_offset = base_input_offset;
+
+                for (size_t h = 0; h < run_info.weights_height; ++h)
+                {
+                    int64_t offs = input_offset + x * sizeof(T);
+                    for (size_t w = 0; w < run_info.weights_width; ++w)
+                    {
+                        const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                        const auto input_vals =
+                            is_valid_region
+                                ? *reinterpret_cast<T *>(input_it.ptr() +
+                                                         std::min(static_cast<size_t>(offs), run_info.input_max_offset))
+                                : 0;
+                        const auto weights_vals =
+                            *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                        acc_scalar += (input_vals * weights_vals);
+
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
+
+                    weights_ptr += run_info.weights_stride_z;
+                    input_offset += dilation.y() * run_info.input_stride_z;
+                }
+
+                if (has_biases)
+                {
+                    const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
+                    acc_scalar += biases_vals;
+                }
+                *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
+            }
+        },
+        input_it, weights_it, biases_it, output_it);
+}
+
+template <typename T>
+void depthwise_loop_generic_fp(const ITensor       *src,
+                               const ITensor       *weights,
+                               const ITensor       *biases,
+                               ITensor             *dst,
+                               const PadStrideInfo &conv_info,
+                               const Size2D        &dilation,
+                               unsigned int         depth_multiplier,
+                               const Window        &window,
+                               bool                 has_biases)
+{
+    const auto run_info =
+        DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
+
+    Window win_input = execution_window;
+    win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
+
+    Window win_weights = window;
+    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
+    win_weights.set(Window::DimY, dim_manual_loop);
+    win_weights.set(Window::DimZ, dim_manual_loop);
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set_dimension_step(Window::DimX, run_info.x_step);
+
+    Iterator input_it(src, win_input);
+    Iterator weights_it(weights, win_weights);
+    Iterator output_it(dst, win_output);
+    Iterator biases_it{};
+
+    if (has_biases)
+    {
+        biases_it = Iterator(biases, win_weights);
+    }
+
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
+        {
+            std::vector<T> acc(depth_multiplier, static_cast<T>(0));
+
+            const int input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            int       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+            auto weights_ptr = weights_it.ptr();
+            for (size_t h = 0; h < run_info.weights_height; ++h)
+            {
+                int offs = input_offset;
+                for (size_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_val =
+                        is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs),
+                                                                                            run_info.input_max_offset)))
+                                        : T(0);
+
+                    for (size_t m = 0; m < depth_multiplier; ++m)
+                    {
+                        const auto weights_val =
+                            *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                        acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
+                    }
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
+
+            if (has_biases)
+            {
+                for (size_t m = 0; m < depth_multiplier; ++m)
+                {
+                    const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
+                    *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
+                }
+            }
+            else
+            {
+                for (size_t m = 0; m < depth_multiplier; ++m)
+                {
+                    *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
+                }
+            }
+        },
+        input_it, weights_it, biases_it, output_it);
+}
+
 template <typename T, typename TW>
-void run_depthwise_float(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                         ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
+void run_depthwise_float(const ITensor         *src,
+                         const ITensor         *weights,
+                         const ITensor         *biases,
+                         ITensor               *dst,
+                         const Window          &window,
+                         bool                   has_biases,
+                         const ConvolutionInfo &info)
+{
+    PadStrideInfo conv_info        = info.pad_stride_info;
+    unsigned int  depth_multiplier = info.depth_multiplier;
+    Size2D        dilation         = info.dilation;
+
+    if (depth_multiplier == 1)
+    {
+        depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, conv_info, dilation, window, has_biases);
+    }
+    else
+    {
+        depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window,
+                                     has_biases);
+    }
+}
 
 template <typename T, typename TW>
-void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                  ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
+void run_depthwise_quanitized8bit(const ITensor         *src,
+                                  const ITensor         *weights,
+                                  const ITensor         *biases,
+                                  ITensor               *dst,
+                                  const Window          &window,
+                                  bool                   has_biases,
+                                  const ConvolutionInfo &info);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp
index 1bf7ad70076f93def830f975fe0df3f2da023bed..d32847c1e8ee3d9bbfc69bbf3313e05af0ea237f 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp
@@ -26,16 +26,26 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qu8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                   ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_qu8_deptwiseconv2dnative(const ITensor         *src,
+                                   const ITensor         *weights,
+                                   const ITensor         *bias,
+                                   ITensor               *dst,
+                                   const Window          &window,
+                                   bool                   has_biases,
+                                   const ConvolutionInfo &info)
 {
     return run_depthwise_quanitized8bit<uint8_t, uint8_t>(src, weights, bias, dst, window, has_biases, info);
 }
 
-void neon_qp8_qu8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                       ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_qp8_qu8_deptwiseconv2dnative(const ITensor         *src,
+                                       const ITensor         *weights,
+                                       const ITensor         *bias,
+                                       ITensor               *dst,
+                                       const Window          &window,
+                                       bool                   has_biases,
+                                       const ConvolutionInfo &info)
 {
     return run_depthwise_quanitized8bit<uint8_t, int8_t>(src, weights, bias, dst, window, has_biases, info);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp
index 58f7536064fa89203e09fa5d86649f479488976b..682fad0bda763f0dec71f01582ccf5bf7e6c04ea 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp
@@ -26,16 +26,26 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qs8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                   ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_qs8_deptwiseconv2dnative(const ITensor         *src,
+                                   const ITensor         *weights,
+                                   const ITensor         *bias,
+                                   ITensor               *dst,
+                                   const Window          &window,
+                                   bool                   has_biases,
+                                   const ConvolutionInfo &info)
 {
     return run_depthwise_quanitized8bit<int8_t, int8_t>(src, weights, bias, dst, window, has_biases, info);
 }
 
-void neon_qp8_qs8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                       ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_qp8_qs8_deptwiseconv2dnative(const ITensor         *src,
+                                       const ITensor         *weights,
+                                       const ITensor         *bias,
+                                       ITensor               *dst,
+                                       const Window          &window,
+                                       bool                   has_biases,
+                                       const ConvolutionInfo &info)
 {
     return run_depthwise_quanitized8bit<int8_t, int8_t>(src, weights, bias, dst, window, has_biases, info);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/list.h b/src/cpu/kernels/depthwiseconv2d/list.h
index 44f055d6a9df110cd3f385bc0e12b9a78e1d1ee2..cf80608f4fe0da4a64271f7cc2d7d69327799e60 100644
--- a/src/cpu/kernels/depthwiseconv2d/list.h
+++ b/src/cpu/kernels/depthwiseconv2d/list.h
@@ -27,9 +27,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_DEPTHWISECONV2D_KERNEL(func_name)                                   \
-    void func_name(const ITensor *src, const ITensor *weights, const ITensor *bias, \
-                   ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+#define DECLARE_DEPTHWISECONV2D_KERNEL(func_name)                                                 \
+    void func_name(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, \
+                   const Window &window, bool has_biases, const ConvolutionInfo &info)
 DECLARE_DEPTHWISECONV2D_KERNEL(neon_qu8_deptwiseconv2dnative);
 DECLARE_DEPTHWISECONV2D_KERNEL(neon_qs8_deptwiseconv2dnative);
 DECLARE_DEPTHWISECONV2D_KERNEL(neon_fp16_deptwiseconv2dnative);
diff --git a/src/cpu/kernels/directconv2d/impl.h b/src/cpu/kernels/directconv2d/impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3965326cdf57117279ad3dcc898ef2b8e9af290
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/impl.h
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_IMPL_H
+#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_IMPL_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+template <typename T, bool has_pads>
+void linearize_volume_nchw(const uint8_t *const in_ptr,
+                           T                   *out_ptr,
+                           bool                 has_bias,
+                           int                  top_left_x,
+                           int                  top_left_y,
+                           int                  kernel_width,
+                           int                  kernel_height,
+                           int                  kernel_depth,
+                           int                  input_w,
+                           int                  input_h,
+                           int                  input_stride_x,
+                           int                  input_stride_y,
+                           int                  input_stride_z,
+                           int                  pad_value,
+                           int                  dilation_x,
+                           int                  dilation_y)
+{
+    const int kernel_size2 = kernel_width * kernel_height;
+    const int x_e          = top_left_x + kernel_width * dilation_x;
+    const int y_e          = top_left_y + kernel_height * dilation_y;
+
+    // Linearize volume
+    int d = 0;
+    // This for loop linearize a volume with 3 slices. This allows:
+    // 1) to reduce the iterations of the outer for loop "d"
+    // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
+    for (; d <= (kernel_depth - 3); d += 3)
+    {
+        for (int y = top_left_y; y < y_e; y += dilation_y)
+        {
+            if ((y < 0 || y >= input_h) && has_pads)
+            {
+                // All the values will be the offset (will be zeros when not quantized)
+                for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+                {
+                    *(out_ptr + 0 * kernel_size2) = pad_value;
+                    *(out_ptr + 1 * kernel_size2) = pad_value;
+                    *(out_ptr + 2 * kernel_size2) = pad_value;
+                }
+            }
+            else
+            {
+                for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+                {
+                    if ((x < 0 || x >= input_w) && has_pads)
+                    {
+                        *(out_ptr + 0 * kernel_size2) = pad_value;
+                        *(out_ptr + 1 * kernel_size2) = pad_value;
+                        *(out_ptr + 2 * kernel_size2) = pad_value;
+                    }
+                    else
+                    {
+                        *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(
+                            in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(
+                            in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(
+                            in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                    }
+                }
+            }
+        }
+        out_ptr += 2 * kernel_size2;
+    }
+
+    // Left over
+    for (; d < kernel_depth; d++)
+    {
+        for (int y = top_left_y; y < y_e; y += dilation_y)
+        {
+            if ((y < 0 || y >= input_h) && has_pads)
+            {
+                // All the values will be the offset (will be zeros when not quantized)
+                memset(static_cast<void *>(out_ptr), pad_value, kernel_width * sizeof(T));
+                out_ptr += kernel_width;
+            }
+            else
+            {
+                for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+                {
+                    if ((x < 0 || x >= input_w) && has_pads)
+                    {
+                        *out_ptr = pad_value;
+                    }
+                    else
+                    {
+                        *out_ptr = *(reinterpret_cast<const T *>(
+                            in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                    }
+                }
+            }
+        }
+    }
+
+    // Append 1 if the convolution layer has biases
+    if (has_bias)
+    {
+        *out_ptr = static_cast<T>(1);
+    }
+}
+
+template <typename T, bool has_pads>
+void linearize_volume_nhwc(const uint8_t *const in_ptr,
+                           T                   *out_ptr,
+                           bool                 has_bias,
+                           int                  start_x,
+                           int                  start_y,
+                           int                  kernel_width,
+                           int                  kernel_height,
+                           int                  input_w,
+                           int                  input_h,
+                           int                  input_c,
+                           int                  input_stride_y,
+                           int                  input_stride_z,
+                           int                  pad_value,
+                           int                  dilation_x,
+                           int                  dilation_y)
+{
+    const int end_x        = start_x + kernel_width * dilation_x;
+    const int end_y        = start_y + kernel_height * dilation_y;
+    const int pad_quant    = kernel_width * input_c;
+    const int element_size = static_cast<int>(sizeof(T));
+    if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) &&
+        (input_stride_y == input_c * element_size))
+    {
+        for (int y = start_y; y < end_y; y += dilation_y)
+        {
+            //optimized for no dilation and no boundary pixels
+            memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)),
+                   input_c * kernel_width * element_size);
+            out_ptr += input_c * kernel_width;
+        }
+    }
+    else
+    {
+        for (int y = start_y; y < end_y; y += dilation_y)
+        {
+            if (y < 0 || y >= input_h)
+            {
+                memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
+                out_ptr += pad_quant;
+            }
+            else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size)
+            {
+                for (int x = start_x; x < end_x; x += dilation_x)
+                {
+                    if (x < 0 || x >= input_w)
+                    {
+                        memset(static_cast<void *>(out_ptr), pad_value, input_c * element_size);
+                        out_ptr += input_c;
+                    }
+                    else
+                    {
+                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)),
+                               input_c * element_size);
+                        out_ptr += input_c;
+                    }
+                }
+            }
+            else
+            {
+                //optimized for no dilation and no boundary pixels
+                memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)),
+                       input_c * kernel_width * element_size);
+                out_ptr += input_c * kernel_width;
+            }
+        }
+    }
+    // Append 1 if the convolution layer has biases
+    if (has_bias)
+    {
+        *out_ptr = static_cast<T>(1);
+    }
+}
+
+template <typename T, bool has_pads>
+void linearize_volume_nhwc(const uint8_t *const in_ptr,
+                           T                   *out_ptr,
+                           bool                 has_bias,
+                           int                  start_x,
+                           int                  start_y,
+                           int                  kernel_width,
+                           int                  kernel_height,
+                           int                  input_w,
+                           int                  input_h,
+                           int                  input_c,
+                           int                  input_stride_y,
+                           int                  input_stride_z,
+                           int                  pad_value,
+                           int                  dilation_x,
+                           int                  dilation_y,
+                           int                  pad_right)
+{
+    const int end_x              = start_x + kernel_width * dilation_x;
+    const int end_y              = start_y + kernel_height * dilation_y;
+    const int pad_quant          = kernel_width * (input_c + pad_right);
+    const int element_size       = static_cast<int>(sizeof(T));
+    const int channel_chunk_size = input_c * element_size;
+
+    if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) &&
+        (input_stride_y == channel_chunk_size))
+    {
+        for (int y = start_y; y < end_y; y += dilation_y)
+        {
+            const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y);
+            for (int e = 0; e < kernel_width; e++)
+            {
+                memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size);
+                out_ptr += input_c + pad_right;
+            }
+        }
+    }
+    else
+    {
+        for (int y = start_y; y < end_y; y += dilation_y)
+        {
+            if (y < 0 || y >= input_h)
+            {
+                memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
+                out_ptr += pad_quant;
+            }
+            else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size)
+            {
+                for (int x = start_x; x < end_x; x += dilation_x)
+                {
+                    if (x < 0 || x >= input_w)
+                    {
+                        memset(static_cast<void *>(out_ptr), pad_value, (input_c + pad_right) * element_size);
+                        out_ptr += input_c + pad_right;
+                    }
+                    else
+                    {
+                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)),
+                               channel_chunk_size);
+                        out_ptr += input_c + pad_right;
+                    }
+                }
+            }
+            else
+            {
+                const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y);
+                for (int e = 0; e < kernel_width; e++)
+                {
+                    memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size),
+                           channel_chunk_size);
+                    out_ptr += input_c + pad_right;
+                }
+            }
+        }
+    }
+    // Append 1 if the convolution layer has biases
+    if (has_bias)
+    {
+        *out_ptr = static_cast<T>(1);
+    }
+}
+
+template <typename T, bool has_pads, bool is_nchw>
+void run_im2col(const ITensor                        *src,
+                ITensor                              *dst,
+                const Window                         &window,
+                DataLayout                            data_layout,
+                const PadStrideInfo                  &conv_info,
+                std::pair<unsigned int, unsigned int> convolved_dims,
+                const Size2D                         &kernel_dims,
+                const Size2D                         &dilation,
+                uint32_t                              input_pad_right,
+                bool                                  has_bias)
+{
+    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    const int input_w        = src->info()->dimension(width_idx);
+    const int input_h        = src->info()->dimension(height_idx);
+    const int input_c        = src->info()->dimension(channel_idx);
+    const int input_stride_x = src->info()->strides_in_bytes().x();
+    const int input_stride_y = src->info()->strides_in_bytes().y();
+    const int input_stride_z = src->info()->strides_in_bytes().z();
+    const int pad_left       = conv_info.pad_left();
+    const int pad_top        = conv_info.pad_top();
+    const int stride_x       = conv_info.stride().first;
+    const int stride_y       = conv_info.stride().second;
+    const int pad_value =
+        is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0;
+
+    const auto kernel_width  = kernel_dims.width;
+    const auto kernel_height = kernel_dims.height;
+
+    Window window_in_out(window);
+    // The first three dimensions of the input and output are increased by the inner loops
+    window_in_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Create iterators
+    Iterator in(src, window_in_out);
+    Iterator out(dst, window_in_out);
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const int start_w = id[width_idx] * stride_x - pad_left;
+            const int start_h = id[height_idx] * stride_y - pad_top;
+
+            // Get pointers
+            const uint8_t *const input_ptr = in.ptr();
+            auto                 output_ptr =
+                reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * convolved_dims.first) *
+                                                      dst->info()->strides_in_bytes().y());
+
+            // Linearize volume
+            if (is_nchw)
+            {
+                linearize_volume_nchw<T, has_pads>(
+                    input_ptr, output_ptr, has_bias, start_w, start_h, kernel_width, kernel_height, input_c, input_w,
+                    input_h, input_stride_x, input_stride_y, input_stride_z, pad_value, dilation.x(), dilation.y());
+            }
+            else
+            {
+                if (input_pad_right > 0)
+                {
+                    linearize_volume_nhwc<T, has_pads>(input_ptr, output_ptr, has_bias, start_w, start_h, kernel_width,
+                                                       kernel_height, input_w, input_h, input_c, input_stride_y,
+                                                       input_stride_z, pad_value, dilation.x(), dilation.y(),
+                                                       input_pad_right);
+                }
+                else
+                {
+                    linearize_volume_nhwc<T, has_pads>(input_ptr, output_ptr, has_bias, start_w, start_h, kernel_width,
+                                                       kernel_height, input_w, input_h, input_c, input_stride_y,
+                                                       input_stride_z, pad_value, dilation.x(), dilation.y());
+                }
+            }
+        },
+        in, out);
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_IMPL_H
diff --git a/src/cpu/kernels/directconv2d/list.h b/src/cpu/kernels/directconv2d/list.h
index 9a0472643da08026e579d410f0d02cadbaf8d842..e3ff46b1481a62cec337bdef260d45c00a100310 100644
--- a/src/cpu/kernels/directconv2d/list.h
+++ b/src/cpu/kernels/directconv2d/list.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,27 +21,57 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_CONV2D_LIST_H
-#define SRC_CORE_NEON_KERNELS_CONV2D_LIST_H
+#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_LIST_H
+#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_LIST_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
 
 #include "src/core/common/Registrars.h"
 
+#include <algorithm>
+
 namespace arm_compute
 {
 namespace cpu
 {
 namespace kernels
 {
-#define DECLARE_DIRECT_CONV2D_KERNEL(func_name) \
-    void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+#define DECLARE_DIRECT_CONV2D_KERNEL(func_name)                                                    \
+    void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, \
+                   const PadStrideInfo &conv_info)
 
 DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nhwc_directconv2d);
 DECLARE_DIRECT_CONV2D_KERNEL(neon_fp16_nchw_directconv2d);
 DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nchw_directconv2d);
 
+#define DECLARE_IM2COL_KERNEL(func_name)                                                                 \
+    void func_name(const ITensor *src, ITensor *dst, const Window &window, DataLayout data_layout,       \
+                   const PadStrideInfo &conv_info, std::pair<unsigned int, unsigned int> convolved_dims, \
+                   const Size2D &kernel_dims, const Size2D &dilation, uint32_t input_pad_right, bool has_bias)
+
+DECLARE_IM2COL_KERNEL(run_im2col_fp32_nchw_pad);
+DECLARE_IM2COL_KERNEL(run_im2col_fp32_nchw_nopad);
+DECLARE_IM2COL_KERNEL(run_im2col_fp16_nchw_pad);
+DECLARE_IM2COL_KERNEL(run_im2col_fp16_nchw_nopad);
+DECLARE_IM2COL_KERNEL(run_im2col_bf16_nchw_pad);
+DECLARE_IM2COL_KERNEL(run_im2col_bf16_nchw_nopad);
+DECLARE_IM2COL_KERNEL(run_im2col_qasymm8_nchw_pad);
+DECLARE_IM2COL_KERNEL(run_im2col_qasymm8_nchw_nopad);
+
+DECLARE_IM2COL_KERNEL(run_im2col_fp32_pad);
+DECLARE_IM2COL_KERNEL(run_im2col_fp32_nopad);
+DECLARE_IM2COL_KERNEL(run_im2col_fp16_pad);
+DECLARE_IM2COL_KERNEL(run_im2col_fp16_nopad);
+DECLARE_IM2COL_KERNEL(run_im2col_bf16_pad);
+DECLARE_IM2COL_KERNEL(run_im2col_bf16_nopad);
+
 #undef DECLARE_DIRECT_CONV2D_KERNEL
+#undef DECLARE_IM2COL_KERNEL
 
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif //SRC_CORE_NEON_KERNELS_CONV2D_LIST_H
+#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_LIST_H
diff --git a/src/cpu/kernels/directconv2d/nchw/all.cpp b/src/cpu/kernels/directconv2d/nchw/all.cpp
index a719fa50d6a72d76dacb14546e23039d8b541ad6..84f5eeff5a73e6bc7075abe109dbf3db526960b6 100644
--- a/src/cpu/kernels/directconv2d/nchw/all.cpp
+++ b/src/cpu/kernels/directconv2d/nchw/all.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,18 +22,20 @@
  * SOFTWARE.
  */
 
-#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
-
-#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/directconv2d/impl.h"
+#include "src/cpu/kernels/directconv2d/list.h"
+#include "src/cpu/kernels/directconv2d/nchw/impl.h"
+#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
 
 #include <algorithm>
 
@@ -43,136 +45,102 @@ namespace cpu
 {
 namespace kernels
 {
-template <typename T>
-void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-void neon_fp16_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void neon_fp32_nchw_directconv2d(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
 {
-    convolve_nchw<float16_t>(window, src, weights, dst, conv_info);
+    convolve_nchw<float>(window, src, weights, dst, conv_info);
 }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-void neon_fp32_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void run_im2col_fp32_nchw_pad(const ITensor                        *src,
+                              ITensor                              *dst,
+                              const Window                         &window,
+                              DataLayout                            data_layout,
+                              const PadStrideInfo                  &conv_info,
+                              std::pair<unsigned int, unsigned int> convolved_dims,
+                              const Size2D                         &kernel_dims,
+                              const Size2D                         &dilation,
+                              uint32_t                              input_pad_right,
+                              bool                                  has_bias)
 {
-    convolve_nchw<float>(window, src, weights, dst, conv_info);
+    arm_compute::cpu::kernels::run_im2col<float, true, true>(src, dst, window, data_layout, conv_info, convolved_dims,
+                                                             kernel_dims, dilation, input_pad_right, has_bias);
 }
 
-template <typename T>
-void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void run_im2col_fp32_nchw_nopad(const ITensor                        *src,
+                                ITensor                              *dst,
+                                const Window                         &window,
+                                DataLayout                            data_layout,
+                                const PadStrideInfo                  &conv_info,
+                                std::pair<unsigned int, unsigned int> convolved_dims,
+                                const Size2D                         &kernel_dims,
+                                const Size2D                         &dilation,
+                                uint32_t                              input_pad_right,
+                                bool                                  has_bias)
 {
-    ARM_COMPUTE_UNUSED(conv_info);
-
-    // Declare useful types
-    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
-    using vector_type = typename vtype::type;
-    using tag_type    = typename vtype::tag_type;
-
-    // Scalar quantities
-    const int element_size   = src->info()->element_size();
-    const int input_stride_w = src->info()->strides_in_bytes()[0] / element_size;
-    const int input_stride_h = src->info()->strides_in_bytes()[1] / element_size;
-    const int input_stride_c = src->info()->strides_in_bytes()[2] / element_size;
-    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
-
-    const int input_dim_w = src->info()->dimension(0);
-    const int input_dim_h = src->info()->dimension(1);
-
-    const int output_stride_c = dst->info()->strides_in_bytes()[2];
-
-    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().x() / element_size;
-    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().y() / element_size;
-    const unsigned int kernel_stride_c = weights->info()->strides_in_bytes().z() / element_size;
-
-    const int kernel_dim_w = weights->info()->dimension(0);
-    const int kernel_dim_h = weights->info()->dimension(1);
-
-    const int conv_pad_top  = conv_info.pad_top();
-    const int conv_pad_left = conv_info.pad_left();
-    const int conv_stride_w = std::get<0>(conv_info.stride());
-    const int conv_stride_h = std::get<1>(conv_info.stride());
-
-    // Setup input window for the output iterator
-    Window window_out = window;
-    window_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    // Setup input window for the weights iterator
-    Window window_w = calculate_max_window(*weights->info(), Steps());
-    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    Iterator out(dst, window_out);
-    Iterator wei(weights, window_w);
-
-    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // We are computing the theoretical starting input starting points
-        const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-        // We use the input points to select the valid weight points to use
-        const int wei_w_start = in_w_start - in_w_start_t;
-        const int wei_h_start = in_h_start - in_h_start_t;
-        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-
-        const int      index_c_end  = weights->info()->dimension(2);
-        const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
-        {
-            const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-            uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
-            T              out_temp          = static_cast<T>(0);
-
-            for(int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c)
-            {
-                const T *const in_ptr_row_0      = in_ptr_start + index_in_c * input_stride_c;
-                const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c;
-                for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
-                {
-                    const T    *in_ptr_row      = in_ptr_row_0 + index_in_h * input_stride_h;
-                    const T    *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h;
-                    int         index_w         = in_w_start;
-                    int         index_wei_w     = wei_w_start;
-                    vector_type out_temp_vec    = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                    for(; index_w <= ((in_w_end - num_elems_read_per_iteration)); index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration)
-                    {
-                        const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w);
-                        const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w);
-                        out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                    }
-                    out_temp += vreduce(out_temp_vec);
-                    for(; index_w < in_w_end; ++index_w, ++index_wei_w)
-                    {
-                        const auto src_val = *(in_ptr_row + index_w * input_stride_w);
-                        const auto w_val   = *(weights_ptr_row + index_wei_w * kernel_stride_w);
-                        out_temp += src_val * w_val;
-                    }
-                }
-            }
-            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+    arm_compute::cpu::kernels::run_im2col<float, false, true>(src, dst, window, data_layout, conv_info, convolved_dims,
+                                                              kernel_dims, dilation, input_pad_right, has_bias);
+}
 
-        },
-        wei);
-    },
-    out);
+void run_im2col_qasymm8_nchw_pad(const ITensor                        *src,
+                                 ITensor                              *dst,
+                                 const Window                         &window,
+                                 DataLayout                            data_layout,
+                                 const PadStrideInfo                  &conv_info,
+                                 std::pair<unsigned int, unsigned int> convolved_dims,
+                                 const Size2D                         &kernel_dims,
+                                 const Size2D                         &dilation,
+                                 uint32_t                              input_pad_right,
+                                 bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<qasymm8_t, true, true>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
 }
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template void convolve_nchw<float16_t>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+void run_im2col_qasymm8_nchw_nopad(const ITensor                        *src,
+                                   ITensor                              *dst,
+                                   const Window                         &window,
+                                   DataLayout                            data_layout,
+                                   const PadStrideInfo                  &conv_info,
+                                   std::pair<unsigned int, unsigned int> convolved_dims,
+                                   const Size2D                         &kernel_dims,
+                                   const Size2D                         &dilation,
+                                   uint32_t                              input_pad_right,
+                                   bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<qasymm8_t, false, true>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+void run_im2col_bf16_nchw_pad(const ITensor                        *src,
+                              ITensor                              *dst,
+                              const Window                         &window,
+                              DataLayout                            data_layout,
+                              const PadStrideInfo                  &conv_info,
+                              std::pair<unsigned int, unsigned int> convolved_dims,
+                              const Size2D                         &kernel_dims,
+                              const Size2D                         &dilation,
+                              uint32_t                              input_pad_right,
+                              bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<bfloat16, true, true>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
 
-template void convolve_nchw<float>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+void run_im2col_bf16_nchw_nopad(const ITensor                        *src,
+                                ITensor                              *dst,
+                                const Window                         &window,
+                                DataLayout                            data_layout,
+                                const PadStrideInfo                  &conv_info,
+                                std::pair<unsigned int, unsigned int> convolved_dims,
+                                const Size2D                         &kernel_dims,
+                                const Size2D                         &dilation,
+                                uint32_t                              input_pad_right,
+                                bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<bfloat16, false, true>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
+#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/directconv2d/nchw/fp16.cpp b/src/cpu/kernels/directconv2d/nchw/fp16.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a9cab42f56b0b3119a610d4bee29a1a2cfa76860
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nchw/fp16.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d/impl.h"
+#include "src/cpu/kernels/directconv2d/nchw/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+void neon_fp16_nchw_directconv2d(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+    convolve_nchw<float16_t>(window, src, weights, dst, conv_info);
+}
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+void run_im2col_fp16_nchw_pad(const ITensor                        *src,
+                              ITensor                              *dst,
+                              const Window                         &window,
+                              DataLayout                            data_layout,
+                              const PadStrideInfo                  &conv_info,
+                              std::pair<unsigned int, unsigned int> convolved_dims,
+                              const Size2D                         &kernel_dims,
+                              const Size2D                         &dilation,
+                              uint32_t                              input_pad_right,
+                              bool                                  has_bias)
+{
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+    arm_compute::cpu::kernels::run_im2col<float16_t, true, true>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+#else  // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+    ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right,
+                       has_bias);
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+}
+
+void run_im2col_fp16_nchw_nopad(const ITensor                        *src,
+                                ITensor                              *dst,
+                                const Window                         &window,
+                                DataLayout                            data_layout,
+                                const PadStrideInfo                  &conv_info,
+                                std::pair<unsigned int, unsigned int> convolved_dims,
+                                const Size2D                         &kernel_dims,
+                                const Size2D                         &dilation,
+                                uint32_t                              input_pad_right,
+                                bool                                  has_bias)
+{
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+    arm_compute::cpu::kernels::run_im2col<float16_t, false, true>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+#else  // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+    ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right,
+                       has_bias);
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d/nchw/impl.h b/src/cpu/kernels/directconv2d/nchw/impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a5b175d9834fac417470509dbb181160f3710d7
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nchw/impl.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_NCHW_IMPL_H
+#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_NCHW_IMPL_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+template <typename T>
+void convolve_nchw(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_UNUSED(conv_info);
+
+    // Declare useful types
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    // Scalar quantities
+    const int element_size   = src->info()->element_size();
+    const int input_stride_w = src->info()->strides_in_bytes()[0] / element_size;
+    const int input_stride_h = src->info()->strides_in_bytes()[1] / element_size;
+    const int input_stride_c = src->info()->strides_in_bytes()[2] / element_size;
+    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
+
+    const int input_dim_w = src->info()->dimension(0);
+    const int input_dim_h = src->info()->dimension(1);
+
+    const int output_stride_c = dst->info()->strides_in_bytes()[2];
+
+    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().x() / element_size;
+    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().y() / element_size;
+    const unsigned int kernel_stride_c = weights->info()->strides_in_bytes().z() / element_size;
+
+    const int kernel_dim_w = weights->info()->dimension(0);
+    const int kernel_dim_h = weights->info()->dimension(1);
+
+    const int conv_pad_top  = conv_info.pad_top();
+    const int conv_pad_left = conv_info.pad_left();
+    const int conv_stride_w = std::get<0>(conv_info.stride());
+    const int conv_stride_h = std::get<1>(conv_info.stride());
+
+    // Setup input window for the output iterator
+    Window window_out = window;
+    window_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    // Setup input window for the weights iterator
+    Window window_w = calculate_max_window(*weights->info(), Steps());
+    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    Iterator out(dst, window_out);
+    Iterator wei(weights, window_w);
+
+    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
+
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // We are computing the theoretical starting input starting points
+            const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left;
+            const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top;
+            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+            // We are computing the valid initial and ending input points by checking the borders
+            const int in_w_start = std::max(in_w_start_t, 0);
+            const int in_h_start = std::max(in_h_start_t, 0);
+            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+            // We use the input points to select the valid weight points to use
+            const int wei_w_start = in_w_start - in_w_start_t;
+            const int wei_h_start = in_h_start - in_h_start_t;
+            const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+
+            const int      index_c_end = weights->info()->dimension(2);
+            const T *const in_ptr_start =
+                reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                id[3] * input_stride_n;
+            execute_window_loop(
+                window_w,
+                [&](const Coordinates &id_w)
+                {
+                    const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                    uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
+                    T              out_temp          = static_cast<T>(0);
+
+                    for (int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c)
+                    {
+                        const T *const in_ptr_row_0      = in_ptr_start + index_in_c * input_stride_c;
+                        const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c;
+                        for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+                             ++index_wei_h, ++index_in_h)
+                        {
+                            const T    *in_ptr_row      = in_ptr_row_0 + index_in_h * input_stride_h;
+                            const T    *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h;
+                            int         index_w         = in_w_start;
+                            int         index_wei_w     = wei_w_start;
+                            vector_type out_temp_vec    = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                            for (; index_w <= ((in_w_end - num_elems_read_per_iteration));
+                                 index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration)
+                            {
+                                const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w);
+                                const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w);
+                                out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                            }
+                            out_temp += vreduce(out_temp_vec);
+                            for (; index_w < in_w_end; ++index_w, ++index_wei_w)
+                            {
+                                const auto src_val = *(in_ptr_row + index_w * input_stride_w);
+                                const auto w_val   = *(weights_ptr_row + index_wei_w * kernel_stride_w);
+                                out_temp += src_val * w_val;
+                            }
+                        }
+                    }
+                    *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                },
+                wei);
+        },
+        out);
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_NCHW_IMPL_H
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f78601544f5398d47f82b78803f867df78406920
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d/impl.h"
+#include "src/cpu/kernels/directconv2d/nchw/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+void run_im2col_fp16_pad(const ITensor                        *src,
+                         ITensor                              *dst,
+                         const Window                         &window,
+                         DataLayout                            data_layout,
+                         const PadStrideInfo                  &conv_info,
+                         std::pair<unsigned int, unsigned int> convolved_dims,
+                         const Size2D                         &kernel_dims,
+                         const Size2D                         &dilation,
+                         uint32_t                              input_pad_right,
+                         bool                                  has_bias)
+{
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+    arm_compute::cpu::kernels::run_im2col<float16_t, true, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+#else  // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+    ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right,
+                       has_bias);
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+}
+
+void run_im2col_fp16_nopad(const ITensor                        *src,
+                           ITensor                              *dst,
+                           const Window                         &window,
+                           DataLayout                            data_layout,
+                           const PadStrideInfo                  &conv_info,
+                           std::pair<unsigned int, unsigned int> convolved_dims,
+                           const Size2D                         &kernel_dims,
+                           const Size2D                         &dilation,
+                           uint32_t                              input_pad_right,
+                           bool                                  has_bias)
+{
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+    arm_compute::cpu::kernels::run_im2col<float16_t, false, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+#else  // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+    ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right,
+                       has_bias);
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
index 9982431de54b8691e920e321dca4bf2e4a10bde8..17d92122480605d85e78221c12bd39cc5ceaa6e0 100644
--- a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,10 +30,11 @@ namespace cpu
 {
 namespace kernels
 {
-void neon_fp32_nhwc_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void neon_fp32_nhwc_directconv2d(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
 {
     convolve_nhwc<float>(window, src, weights, dst, conv_info);
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
index 500ad1b420183ae81a96fde553567d2a78ebdd0b..f235167e28222ac19052f81eaf027c5076e5a561 100644
--- a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
@@ -24,16 +24,16 @@
 
 #include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
 
-#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <algorithm>
 
@@ -49,12 +49,14 @@ namespace
 {
 bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights)
 {
-    return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0);
-}
+    return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 &&
+            weights->padding().right == 0);
 }
+} // namespace
 
 template <typename T>
-void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void convolve_nhwc(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
 {
     // Declare useful types
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
@@ -97,7 +99,7 @@ void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weig
     constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
 
     // nhwc optimized
-    if(have_zero_x_internal_padding(src->info(), weights->info()))
+    if (have_zero_x_internal_padding(src->info(), weights->info()))
     {
         // This function assumes that input and weights have not padding in channel
 
@@ -114,138 +116,154 @@ void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weig
         * multiplication works on the correct input/weight elements.
         */
         execute_window_loop(
-            window_out, [&](const Coordinates & id)
-        {
-            /*
+            window_out,
+            [&](const Coordinates &id)
+            {
+                /*
             * In here we create theoretical indexes which then we validate for both
             * inputs and weights.
             * As a reminder, this loop take each output point in NHW, C is treated
             * in the weights loop.
             */
-            // We are computing the theoretical starting input starting points
-            const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-            const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-            // We are computing the valid initial and ending input points by checking the borders
-            const int in_w_start = std::max(in_w_start_t, 0);
-            const int in_h_start = std::max(in_h_start_t, 0);
-            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-            // We use the input points to select the valid weight points to use
-            const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
-            const int index_h_start  = in_h_start - in_h_start_t;
-            const int index_wc_end   = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
-            const int index_h_end    = kernel_dim_h - (in_h_end_t - in_h_end);
-
-            execute_window_loop(
-                window_w, [&](const Coordinates & id_w)
-            {
-                /*
+                // We are computing the theoretical starting input starting points
+                const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+                const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+                const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+                const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+                // We are computing the valid initial and ending input points by checking the borders
+                const int in_w_start = std::max(in_w_start_t, 0);
+                const int in_h_start = std::max(in_h_start_t, 0);
+                const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+                const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+                // We use the input points to select the valid weight points to use
+                const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
+                const int index_h_start  = in_h_start - in_h_start_t;
+                const int index_wc_end   = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
+                const int index_h_end    = kernel_dim_h - (in_h_end_t - in_h_end);
+
+                execute_window_loop(
+                    window_w,
+                    [&](const Coordinates &id_w)
+                    {
+                        /*
                 * This is the loop in the weights, and it goes along N (the batches)
                 * As a reminder, the batches of the weights are translated into the
                 * channels of the output
                 */
-                const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes())
-                                      + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
-                const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
-                uint8_t *out_ptr         = out.ptr() + id_w[3] * output_stride_c;
-
-                T out_temp = static_cast<T>(0);
-                for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
-                {
-                    const T    *in_ptr_mover = in_ptr_row;
-                    int         index_wc     = index_wc_start;
-                    vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                    for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
-                    {
-                        const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                        const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wc);
-                        out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                    }
-                    out_temp += vreduce(out_temp_vec);
-                    for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
-                    {
-                        const auto src_val = *(in_ptr_mover);
-                        const auto w_val   = *(weights_ptr_row + index_wc);
-                        out_temp += src_val * w_val;
-                    }
-                }
-                *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                        const T *in_ptr_row =
+                            reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                            id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
+                        const T *weights_ptr_row =
+                            reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
+                        uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
+
+                        T out_temp = static_cast<T>(0);
+                        for (int index_h = index_h_start; index_h < index_h_end;
+                             ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
+                        {
+                            const T    *in_ptr_mover = in_ptr_row;
+                            int         index_wc     = index_wc_start;
+                            vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                            for (; index_wc <= index_wc_end - num_elems_read_per_iteration;
+                                 index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+                            {
+                                const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                                const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wc);
+                                out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                            }
+                            out_temp += vreduce(out_temp_vec);
+                            for (; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
+                            {
+                                const auto src_val = *(in_ptr_mover);
+                                const auto w_val   = *(weights_ptr_row + index_wc);
+                                out_temp += src_val * w_val;
+                            }
+                        }
+                        *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                    },
+                    wei);
             },
-            wei);
-        },
-        out);
+            out);
     }
     else // nhwc non optimized
     {
         execute_window_loop(
-            window_out, [&](const Coordinates & id)
-        {
-            // We are computing the theoretical starting input starting points
-            const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-            const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-            // We are computing the valid initial and ending input points by checking the borders
-            const int in_w_start = std::max(in_w_start_t, 0);
-            const int in_h_start = std::max(in_h_start_t, 0);
-            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-            // We use the input points to select the valid weight points to use
-            const int wei_w_start = in_w_start - in_w_start_t;
-            const int wei_h_start = in_h_start - in_h_start_t;
-            const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
-            const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-
-            const int      index_c_end  = weights->info()->dimension(0);
-            const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
-
-            execute_window_loop(
-                window_w, [&](const Coordinates & id_w)
+            window_out,
+            [&](const Coordinates &id)
             {
-                const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-                uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
-
-                T out_temp = static_cast<T>(0);
-                for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
-                {
-                    const T *const in_ptr_row      = in_ptr_start + index_in_h * input_stride_h;
-                    const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
-                    for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
+                // We are computing the theoretical starting input starting points
+                const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+                const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+                const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+                const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+                // We are computing the valid initial and ending input points by checking the borders
+                const int in_w_start = std::max(in_w_start_t, 0);
+                const int in_h_start = std::max(in_h_start_t, 0);
+                const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+                const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+                // We use the input points to select the valid weight points to use
+                const int wei_w_start = in_w_start - in_w_start_t;
+                const int wei_h_start = in_h_start - in_h_start_t;
+                const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
+                const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+
+                const int      index_c_end = weights->info()->dimension(0);
+                const T *const in_ptr_start =
+                    reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                    id[3] * input_stride_n;
+
+                execute_window_loop(
+                    window_w,
+                    [&](const Coordinates &id_w)
                     {
-                        const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
-                        const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
-                        int         index_c           = 0;
-                        vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                        for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
-                        {
-                            const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                            const auto w_vec   = wrapper::vloadq(weights_ptr_mover);
-                            out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                        }
-                        out_temp += vreduce(out_temp_vec);
-                        for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
+                        const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                        uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
+
+                        T out_temp = static_cast<T>(0);
+                        for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+                             ++index_wei_h, ++index_in_h)
                         {
-                            const auto src_val = *(in_ptr_mover);
-                            const auto w_val   = *(weights_ptr_mover);
-                            out_temp += src_val * w_val;
+                            const T *const in_ptr_row      = in_ptr_start + index_in_h * input_stride_h;
+                            const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
+                            for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end;
+                                 ++index_wei_w, ++index_in_w)
+                            {
+                                const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
+                                const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+                                int         index_c           = 0;
+                                vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                                for (; index_c <= index_c_end - num_elems_read_per_iteration;
+                                     index_c += num_elems_read_per_iteration,
+                                     in_ptr_mover += num_elems_read_per_iteration,
+                                     weights_ptr_mover += num_elems_read_per_iteration)
+                                {
+                                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                                    const auto w_vec   = wrapper::vloadq(weights_ptr_mover);
+                                    out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                                }
+                                out_temp += vreduce(out_temp_vec);
+                                for (; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
+                                {
+                                    const auto src_val = *(in_ptr_mover);
+                                    const auto w_val   = *(weights_ptr_mover);
+                                    out_temp += src_val * w_val;
+                                }
+                            }
                         }
-                    }
-                }
-                *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                        *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                    },
+                    wei);
             },
-            wei);
-        },
-        out);
+            out);
     }
 }
 
-template void convolve_nhwc<float>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+template void convolve_nhwc<float>(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
 
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
index 3b26fcdf2954be4677b2be14f2dc8bd77790b78a..efb9ce8e2a678576c91288307a6f0de422bf0079 100644
--- a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
@@ -26,6 +26,7 @@
 #define SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H
 
 #include "arm_compute/core/ITensor.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
@@ -35,7 +36,8 @@ namespace cpu
 namespace kernels
 {
 template <typename T>
-void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+void convolve_nhwc(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4c6fbec63a26759092146d26c9a49e6230718a08
--- /dev/null
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/directconv2d/impl.h"
+#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+void run_im2col_qasymm8_pad(const ITensor                        *src,
+                            ITensor                              *dst,
+                            const Window                         &window,
+                            DataLayout                            data_layout,
+                            const PadStrideInfo                  &conv_info,
+                            std::pair<unsigned int, unsigned int> convolved_dims,
+                            const Size2D                         &kernel_dims,
+                            const Size2D                         &dilation,
+                            uint32_t                              input_pad_right,
+                            bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<qasymm8_t, true, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
+
+void run_im2col_qasymm8_nopad(const ITensor                        *src,
+                              ITensor                              *dst,
+                              const Window                         &window,
+                              DataLayout                            data_layout,
+                              const PadStrideInfo                  &conv_info,
+                              std::pair<unsigned int, unsigned int> convolved_dims,
+                              const Size2D                         &kernel_dims,
+                              const Size2D                         &dilation,
+                              uint32_t                              input_pad_right,
+                              bool                                  has_bias)
+{
+    arm_compute::cpu::kernels::run_im2col<qasymm8_t, false, false>(
+        src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias);
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
index 6091ef215e1f45493509c8b4f8a8da7fa7182721..9b4375f17cdeab223b9e1161bdd3ce9a12ecf949 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
@@ -23,6 +23,7 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
 
 namespace arm_compute
@@ -35,14 +36,38 @@ void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITenso
     return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>(in1, in2, out, window);
 }
 
-template void neon_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                              const ITensor *in2,
+                                                                              ITensor       *out,
+                                                                              const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                       const ITensor *in2,
+                                                                       ITensor       *out,
+                                                                       const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                       const ITensor *in2,
+                                                                       ITensor       *out,
+                                                                       const Window  &window);
 
 template <ComparisonOperation op>
 void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
@@ -50,12 +75,30 @@ void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *
     return elementwise_comp_op_16<op, float16_t, float16x8_t>(in1, in2, out, window);
 }
 
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-}
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                         const ITensor *in2,
+                                                                                         ITensor       *out,
+                                                                                         const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                      const ITensor *in2,
+                                                                                      ITensor       *out,
+                                                                                      const Window  &window);
+} // namespace cpu
 } // namespace arm_compute
 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
index 2d8fec91c55e5069cb8fa375c6806fdedc1a2137..53ccd89dccc9904c7e82eb0d2f967cd76a5d6928 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
 
 namespace arm_compute
@@ -34,25 +35,67 @@ void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITenso
     return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>(in1, in2, out, window);
 }
 
-template void neon_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                              const ITensor *in2,
+                                                                              ITensor       *out,
+                                                                              const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                       const ITensor *in2,
+                                                                       ITensor       *out,
+                                                                       const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                       const ITensor *in2,
+                                                                       ITensor       *out,
+                                                                       const Window  &window);
 
 template <ComparisonOperation op>
 void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comp_op_32<op, float, float32x4_t>(in1, in2, out, window);
 }
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-}
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                         const ITensor *in2,
+                                                                                         ITensor       *out,
+                                                                                         const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                      const ITensor *in2,
+                                                                                      ITensor       *out,
+                                                                                      const Window  &window);
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
index 98b154e8fdf2bff649b2bbad11c17344bd3ae8b2..98f7e8b9493a2be0cf668fb2be1ecb3e755105c8 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
@@ -39,7 +39,7 @@ typename VectorType::type elementwise_arithm_op(const typename VectorType::type
 
     vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
 
-    switch(op)
+    switch (op)
     {
         case ArithmeticOperation::MAX:
             res = wrapper::vmax(a, b);
@@ -71,7 +71,9 @@ typename VectorType::type elementwise_arithm_op(const typename VectorType::type
 }
 
 template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder)
+typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a,
+                                                          const ScalarType                &broadcast_value,
+                                                          const bool                       reorder)
 {
     using tag_type = typename VectorType::tag_type;
     using vec_type = typename VectorType::type;
@@ -81,10 +83,15 @@ typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorT
 }
 
 template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
-                    int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
-                    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
+void elementwise_op(
+    const ITensor *in1,
+    const ITensor *in2,
+    ITensor       *out,
+    const Window  &window,
+    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
+    int (*broadcast_func)(
+        int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
+    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
 {
     // Create input windows
     Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -99,7 +106,7 @@ void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const
     const auto window_end_x          = static_cast<int>(window.x().end());
     const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -114,20 +121,26 @@ void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto a      = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr =
+                    reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+                const InputScalarType broadcast_value =
+                    *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+
+                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                                          broadcast_value, output_ptr, !is_broadcast_input_2);
+                for (; x < window_end_x; ++x)
+                {
+                    const auto a      = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a,
+                                                       !is_broadcast_input_2 ? a : broadcast_value);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -139,21 +152,23 @@ void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto a      = *(input1_ptr + x);
-                const auto b      = *(input2_ptr + x);
-                *(output_ptr + x) = (*scalar_func)(a, b);
-            }
-        },
-        input1, input2, output);
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
+                for (; x < window_end_x; ++x)
+                {
+                    const auto a      = *(input1_ptr + x);
+                    const auto b      = *(input2_ptr + x);
+                    *(output_ptr + x) = (*scalar_func)(a, b);
+                }
+            },
+            input1, input2, output);
     }
 }
 
@@ -162,7 +177,7 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar
 {
     auto res = ScalarType(0);
 
-    switch(op)
+    switch (op)
     {
         case ArithmeticOperation::MAX:
             res = std::max(a, b);
@@ -183,10 +198,10 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar
         case ArithmeticOperation::DIV:
         {
             res = a / b;
-            if(std::is_integral<ScalarType>::value)
+            if (std::is_integral<ScalarType>::value)
             {
                 res = (b == 0) ? 0 : res;
-                if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
+                if (static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
                 {
                     --res;
                 }
@@ -205,43 +220,56 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar
 }
 
 template <>
-inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, const int32x4_t &b)
+inline int32x4_t
+elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a,
+                                                                                                   const int32x4_t &b)
 {
     return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));
 }
 
 template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
+inline float32x4_t
+elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a,
+                                                                                                 const float32x4_t &b)
 {
     return wrapper::vdiv(a, b);
 }
 
 template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
+inline float32x4_t
+elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a,
+                                                                                                   const float32x4_t &b)
 {
     return wrapper::vpow(a, b);
 }
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
+inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(
+    const float16x8_t &a, const float16x8_t &b)
 {
     return wrapper::vdiv(a, b);
 }
 
 template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
+inline float16x8_t
+elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(
+    const float16x8_t &a, const float16x8_t &b)
 {
     return wrapper::vpow(a, b);
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x,
-                                      const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
+inline int elementwise_arithm_op_loop(int               window_start_x,
+                                      int               window_end_x,
+                                      int               window_step_x,
+                                      const ScalarType *input1_ptr,
+                                      const ScalarType *input2_ptr,
+                                      ScalarType       *output_ptr)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const auto a = wrapper::vloadq(input1_ptr + x);
         const auto b = wrapper::vloadq(input2_ptr + x);
@@ -251,14 +279,20 @@ inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int
 }
 
 template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
+inline int elementwise_arithm_op_broadcast_loop(int               window_start_x,
+                                                int               window_end_x,
+                                                int               window_step_x,
+                                                const ScalarType *non_broadcast_input_ptr,
+                                                const ScalarType &broadcast_value,
+                                                ScalarType       *output_ptr,
+                                                const bool        reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
-        wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
+        wrapper::vstore(output_ptr + x,
+                        elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
     }
     return x;
 }
@@ -268,10 +302,10 @@ void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out,
 {
     using scalar_type = typename VectorType::scalar_type;
 
-    elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window,
-                                                         &elementwise_arithm_op_scalar<op, scalar_type>,
-                                                         &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
-                                                         &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
+    elementwise_op<scalar_type, scalar_type, VectorType>(
+        in1, in2, out, window, &elementwise_arithm_op_scalar<op, scalar_type>,
+        &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
+        &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
 }
 
 template <ComparisonOperation op, typename InputScalarType>
@@ -279,7 +313,7 @@ inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputS
 {
     bool res = false;
 
-    switch(op)
+    switch (op)
     {
         case ComparisonOperation::Equal:
             res = (a == b);
@@ -308,9 +342,9 @@ inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputS
 template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType>
 inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)
 {
-    OutputVectorType res = { 0, 0, 0, 0 };
+    OutputVectorType res = {0, 0, 0, 0};
 
-    switch(op)
+    switch (op)
     {
         case ComparisonOperation::Equal:
             res = wrapper::vceq(a, b);
@@ -338,53 +372,75 @@ inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const Inpu
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType>
-inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
+inline OutputVectorType
+elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
 {
     InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
-    return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+    return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a,
+                                                                      reorder ? a : broadcast_vector);
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+inline int elementwise_comp_op_broadcast_8_loop(int                    window_start_x,
+                                                int                    window_end_x,
+                                                int                    window_step_x,
+                                                const InputScalarType *non_broadcast_input_ptr,
+                                                const InputScalarType &broadcast_value,
+                                                uint8_t               *output_ptr,
+                                                const bool             reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(
+            wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
         wrapper::vstore(output_ptr + x, a);
     }
     return x;
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                 const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+inline int elementwise_comp_op_broadcast_16_loop(int                    window_start_x,
+                                                 int                    window_end_x,
+                                                 int                    window_step_x,
+                                                 const InputScalarType *non_broadcast_input_ptr,
+                                                 const InputScalarType &broadcast_value,
+                                                 uint8_t               *output_ptr,
+                                                 const bool             reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(
+            wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
         wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
     }
     return x;
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                 const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+inline int elementwise_comp_op_broadcast_32_loop(int                    window_start_x,
+                                                 int                    window_end_x,
+                                                 int                    window_step_x,
+                                                 const InputScalarType *non_broadcast_input_ptr,
+                                                 const InputScalarType &broadcast_value,
+                                                 uint8_t               *output_ptr,
+                                                 const bool             reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
-        const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
+            wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
+        const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
+            wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
         wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b))));
     }
-    if(x <= window_end_x - 4)
+    if (x <= window_end_x - 4)
     {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
-        for(int i = 0; i < 4; i++)
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
+            wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        for (int i = 0; i < 4; i++)
         {
             *(output_ptr + x + i) = wrapper::vgetlane(a, i);
         }
@@ -394,11 +450,15 @@ inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x,
-                                      const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+inline int elementwise_comp_op_8_loop(int                    window_start_x,
+                                      int                    window_end_x,
+                                      int                    window_step_x,
+                                      const InputScalarType *input1_ptr,
+                                      const InputScalarType *input2_ptr,
+                                      uint8_t               *output_ptr)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const auto a   = wrapper::vloadq(input1_ptr + x);
         const auto b   = wrapper::vloadq(input2_ptr + x);
@@ -409,11 +469,15 @@ inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x,
-                                       const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+inline int elementwise_comp_op_16_loop(int                    window_start_x,
+                                       int                    window_end_x,
+                                       int                    window_step_x,
+                                       const InputScalarType *input1_ptr,
+                                       const InputScalarType *input2_ptr,
+                                       uint8_t               *output_ptr)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const auto a   = wrapper::vloadq(input1_ptr + x);
         const auto b   = wrapper::vloadq(input2_ptr + x);
@@ -424,11 +488,15 @@ inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x,
-                                       const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+inline int elementwise_comp_op_32_loop(int                    window_start_x,
+                                       int                    window_end_x,
+                                       int                    window_step_x,
+                                       const InputScalarType *input1_ptr,
+                                       const InputScalarType *input2_ptr,
+                                       uint8_t               *output_ptr)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         auto       a    = wrapper::vloadq(input1_ptr + x);
         auto       b    = wrapper::vloadq(input2_ptr + x);
@@ -438,12 +506,12 @@ inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int
         const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
         wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2))));
     }
-    if(x <= window_end_x - 4)
+    if (x <= window_end_x - 4)
     {
         const auto a   = wrapper::vloadq(input1_ptr + x);
         const auto b   = wrapper::vloadq(input2_ptr + x);
         const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
-        for(int i = 0; i < 4; i++)
+        for (int i = 0; i < 4; i++)
         {
             *(output_ptr + x + i) = wrapper::vgetlane(res, i);
         }
@@ -455,57 +523,59 @@ inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
 void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(
+        in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
+        &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
+        &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
 void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(
+        in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
+        &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
+        &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
 void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(
+        in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
+        &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
+        &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
 }
 
 inline float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
 {
-    qasymm8x16_t        x = vld1q_u8(input1_ptr);
-    const float32x4x4_t out =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
-        }
-    };
+    qasymm8x16_t        x   = vld1q_u8(input1_ptr);
+    const float32x4x4_t out = {{
+        vmulq_f32(
+            vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)),
+            scale),
+        vmulq_f32(
+            vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)),
+            scale),
+        vmulq_f32(
+            vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)),
+            scale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)),
+                  scale),
+    }};
     return out;
 }
 
 inline float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
 {
-    qasymm8x16_signed_t x = vld1q_s8(input1_ptr);
-    const float32x4x4_t out =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
-        }
-    };
+    qasymm8x16_signed_t x   = vld1q_s8(input1_ptr);
+    const float32x4x4_t out = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
+    }};
     return out;
 }
 
@@ -523,17 +593,15 @@ inline void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out)
     vst1q_u8(output_ptr, vcombine_u8(pa, pb));
 }
 
-inline void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
+inline void
+store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
 {
-    int32x4x4_t out =
-    {
-        {
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
-        }
-    };
+    int32x4x4_t out = {{
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
+    }};
     store_quantized(output_ptr, out);
 }
 
@@ -544,17 +612,17 @@ inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out)
     vst1q_s8(output_ptr, vcombine_s8(pa, pb));
 }
 
-inline void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
+inline void store_quantized_signed(int8_t              *output_ptr,
+                                   const float32x4x4_t &rf,
+                                   const float32x4_t   &offset,
+                                   const float32x4_t   &invscale)
 {
-    int32x4x4_t out =
-    {
-        {
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
-        }
-    };
+    int32x4x4_t out = {{
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
+    }};
     store_quantized_signed(output_ptr, out);
 }
 
@@ -565,7 +633,8 @@ inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const floa
 }
 
 template <ArithmeticOperation op>
-inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
+inline int8_t
+elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
 {
     return quantize_qasymm8_signed(elementwise_arithm_op_scalar<op>(a, b), qinfo);
 }
@@ -574,15 +643,12 @@ template <ArithmeticOperation op>
 float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b)
 {
     using neon_vector_float = wrapper::traits::neon_vector<float, 4>;
-    float32x4x4_t out =
-    {
-        {
-            elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]),
-            elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]),
-            elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]),
-            elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]),
-        }
-    };
+    float32x4x4_t out       = {{
+              elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]),
+              elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]),
+              elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]),
+              elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]),
+    }};
     return out;
 }
 
@@ -596,26 +662,29 @@ inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float
 template <ComparisonOperation op>
 inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b)
 {
-    uint32x4x4_t out =
-    {
-        {
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])
-        }
-    };
+    uint32x4x4_t out = {{elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
+                         elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
+                         elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
+                         elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])}};
     return out;
 }
 
 template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
-                                                int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                                float32x4_t voffseto, float32x4_t invvscaleo)
+inline int elementwise_arithm_op_quantized_loop(int            window_start_x,
+                                                int            window_end_x,
+                                                int            window_step_x,
+                                                const uint8_t *input1_ptr,
+                                                const uint8_t *input2_ptr,
+                                                uint8_t       *output_ptr,
+                                                int32x4_t      voffset1,
+                                                int32x4_t      voffset2,
+                                                float32x4_t    vscale1,
+                                                float32x4_t    vscale2,
+                                                float32x4_t    voffseto,
+                                                float32x4_t    invvscaleo)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         // Get inputs and compute output
         const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
@@ -627,13 +696,21 @@ inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_e
 }
 
 template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                       const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr,
-                                                       int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                                       float32x4_t voffseto, float32x4_t invvscaleo)
+inline int elementwise_arithm_op_quantized_singed_loop(int           window_start_x,
+                                                       int           window_end_x,
+                                                       int           window_step_x,
+                                                       const int8_t *input1_ptr,
+                                                       const int8_t *input2_ptr,
+                                                       int8_t       *output_ptr,
+                                                       int32x4_t     voffset1,
+                                                       int32x4_t     voffset2,
+                                                       float32x4_t   vscale1,
+                                                       float32x4_t   vscale2,
+                                                       float32x4_t   voffseto,
+                                                       float32x4_t   invvscaleo)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         // Get inputs and compute output
         const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
@@ -645,45 +722,71 @@ inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int w
 }
 
 template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                          const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
-                                                          int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                          float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+inline int elementwise_arithm_op_quantized_broadcast_loop(int            window_start_x,
+                                                          int            window_end_x,
+                                                          int            window_step_x,
+                                                          const uint8_t *non_broadcast_input_ptr,
+                                                          float32x4x4_t  broadcast_vector,
+                                                          uint8_t       *output_ptr,
+                                                          int32x4_t      voffset_non_broadcast,
+                                                          float32x4_t    vscale_non_broadcast,
+                                                          float32x4_t    voffseto,
+                                                          float32x4_t    invvscaleo,
+                                                          bool           reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        const float32x4x4_t af =
+            load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const float32x4x4_t rf =
+            elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
         store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
     }
     return x;
 }
 template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                                 const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr,
-                                                                 int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                                 float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int           window_start_x,
+                                                                 int           window_end_x,
+                                                                 int           window_step_x,
+                                                                 const int8_t *non_broadcast_input_ptr,
+                                                                 float32x4x4_t broadcast_vector,
+                                                                 int8_t       *output_ptr,
+                                                                 int32x4_t     voffset_non_broadcast,
+                                                                 float32x4_t   vscale_non_broadcast,
+                                                                 float32x4_t   voffseto,
+                                                                 float32x4_t   invvscaleo,
+                                                                 bool          reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        const float32x4x4_t af =
+            load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const float32x4x4_t rf =
+            elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
         store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
     }
     return x;
 }
 
 template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
-                                              const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
-                                              int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                              float32x4_t voffseto, float32x4_t invvscaleo)
+inline int elementwise_comp_op_quantized_loop(int            window_start_x,
+                                              int            window_end_x,
+                                              int            window_step_x,
+                                              const uint8_t *input1_ptr,
+                                              const uint8_t *input2_ptr,
+                                              uint8_t       *output_ptr,
+                                              int32x4_t      voffset1,
+                                              int32x4_t      voffset2,
+                                              float32x4_t    vscale1,
+                                              float32x4_t    vscale2,
+                                              float32x4_t    voffseto,
+                                              float32x4_t    invvscaleo)
 {
     ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
         const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
@@ -694,14 +797,22 @@ inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end
 }
 
 template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                     const int8_t *input1_ptr, const int8_t *input2_ptr, uint8_t *output_ptr,
-                                                     int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                                     float32x4_t voffseto, float32x4_t invvscaleo)
+inline int elementwise_comp_op_quantized_signed_loop(int           window_start_x,
+                                                     int           window_end_x,
+                                                     int           window_step_x,
+                                                     const int8_t *input1_ptr,
+                                                     const int8_t *input2_ptr,
+                                                     uint8_t      *output_ptr,
+                                                     int32x4_t     voffset1,
+                                                     int32x4_t     voffset2,
+                                                     float32x4_t   vscale1,
+                                                     float32x4_t   vscale2,
+                                                     float32x4_t   voffseto,
+                                                     float32x4_t   invvscaleo)
 {
     ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
         const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
@@ -712,46 +823,85 @@ inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int win
 }
 
 template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                        const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
-                                                        int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                        float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+inline int elementwise_comp_op_quantized_broadcast_loop(int            window_start_x,
+                                                        int            window_end_x,
+                                                        int            window_step_x,
+                                                        const uint8_t *non_broadcast_input_ptr,
+                                                        float32x4x4_t  broadcast_vector,
+                                                        uint8_t       *output_ptr,
+                                                        int32x4_t      voffset_non_broadcast,
+                                                        float32x4_t    vscale_non_broadcast,
+                                                        float32x4_t    voffseto,
+                                                        float32x4_t    invvscaleo,
+                                                        bool           reorder)
 {
     ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const uint32x4x4_t  rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        const float32x4x4_t af =
+            load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const uint32x4x4_t rf =
+            elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
         store_quantized(output_ptr + x, rf);
     }
     return x;
 }
 
 template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                               const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
-                                                               int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                               float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+inline int elementwise_comp_op_quantized_signed_broadcast_loop(int           window_start_x,
+                                                               int           window_end_x,
+                                                               int           window_step_x,
+                                                               const int8_t *non_broadcast_input_ptr,
+                                                               float32x4x4_t broadcast_vector,
+                                                               uint8_t      *output_ptr,
+                                                               int32x4_t     voffset_non_broadcast,
+                                                               float32x4_t   vscale_non_broadcast,
+                                                               float32x4_t   voffseto,
+                                                               float32x4_t   invvscaleo,
+                                                               bool          reorder)
 {
     ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const uint32x4x4_t  rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        const float32x4x4_t af =
+            load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const uint32x4x4_t rf =
+            elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
         store_quantized(output_ptr + x, rf);
     }
     return x;
 }
 
-inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+inline void elementwise_op_quantized(const ITensor *in1,
+                                     const ITensor *in2,
+                                     ITensor       *out,
+                                     const Window  &window,
                                      uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                     int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
-                                                           float32x4_t, float32x4_t, const bool),
-                                     int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *,
-                                                      int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                                      float32x4_t, float32x4_t))
+                                     int (*broadcast_func)(int,
+                                                           int,
+                                                           int,
+                                                           const uint8_t *,
+                                                           float32x4x4_t,
+                                                           uint8_t *,
+                                                           int32x4_t,
+                                                           float32x4_t,
+                                                           float32x4_t,
+                                                           float32x4_t,
+                                                           const bool),
+                                     int (*neon_func)(int,
+                                                      int,
+                                                      int,
+                                                      const uint8_t *,
+                                                      const uint8_t *,
+                                                      uint8_t *,
+                                                      int32x4_t,
+                                                      int32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t))
 {
     // Create input windows
     Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -772,7 +922,7 @@ inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITe
     const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset + 0.5f);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         // Select the broadcast input on the X axis
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
@@ -794,24 +944,28 @@ inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITe
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
 
-            const uint8_t       broadcast_value  = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
-            const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);
+                const uint8_t       broadcast_value  = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
+                const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);
 
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
-                                      voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                const float bfs   = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                                          broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
+                                          voffseto, invvscaleo, !is_broadcast_input_2);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
+                    const float bfs   = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
+                                                       !is_broadcast_input_2 ? afs : bfs, output_qinfo);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -834,32 +988,56 @@ inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITe
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
-                                 vscale1, vscale2, voffseto, invvscaleo);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const float afs   = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
-                const float bfs   = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
-                *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-            }
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
+                                     voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
+                    const float bfs   = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
+                }
+            },
+            input1, input2, output);
     }
 }
 
-inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                                              uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                              int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
-                                                                    float32x4_t, float32x4_t, const bool),
-                                              int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *,
-                                                               int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                                               float32x4_t, float32x4_t))
+inline void
+elementwise_comp_quantized_signed(const ITensor *in1,
+                                  const ITensor *in2,
+                                  ITensor       *out,
+                                  const Window  &window,
+                                  uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+                                  int (*broadcast_func)(int,
+                                                        int,
+                                                        int,
+                                                        const int8_t *,
+                                                        float32x4x4_t,
+                                                        uint8_t *,
+                                                        int32x4_t,
+                                                        float32x4_t,
+                                                        float32x4_t,
+                                                        float32x4_t,
+                                                        const bool),
+                                  int (*neon_func)(int,
+                                                   int,
+                                                   int,
+                                                   const int8_t *,
+                                                   const int8_t *,
+                                                   uint8_t *,
+                                                   int32x4_t,
+                                                   int32x4_t,
+                                                   float32x4_t,
+                                                   float32x4_t,
+                                                   float32x4_t,
+                                                   float32x4_t))
 {
     // Create input windows
     Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -879,7 +1057,7 @@ inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor
     const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         // Select the broadcast input on the X axis
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
@@ -901,24 +1079,28 @@ inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
 
-            const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
+                const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+                const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
 
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
-                                      voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                                          broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
+                                          voffseto, invvscaleo, !is_broadcast_input_2);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
+                    const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
+                                                       !is_broadcast_input_2 ? afs : bfs, output_qinfo);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -941,32 +1123,56 @@ inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
-                                 vscale1, vscale2, voffseto, invvscaleo);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
-                *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-            }
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
+                                     voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
+                    const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
+                }
+            },
+            input1, input2, output);
     }
 }
 
-inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                                            int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                            int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t,
-                                                                  float32x4_t, float32x4_t, const bool),
-                                            int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *,
-                                                             int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                                             float32x4_t, float32x4_t))
+inline void
+elementwise_op_quantized_signed(const ITensor *in1,
+                                const ITensor *in2,
+                                ITensor       *out,
+                                const Window  &window,
+                                int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+                                int (*broadcast_func)(int,
+                                                      int,
+                                                      int,
+                                                      const int8_t *,
+                                                      float32x4x4_t,
+                                                      int8_t *,
+                                                      int32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t,
+                                                      const bool),
+                                int (*neon_func)(int,
+                                                 int,
+                                                 int,
+                                                 const int8_t *,
+                                                 const int8_t *,
+                                                 int8_t *,
+                                                 int32x4_t,
+                                                 int32x4_t,
+                                                 float32x4_t,
+                                                 float32x4_t,
+                                                 float32x4_t,
+                                                 float32x4_t))
 {
     // Create input windows
     Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -986,7 +1192,7 @@ inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *i
     const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         // Select the broadcast input on the X axis
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
@@ -1008,24 +1214,28 @@ inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *i
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
 
-            const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
+                const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+                const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
 
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
-                                      voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                                          broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
+                                          voffseto, invvscaleo, !is_broadcast_input_2);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
+                    const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
+                                                       !is_broadcast_input_2 ? afs : bfs, output_qinfo);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -1048,22 +1258,24 @@ inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *i
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
-                                 vscale1, vscale2, voffseto, invvscaleo);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
-                *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-            }
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
+                                     voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
+                    const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
+                }
+            },
+            input1, input2, output);
     }
 }
 
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
index c5c528d3f3709d317121533511c0a4591f4424cd..09ad13d5ebdb9fbfc0cd92e4f248ed96ff8e62db 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
 namespace arm_compute
 {
@@ -33,63 +34,165 @@ void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor
     return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>(in1, in2, out, window);
 }
 
-template void neon_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                             const ITensor *in2,
+                                                                             ITensor       *out,
+                                                                             const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
 
 template <ArithmeticOperation op>
 void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>(in1, in2, out, window);
 }
-template void neon_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                             const ITensor *in2,
+                                                                             ITensor       *out,
+                                                                             const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
 
 template <ComparisonOperation op>
 void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comp_op_8<op, uint8_t, uint8x16_t>(in1, in2, out, window);
 }
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
 
 template <ComparisonOperation op>
 void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comp_op_16<op, int16_t, int16x8_t>(in1, in2, out, window);
 }
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
 
 template <ComparisonOperation op>
 void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comp_op_32<op, int32_t, int32x4_t>(in1, in2, out, window);
 }
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-}
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
index fa8e08745a50dcfc938b4205fd87b48573abf50c..d891f70644f3501a05417460e3fd5c6f93b87ea6 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
 namespace arm_compute
 {
@@ -33,27 +34,72 @@ void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITe
     return elementwise_arithm_op_quantized<op>(in1, in2, out, window);
 }
 
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                          const ITensor *in2,
+                                                                          ITensor       *out,
+                                                                          const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                          const ITensor *in2,
+                                                                          ITensor       *out,
+                                                                          const Window  &window);
 
 template <ComparisonOperation op>
-void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1,
+                                                const ITensor *in2,
+                                                ITensor       *out,
+                                                const Window  &window)
 {
     return elementwise_comp_op_quantized<op>(in1, in2, out, window);
 }
 
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                            const ITensor *in2,
+                                                                                            ITensor       *out,
+                                                                                            const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                         const ITensor *in2,
+                                                                                         ITensor       *out,
+                                                                                         const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
index abfdf93b75c709e3be6fc329e75687c1c8569d62..b1f8e018f58a0bd72937747e2e188520bed9cb72 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
 
 namespace arm_compute
@@ -34,27 +35,70 @@ void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *i
     return elementwise_arithm_op_quantized_signed<op>(in1, in2, out, window);
 }
 
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
 
 template <ComparisonOperation op>
-void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1,
+                                                       const ITensor *in2,
+                                                       ITensor       *out,
+                                                       const Window  &window)
 {
     return elementwise_comp_op_quantized_signed<op>(in1, in2, out, window);
 }
 
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                            const ITensor *in2,
+                                                                                            ITensor       *out,
+                                                                                            const Window  &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                               const ITensor *in2,
+                                                                                               ITensor       *out,
+                                                                                               const Window  &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                              const ITensor *in2,
+                                                                                              ITensor       *out,
+                                                                                              const Window  &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                           const ITensor *in2,
+                                                                                           ITensor       *out,
+                                                                                           const Window  &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                                const ITensor *in2,
+                                                                                                ITensor       *out,
+                                                                                                const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
index 85224351df112e3b0c4d336103523624cc3948b3..600c7f1c05a4b2c6799a956a7c7929a8bccb634b 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
@@ -25,6 +25,7 @@
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
 namespace arm_compute
 {
@@ -36,14 +37,38 @@ void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor
     return elementwise_arithmetic_op<float16_t>(in1, in2, out, op, window);
 }
 
-template void sve_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                             const ITensor *in2,
+                                                                             ITensor       *out,
+                                                                             const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
 
 template <ComparisonOperation op>
 void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
@@ -51,14 +76,32 @@ void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *i
     return elementwise_comparison_op<float16_t>(in1, in2, out, op, window);
 }
 
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
index 2b479f76f1b4bc269ee8e62c3542c1318f39364b..832a96688313004cf4c6a9444ba032154dfadb04 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
 namespace arm_compute
 {
@@ -34,26 +35,68 @@ void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor
     return elementwise_arithmetic_op<float32_t>(in1, in2, out, op, window);
 }
 
-template void sve_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                             const ITensor *in2,
+                                                                             ITensor       *out,
+                                                                             const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
 
 template <ComparisonOperation op>
 void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comparison_op<float>(in1, in2, out, op, window);
 }
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
index c0515f2abc8067c251fac30d2adf00a6ce766f75..fa48407e9b729c46a4cade9d58ffb7a3fc34a636 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
@@ -23,7 +23,9 @@
  */
 
 #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -33,7 +35,8 @@ namespace cpu
 using namespace arm_compute::wrapper;
 
 template <typename ScalarType>
-void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
+void elementwise_arithmetic_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
 {
     using VectorType = typename sve_vector<ScalarType>::type;
 
@@ -51,7 +54,7 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *
     const auto window_end_x          = static_cast<int>(window.x().end());
     const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -66,37 +69,40 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto             output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
-            const auto       non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const ScalarType broadcast_value         = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const auto       broadcast_vector        = svdup_n(broadcast_value);
-
-            int x = window_start_x;
-
-            svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
-                VectorType res{};
+                auto       output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+                const ScalarType broadcast_value   = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+                const auto       broadcast_vector  = svdup_n(broadcast_value);
 
-                if(is_broadcast_input_2)
-                {
-                    res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, non_broadcast_vector, broadcast_vector, op);
-                }
-                else
+                int x = window_start_x;
+
+                svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
+                do
                 {
-                    res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, broadcast_vector, non_broadcast_vector, op);
-                }
-                svst1(pg, output_ptr + x, res);
-
-                x += svcnt<ScalarType>();
-                pg = svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                    const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
+                    VectorType res{};
+
+                    if (is_broadcast_input_2)
+                    {
+                        res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, non_broadcast_vector,
+                                                                                               broadcast_vector, op);
+                    }
+                    else
+                    {
+                        res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(
+                            pg, broadcast_vector, non_broadcast_vector, op);
+                    }
+                    svst1(pg, output_ptr + x, res);
+
+                    x += svcnt<ScalarType>();
+                    pg = svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -108,39 +114,46 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
 
-            int x = window_start_x;
+                int x = window_start_x;
 
-            svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
-            do
-            {
-                const auto in1 = svld1(pg, input1_ptr + x);
-                const auto in2 = svld1(pg, input2_ptr + x);
-                const auto res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, in1, in2, op);
-                svst1(pg, output_ptr + x, res);
-
-                x += svcnt<ScalarType>();
-                pg = svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 = svld1(pg, input1_ptr + x);
+                    const auto in2 = svld1(pg, input2_ptr + x);
+                    const auto res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, in1, in2, op);
+                    svst1(pg, output_ptr + x, res);
+
+                    x += svcnt<ScalarType>();
+                    pg = svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
-template void elementwise_arithmetic_op<float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
-template void elementwise_arithmetic_op<float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
-template void elementwise_arithmetic_op<int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
-template void elementwise_arithmetic_op<int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<float32_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<float16_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<int16_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<int32_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
 
 template <typename InputScalarType, typename OutputScalarType>
-void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
+void elementwise_comparison_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
 {
-    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
+    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType),
+                  "input data type's width should be equal to or greater than output data type's width");
 
     using OutputVectorType = typename sve_vector<OutputScalarType>::type;
     const auto all_true_pg = svptrue<InputScalarType>();
@@ -157,7 +170,7 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *
     const auto window_end_x          = static_cast<int>(window.x().end());
     const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -172,37 +185,44 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-            const auto            broadcast_vector        = svdup_n(broadcast_value);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr =
+                    reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+                const InputScalarType broadcast_value =
+                    *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+                const auto broadcast_vector = svdup_n(broadcast_value);
 
-            int x = window_start_x;
+                int x = window_start_x;
 
-            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                const auto       non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
-                const svbool_t   output_pg            = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
-                OutputVectorType res{};
-                if(is_broadcast_input_2)
-                {
-                    res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, non_broadcast_vector, broadcast_vector, op);
-                }
-                else
+                svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
+                do
                 {
-                    res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, broadcast_vector, non_broadcast_vector, op);
-                }
-                svst1(output_pg, output_ptr + x, res);
-
-                x += svcnt<InputScalarType>();
-                pg = svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                    const auto       non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
+                    const svbool_t   output_pg            = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
+                    OutputVectorType res{};
+                    if (is_broadcast_input_2)
+                    {
+                        res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type,
+                                                        typename sve_vector<OutputScalarType>::type>(
+                            pg, non_broadcast_vector, broadcast_vector, op);
+                    }
+                    else
+                    {
+                        res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type,
+                                                        typename sve_vector<OutputScalarType>::type>(
+                            pg, broadcast_vector, non_broadcast_vector, op);
+                    }
+                    svst1(output_pg, output_ptr + x, res);
+
+                    x += svcnt<InputScalarType>();
+                    pg = svwhilelt<InputScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -214,37 +234,45 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
 
-            int x = window_start_x;
+                int x = window_start_x;
 
-            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                const auto     in1       = svld1(pg, input1_ptr + x);
-                const auto     in2       = svld1(pg, input2_ptr + x);
-                const auto     res       = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, op);
-                const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
-                svst1(output_pg, output_ptr + x, res);
-
-                x += svcnt<InputScalarType>();
-                pg = svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 = svld1(pg, input1_ptr + x);
+                    const auto in2 = svld1(pg, input2_ptr + x);
+                    const auto res =
+                        elementwise_comparison_op<typename sve_vector<InputScalarType>::type,
+                                                  typename sve_vector<OutputScalarType>::type>(pg, in1, in2, op);
+                    const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
+                    svst1(output_pg, output_ptr + x, res);
+
+                    x += svcnt<InputScalarType>();
+                    pg = svwhilelt<InputScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
 
-template void elementwise_comparison_op<float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
-template void elementwise_comparison_op<float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
-template void elementwise_comparison_op<uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
-template void elementwise_comparison_op<int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
-template void elementwise_comparison_op<int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<float32_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<float16_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<uint8_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<int16_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<int32_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
 
 template <>
 svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
index 860c50a1e048558271d504842c3a56a7e7e1484f..4c61b9f3157cad96fffd9e11f31cd61d1c645476 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
@@ -25,6 +25,7 @@
 #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/core/NEON/wrapper/svtraits.h"
 
@@ -51,7 +52,7 @@ svbool_t narrow_to_byte_predicate(svbool_t pg)
 {
     const auto all_false = svpfalse();
 
-    switch(bytewidth)
+    switch (bytewidth)
     {
         case 8:
             pg = svuzp1_b32(pg, all_false);
@@ -74,7 +75,7 @@ VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const Ve
     using ScalarType = typename wrapper::sve_scalar<VectorType>::type;
     VectorType res{};
 
-    switch(op)
+    switch (op)
     {
         case ArithmeticOperation::MAX:
             res = svmax_z(pg, a, b);
@@ -114,11 +115,12 @@ VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const Ve
 }
 
 template <typename InputVectorType, typename OutputVectorType>
-OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
+OutputVectorType
+elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
 {
     svbool_t selection_vector{};
 
-    switch(op)
+    switch (op)
     {
         case ComparisonOperation::Equal:
             selection_vector = svcmpeq(pg, a, b);
@@ -154,10 +156,12 @@ OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &
 }
 
 template <typename ScalarType>
-void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window);
+void elementwise_arithmetic_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window);
 
 template <typename ScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window);
+void elementwise_comparison_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
index c313fc6e0453a23ab8751bc34d9a3a6babc4254a..f7714ff7e9239acab457dcea464ece6ddf077ada 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
 namespace arm_compute
 {
@@ -33,64 +34,166 @@ void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor
 {
     return elementwise_arithmetic_op<int32_t>(in1, in2, out, op, window);
 }
-template void sve_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                            const ITensor *in2,
+                                                                            ITensor       *out,
+                                                                            const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
 
 template <ArithmeticOperation op>
 void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_arithmetic_op<int16_t>(in1, in2, out, op, window);
 }
-template void sve_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                            const ITensor *in2,
+                                                                            ITensor       *out,
+                                                                            const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
 
 template <ComparisonOperation op>
 void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comparison_op<uint8_t>(in1, in2, out, op, window);
 }
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                      const ITensor *in2,
+                                                                                      ITensor       *out,
+                                                                                      const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                              const ITensor *in2,
+                                                                              ITensor       *out,
+                                                                              const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
 
 template <ComparisonOperation op>
 void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comparison_op<int16_t>(in1, in2, out, op, window);
 }
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
 
 template <ComparisonOperation op>
 void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comparison_op<int32_t>(in1, in2, out, op, window);
 }
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
index 41e0ac77dbc5396d18e573828b1fd602b8b81c13..7c6015d37934f198242ca87e68cc2c024db87827 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
@@ -35,19 +35,14 @@ inline svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint3
 {
     auto x = svld1(pg, ptr);
 
-    const auto widened = svcreate4(
-                             svmovlb(svmovlb(x)),
-                             svmovlt(svmovlb(x)),
-                             svmovlb(svmovlt(x)),
-                             svmovlt(svmovlt(x)));
+    const auto widened = svcreate4(svmovlb(svmovlb(x)), svmovlt(svmovlb(x)), svmovlb(svmovlt(x)), svmovlt(svmovlt(x)));
 
     pg = svptrue_b8();
 
-    return svcreate4(
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale));
+    return svcreate4(svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale));
 }
 
 inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
@@ -56,28 +51,24 @@ inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint
 
     //vprint(x);
 
-    const auto widened = svcreate4(
-                             svmovlb(svmovlb(x)),
-                             svmovlt(svmovlb(x)),
-                             svmovlb(svmovlt(x)),
-                             svmovlt(svmovlt(x)));
+    const auto widened = svcreate4(svmovlb(svmovlb(x)), svmovlt(svmovlb(x)), svmovlb(svmovlt(x)), svmovlt(svmovlt(x)));
 
     pg = svptrue_b8();
 
-    return svcreate4(
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale));
+    return svcreate4(svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale));
 }
 
-inline void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
+inline void
+store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
 {
-    const auto quantized = svcreate4(
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
+    const auto quantized =
+        svcreate4(svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
 
     const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1));
     const auto narrowed_top    = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3));
@@ -85,13 +76,14 @@ inline void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const
     svst1(pg, ptr, narrowed);
 }
 
-inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
+inline void
+store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
 {
-    const auto quantized = svcreate4(
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
+    const auto quantized =
+        svcreate4(svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
 
     const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1));
     const auto narrowed_top    = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3));
@@ -101,7 +93,8 @@ inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const
 }
 
 template <typename ScalarType>
-void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
+void elementwise_arithmetic_quantized_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
 {
     const auto all_true_pg = wrapper::svptrue<ScalarType>();
 
@@ -120,7 +113,7 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2,
     const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset);
     const auto output_vscale  = svdup_n(1.f / out->info()->quantization_info().uniform().scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -128,8 +121,10 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2,
         const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
         const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
 
-        const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
-        const auto broadcast_qinfo     = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
+        const auto non_broadcast_qinfo =
+            is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
+        const auto broadcast_qinfo =
+            is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
 
         const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset);
         const auto non_broadcast_vscale  = svdup_n(non_broadcast_qinfo.uniform().scale);
@@ -141,48 +136,52 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2,
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto             output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
-            const auto       non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const ScalarType broadcast_value         = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const float      broadcast_value_f       = Qasymm8QuantizationHelper<ScalarType>::dequantize(broadcast_value, broadcast_qinfo);
-            const auto       in2                     = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
-
-            int x = window_start_x;
-
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto in1 = load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
-
-                svfloat32x4_t result{};
-
-                if(!is_broadcast_input_2)
+                auto       output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+                const ScalarType broadcast_value   = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+                const float      broadcast_value_f =
+                    Qasymm8QuantizationHelper<ScalarType>::dequantize(broadcast_value, broadcast_qinfo);
+                const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f),
+                                           svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
+
+                int x = window_start_x;
+
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                do
                 {
-                    result = svcreate4(
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 0), svget4(in1, 0), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 1), svget4(in1, 1), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 2), svget4(in1, 2), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 3), svget4(in1, 3), op));
-                }
-                else
-                {
-                    result = svcreate4(
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
-                }
-
-                store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                    const auto in1 =
+                        load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
+
+                    svfloat32x4_t result{};
+
+                    if (!is_broadcast_input_2)
+                    {
+                        result =
+                            svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 0), svget4(in1, 0), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 1), svget4(in1, 1), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 2), svget4(in1, 2), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 3), svget4(in1, 3), op));
+                    }
+                    else
+                    {
+                        result =
+                            svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
+                    }
+
+                    store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -200,41 +199,44 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2,
         const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset);
         const auto in2_vscale  = svdup_n(in2->info()->quantization_info().uniform().scale);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
 
-            int x = window_start_x;
+                int x = window_start_x;
 
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
-            {
-                const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
-                const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
-
-                const auto result = svcreate4(
-                                        elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
-                                        elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
-                                        elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
-                                        elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
-
-                store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
+                    const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
+
+                    const auto result =
+                        svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
+                                  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
+                                  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
+                                  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
+
+                    store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
 
 template <typename InputScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
+void elementwise_comparison_quantized_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
 {
-    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
+    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType),
+                  "input data type's width should be equal to or greater than output data type's width");
 
     using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
     const auto all_true_pg = wrapper::svptrue<InputScalarType>();
@@ -251,7 +253,7 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2,
     const auto window_end_x          = static_cast<int>(window.x().end());
     const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -259,8 +261,10 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2,
         const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
         const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
 
-        const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
-        const auto broadcast_qinfo     = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
+        const auto non_broadcast_qinfo =
+            is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
+        const auto broadcast_qinfo =
+            is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
 
         const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset);
         const auto non_broadcast_vscale  = svdup_n(non_broadcast_qinfo.uniform().scale);
@@ -272,51 +276,63 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2,
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-            const float           broadcast_value_f       = Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo);
-            const auto            in2                     = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
-
-            int x = window_start_x;
-
-            svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto in1 = load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
-
-                svuint8x4_t result{};
-
-                if(!is_broadcast_input_2)
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr =
+                    reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+                const InputScalarType broadcast_value =
+                    *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+                const float broadcast_value_f =
+                    Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo);
+                const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f),
+                                           svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
+
+                int x = window_start_x;
+
+                svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+                do
                 {
-                    result = svcreate4(
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 0), svget4(in1, 0), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 1), svget4(in1, 1), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 2), svget4(in1, 2), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 3), svget4(in1, 3), op));
-                }
-                else
-                {
-                    result = svcreate4(
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), op));
-                }
-
-                const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
-                const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
-                const auto zipped        = svzip1(zipped_bottom, zipped_top);
-                svst1(pg, output_ptr + x, zipped);
-
-                x += wrapper::svcnt<InputScalarType>();
-                pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                    const auto in1 =
+                        load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
+
+                    svuint8x4_t result{};
+
+                    if (!is_broadcast_input_2)
+                    {
+                        result = svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 0),
+                                                                                                    svget4(in1, 0), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 1),
+                                                                                                    svget4(in1, 1), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 2),
+                                                                                                    svget4(in1, 2), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(
+                                               pg, svget4(in2, 3), svget4(in1, 3), op));
+                    }
+                    else
+                    {
+                        result = svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0),
+                                                                                                    svget4(in2, 0), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1),
+                                                                                                    svget4(in2, 1), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2),
+                                                                                                    svget4(in2, 2), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(
+                                               pg, svget4(in1, 3), svget4(in2, 3), op));
+                    }
+
+                    const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
+                    const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
+                    const auto zipped        = svzip1(zipped_bottom, zipped_top);
+                    svst1(pg, output_ptr + x, zipped);
+
+                    x += wrapper::svcnt<InputScalarType>();
+                    pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -334,39 +350,44 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2,
         const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset);
         const auto in2_vscale  = svdup_n(in2->info()->quantization_info().uniform().scale);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
 
-            int x = window_start_x;
+                int x = window_start_x;
 
-            svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                const auto in1    = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
-                const auto in2    = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
-                const auto result = svcreate4(
-                                        elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), op),
-                                        elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), op),
-                                        elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), op),
-                                        elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), op));
-
-                const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
-                const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
-                const auto zipped        = svzip1(zipped_bottom, zipped_top);
-                svst1(pg, output_ptr + x, zipped);
-
-                x += wrapper::svcnt<InputScalarType>();
-                pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
+                    const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
+                    const auto result =
+                        svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0),
+                                                                                           svget4(in2, 0), op),
+                                  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1),
+                                                                                           svget4(in2, 1), op),
+                                  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2),
+                                                                                           svget4(in2, 2), op),
+                                  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3),
+                                                                                           svget4(in2, 3), op));
+
+                    const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
+                    const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
+                    const auto zipped        = svzip1(zipped_bottom, zipped_top);
+                    svst1(pg, output_ptr + x, zipped);
+
+                    x += wrapper::svcnt<InputScalarType>();
+                    pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
\ No newline at end of file
+#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
index 7435bb4f2978ddab0ff0b4c55a2a6c8110116c92..5cc66642d7701a630d235ec49fca6bffab621306 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h"
 namespace arm_compute
 {
@@ -34,27 +35,72 @@ void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITe
     return elementwise_arithmetic_quantized_op<uint8_t>(in1, in2, out, op, window);
 }
 
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                          const ITensor *in2,
+                                                                          ITensor       *out,
+                                                                          const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                          const ITensor *in2,
+                                                                          ITensor       *out,
+                                                                          const Window  &window);
 
 template <ComparisonOperation op>
-void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1,
+                                                const ITensor *in2,
+                                                ITensor       *out,
+                                                const Window  &window)
 {
     return elementwise_comparison_quantized_op<uint8_t>(in1, in2, out, op, window);
 }
 
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                            const ITensor *in2,
+                                                                                            ITensor       *out,
+                                                                                            const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                         const ITensor *in2,
+                                                                                         ITensor       *out,
+                                                                                         const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
index 1027a1eed09097afbde32bd69c3ca733c996c13b..165e0c05fa8c3cb9f83a400f8ffb307bcae855c6 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h"
 namespace arm_compute
 {
@@ -34,27 +35,70 @@ void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *i
     return elementwise_arithmetic_quantized_op<int8_t>(in1, in2, out, op, window);
 }
 
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
 
 template <ComparisonOperation op>
-void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1,
+                                                       const ITensor *in2,
+                                                       ITensor       *out,
+                                                       const Window  &window)
 {
     return elementwise_comparison_quantized_op<int8_t>(in1, in2, out, op, window);
 }
 
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                            const ITensor *in2,
+                                                                                            ITensor       *out,
+                                                                                            const Window  &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                               const ITensor *in2,
+                                                                                               ITensor       *out,
+                                                                                               const Window  &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                              const ITensor *in2,
+                                                                                              ITensor       *out,
+                                                                                              const Window  &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                           const ITensor *in2,
+                                                                                           ITensor       *out,
+                                                                                           const Window  &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                                const ITensor *in2,
+                                                                                                ITensor       *out,
+                                                                                                const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
index b2833c24819f36719297b967d3ab49360bc02e36..2588db024dc3866c1f84be443d85306af1ef691a 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
@@ -23,17 +23,19 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_fp16_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_op<__fp16>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
index 6566821eca74fbc5814e0bc066baaa5eea789224..936a2e588a33cbb7f5468555c8b7742f905fa2ba 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
@@ -22,16 +22,18 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_fp32_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_op<float>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/impl.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/impl.cpp
deleted file mode 100644
index 1b449340bab49ed0f1c94d6d6100f3ae7ccfc7f9..0000000000000000000000000000000000000000
--- a/src/cpu/kernels/elementwise_unary/generic/neon/impl.cpp
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Copyright (c) 2018-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
-#include "arm_compute/core/Helpers.h"
-#include "src/core/NEON/NEAsymm.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a)
-{
-    switch(op)
-    {
-        case ElementWiseUnary::RSQRT:
-            return 1 / sqrt(a);
-        case ElementWiseUnary::EXP:
-            return std::exp(a);
-        case ElementWiseUnary::NEG:
-            return -a;
-        case ElementWiseUnary::LOG:
-            return std::log(a);
-        case ElementWiseUnary::ABS:
-            return std::abs(a);
-        case ElementWiseUnary::ROUND:
-            return support::cpp11::nearbyint(a);
-        case ElementWiseUnary::SIN:
-            return std::sin(a);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-}
-
-template <typename ScalarType, typename VectorType>
-inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a)
-{
-    switch(op)
-    {
-        case ElementWiseUnary::RSQRT:
-            return wrapper::vinvsqrt(a);
-        case ElementWiseUnary::EXP:
-            return wrapper::vexpq(a);
-        case ElementWiseUnary::NEG:
-            return wrapper::vneg(a);
-        case ElementWiseUnary::LOG:
-            return wrapper::vlog(a);
-        case ElementWiseUnary::ABS:
-            return wrapper::vabs(a);
-        case ElementWiseUnary::ROUND:
-            return wrapper::vround(a);
-        case ElementWiseUnary::SIN:
-            return wrapper::vsin(a);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-}
-
-template <typename ScalarType>
-void elementwise_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
-{
-    const int  window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-
-        int x = window_start_x;
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
-        }
-    },
-    input, output);
-}
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void elementwise_op<__fp16>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
-#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void elementwise_op<float>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
-template void elementwise_op<int32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
-
-template <>
-void elementwise_op<int8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
-{
-    const int                     window_step_x     = 16;
-    const auto                    window_start_x    = static_cast<int>(window.x().start());
-    const auto                    window_end_x      = static_cast<int>(window.x().end());
-    const UniformQuantizationInfo qi_in             = in->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out            = out->info()->quantization_info().uniform();
-    const auto                    min_clamped_value = vdupq_n_f32((-128 - qi_out.offset) * qi_out.scale);
-    const auto                    max_clamped_value = vdupq_n_f32((127 - qi_out.offset) * qi_out.scale);
-    Window                        win               = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int8x16_t  vout;
-        auto       output_ptr    = reinterpret_cast<int8_t *>(output.ptr());
-        const auto input_ptr     = reinterpret_cast<const int8_t *>(input.ptr());
-        const auto vconst_0_f32  = vdupq_n_f32(0);
-        auto       clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
-
-        int x = window_start_x;
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-
-            // De-quantize
-            const auto vin_deq = vdequantize(vin, qi_in);
-
-            // Perform activation
-            float32x4x4_t vtmp_deq =
-            {
-                {
-                    elementwise_op_imp<float>(op, vin_deq.val[0]),
-                    elementwise_op_imp<float>(op, vin_deq.val[1]),
-                    elementwise_op_imp<float>(op, vin_deq.val[2]),
-                    elementwise_op_imp<float>(op, vin_deq.val[3]),
-                }
-            };
-
-            if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
-            {
-                vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
-                vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
-                vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
-                vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
-            }
-
-            // Re-quantize to new output space
-            vout = vquantize_signed(vtmp_deq, qi_out);
-            wrapper::vstore(output_ptr + x, vout);
-        }
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_signed_t in    = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
-            qasymm8_signed_t tmp   = 0;
-            float            tmp_f = dequantize_qasymm8_signed(in, qi_in);
-            if(tmp_f <= 0.0)
-            {
-                if(op == ElementWiseUnary::LOG)
-                {
-                    tmp_f = (-128 - qi_out.offset) * qi_out.scale;
-                }
-                else if(op == ElementWiseUnary::RSQRT)
-                {
-                    tmp_f = (127 - qi_out.offset) * qi_out.scale;
-                }
-                else
-                {
-                    tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
-                }
-            }
-            else
-            {
-                tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
-            }
-            tmp = quantize_qasymm8_signed(tmp_f, qi_out, RoundingPolicy::TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a.
-            // For aarch64 LUT is used and rounding to nearest is used
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-template <>
-void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
-{
-    const int                     window_step_x     = 16;
-    const auto                    window_start_x    = static_cast<int>(window.x().start());
-    const auto                    window_end_x      = static_cast<int>(window.x().end());
-    const UniformQuantizationInfo qi_in             = in->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out            = out->info()->quantization_info().uniform();
-    const auto                    vconst_0_f32      = vdupq_n_f32(0);
-    const auto                    min_clamped_value = vdupq_n_f32((0 - qi_out.offset) * qi_out.scale);
-    const auto                    max_clamped_value = vdupq_n_f32((255 - qi_out.offset) * qi_out.scale);
-    Window                        win               = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        uint8x16_t vout;
-        auto       clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
-        auto       output_ptr    = reinterpret_cast<uint8_t *>(output.ptr());
-        const auto input_ptr     = reinterpret_cast<const uint8_t *>(input.ptr());
-        int        x             = window_start_x;
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-
-            // De-quantize
-            const auto vin_deq = vdequantize(vin, qi_in);
-
-            // Perform activation
-            float32x4x4_t vtmp_deq =
-            {
-                {
-                    elementwise_op_imp<float>(op, vin_deq.val[0]),
-                    elementwise_op_imp<float>(op, vin_deq.val[1]),
-                    elementwise_op_imp<float>(op, vin_deq.val[2]),
-                    elementwise_op_imp<float>(op, vin_deq.val[3]),
-                }
-            };
-            if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
-            {
-                vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
-                vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
-                vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
-                vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
-            }
-
-            // Re-quantize to new output space
-            vout = vquantize(vtmp_deq, qi_out);
-            wrapper::vstore(output_ptr + x, vout);
-        }
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_t in    = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
-            qasymm8_t tmp   = 0;
-            float     tmp_f = dequantize_qasymm8(in, qi_in);
-            if(tmp_f <= 0.0)
-            {
-                if(op == ElementWiseUnary::LOG)
-                {
-                    tmp_f = (0 - qi_out.offset) * qi_out.scale;
-                }
-                else if(op == ElementWiseUnary::RSQRT)
-                {
-                    tmp_f = (255 - qi_out.offset) * qi_out.scale;
-                }
-                else
-                {
-                    tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
-                }
-            }
-            else
-            {
-                tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
-            }
-            tmp               = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO);
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
index 66b8b5fe456e5f0019284413a5c5717807811b7a..d54d3984cbd9fd5d3c5820ea3f398618fccdb3bc 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+
+#include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 
 namespace arm_compute
@@ -33,9 +35,266 @@ namespace arm_compute
 namespace cpu
 {
 template <typename ScalarType>
-void elementwise_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
+inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a)
+{
+    switch (op)
+    {
+        case ElementWiseUnary::RSQRT:
+            return 1 / sqrt(a);
+        case ElementWiseUnary::EXP:
+            return std::exp(a);
+        case ElementWiseUnary::NEG:
+            return -a;
+        case ElementWiseUnary::LOG:
+            return std::log(a);
+        case ElementWiseUnary::ABS:
+            return std::abs(a);
+        case ElementWiseUnary::ROUND:
+            return support::cpp11::nearbyint(a);
+        case ElementWiseUnary::SIN:
+            return std::sin(a);
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+}
+
+template <typename ScalarType, typename VectorType>
+inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a)
+{
+    switch (op)
+    {
+        case ElementWiseUnary::RSQRT:
+            return wrapper::vinvsqrt(a);
+        case ElementWiseUnary::EXP:
+            return wrapper::vexpq(a);
+        case ElementWiseUnary::NEG:
+            return wrapper::vneg(a);
+        case ElementWiseUnary::LOG:
+            return wrapper::vlog(a);
+        case ElementWiseUnary::ABS:
+            return wrapper::vabs(a);
+        case ElementWiseUnary::ROUND:
+            return wrapper::vround(a);
+        case ElementWiseUnary::SIN:
+            return wrapper::vsin(a);
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+}
+
+template <typename ScalarType>
+inline void elementwise_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    const int  window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+            const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+
+            int x = window_start_x;
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
+            }
+        },
+        input, output);
+}
+
+template <>
+inline void elementwise_op<int8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    const int                     window_step_x     = 16;
+    const auto                    window_start_x    = static_cast<int>(window.x().start());
+    const auto                    window_end_x      = static_cast<int>(window.x().end());
+    const UniformQuantizationInfo qi_in             = in->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out            = out->info()->quantization_info().uniform();
+    const auto                    min_clamped_value = vdupq_n_f32((-128 - qi_out.offset) * qi_out.scale);
+    const auto                    max_clamped_value = vdupq_n_f32((127 - qi_out.offset) * qi_out.scale);
+    Window                        win               = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            int8x16_t  vout;
+            auto       output_ptr    = reinterpret_cast<int8_t *>(output.ptr());
+            const auto input_ptr     = reinterpret_cast<const int8_t *>(input.ptr());
+            const auto vconst_0_f32  = vdupq_n_f32(0);
+            auto       clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
+
+            int x = window_start_x;
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                const auto vin = wrapper::vloadq(input_ptr + x);
+
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+
+                // Perform activation
+                float32x4x4_t vtmp_deq = {{
+                    elementwise_op_imp<float>(op, vin_deq.val[0]),
+                    elementwise_op_imp<float>(op, vin_deq.val[1]),
+                    elementwise_op_imp<float>(op, vin_deq.val[2]),
+                    elementwise_op_imp<float>(op, vin_deq.val[3]),
+                }};
+
+                if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
+                {
+                    vtmp_deq.val[0] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
+                    vtmp_deq.val[1] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
+                    vtmp_deq.val[2] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
+                    vtmp_deq.val[3] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
+                }
+
+                // Re-quantize to new output space
+                vout = vquantize_signed(vtmp_deq, qi_out);
+                wrapper::vstore(output_ptr + x, vout);
+            }
+            for (; x < window_end_x; ++x)
+            {
+                qasymm8_signed_t in    = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
+                qasymm8_signed_t tmp   = 0;
+                float            tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                if (tmp_f <= 0.0)
+                {
+                    if (op == ElementWiseUnary::LOG)
+                    {
+                        tmp_f = (-128 - qi_out.offset) * qi_out.scale;
+                    }
+                    else if (op == ElementWiseUnary::RSQRT)
+                    {
+                        tmp_f = (127 - qi_out.offset) * qi_out.scale;
+                    }
+                    else
+                    {
+                        tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+                    }
+                }
+                else
+                {
+                    tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+                }
+                tmp = quantize_qasymm8_signed(
+                    tmp_f, qi_out,
+                    RoundingPolicy::
+                        TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a.
+                // For aarch64 LUT is used and rounding to nearest is used
+                *(output_ptr + x) = tmp;
+            }
+        },
+        input, output);
+}
+template <>
+inline void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    const int                     window_step_x     = 16;
+    const auto                    window_start_x    = static_cast<int>(window.x().start());
+    const auto                    window_end_x      = static_cast<int>(window.x().end());
+    const UniformQuantizationInfo qi_in             = in->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out            = out->info()->quantization_info().uniform();
+    const auto                    vconst_0_f32      = vdupq_n_f32(0);
+    const auto                    min_clamped_value = vdupq_n_f32((0 - qi_out.offset) * qi_out.scale);
+    const auto                    max_clamped_value = vdupq_n_f32((255 - qi_out.offset) * qi_out.scale);
+    Window                        win               = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            uint8x16_t vout;
+            auto       clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
+            auto       output_ptr    = reinterpret_cast<uint8_t *>(output.ptr());
+            const auto input_ptr     = reinterpret_cast<const uint8_t *>(input.ptr());
+            int        x             = window_start_x;
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                const auto vin = wrapper::vloadq(input_ptr + x);
+
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+
+                // Perform activation
+                float32x4x4_t vtmp_deq = {{
+                    elementwise_op_imp<float>(op, vin_deq.val[0]),
+                    elementwise_op_imp<float>(op, vin_deq.val[1]),
+                    elementwise_op_imp<float>(op, vin_deq.val[2]),
+                    elementwise_op_imp<float>(op, vin_deq.val[3]),
+                }};
+                if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
+                {
+                    vtmp_deq.val[0] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
+                    vtmp_deq.val[1] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
+                    vtmp_deq.val[2] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
+                    vtmp_deq.val[3] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
+                }
+
+                // Re-quantize to new output space
+                vout = vquantize(vtmp_deq, qi_out);
+                wrapper::vstore(output_ptr + x, vout);
+            }
+            for (; x < window_end_x; ++x)
+            {
+                qasymm8_t in    = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
+                qasymm8_t tmp   = 0;
+                float     tmp_f = dequantize_qasymm8(in, qi_in);
+                if (tmp_f <= 0.0)
+                {
+                    if (op == ElementWiseUnary::LOG)
+                    {
+                        tmp_f = (0 - qi_out.offset) * qi_out.scale;
+                    }
+                    else if (op == ElementWiseUnary::RSQRT)
+                    {
+                        tmp_f = (255 - qi_out.offset) * qi_out.scale;
+                    }
+                    else
+                    {
+                        tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+                    }
+                }
+                else
+                {
+                    tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+                }
+                tmp               = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO);
+                *(output_ptr + x) = tmp;
+            }
+        },
+        input, output);
+}
 
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
\ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
index dfe5e30035076f73451bc1432f5fc96c17b0a7c2..d4daad4ca65c6115ab43ccdc1add535af5a6e7a4 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
@@ -22,16 +22,18 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_s32_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_op<int32_t>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp
index 08bb7f28b6b9c5f96cd91d7f024a533c5d152cfb..38cb61d0ff3f79de42ffad989c49202a7785b56b 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/lut/list.h"
 
 namespace arm_compute
@@ -32,24 +33,28 @@ namespace cpu
 
 #ifdef __aarch64__
 
-void neon_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_q8_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(op);
 
-    auto win = window;
+    auto       win          = window;
     const auto window_end_x = window.x().end();
     win.set(0, Window::Dimension(0, 1, 1));
 
     Iterator src_it(in, win);
     Iterator dst_it(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &) {
-        const auto src_ptr = src_it.ptr();
-        auto dst_ptr = dst_it.ptr();
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto src_ptr = src_it.ptr();
+            auto       dst_ptr = dst_it.ptr();
 
-        lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr);
-    },
-    src_it, dst_it);
+            lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr);
+        },
+        src_it, dst_it);
 }
 
 #endif // __aarch64__
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
index d987f7747b9cb2ca08aa4f163dd337df674da097..3e4b88eb476cab2591bd936e88faa7746c710473 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Window.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
 
 namespace arm_compute
@@ -31,7 +32,8 @@ namespace cpu
 {
 #ifndef __aarch64__
 // Fallback function to be used for armv7a, for aarch64 LUT is used
-void neon_qasymm8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_qasymm8_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_op<uint8_t>(in, out, window, op);
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
index e00970a1e0154653228a4ed665da9ca354a51c34..a5f4b053e3bed2c785f5a2fdc6f997cb58d3d19a 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Window.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
 
 namespace arm_compute
@@ -31,7 +32,8 @@ namespace cpu
 {
 #ifndef __aarch64__
 // Fallback function to be used for armv7a, for aarch64 LUT is used
-void neon_qasymm8_signed_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_qasymm8_signed_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_op<int8_t>(in, out, window, op);
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
index a883309b2ecb2ee55d945388b45b68c8808b2228..22ff43c5d956f26067bb627f3ccd0891fd72914a 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
@@ -23,6 +23,7 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/CpuTypes.h"
 #include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
 
@@ -30,11 +31,12 @@ namespace arm_compute
 {
 namespace cpu
 {
-void sve_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve_fp16_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_sve_op<float16_t>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
index b21ed8ddbc9d5b99852206b457c793bcd97e1fe5..394bd47adf4df2fb95a25ae89b2d08271910692d 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/CpuTypes.h"
 #include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
 
@@ -30,10 +31,11 @@ namespace arm_compute
 {
 namespace cpu
 {
-void sve_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve_fp32_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_sve_op<float32_t>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
index a948862906b5c4ca4cc430c777b75c9529756834..5af534d9e7935a77ba9418fa8482757ffcbdb9c7 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
@@ -24,6 +24,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 
 namespace arm_compute
@@ -31,9 +32,10 @@ namespace arm_compute
 namespace cpu
 {
 template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
+inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type
+elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
 {
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::RSQRT:
             return svinvsqrt(pg, a);
@@ -55,9 +57,10 @@ inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::val
 }
 
 template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
+inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type
+elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
 {
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::NEG:
             return svneg_z(pg, a);
@@ -81,23 +84,24 @@ void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, E
     Iterator input(in, win);
     Iterator output(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        int        x          = window_start_x;
-
-        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const auto vin = svld1(pg, input_ptr + x);
-            svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
-            x += wrapper::svcnt<ScalarType>();
-            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        }
-        while(svptest_any(all_true_pg, pg));
-    },
-    input, output);
+            auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+            const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+            int        x          = window_start_x;
+
+            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            do
+            {
+                const auto vin = svld1(pg, input_ptr + x);
+                svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
+                x += wrapper::svcnt<ScalarType>();
+                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            } while (svptest_any(all_true_pg, pg));
+        },
+        input, output);
 }
 
 template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
index 068c3f7cdab9522ef31d1c5705767b04a7c5a42d..e27fe5a87f51ffc7b62988ec6619026cbd72316c 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
@@ -23,16 +23,18 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve_s32_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_sve_op<int32_t>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
index 7e32f5013226369490ec2da482dc7d7e10bd1dee..4e4582debb40c8faa6c0bd88e9e55210d1bfa8da 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
@@ -23,13 +23,15 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/lut/list.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve2_q8_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(op);
 
@@ -40,14 +42,16 @@ void sve2_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &wi
     Iterator src_it(in, win);
     Iterator dst_it(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = src_it.ptr();
-        auto       dst_ptr = dst_it.ptr();
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto src_ptr = src_it.ptr();
+            auto       dst_ptr = dst_it.ptr();
 
-        lut_u8_sve2(lut, 1, window_end_x, &src_ptr, &dst_ptr);
-    },
-    src_it, dst_it);
+            lut_u8_sve2(lut, 1, window_end_x, &src_ptr, &dst_ptr);
+        },
+        src_it, dst_it);
 }
 
 } // namespace cpu
diff --git a/src/cpu/kernels/floor/list.h b/src/cpu/kernels/floor/list.h
index 4367e0ffc908f22179b5ed89b9a51a852cc7072c..5ac78df3243c1faf0f01e9ae6cdaa00a127523aa 100644
--- a/src/cpu/kernels/floor/list.h
+++ b/src/cpu/kernels/floor/list.h
@@ -28,8 +28,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_FLOOR_KERNEL(func_name) \
-    void func_name(const void *src, void *dst, int len)
+#define DECLARE_FLOOR_KERNEL(func_name) void func_name(const void *src, void *dst, int len)
 
 DECLARE_FLOOR_KERNEL(fp16_neon_floor);
 DECLARE_FLOOR_KERNEL(fp32_neon_floor);
diff --git a/src/cpu/kernels/floor/neon/fp16.cpp b/src/cpu/kernels/floor/neon/fp16.cpp
index f362676a36fa5770c2bdb241ece5c0e707928338..f47690277db84ae265a31ce953437980baaaaa34 100644
--- a/src/cpu/kernels/floor/neon/fp16.cpp
+++ b/src/cpu/kernels/floor/neon/fp16.cpp
@@ -45,14 +45,14 @@ void fp16_neon_floor(const void *src, void *dst, int len)
     auto psrc = static_cast<const __fp16 *>(src);
     auto pdst = static_cast<__fp16 *>(dst);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc)));
         psrc += step;
         pdst += step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *pdst = std::floor(*psrc);
         ++psrc;
diff --git a/src/cpu/kernels/floor/neon/fp32.cpp b/src/cpu/kernels/floor/neon/fp32.cpp
index f5efb2e8494e7292e8f1ac8c6a6f009c48b8bf3e..a86e24d3c33f75a8d2d4dd6a24b7a521bd95e1e5 100644
--- a/src/cpu/kernels/floor/neon/fp32.cpp
+++ b/src/cpu/kernels/floor/neon/fp32.cpp
@@ -43,14 +43,14 @@ void fp32_neon_floor(const void *src, void *dst, int len)
     auto psrc = static_cast<const float *>(src);
     auto pdst = static_cast<float *>(dst);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc)));
         psrc += step;
         pdst += step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *pdst = std::floor(*psrc);
         ++pdst;
diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp
index a29ee762fc4b7644b4d3696f71fd98f0c70f8e7f..8f47ecba8ff8f7501c2bd0e69d1aa0aa6d7860fa 100644
--- a/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,11 +29,34 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fused_batch_normalization_conv_f16(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_conv_f16(const ITensor *conv_weights,
+                                        const ITensor *conv_bias,
+                                        ITensor       *fused_weights,
+                                        ITensor       *fused_bias,
+                                        const ITensor *bn_mean,
+                                        const ITensor *bn_var,
+                                        const ITensor *bn_beta,
+                                        const ITensor *bn_gamma,
+                                        float          epsilon,
+                                        const Window  &window)
 {
-    return fused_batch_normalization_conv<float16_t>(conv_weights, conv_bias, fused_weights, fused_bias,
-                                                     bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_conv<float16_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean,
+                                                     bn_var, bn_beta, bn_gamma, epsilon, window);
+}
+
+void fused_batch_normalization_dwc_nchw_f16(const ITensor *dwc_weights,
+                                            const ITensor *dwc_bias,
+                                            ITensor       *fused_weights,
+                                            ITensor       *fused_bias,
+                                            const ITensor *bn_mean,
+                                            const ITensor *bn_var,
+                                            const ITensor *bn_beta,
+                                            const ITensor *bn_gamma,
+                                            float          epsilon,
+                                            const Window  &window)
+{
+    return fused_batch_normalization_dwc_nchw<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+                                                         bn_var, bn_beta, bn_gamma, epsilon, window);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp
index 076e97651d55c276a95f003a23588addca967039..3ca5b6977a828361345d551f7b6f0bbdf427ef00 100644
--- a/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp
@@ -28,11 +28,19 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fused_batch_normalization_conv_f32(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_conv_f32(const ITensor *conv_weights,
+                                        const ITensor *conv_bias,
+                                        ITensor       *fused_weights,
+                                        ITensor       *fused_bias,
+                                        const ITensor *bn_mean,
+                                        const ITensor *bn_var,
+                                        const ITensor *bn_beta,
+                                        const ITensor *bn_gamma,
+                                        float          epsilon,
+                                        const Window  &window)
 {
-    return fused_batch_normalization_conv<float32_t>(conv_weights, conv_bias, fused_weights, fused_bias,
-                                                     bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_conv<float32_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean,
+                                                     bn_var, bn_beta, bn_gamma, epsilon, window);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/impl.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/impl.cpp
deleted file mode 100644
index 3c6a2069ee31b8c9487e64b88246f841b6962b73..0000000000000000000000000000000000000000
--- a/src/cpu/kernels/fuse_batch_normalization/generic/impl.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2018-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename T>
-void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                    const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
-{
-    using ScalarType   = T;
-    const int size     = 16 / conv_weights->info()->element_size();
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == conv_weights);
-    const bool run_in_place_bias    = (fused_bias == nullptr) || (conv_bias != nullptr && fused_bias == conv_bias);
-
-    // Set build options
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x  = size;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Iterator conv_w_in(conv_weights, win);
-    Iterator conv_w_out(run_in_place_weights ? conv_weights : fused_weights, win);
-
-    const auto conv_bias_in  = (conv_bias != nullptr ? reinterpret_cast<ScalarType *>(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
-    auto       conv_bias_out = (run_in_place_bias ? conv_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
-
-    const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
-    auto       mean_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       var_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       gamma_vec   = wrapper::vdup_n(ScalarType(1), ExactTagType{});
-    auto       beta_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       rvar_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    const auto epsilon_vec = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{});
-
-    auto mean                = ScalarType(0.0);
-    auto var                 = ScalarType(0.0);
-    auto gamma               = ScalarType(1.0);
-    auto beta                = ScalarType(0.0);
-    auto conv_bias_in_scalar = ScalarType(0.0);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        var = input_var[id[3]];
-        if(input_gamma != nullptr)
-        {
-            gamma = input_gamma[id[3]];
-        }
-
-        if((id[0] == 0) && (id[1] == 0) && (id[2] == 0))
-        {
-            if(input_beta != nullptr)
-            {
-                beta     = input_beta[id[3]];
-                beta_vec = wrapper::vdup_n(beta, ExactTagType{});
-            }
-
-            // Construct vectors
-            mean     = input_mean[id[3]];
-            mean_vec = wrapper::vdup_n(mean, ExactTagType{});
-
-            if(conv_bias_in != nullptr)
-            {
-                conv_bias_in_scalar = conv_bias_in[id[3]];
-            }
-            auto conv_bias_tmp_scalar = (conv_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
-            conv_bias_out[id[3]]      = (conv_bias_tmp_scalar * gamma) + beta;
-        }
-
-        int  x              = window_start_x;
-        auto conv_w_in_ptr  = reinterpret_cast<const ScalarType *>(conv_w_in.ptr());
-        auto conv_w_out_ptr = reinterpret_cast<ScalarType *>(conv_w_out.ptr());
-        var_vec             = wrapper::vdup_n(var, ExactTagType{});
-        gamma_vec           = wrapper::vdup_n(gamma, ExactTagType{});
-        rvar_vec            = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto wn = wrapper::vloadq(conv_w_in_ptr + x);
-            wn      = wrapper::vmul(wn, rvar_vec);
-            wn      = wrapper::vmul(wn, gamma_vec);
-
-            // Store results
-            wrapper::vstore(conv_w_out_ptr + x, wn);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
-        }
-    },
-    conv_w_in, conv_w_out);
-}
-
-template void fused_batch_normalization_conv<float32_t>(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window);
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void fused_batch_normalization_conv<float16_t>(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window);
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h
index 979ea13842e4815591bfc4cc4e12b91d44fcd875..d807148e3784d68f3150b80b7a68ec6b6d749c49 100644
--- a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h
+++ b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H
-#define SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H
+#ifndef ACL_SRC_CPU_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H
+#define ACL_SRC_CPU_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -32,8 +33,231 @@ namespace arm_compute
 namespace cpu
 {
 template <typename T>
-void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                    const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window);
+void fused_batch_normalization_conv(const ITensor *conv_weights,
+                                    const ITensor *conv_bias,
+                                    ITensor       *fused_weights,
+                                    ITensor       *fused_bias,
+                                    const ITensor *bn_mean,
+                                    const ITensor *bn_var,
+                                    const ITensor *bn_beta,
+                                    const ITensor *bn_gamma,
+                                    float          epsilon,
+                                    const Window  &window)
+{
+    using ScalarType   = T;
+    const int size     = 16 / conv_weights->info()->element_size();
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == conv_weights);
+    const bool run_in_place_bias    = (fused_bias == nullptr) || (conv_bias != nullptr && fused_bias == conv_bias);
+
+    // Set build options
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x  = size;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Iterator conv_w_in(conv_weights, win);
+    Iterator conv_w_out(run_in_place_weights ? conv_weights : fused_weights, win);
+
+    const auto conv_bias_in =
+        (conv_bias != nullptr ? reinterpret_cast<ScalarType *>(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+    auto conv_bias_out =
+        (run_in_place_bias ? conv_bias_in
+                           : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+
+    const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = (bn_gamma != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+    const auto input_beta  = (bn_beta != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+
+    auto       mean_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       var_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       gamma_vec   = wrapper::vdup_n(ScalarType(1), ExactTagType{});
+    auto       beta_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       rvar_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    const auto epsilon_vec = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{});
+
+    auto mean                = ScalarType(0.0);
+    auto var                 = ScalarType(0.0);
+    auto gamma               = ScalarType(1.0);
+    auto beta                = ScalarType(0.0);
+    auto conv_bias_in_scalar = ScalarType(0.0);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            var = input_var[id[3]];
+            if (input_gamma != nullptr)
+            {
+                gamma = input_gamma[id[3]];
+            }
+
+            if ((id[0] == 0) && (id[1] == 0) && (id[2] == 0))
+            {
+                if (input_beta != nullptr)
+                {
+                    beta     = input_beta[id[3]];
+                    beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                }
+
+                // Construct vectors
+                mean     = input_mean[id[3]];
+                mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+
+                if (conv_bias_in != nullptr)
+                {
+                    conv_bias_in_scalar = conv_bias_in[id[3]];
+                }
+                auto conv_bias_tmp_scalar = (conv_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
+                conv_bias_out[id[3]]      = (conv_bias_tmp_scalar * gamma) + beta;
+            }
+
+            int  x              = window_start_x;
+            auto conv_w_in_ptr  = reinterpret_cast<const ScalarType *>(conv_w_in.ptr());
+            auto conv_w_out_ptr = reinterpret_cast<ScalarType *>(conv_w_out.ptr());
+            var_vec             = wrapper::vdup_n(var, ExactTagType{});
+            gamma_vec           = wrapper::vdup_n(gamma, ExactTagType{});
+            rvar_vec            = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                auto wn = wrapper::vloadq(conv_w_in_ptr + x);
+                wn      = wrapper::vmul(wn, rvar_vec);
+                wn      = wrapper::vmul(wn, gamma_vec);
+
+                // Store results
+                wrapper::vstore(conv_w_out_ptr + x, wn);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
+            }
+        },
+        conv_w_in, conv_w_out);
 }
+template <typename T>
+void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights,
+                                        const ITensor *dwc_bias,
+                                        ITensor       *fused_weights,
+                                        ITensor       *fused_bias,
+                                        const ITensor *bn_mean,
+                                        const ITensor *bn_var,
+                                        const ITensor *bn_beta,
+                                        const ITensor *bn_gamma,
+                                        float          epsilon,
+                                        const Window  &window)
+{
+    using ScalarType   = T;
+    const int size     = 16 / dwc_weights->info()->element_size();
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == dwc_weights);
+    const bool run_in_place_bias    = (fused_bias == nullptr) || (dwc_bias != nullptr && fused_bias == dwc_bias);
+
+    // Set build options
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x  = size;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Iterator dwc_w_in(dwc_weights, win);
+    Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win);
+
+    const auto dwc_bias_in =
+        (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+    auto dwc_bias_out =
+        (run_in_place_bias ? dwc_bias_in
+                           : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+
+    const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = (bn_gamma != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+    const auto input_beta  = (bn_beta != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+
+    auto       mean_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       var_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       gamma_vec   = wrapper::vdup_n(ScalarType(1), ExactTagType{});
+    auto       beta_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       rvar_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    const auto epsilon_vec = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{});
+
+    auto mean               = ScalarType(0.0);
+    auto var                = ScalarType(0.0);
+    auto gamma              = ScalarType(1.0);
+    auto beta               = ScalarType(0.0);
+    auto dwc_bias_in_scalar = ScalarType(0.0);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            var = input_var[id[2]];
+            if (input_gamma != nullptr)
+            {
+                gamma = input_gamma[id[2]];
+            }
+
+            if (id[1] == 0)
+            {
+                mean = input_mean[id[2]];
+
+                // Construct vectors
+                mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+                if (input_beta != nullptr)
+                {
+                    beta     = input_beta[id[2]];
+                    beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                }
+
+                if (dwc_bias_in != nullptr)
+                {
+                    dwc_bias_in_scalar = dwc_bias_in[id[2]];
+                }
+
+                auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
+                dwc_bias_out[id[2]]      = (dwc_bias_tmp_scalar * gamma) + beta;
+            }
+
+            int  x             = window_start_x;
+            auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
+            auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
+            var_vec            = wrapper::vdup_n(var, ExactTagType{});
+            gamma_vec          = wrapper::vdup_n(gamma, ExactTagType{});
+            rvar_vec           = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                auto wn = wrapper::vloadq(dwc_w_in_ptr + x);
+                wn      = wrapper::vmul(wn, rvar_vec);
+                wn      = wrapper::vmul(wn, gamma_vec);
+
+                // Store results
+                wrapper::vstore(dwc_w_out_ptr + x, wn);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
+            }
+        },
+        dwc_w_in, dwc_w_out);
 }
-#endif //SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H
\ No newline at end of file
+
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H
diff --git a/src/cpu/kernels/fuse_batch_normalization/list.h b/src/cpu/kernels/fuse_batch_normalization/list.h
index e25b1e5fed38365d58812a10da8a7ec9190bedd7..a03dd74f786677764969254127358c00a7a2367b 100644
--- a/src/cpu/kernels/fuse_batch_normalization/list.h
+++ b/src/cpu/kernels/fuse_batch_normalization/list.h
@@ -30,15 +30,18 @@ namespace cpu
 {
 #define DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(func_name)                                                            \
     void func_name(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, \
-                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma,     \
+                   float epsilon, const Window &window)
 
 #define DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(func_name)                                                 \
     void func_name(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, \
-                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma,   \
+                   float epsilon, const Window &window)
 
 #define DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL(func_name)                                                 \
     void func_name(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, \
-                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma,   \
+                   float epsilon, const Window &window)
 
 DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(fused_batch_normalization_conv_f16);
 DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(fused_batch_normalization_conv_f32);
@@ -50,7 +53,7 @@ DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(fused_batch_normalization_dwc_
 #undef DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL
 #undef DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL
 #undef DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL
-}
-}
+} // namespace cpu
+} // namespace arm_compute
 
-#endif //
\ No newline at end of file
+#endif //
diff --git a/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp b/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp
index 1e3be8792ded5c46cee1884de30ebddfe630ca47..25580e1bec941a7b5b987a66a28b29816176ecc7 100644
--- a/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,118 +28,20 @@ namespace arm_compute
 {
 namespace cpu
 {
-template <typename T>
-void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nchw_f32(const ITensor *dwc_weights,
+                                            const ITensor *dwc_bias,
+                                            ITensor       *fused_weights,
+                                            ITensor       *fused_bias,
+                                            const ITensor *bn_mean,
+                                            const ITensor *bn_var,
+                                            const ITensor *bn_beta,
+                                            const ITensor *bn_gamma,
+                                            float          epsilon,
+                                            const Window  &window)
 {
-    using ScalarType   = T;
-    const int size     = 16 / dwc_weights->info()->element_size();
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == dwc_weights);
-    const bool run_in_place_bias    = (fused_bias == nullptr) || (dwc_bias != nullptr && fused_bias == dwc_bias);
-
-    // Set build options
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x  = size;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Iterator dwc_w_in(dwc_weights, win);
-    Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win);
-
-    const auto dwc_bias_in  = (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
-    auto       dwc_bias_out = (run_in_place_bias ? dwc_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
-
-    const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
-    auto       mean_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       var_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       gamma_vec   = wrapper::vdup_n(ScalarType(1), ExactTagType{});
-    auto       beta_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       rvar_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    const auto epsilon_vec = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{});
-
-    auto mean               = ScalarType(0.0);
-    auto var                = ScalarType(0.0);
-    auto gamma              = ScalarType(1.0);
-    auto beta               = ScalarType(0.0);
-    auto dwc_bias_in_scalar = ScalarType(0.0);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        var = input_var[id[2]];
-        if(input_gamma != nullptr)
-        {
-            gamma = input_gamma[id[2]];
-        }
-
-        if(id[1] == 0)
-        {
-            mean = input_mean[id[2]];
-
-            // Construct vectors
-            mean_vec = wrapper::vdup_n(mean, ExactTagType{});
-            if(input_beta != nullptr)
-            {
-                beta     = input_beta[id[2]];
-                beta_vec = wrapper::vdup_n(beta, ExactTagType{});
-            }
-
-            if(dwc_bias_in != nullptr)
-            {
-                dwc_bias_in_scalar = dwc_bias_in[id[2]];
-            }
-
-            auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
-            dwc_bias_out[id[2]]      = (dwc_bias_tmp_scalar * gamma) + beta;
-        }
-
-        int  x             = window_start_x;
-        auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
-        auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
-        var_vec            = wrapper::vdup_n(var, ExactTagType{});
-        gamma_vec          = wrapper::vdup_n(gamma, ExactTagType{});
-        rvar_vec           = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto wn = wrapper::vloadq(dwc_w_in_ptr + x);
-            wn      = wrapper::vmul(wn, rvar_vec);
-            wn      = wrapper::vmul(wn, gamma_vec);
-
-            // Store results
-            wrapper::vstore(dwc_w_out_ptr + x, wn);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
-        }
-    },
-    dwc_w_in, dwc_w_out);
-}
-
-void fused_batch_normalization_dwc_nchw_f32(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                            const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
-{
-    return fused_batch_normalization_dwc_nchw<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias,
-                                                         bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
-}
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-void fused_batch_normalization_dwc_nchw_f16(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                            const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
-{
-    return fused_batch_normalization_dwc_nchw<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias,
-                                                         bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_dwc_nchw<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+                                                         bn_var, bn_beta, bn_gamma, epsilon, window);
 }
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp
index 275211ff3844c72b9fbe64f46e456ce8fb48a5bf..1d88d3b4946704481a73b0554b2718193ec502ea 100644
--- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp
@@ -30,11 +30,19 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fused_batch_normalization_dwc_nhwc_f16(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                            const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nhwc_f16(const ITensor *dwc_weights,
+                                            const ITensor *dwc_bias,
+                                            ITensor       *fused_weights,
+                                            ITensor       *fused_bias,
+                                            const ITensor *bn_mean,
+                                            const ITensor *bn_var,
+                                            const ITensor *bn_beta,
+                                            const ITensor *bn_gamma,
+                                            float          epsilon,
+                                            const Window  &window)
 {
-    return fused_batch_normalization_dwc_nhwc<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias,
-                                                         bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_dwc_nhwc<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+                                                         bn_var, bn_beta, bn_gamma, epsilon, window);
 }
 
 } // namespace cpu
diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp
index 67169c53259050edb11ffe75e9a57bab5fe294b5..1f336bb1962e0eb6c07841ae201c64257c0a4cb3 100644
--- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp
@@ -29,11 +29,19 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fused_batch_normalization_dwc_nhwc_f32(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                            const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nhwc_f32(const ITensor *dwc_weights,
+                                            const ITensor *dwc_bias,
+                                            ITensor       *fused_weights,
+                                            ITensor       *fused_bias,
+                                            const ITensor *bn_mean,
+                                            const ITensor *bn_var,
+                                            const ITensor *bn_beta,
+                                            const ITensor *bn_gamma,
+                                            float          epsilon,
+                                            const Window  &window)
 {
-    return fused_batch_normalization_dwc_nhwc<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias,
-                                                         bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_dwc_nhwc<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+                                                         bn_var, bn_beta, bn_gamma, epsilon, window);
 }
 
 } // namespace cpu
diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.cpp
deleted file mode 100644
index e33af4ebbeaf273da9c92e71dd102b438cf6af6b..0000000000000000000000000000000000000000
--- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2018-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename T>
-void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
-{
-    using ScalarType   = T;
-    const int size     = 16 / dwc_weights->info()->element_size();
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == dwc_weights);
-    const bool run_in_place_bias    = (fused_bias == nullptr) || (dwc_bias != nullptr && fused_bias == dwc_bias);
-
-    // Set build options
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x  = size;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Iterator dwc_w_in(dwc_weights, win);
-    Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win);
-
-    const auto dwc_bias_in  = (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
-    auto       dwc_bias_out = (run_in_place_bias ? dwc_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
-
-    const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
-    auto       mean_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       var_vec      = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       gamma_vec    = wrapper::vdup_n(ScalarType(1), ExactTagType{});
-    auto       beta_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       rvar_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       dwc_bias_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    const auto epsilon_vec  = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{});
-
-    auto gamma              = ScalarType(1.0);
-    auto beta               = ScalarType(0.0);
-    auto dwc_bias_in_scalar = ScalarType(0);
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            var_vec = wrapper::vloadq(input_var + x);
-            if(input_gamma != nullptr)
-            {
-                gamma_vec = wrapper::vloadq(input_gamma + x);
-            }
-
-            if((id[2] == 0) && (id[1] == 0))
-            {
-                mean_vec = wrapper::vloadq(input_mean + x);
-
-                // Construct vectors
-                if(input_beta != nullptr)
-                {
-                    beta_vec = wrapper::vloadq(input_beta + x);
-                }
-
-                if(dwc_bias_in != nullptr)
-                {
-                    dwc_bias_vec = wrapper::vloadq(dwc_bias_in + x);
-                }
-
-                auto dwc_bias_tmp_vec = wrapper::vmul(wrapper::vsub(dwc_bias_vec, mean_vec), wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)));
-                dwc_bias_tmp_vec      = wrapper::vadd(wrapper::vmul(dwc_bias_tmp_vec, gamma_vec), beta_vec);
-                wrapper::vstore(dwc_bias_out + x, dwc_bias_tmp_vec);
-            }
-
-            auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
-            auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
-
-            auto wn  = wrapper::vloadq(dwc_w_in_ptr + x);
-            rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-            wn       = wrapper::vmul(wn, rvar_vec);
-            wn       = wrapper::vmul(wn, gamma_vec);
-
-            // Store results
-            wrapper::vstore(dwc_w_out_ptr + x, wn);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            auto var = input_var[x];
-            if(input_gamma != nullptr)
-            {
-                gamma = input_gamma[x];
-            }
-
-            if(id[2] == 0 && id[1] == 0)
-            {
-                auto mean = input_mean[x];
-                if(input_beta != nullptr)
-                {
-                    beta = input_beta[x];
-                }
-                if(dwc_bias_in != nullptr)
-                {
-                    dwc_bias_in_scalar = dwc_bias_in[x];
-                }
-
-                auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
-                dwc_bias_out[x]          = (dwc_bias_tmp_scalar * gamma) + beta;
-            }
-
-            const auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
-            auto       dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
-
-            *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
-        }
-    },
-    dwc_w_in, dwc_w_out);
-}
-
-template void fused_batch_normalization_dwc_nhwc<float32_t>(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                                            const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window);
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void fused_batch_normalization_dwc_nhwc<float16_t>(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                                            const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window);
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h
index 3b813132b1ce1311e022f71d1302afe710539e52..5b74a7aef690574b00bb211ce86c1773d124f3c4 100644
--- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h
+++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_IMPL_H
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -32,9 +33,141 @@ namespace arm_compute
 namespace cpu
 {
 template <typename T>
-void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window);
+void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights,
+                                        const ITensor *dwc_bias,
+                                        ITensor       *fused_weights,
+                                        ITensor       *fused_bias,
+                                        const ITensor *bn_mean,
+                                        const ITensor *bn_var,
+                                        const ITensor *bn_beta,
+                                        const ITensor *bn_gamma,
+                                        float          epsilon,
+                                        const Window  &window)
+{
+    using ScalarType   = T;
+    const int size     = 16 / dwc_weights->info()->element_size();
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == dwc_weights);
+    const bool run_in_place_bias    = (fused_bias == nullptr) || (dwc_bias != nullptr && fused_bias == dwc_bias);
+
+    // Set build options
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x  = size;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Iterator dwc_w_in(dwc_weights, win);
+    Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win);
+
+    const auto dwc_bias_in =
+        (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+    auto dwc_bias_out =
+        (run_in_place_bias ? dwc_bias_in
+                           : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+
+    const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = (bn_gamma != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+    const auto input_beta  = (bn_beta != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+
+    auto       mean_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       var_vec      = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       gamma_vec    = wrapper::vdup_n(ScalarType(1), ExactTagType{});
+    auto       beta_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       rvar_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       dwc_bias_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    const auto epsilon_vec  = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{});
+
+    auto gamma              = ScalarType(1.0);
+    auto beta               = ScalarType(0.0);
+    auto dwc_bias_in_scalar = ScalarType(0);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                var_vec = wrapper::vloadq(input_var + x);
+                if (input_gamma != nullptr)
+                {
+                    gamma_vec = wrapper::vloadq(input_gamma + x);
+                }
+
+                if ((id[2] == 0) && (id[1] == 0))
+                {
+                    mean_vec = wrapper::vloadq(input_mean + x);
+
+                    // Construct vectors
+                    if (input_beta != nullptr)
+                    {
+                        beta_vec = wrapper::vloadq(input_beta + x);
+                    }
+
+                    if (dwc_bias_in != nullptr)
+                    {
+                        dwc_bias_vec = wrapper::vloadq(dwc_bias_in + x);
+                    }
+
+                    auto dwc_bias_tmp_vec = wrapper::vmul(wrapper::vsub(dwc_bias_vec, mean_vec),
+                                                          wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)));
+                    dwc_bias_tmp_vec      = wrapper::vadd(wrapper::vmul(dwc_bias_tmp_vec, gamma_vec), beta_vec);
+                    wrapper::vstore(dwc_bias_out + x, dwc_bias_tmp_vec);
+                }
+
+                auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
+                auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
+
+                auto wn  = wrapper::vloadq(dwc_w_in_ptr + x);
+                rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+                wn       = wrapper::vmul(wn, rvar_vec);
+                wn       = wrapper::vmul(wn, gamma_vec);
+
+                // Store results
+                wrapper::vstore(dwc_w_out_ptr + x, wn);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                auto var = input_var[x];
+                if (input_gamma != nullptr)
+                {
+                    gamma = input_gamma[x];
+                }
+
+                if (id[2] == 0 && id[1] == 0)
+                {
+                    auto mean = input_mean[x];
+                    if (input_beta != nullptr)
+                    {
+                        beta = input_beta[x];
+                    }
+                    if (dwc_bias_in != nullptr)
+                    {
+                        dwc_bias_in_scalar = dwc_bias_in[x];
+                    }
+
+                    auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
+                    dwc_bias_out[x]          = (dwc_bias_tmp_scalar * gamma) + beta;
+                }
+
+                const auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
+                auto       dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
 
+                *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
+            }
+        },
+        dwc_w_in, dwc_w_out);
+}
 } // namespace cpu
 } // namespace arm_compute
 #endif //SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_IMPL_H
diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp
index 2d61b72078232778257613b95f1986c67bdcf50d..4d7507a5da959b4776c93f4569fba96ceb8bd920 100644
--- a/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,10 +25,57 @@
 
 #include "src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h"
 
+#include <arm_neon.h>
+
 namespace arm_compute
 {
 namespace cpu
 {
+namespace
+{
+void matrix_addition_f16(const ITensor *src, ITensor *dst, const Window &window, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    const float16x8_t beta_f16 = vdupq_n_f16(beta);
+
+    constexpr int window_step_x  = 16;
+    const auto    window_start_x = static_cast<int>(window.x().start());
+    const auto    window_end_x   = static_cast<int>(window.x().end());
+
+    Window win = window.collapse_if_possible(window, Window::DimZ);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator out(dst, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const float16_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
+
+            int x = window_start_x;
+            for (; x < (window_end_x - window_step_x); x += window_step_x)
+            {
+                float16x8x2_t       alpha_ab = vld2q_f16(out_ptr + x);
+                const float16x8x2_t c        = vld2q_f16(in_ptr + x);
+                // Multiply matrix C by its weight and accumulate
+                alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
+                alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
+
+                vst2q_f16(out_ptr + x, alpha_ab);
+            }
+
+            // Left-over loop
+            for (; x < window_end_x; ++x)
+            {
+                *(out_ptr + x) += *(in_ptr + x) * static_cast<float16_t>(beta);
+            }
+        },
+        in, out);
+}
+} // namespace
 void neon_fp16_gemm_matrix_add(const ITensor *src, ITensor *dst, const Window &window, float beta)
 {
     return matrix_addition_f16(src, dst, window, beta);
diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp
index 675ed1bbcbf8a426a3a2f903f23347d6b699590e..47de0f3928289ef496afc3e02fe722397d43859d 100644
--- a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp
+++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include "src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -44,76 +45,35 @@ void matrix_addition_f32(const ITensor *src, ITensor *dst, const Window &window,
     Iterator in(src, win);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<float *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
-        {
-            float32x4x4_t       alpha_ab = vld4q_f32(out_ptr + x);
-            const float32x4x4_t c        = vld4q_f32(in_ptr + x);
-
-            // Multiply matrix C by its weight and accumulate
-            alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
-            alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
-            alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
-            alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
-
-            vst4q_f32(out_ptr + x, alpha_ab);
-        }
-
-        // Left-over loop
-        for(; x < window_end_x; ++x)
-        {
-            *(out_ptr + x) += *(in_ptr + x) * beta;
-        }
-    },
-    in, out);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void matrix_addition_f16(const ITensor *src, ITensor *dst, const Window &window, float beta)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    const float16x8_t beta_f16 = vdupq_n_f16(beta);
-
-    constexpr int window_step_x  = 16;
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end());
-
-    Window win = window.collapse_if_possible(window, Window::DimZ);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator out(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const float16_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
-        {
-            float16x8x2_t       alpha_ab = vld2q_f16(out_ptr + x);
-            const float16x8x2_t c        = vld2q_f16(in_ptr + x);
-            // Multiply matrix C by its weight and accumulate
-            alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
-            alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
-
-            vst2q_f16(out_ptr + x, alpha_ab);
-        }
-
-        // Left-over loop
-        for(; x < window_end_x; ++x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            *(out_ptr + x) += *(in_ptr + x) * static_cast<float16_t>(beta);
-        }
-    },
-    in, out);
+            const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<float *>(out.ptr());
+
+            int x = window_start_x;
+            for (; x < (window_end_x - window_step_x); x += window_step_x)
+            {
+                float32x4x4_t       alpha_ab = vld4q_f32(out_ptr + x);
+                const float32x4x4_t c        = vld4q_f32(in_ptr + x);
+
+                // Multiply matrix C by its weight and accumulate
+                alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
+                alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
+                alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
+                alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
+
+                vst4q_f32(out_ptr + x, alpha_ab);
+            }
+
+            // Left-over loop
+            for (; x < window_end_x; ++x)
+            {
+                *(out_ptr + x) += *(in_ptr + x) * beta;
+            }
+        },
+        in, out);
 }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h
index ff35f28b112a5102b99576d8948364d769a6c5ff..26ac99b4832ffeb3040d2c51ef270d5a11df53c1 100644
--- a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h
+++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,10 +30,6 @@ namespace arm_compute
 {
 namespace cpu
 {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void matrix_addition_f16(const ITensor *src, ITensor *dst, const Window &window, float beta);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
 void matrix_addition_f32(const ITensor *src, ITensor *dst, const Window &window, float beta);
 
 } // namespace cpu
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
index 1bd5a57fab8c689957559b8d0f0584e6c527f564..60fda511e3a4165efc81c3bd97033b384ca32762 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,16 +23,396 @@
  */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
+#include "src/core/utils/helpers/float_ops.h"
 #include "src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h"
 
+#include <arm_neon.h>
+
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_gemm_matrix_mul(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector)
+void vector_matrix_multiply_f16(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+{
+    const auto width_matrix_b  = static_cast<int>(dst->info()->dimension(0));
+    const auto in_b_stride     = static_cast<int>(rhs->info()->strides_in_bytes()[1] / rhs->info()->element_size());
+    const auto num_elems_vec_a = static_cast<int>(lhs->info()->dimension(0));
+
+    // The implementation computes 32 elements per iteration
+    const int window_start_x = 32 * info.thread_id;
+    const int window_step_x  = 32 * info.num_threads;
+    const int window_end_x   = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+    ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x,
+                             " (window_end_x - window_start_x) must be multiple of window_step_x");
+
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if (rhs->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    win_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Iterator ina(lhs, win_a);
+    Iterator inb(rhs, win_b);
+    Iterator out(dst, win_out);
+
+    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
+
+    const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
+
+    execute_window_loop(
+        win_out,
+        [&](const Coordinates &)
+        {
+            int x = window_start_x;
+            // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
+            // window_end_x is computed above which may cause out-of-bound writes to the dst.
+            for (; x < (window_end_x - window_step_x); x += window_step_x)
+            {
+                if (x > width_matrix_b)
+                {
+                    return;
+                }
+
+                auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
+
+                float16x8_t acc0 = vdupq_n_f16(0.f);
+                float16x8_t acc1 = vdupq_n_f16(0.f);
+                float16x8_t acc2 = vdupq_n_f16(0.f);
+                float16x8_t acc3 = vdupq_n_f16(0.f);
+
+                auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
+                const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
+                for (; vec_a <= (vec_a_end_addr - 4);)
+                {
+                    const float16x4_t a0l = vld1_f16(vec_a);
+
+                    float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+                    float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+                    float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+                    float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+                    float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+                    float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+                    float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+                    float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+                    acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
+                    acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
+                    acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
+                    acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
+                    acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
+                    acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
+                    acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
+                    acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
+
+                    matrix_b += 2 * in_b_stride;
+
+                    b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+                    b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+                    b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+                    b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+                    b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+                    b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+                    b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+                    b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+                    acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
+                    acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
+                    acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
+                    acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
+                    acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
+                    acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
+                    acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
+                    acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
+
+                    vec_a += 4;
+                    matrix_b += 2 * in_b_stride;
+                }
+
+                for (; vec_a < vec_a_end_addr; ++vec_a)
+                {
+                    const float16_t   a0  = *vec_a;
+                    const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+                    const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+                    const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+                    const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+
+                    acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
+                    acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
+                    acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
+                    acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
+
+                    matrix_b += in_b_stride;
+                }
+
+                // Multiply by the weight of matrix product (alpha)
+                if (multiply_alpha)
+                {
+                    acc0 = vmulq_f16(acc0, alpha_f16);
+                    acc1 = vmulq_f16(acc1, alpha_f16);
+                    acc2 = vmulq_f16(acc2, alpha_f16);
+                    acc3 = vmulq_f16(acc3, alpha_f16);
+                }
+
+                auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
+
+                vst1q_f16(vec_out + 0, acc0);
+                vst1q_f16(vec_out + 8, acc1);
+                vst1q_f16(vec_out + 16, acc2);
+                vst1q_f16(vec_out + 24, acc3);
+            }
+
+            for (; x < window_end_x; ++x)
+            {
+                if (x > width_matrix_b)
+                {
+                    return;
+                }
+
+                auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
+
+                float16x4_t vacc = vdup_n_f16(0.f);
+
+                auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
+                const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
+                for (; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
+                {
+                    const float16x4_t a0l = vld1_f16(vec_a);
+
+                    const float16x4_t b_col = {
+                        *(matrix_b + 0 * in_b_stride),
+                        *(matrix_b + 1 * in_b_stride),
+                        *(matrix_b + 2 * in_b_stride),
+                        *(matrix_b + 3 * in_b_stride),
+                    };
+
+                    vacc = vadd_f16(vacc, vmul_f16(a0l, b_col));
+
+                    matrix_b += 4 * in_b_stride;
+                }
+
+                float16_t acc =
+                    vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3);
+
+                for (; vec_a < vec_a_end_addr; ++vec_a)
+                {
+                    const float16_t a0  = *vec_a;
+                    const float16_t b00 = *matrix_b;
+
+                    acc += b00 * a0;
+
+                    matrix_b += in_b_stride;
+                }
+
+                // Multiply by the weight of matrix product (alpha)
+                if (multiply_alpha)
+                {
+                    acc *= static_cast<float16_t>(alpha);
+                }
+
+                auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
+
+                *(vec_out) = acc;
+            }
+        },
+        ina, inb, out);
+}
+
+void matrix_matrix_multiply_f16(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+{
+    ARM_COMPUTE_UNUSED(info);
+    const int    out_width   = static_cast<int>(dst->info()->dimension(0));
+    const int    out_height  = static_cast<int>(dst->info()->dimension(1));
+    const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
+    const size_t out_stride  = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
+    const int    num_elems_matrix_b_x = rhs->info()->dimension(0);
+
+    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if (rhs->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    // Set step_x and step_y for matrix B. Scale by a factor of 8 the X range as the input transposed matrix A has 8 times less the cols of the dst matrix
+    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride));
+    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator ina(lhs, win_a);
+    Iterator inb(rhs, win_b);
+    Iterator out(dst, window);
+
+    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
+
+    const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto   *mtx_a0  = reinterpret_cast<const float16_t *>(ina.ptr());
+            const auto   *mtx_b0  = reinterpret_cast<const float16_t *>(inb.ptr());
+            auto         *mtx_out = reinterpret_cast<float16_t *>(out.ptr());
+            float16x8x4_t c       = {{vdupq_n_f16(0.f), vdupq_n_f16(0.f), vdupq_n_f16(0.f), vdupq_n_f16(0.f)}};
+
+            /*
+        This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+             |a00 a01 a02 a03 | a04 a05 a06 a07|
+             |a10 a11 a12 a13 | a14 a15 a16 a17|
+             |a20 a21 a22 a23 | a24 a25 a26 a27| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | a40 a50 a60 a70 | ...
+             |a30 a31 a32 a33 | a34 a35 a36 a37|   | a04 a14 a24 a34 || a05 a15 a25 a35 || a06 a15 a26 a36 || a07 a17 a27 a37 | a44 a54 a64 a74 | ...
+             |a40 a41 a42 a43 | a44 a45 a46 a47|
+             |a50 a51 a52 a53 | a54 a55 a56 a57|
+             |a60 a61 a62 a63 | a64 a65 a66 a67|
+             |a70 a71 a72 a73 | a74 a75 a76 a77|
+
+             After this operation, the dst matrix will have the following shape: [ height * 4, width / 4 ]
+
+        B Matrix has been transposed as shown below
+
+           |b00 b01 b02 b03 b04 b05 b06 b07|
+           |b10 b11 b12 b13 b14 b15 b16 b17|
+           |b20 b21 b22 b23 b24 b25 b26 b27|
+           |b30 b31 b32 b33 b34 b35 b36 b37|
+          ------------------->
+
+           |b00 b01 b02 b03 b04 b05 b06 b07||b10 b11 b12 b13 b14 b15 b16 b17||b20 b21 b22 b23 b24 b25 b26 b27||b30 b31 b32 b33 b34 b35 b36 b37|
+
+            c.val[0][0] = a00*b00 + a01*b10 + a02*b20 + a03*b30
+            c.val[0][1] = a00*b01 + a01*b11 + a02*b21 + a03*b31
+
+        The size of the dst tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size.
+        */
+            const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
+
+            for (; mtx_b0 <= (mtx_b0_end_addr - 32);)
+
+            {
+                const float16x8_t p00 = vld1q_f16(mtx_a0);
+                const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);
+
+                const float16x8_t q00 = vld1q_f16(mtx_b0);
+                const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);
+                const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);
+                const float16x8_t q06 = vld1q_f16(mtx_b0 + 24);
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3)));
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7)));
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3)));
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));
+
+                mtx_a0 += 16;
+                mtx_b0 += 32;
+            }
+
+            for (; mtx_b0 < mtx_b0_end_addr;)
+
+            {
+                const float16x4_t p00 = vld1_f16(mtx_a0);
+                const float16x8_t q00 = vld1q_f16(mtx_b0);
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3)));
+
+                mtx_a0 += 4;
+                mtx_b0 += 8;
+            }
+
+            if (multiply_alpha)
+            {
+                c.val[0] = vmulq_f16(c.val[0], alpha_f16);
+                c.val[1] = vmulq_f16(c.val[1], alpha_f16);
+                c.val[2] = vmulq_f16(c.val[2], alpha_f16);
+                c.val[3] = vmulq_f16(c.val[3], alpha_f16);
+            }
+
+            if (id.x() < (out_width - 8))
+            {
+                vst1q_f16(mtx_out, c.val[0]);
+                if (id.y() + 1 < out_height)
+                {
+                    vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);
+                    if (id.y() + 2 < out_height)
+                    {
+                        vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);
+                        if (id.y() + 3 < out_height)
+                        {
+                            vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // Left-over columns
+                const int columns_left = out_width - id.x();
+                for (int x = 0; x < columns_left; ++x)
+                {
+                    *(mtx_out + x) = c.val[0][x];
+                    if (id.y() + 1 < out_height)
+                    {
+                        *(mtx_out + x + 1 * out_stride) = c.val[1][x];
+                        if (id.y() + 2 < out_height)
+                        {
+                            *(mtx_out + x + 2 * out_stride) = c.val[2][x];
+                            if (id.y() + 3 < out_height)
+                            {
+                                *(mtx_out + x + 3 * out_stride) = c.val[3][x];
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        ina, inb, out);
+}
+
+void neon_fp16_gemm_matrix_mul(const ITensor    *lhs,
+                               const ITensor    *rhs,
+                               ITensor          *dst,
+                               const Window     &window,
+                               const ThreadInfo &info,
+                               float             alpha,
+                               const bool        is_dst_vector)
 {
-    return (is_dst_vector) ? vector_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha) : matrix_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha);
+    return (is_dst_vector) ? vector_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha)
+                           : matrix_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha);
 }
-} // namespce cpu
+} // namespace cpu
 } // namespace arm_compute
 #endif //__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp
index 9c1f6f3c0ff035cf85b6b0bdbaa4b612cf7ab309..e12a312280d13730094ac965ed30130ab3a991ab 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp
@@ -28,9 +28,16 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_gemm_matrix_mul(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector)
+void neon_fp32_gemm_matrix_mul(const ITensor    *lhs,
+                               const ITensor    *rhs,
+                               ITensor          *dst,
+                               const Window     &window,
+                               const ThreadInfo &info,
+                               float             alpha,
+                               const bool        is_dst_vector)
 {
-    return (is_dst_vector) ? vector_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha) : matrix_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha);
+    return (is_dst_vector) ? vector_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha)
+                           : matrix_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha);
 }
-} // namespce cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp
index 300dc3ffc7ace05ef722b79989beba6ecd5a8c4c..404d070a3718fa1e85bc4cf468b5a8138a10f309 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include "src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h"
+
 #include "src/core/utils/helpers/float_ops.h"
 
 #include <arm_neon.h>
@@ -31,206 +32,12 @@ namespace arm_compute
 {
 namespace cpu
 {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+void vector_matrix_multiply_f32(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
 {
-    const auto width_matrix_b  = static_cast<int>(dst->info()->dimension(0));
-    const auto in_b_stride     = static_cast<int>(rhs->info()->strides_in_bytes()[1] / rhs->info()->element_size());
-    const auto num_elems_vec_a = static_cast<int>(lhs->info()->dimension(0));
-
-    // The implementation computes 32 elements per iteration
-    const int window_start_x = 32 * info.thread_id;
-    const int window_step_x  = 32 * info.num_threads;
-    const int window_end_x   = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
-    ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x");
-
-    Window win_out(window);
-    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Window win_b;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(rhs->info()->num_dimensions() >= 3)
-    {
-        win_b = window;
-    }
-    win_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    Iterator ina(lhs, win_a);
-    Iterator inb(rhs, win_b);
-    Iterator out(dst, win_out);
-
-    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
-
-    const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
-
-    execute_window_loop(win_out, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
-        // window_end_x is computed above which may cause out-of-bound writes to the dst.
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
-        {
-            if(x > width_matrix_b)
-            {
-                return;
-            }
-
-            auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
-
-            float16x8_t acc0 = vdupq_n_f16(0.f);
-            float16x8_t acc1 = vdupq_n_f16(0.f);
-            float16x8_t acc2 = vdupq_n_f16(0.f);
-            float16x8_t acc3 = vdupq_n_f16(0.f);
-
-            auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
-            const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4);)
-            {
-                const float16x4_t a0l = vld1_f16(vec_a);
-
-                float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-                float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-                float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-                float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-                float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
-                float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
-                float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
-                float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
-
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
-
-                matrix_b += 2 * in_b_stride;
-
-                b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-                b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-                b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-                b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-                b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
-                b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
-                b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
-                b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
-
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
-
-                vec_a += 4;
-                matrix_b += 2 * in_b_stride;
-            }
-
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float16_t   a0  = *vec_a;
-                const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-                const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-                const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-                const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-
-                acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
-                acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
-                acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
-                acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
-
-                matrix_b += in_b_stride;
-            }
-
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc0 = vmulq_f16(acc0, alpha_f16);
-                acc1 = vmulq_f16(acc1, alpha_f16);
-                acc2 = vmulq_f16(acc2, alpha_f16);
-                acc3 = vmulq_f16(acc3, alpha_f16);
-            }
-
-            auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
-
-            vst1q_f16(vec_out + 0, acc0);
-            vst1q_f16(vec_out + 8, acc1);
-            vst1q_f16(vec_out + 16, acc2);
-            vst1q_f16(vec_out + 24, acc3);
-        }
-
-        for(; x < window_end_x; ++x)
-        {
-            if(x > width_matrix_b)
-            {
-                return;
-            }
-
-            auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
-
-            float16x4_t vacc = vdup_n_f16(0.f);
-
-            auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
-            const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
-            {
-                const float16x4_t a0l = vld1_f16(vec_a);
-
-                const float16x4_t b_col =
-                {
-                    *(matrix_b + 0 * in_b_stride),
-                    *(matrix_b + 1 * in_b_stride),
-                    *(matrix_b + 2 * in_b_stride),
-                    *(matrix_b + 3 * in_b_stride),
-                };
-
-                vacc = vadd_f16(vacc, vmul_f16(a0l, b_col));
-
-                matrix_b += 4 * in_b_stride;
-            }
-
-            float16_t acc = vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3);
-
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float16_t a0  = *vec_a;
-                const float16_t b00 = *matrix_b;
-
-                acc += b00 * a0;
-
-                matrix_b += in_b_stride;
-            }
-
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc *= static_cast<float16_t>(alpha);
-            }
-
-            auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
-
-            *(vec_out) = acc;
-        }
-    },
-    ina, inb, out);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
-{
-    const auto width_matrix_b  = static_cast<int>(dst->info()->dimension(0));
-    const auto in_b_stride     = static_cast<int>(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()));
+    const auto width_matrix_b = static_cast<int>(dst->info()->dimension(0));
+    const auto in_b_stride =
+        static_cast<int>(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()));
     const auto num_elems_vec_a = static_cast<int>(lhs->info()->dimension(0));
 
     // The implementation computes 16 elements per iteration
@@ -250,7 +57,7 @@ void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor
     Window win_b;
     // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
     // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(rhs->info()->num_dimensions() >= 3)
+    if (rhs->info()->num_dimensions() >= 3)
     {
         win_b = window;
     }
@@ -265,209 +72,220 @@ void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor
 
     const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
 
-    execute_window_loop(win_out, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
-        // window_end_x is computed above which may cause out-of-bound writes to the dst.
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_out,
+        [&](const Coordinates &)
         {
-            if(x > width_matrix_b)
+            int x = window_start_x;
+            // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
+            // window_end_x is computed above which may cause out-of-bound writes to the dst.
+            for (; x < (window_end_x - window_step_x); x += window_step_x)
             {
-                return;
-            }
+                if (x > width_matrix_b)
+                {
+                    return;
+                }
 
-            float32x4_t acc0 = vdupq_n_f32(0.f);
-            float32x4_t acc1 = vdupq_n_f32(0.f);
-            float32x4_t acc2 = vdupq_n_f32(0.f);
-            float32x4_t acc3 = vdupq_n_f32(0.f);
+                float32x4_t acc0 = vdupq_n_f32(0.f);
+                float32x4_t acc1 = vdupq_n_f32(0.f);
+                float32x4_t acc2 = vdupq_n_f32(0.f);
+                float32x4_t acc3 = vdupq_n_f32(0.f);
 
-            auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
-            auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
+                auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
+                auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
 
 #if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
 #endif /* __arm__ */
 
-            auto vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4);)
-            {
-                float32x2_t a0l = vld1_f32(vec_a);
+                auto vec_a_end_addr = vec_a + num_elems_vec_a;
+                for (; vec_a <= (vec_a_end_addr - 4);)
+                {
+                    float32x2_t a0l = vld1_f32(vec_a);
 
-                float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-                float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-                float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-                float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+                    float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+                    float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+                    float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+                    float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
 
-                float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
-                float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
-                float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
-                float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+                    float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+                    float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+                    float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+                    float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
 
 #if __arm__
-                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
 #endif /* __arm__ */
 
-                acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
-                acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
-                acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
-                acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+                    acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+                    acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+                    acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+                    acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
 
-                acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
-                acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
-                acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
-                acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+                    acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+                    acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+                    acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+                    acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
 
-                vec_a += 2;
-                matrix_b += 2 * in_b_stride;
+                    vec_a += 2;
+                    matrix_b += 2 * in_b_stride;
 
-                a0l = vld1_f32(vec_a);
+                    a0l = vld1_f32(vec_a);
 
-                b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-                b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-                b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-                b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+                    b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+                    b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+                    b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+                    b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
 
-                b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
-                b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
-                b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
-                b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+                    b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+                    b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+                    b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+                    b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
 
-                acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
-                acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
-                acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
-                acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+                    acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+                    acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+                    acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+                    acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
 
-                acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
-                acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
-                acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
-                acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+                    acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+                    acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+                    acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+                    acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
 
-                vec_a += 2;
-                matrix_b += 2 * in_b_stride;
-            }
+                    vec_a += 2;
+                    matrix_b += 2 * in_b_stride;
+                }
 
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float a0 = *vec_a;
+                for (; vec_a < vec_a_end_addr; ++vec_a)
+                {
+                    const float a0 = *vec_a;
 
-                const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-                const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-                const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-                const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+                    const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+                    const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+                    const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+                    const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
 
-                acc0 = vmlaq_n_f32(acc0, b00, a0);
-                acc1 = vmlaq_n_f32(acc1, b01, a0);
-                acc2 = vmlaq_n_f32(acc2, b02, a0);
-                acc3 = vmlaq_n_f32(acc3, b03, a0);
+                    acc0 = vmlaq_n_f32(acc0, b00, a0);
+                    acc1 = vmlaq_n_f32(acc1, b01, a0);
+                    acc2 = vmlaq_n_f32(acc2, b02, a0);
+                    acc3 = vmlaq_n_f32(acc3, b03, a0);
 
-                matrix_b += in_b_stride;
-            }
+                    matrix_b += in_b_stride;
+                }
 
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc0 = vmulq_f32(acc0, alpha_f32);
-                acc1 = vmulq_f32(acc1, alpha_f32);
-                acc2 = vmulq_f32(acc2, alpha_f32);
-                acc3 = vmulq_f32(acc3, alpha_f32);
-            }
+                // Multiply by the weight of matrix product (alpha)
+                if (multiply_alpha)
+                {
+                    acc0 = vmulq_f32(acc0, alpha_f32);
+                    acc1 = vmulq_f32(acc1, alpha_f32);
+                    acc2 = vmulq_f32(acc2, alpha_f32);
+                    acc3 = vmulq_f32(acc3, alpha_f32);
+                }
 
-            const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
+                const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
 
-            vst1q_f32(vec_out + 0, acc0);
-            vst1q_f32(vec_out + 4, acc1);
-            vst1q_f32(vec_out + 8, acc2);
-            vst1q_f32(vec_out + 12, acc3);
-        }
+                vst1q_f32(vec_out + 0, acc0);
+                vst1q_f32(vec_out + 4, acc1);
+                vst1q_f32(vec_out + 8, acc2);
+                vst1q_f32(vec_out + 12, acc3);
+            }
 
-        // Left-over loop
-        for(; x < window_end_x; ++x)
-        {
-            if(x > width_matrix_b)
+            // Left-over loop
+            for (; x < window_end_x; ++x)
             {
-                return;
-            }
+                if (x > width_matrix_b)
+                {
+                    return;
+                }
 
-            float32x4_t vacc = vdupq_n_f32(0.f);
+                float32x4_t vacc = vdupq_n_f32(0.f);
 
-            auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
-            auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
+                auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
+                auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
 
 #if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
 #endif /* __arm__ */
 
-            auto vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
-            {
-                const float32x4_t a0l = vld1q_f32(vec_a);
-
-                const float32x4_t b_col =
+                auto vec_a_end_addr = vec_a + num_elems_vec_a;
+                for (; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
                 {
-                    *(matrix_b + 0 * in_b_stride),
-                    *(matrix_b + 1 * in_b_stride),
-                    *(matrix_b + 2 * in_b_stride),
-                    *(matrix_b + 3 * in_b_stride),
-                };
+                    const float32x4_t a0l = vld1q_f32(vec_a);
+
+                    const float32x4_t b_col = {
+                        *(matrix_b + 0 * in_b_stride),
+                        *(matrix_b + 1 * in_b_stride),
+                        *(matrix_b + 2 * in_b_stride),
+                        *(matrix_b + 3 * in_b_stride),
+                    };
 
 #if __arm__
-                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
 #endif /* __arm__ */
 
-                vacc = vmlaq_f32(vacc, b_col, a0l);
+                    vacc = vmlaq_f32(vacc, b_col, a0l);
 
-                matrix_b += 4 * in_b_stride;
-            }
+                    matrix_b += 4 * in_b_stride;
+                }
 
-            float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) + vgetq_lane_f32(vacc, 3);
+                float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) +
+                            vgetq_lane_f32(vacc, 3);
 
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float a0 = *vec_a;
+                for (; vec_a < vec_a_end_addr; ++vec_a)
+                {
+                    const float a0 = *vec_a;
 
-                const float b00 = *matrix_b;
+                    const float b00 = *matrix_b;
 
-                acc += b00 * a0;
+                    acc += b00 * a0;
 
-                matrix_b += in_b_stride;
-            }
+                    matrix_b += in_b_stride;
+                }
 
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc *= alpha;
-            }
+                // Multiply by the weight of matrix product (alpha)
+                if (multiply_alpha)
+                {
+                    acc *= alpha;
+                }
 
-            const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
+                const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
 
-            *vec_out = acc;
-        }
-    },
-    ina, inb, out);
+                *vec_out = acc;
+            }
+        },
+        ina, inb, out);
 }
 
-void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+void matrix_matrix_multiply_f32(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
 {
     ARM_COMPUTE_UNUSED(info);
-    const int    out_width            = static_cast<int>(dst->info()->dimension(0));
-    const int    out_height           = static_cast<int>(dst->info()->dimension(1));
-    const size_t in_b_stride          = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
-    const size_t out_stride1          = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
-    const size_t out_stride2          = out_stride1 * 2;
-    const size_t out_stride3          = out_stride1 * 3;
+    const int    out_width   = static_cast<int>(dst->info()->dimension(0));
+    const int    out_height  = static_cast<int>(dst->info()->dimension(1));
+    const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
+    const size_t out_stride1 = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
+    const size_t out_stride2 = out_stride1 * 2;
+    const size_t out_stride3 = out_stride1 * 3;
     const int    num_elems_matrix_b_x = rhs->info()->dimension(0);
 
     // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix
@@ -478,7 +296,7 @@ void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor
     Window win_b;
     // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
     // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(rhs->info()->num_dimensions() >= 3)
+    if (rhs->info()->num_dimensions() >= 3)
     {
         win_b = window;
     }
@@ -498,519 +316,341 @@ void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor
     // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW
     // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
     // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
-        auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
-        auto mtx_b1 = mtx_b0 + in_b_stride;
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
+            auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
+            auto mtx_b1 = mtx_b0 + in_b_stride;
 
-        float32x4_t acc00 = vdupq_n_f32(0.f);
-        float32x4_t acc10 = vdupq_n_f32(0.f);
-        float32x4_t acc20 = vdupq_n_f32(0.f);
-        float32x4_t acc30 = vdupq_n_f32(0.f);
+            float32x4_t acc00 = vdupq_n_f32(0.f);
+            float32x4_t acc10 = vdupq_n_f32(0.f);
+            float32x4_t acc20 = vdupq_n_f32(0.f);
+            float32x4_t acc30 = vdupq_n_f32(0.f);
 
-        float32x4_t acc01 = vdupq_n_f32(0.f);
-        float32x4_t acc11 = vdupq_n_f32(0.f);
-        float32x4_t acc21 = vdupq_n_f32(0.f);
-        float32x4_t acc31 = vdupq_n_f32(0.f);
+            float32x4_t acc01 = vdupq_n_f32(0.f);
+            float32x4_t acc11 = vdupq_n_f32(0.f);
+            float32x4_t acc21 = vdupq_n_f32(0.f);
+            float32x4_t acc31 = vdupq_n_f32(0.f);
 
 #if __arm__
-        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
 #endif /* __arm__ */
 
-        auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
-        for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
-        {
-            float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
-            float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
-            float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
-            float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
+            auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
+            for (; mtx_b0 <= (mtx_b0_end_addr - 32);)
+            {
+                float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
+                float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
+                float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
+                float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
 
-            float32x4_t b00 = vld1q_f32(mtx_b0);
-            float32x4_t b10 = vld1q_f32(mtx_b1);
-            float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
-            float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
+                float32x4_t b00 = vld1q_f32(mtx_b0);
+                float32x4_t b10 = vld1q_f32(mtx_b1);
+                float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
+                float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
 
 #if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
 #endif /* __arm__ */
 
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
-            float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
-            float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
-            float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-
-            a0 = vld1q_dup_f32(mtx_a0 + 0);
-            a1 = vld1q_dup_f32(mtx_a0 + 1);
-            a2 = vld1q_dup_f32(mtx_a0 + 2);
-            a3 = vld1q_dup_f32(mtx_a0 + 3);
-
-            b00 = vld1q_f32(mtx_b0);
-            b10 = vld1q_f32(mtx_b1);
-            b01 = vld1q_f32(mtx_b0 + 4);
-            b11 = vld1q_f32(mtx_b1 + 4);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            a4 = vld1q_dup_f32(mtx_a0 + 4);
-            a5 = vld1q_dup_f32(mtx_a0 + 5);
-            a6 = vld1q_dup_f32(mtx_a0 + 6);
-            a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-
-            a0  = vld1q_dup_f32(mtx_a0 + 0);
-            a1  = vld1q_dup_f32(mtx_a0 + 1);
-            a2  = vld1q_dup_f32(mtx_a0 + 2);
-            a3  = vld1q_dup_f32(mtx_a0 + 3);
-            b00 = vld1q_f32(mtx_b0);
-            b10 = vld1q_f32(mtx_b1);
-            b01 = vld1q_f32(mtx_b0 + 4);
-            b11 = vld1q_f32(mtx_b1 + 4);
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
+                float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
+                float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
+                float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b01, a4);
+                acc10 = vmlaq_f32(acc10, b01, a5);
+                acc20 = vmlaq_f32(acc20, b01, a6);
+                acc30 = vmlaq_f32(acc30, b01, a7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b11, a4);
+                acc11 = vmlaq_f32(acc11, b11, a5);
+                acc21 = vmlaq_f32(acc21, b11, a6);
+                acc31 = vmlaq_f32(acc31, b11, a7);
+
+                mtx_a0 += 8;
+                mtx_b0 += 8;
+                mtx_b1 += 8;
+
+                a0 = vld1q_dup_f32(mtx_a0 + 0);
+                a1 = vld1q_dup_f32(mtx_a0 + 1);
+                a2 = vld1q_dup_f32(mtx_a0 + 2);
+                a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+                b00 = vld1q_f32(mtx_b0);
+                b10 = vld1q_f32(mtx_b1);
+                b01 = vld1q_f32(mtx_b0 + 4);
+                b11 = vld1q_f32(mtx_b1 + 4);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                a4 = vld1q_dup_f32(mtx_a0 + 4);
+                a5 = vld1q_dup_f32(mtx_a0 + 5);
+                a6 = vld1q_dup_f32(mtx_a0 + 6);
+                a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b01, a4);
+                acc10 = vmlaq_f32(acc10, b01, a5);
+                acc20 = vmlaq_f32(acc20, b01, a6);
+                acc30 = vmlaq_f32(acc30, b01, a7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b11, a4);
+                acc11 = vmlaq_f32(acc11, b11, a5);
+                acc21 = vmlaq_f32(acc21, b11, a6);
+                acc31 = vmlaq_f32(acc31, b11, a7);
+
+                mtx_a0 += 8;
+                mtx_b0 += 8;
+                mtx_b1 += 8;
+
+                a0  = vld1q_dup_f32(mtx_a0 + 0);
+                a1  = vld1q_dup_f32(mtx_a0 + 1);
+                a2  = vld1q_dup_f32(mtx_a0 + 2);
+                a3  = vld1q_dup_f32(mtx_a0 + 3);
+                b00 = vld1q_f32(mtx_b0);
+                b10 = vld1q_f32(mtx_b1);
+                b01 = vld1q_f32(mtx_b0 + 4);
+                b11 = vld1q_f32(mtx_b1 + 4);
 
 #if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
 #endif /* __arm__ */
 
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            a4 = vld1q_dup_f32(mtx_a0 + 4);
-            a5 = vld1q_dup_f32(mtx_a0 + 5);
-            a6 = vld1q_dup_f32(mtx_a0 + 6);
-            a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-
-            a0  = vld1q_dup_f32(mtx_a0 + 0);
-            a1  = vld1q_dup_f32(mtx_a0 + 1);
-            a2  = vld1q_dup_f32(mtx_a0 + 2);
-            a3  = vld1q_dup_f32(mtx_a0 + 3);
-            b00 = vld1q_f32(mtx_b0);
-            b10 = vld1q_f32(mtx_b1);
-            b01 = vld1q_f32(mtx_b0 + 4);
-            b11 = vld1q_f32(mtx_b1 + 4);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            a4 = vld1q_dup_f32(mtx_a0 + 4);
-            a5 = vld1q_dup_f32(mtx_a0 + 5);
-            a6 = vld1q_dup_f32(mtx_a0 + 6);
-            a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-        }
-
-        for(; mtx_b0 < mtx_b0_end_addr;)
-        {
-            float32x4_t a0  = vld1q_dup_f32(mtx_a0 + 0);
-            float32x4_t a1  = vld1q_dup_f32(mtx_a0 + 1);
-            float32x4_t a2  = vld1q_dup_f32(mtx_a0 + 2);
-            float32x4_t a3  = vld1q_dup_f32(mtx_a0 + 3);
-            float32x4_t b00 = vld1q_f32(mtx_b0);
-            float32x4_t b10 = vld1q_f32(mtx_b1);
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                a4 = vld1q_dup_f32(mtx_a0 + 4);
+                a5 = vld1q_dup_f32(mtx_a0 + 5);
+                a6 = vld1q_dup_f32(mtx_a0 + 6);
+                a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b01, a4);
+                acc10 = vmlaq_f32(acc10, b01, a5);
+                acc20 = vmlaq_f32(acc20, b01, a6);
+                acc30 = vmlaq_f32(acc30, b01, a7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b11, a4);
+                acc11 = vmlaq_f32(acc11, b11, a5);
+                acc21 = vmlaq_f32(acc21, b11, a6);
+                acc31 = vmlaq_f32(acc31, b11, a7);
+
+                mtx_a0 += 8;
+                mtx_b0 += 8;
+                mtx_b1 += 8;
+
+                a0  = vld1q_dup_f32(mtx_a0 + 0);
+                a1  = vld1q_dup_f32(mtx_a0 + 1);
+                a2  = vld1q_dup_f32(mtx_a0 + 2);
+                a3  = vld1q_dup_f32(mtx_a0 + 3);
+                b00 = vld1q_f32(mtx_b0);
+                b10 = vld1q_f32(mtx_b1);
+                b01 = vld1q_f32(mtx_b0 + 4);
+                b11 = vld1q_f32(mtx_b1 + 4);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                a4 = vld1q_dup_f32(mtx_a0 + 4);
+                a5 = vld1q_dup_f32(mtx_a0 + 5);
+                a6 = vld1q_dup_f32(mtx_a0 + 6);
+                a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b01, a4);
+                acc10 = vmlaq_f32(acc10, b01, a5);
+                acc20 = vmlaq_f32(acc20, b01, a6);
+                acc30 = vmlaq_f32(acc30, b01, a7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b11, a4);
+                acc11 = vmlaq_f32(acc11, b11, a5);
+                acc21 = vmlaq_f32(acc21, b11, a6);
+                acc31 = vmlaq_f32(acc31, b11, a7);
+
+                mtx_a0 += 8;
+                mtx_b0 += 8;
+                mtx_b1 += 8;
+            }
+
+            for (; mtx_b0 < mtx_b0_end_addr;)
+            {
+                float32x4_t a0  = vld1q_dup_f32(mtx_a0 + 0);
+                float32x4_t a1  = vld1q_dup_f32(mtx_a0 + 1);
+                float32x4_t a2  = vld1q_dup_f32(mtx_a0 + 2);
+                float32x4_t a3  = vld1q_dup_f32(mtx_a0 + 3);
+                float32x4_t b00 = vld1q_f32(mtx_b0);
+                float32x4_t b10 = vld1q_f32(mtx_b1);
 
 #if __arm__
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+                asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+                asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
 #endif /* __arm__ */
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            mtx_a0 += 4;
-            mtx_b0 += 4;
-            mtx_b1 += 4;
-        }
-
-        // Multiply by the weight of matrix product (alpha)
-        if(multiply_alpha)
-        {
-            acc00 = vmulq_f32(acc00, alpha_f32);
-            acc10 = vmulq_f32(acc10, alpha_f32);
-            acc20 = vmulq_f32(acc20, alpha_f32);
-            acc30 = vmulq_f32(acc30, alpha_f32);
-            acc01 = vmulq_f32(acc01, alpha_f32);
-            acc11 = vmulq_f32(acc11, alpha_f32);
-            acc21 = vmulq_f32(acc21, alpha_f32);
-            acc31 = vmulq_f32(acc31, alpha_f32);
-        }
-
-        const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());
-        const auto mtx_out1 = mtx_out0 + 4;
-
-        if(id.x() < (out_width - 8))
-        {
-            vst1q_f32(mtx_out0, acc00);
-            vst1q_f32(mtx_out1, acc01);
-            if(id.y() + 1 < out_height)
-            {
-                vst1q_f32(mtx_out0 + out_stride1, acc10);
-                vst1q_f32(mtx_out1 + out_stride1, acc11);
-                if(id.y() + 2 < out_height)
-                {
-                    vst1q_f32(mtx_out0 + out_stride2, acc20);
-                    vst1q_f32(mtx_out1 + out_stride2, acc21);
-                    if(id.y() + 3 < out_height)
-                    {
-                        vst1q_f32(mtx_out0 + out_stride3, acc30);
-                        vst1q_f32(mtx_out1 + out_stride3, acc31);
-                    }
-                }
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                mtx_a0 += 4;
+                mtx_b0 += 4;
+                mtx_b1 += 4;
             }
-        }
-        else if(id.x() < (out_width - 4))
-        {
-            vst1q_f32(mtx_out0, acc00);
-            if(id.y() + 1 < out_height)
+
+            // Multiply by the weight of matrix product (alpha)
+            if (multiply_alpha)
             {
-                vst1q_f32(mtx_out0 + out_stride1, acc10);
-                if(id.y() + 2 < out_height)
-                {
-                    vst1q_f32(mtx_out0 + out_stride2, acc20);
-                    if(id.y() + 3 < out_height)
-                    {
-                        vst1q_f32(mtx_out0 + out_stride3, acc30);
-                    }
-                }
+                acc00 = vmulq_f32(acc00, alpha_f32);
+                acc10 = vmulq_f32(acc10, alpha_f32);
+                acc20 = vmulq_f32(acc20, alpha_f32);
+                acc30 = vmulq_f32(acc30, alpha_f32);
+                acc01 = vmulq_f32(acc01, alpha_f32);
+                acc11 = vmulq_f32(acc11, alpha_f32);
+                acc21 = vmulq_f32(acc21, alpha_f32);
+                acc31 = vmulq_f32(acc31, alpha_f32);
             }
-            // Left-over columns
-            const int columns_left = out_width - id.x() - 4;
-            for(auto x = 0; x < columns_left; ++x)
+
+            const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());
+            const auto mtx_out1 = mtx_out0 + 4;
+
+            if (id.x() < (out_width - 8))
             {
-                *(mtx_out1 + x) = acc01[x];
-                if(id.y() + 1 < out_height)
+                vst1q_f32(mtx_out0, acc00);
+                vst1q_f32(mtx_out1, acc01);
+                if (id.y() + 1 < out_height)
                 {
-                    *(mtx_out1 + x + out_stride1) = acc11[x];
-                    if(id.y() + 2 < out_height)
+                    vst1q_f32(mtx_out0 + out_stride1, acc10);
+                    vst1q_f32(mtx_out1 + out_stride1, acc11);
+                    if (id.y() + 2 < out_height)
                     {
-                        *(mtx_out1 + x + out_stride2) = acc21[x];
-                        if(id.y() + 3 < out_height)
+                        vst1q_f32(mtx_out0 + out_stride2, acc20);
+                        vst1q_f32(mtx_out1 + out_stride2, acc21);
+                        if (id.y() + 3 < out_height)
                         {
-                            *(mtx_out1 + x + out_stride3) = acc31[x];
+                            vst1q_f32(mtx_out0 + out_stride3, acc30);
+                            vst1q_f32(mtx_out1 + out_stride3, acc31);
                         }
                     }
                 }
             }
-        }
-        else
-        {
-            // Left-over columns
-            const int columns_left = out_width - id.x();
-            for(int x = 0; x < columns_left; ++x)
+            else if (id.x() < (out_width - 4))
             {
-                *(mtx_out0 + x) = acc00[x];
-                if(id.y() + 1 < out_height)
+                vst1q_f32(mtx_out0, acc00);
+                if (id.y() + 1 < out_height)
                 {
-                    *(mtx_out0 + x + out_stride1) = acc10[x];
-                    if(id.y() + 2 < out_height)
+                    vst1q_f32(mtx_out0 + out_stride1, acc10);
+                    if (id.y() + 2 < out_height)
                     {
-                        *(mtx_out0 + x + out_stride2) = acc20[x];
-                        if(id.y() + 3 < out_height)
+                        vst1q_f32(mtx_out0 + out_stride2, acc20);
+                        if (id.y() + 3 < out_height)
                         {
-                            *(mtx_out0 + x + out_stride3) = acc30[x];
+                            vst1q_f32(mtx_out0 + out_stride3, acc30);
                         }
                     }
                 }
-            }
-        }
-    },
-    ina, inb, out);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
-{
-    ARM_COMPUTE_UNUSED(info);
-    const int    out_width            = static_cast<int>(dst->info()->dimension(0));
-    const int    out_height           = static_cast<int>(dst->info()->dimension(1));
-    const size_t in_b_stride          = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
-    const size_t out_stride           = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
-    const int    num_elems_matrix_b_x = rhs->info()->dimension(0);
-
-    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
-
-    Window win_b;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(rhs->info()->num_dimensions() >= 3)
-    {
-        win_b = window;
-    }
-    // Set step_x and step_y for matrix B. Scale by a factor of 8 the X range as the input transposed matrix A has 8 times less the cols of the dst matrix
-    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride));
-    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator ina(lhs, win_a);
-    Iterator inb(rhs, win_b);
-    Iterator out(dst, window);
-
-    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
-
-    const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto   *mtx_a0  = reinterpret_cast<const float16_t *>(ina.ptr());
-        const auto   *mtx_b0  = reinterpret_cast<const float16_t *>(inb.ptr());
-        auto         *mtx_out = reinterpret_cast<float16_t *>(out.ptr());
-        float16x8x4_t c =
-        {
-            {
-                vdupq_n_f16(0.f),
-                vdupq_n_f16(0.f),
-                vdupq_n_f16(0.f),
-                vdupq_n_f16(0.f)
-            }
-        };
-
-        /*
-        This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
-             |a00 a01 a02 a03 | a04 a05 a06 a07|
-             |a10 a11 a12 a13 | a14 a15 a16 a17|
-             |a20 a21 a22 a23 | a24 a25 a26 a27| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | a40 a50 a60 a70 | ...
-             |a30 a31 a32 a33 | a34 a35 a36 a37|   | a04 a14 a24 a34 || a05 a15 a25 a35 || a06 a15 a26 a36 || a07 a17 a27 a37 | a44 a54 a64 a74 | ...
-             |a40 a41 a42 a43 | a44 a45 a46 a47|
-             |a50 a51 a52 a53 | a54 a55 a56 a57|
-             |a60 a61 a62 a63 | a64 a65 a66 a67|
-             |a70 a71 a72 a73 | a74 a75 a76 a77|
-
-             After this operation, the dst matrix will have the following shape: [ height * 4, width / 4 ]
-
-        B Matrix has been transposed as shown below
-
-           |b00 b01 b02 b03 b04 b05 b06 b07|
-           |b10 b11 b12 b13 b14 b15 b16 b17|
-           |b20 b21 b22 b23 b24 b25 b26 b27|
-           |b30 b31 b32 b33 b34 b35 b36 b37|
-          ------------------->
-
-           |b00 b01 b02 b03 b04 b05 b06 b07||b10 b11 b12 b13 b14 b15 b16 b17||b20 b21 b22 b23 b24 b25 b26 b27||b30 b31 b32 b33 b34 b35 b36 b37|
-
-            c.val[0][0] = a00*b00 + a01*b10 + a02*b20 + a03*b30
-            c.val[0][1] = a00*b01 + a01*b11 + a02*b21 + a03*b31
-
-        The size of the dst tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size.
-        */
-        const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
-
-        for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
-
-        {
-            const float16x8_t p00 = vld1q_f16(mtx_a0);
-            const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);
-
-            const float16x8_t q00 = vld1q_f16(mtx_b0);
-            const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);
-            const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);
-            const float16x8_t q06 = vld1q_f16(mtx_b0 + 24);
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3)));
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7)));
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3)));
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));
-
-            mtx_a0 += 16;
-            mtx_b0 += 32;
-        }
-
-        for(; mtx_b0 < mtx_b0_end_addr;)
-
-        {
-            const float16x4_t p00 = vld1_f16(mtx_a0);
-            const float16x8_t q00 = vld1q_f16(mtx_b0);
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3)));
-
-            mtx_a0 += 4;
-            mtx_b0 += 8;
-        }
-
-        if(multiply_alpha)
-        {
-            c.val[0] = vmulq_f16(c.val[0], alpha_f16);
-            c.val[1] = vmulq_f16(c.val[1], alpha_f16);
-            c.val[2] = vmulq_f16(c.val[2], alpha_f16);
-            c.val[3] = vmulq_f16(c.val[3], alpha_f16);
-        }
-
-        if(id.x() < (out_width - 8))
-        {
-            vst1q_f16(mtx_out, c.val[0]);
-            if(id.y() + 1 < out_height)
-            {
-                vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);
-                if(id.y() + 2 < out_height)
+                // Left-over columns
+                const int columns_left = out_width - id.x() - 4;
+                for (auto x = 0; x < columns_left; ++x)
                 {
-                    vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);
-                    if(id.y() + 3 < out_height)
+                    *(mtx_out1 + x) = acc01[x];
+                    if (id.y() + 1 < out_height)
                     {
-                        vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
+                        *(mtx_out1 + x + out_stride1) = acc11[x];
+                        if (id.y() + 2 < out_height)
+                        {
+                            *(mtx_out1 + x + out_stride2) = acc21[x];
+                            if (id.y() + 3 < out_height)
+                            {
+                                *(mtx_out1 + x + out_stride3) = acc31[x];
+                            }
+                        }
                     }
                 }
             }
-        }
-        else
-        {
-            // Left-over columns
-            const int columns_left = out_width - id.x();
-            for(int x = 0; x < columns_left; ++x)
+            else
             {
-                *(mtx_out + x) = c.val[0][x];
-                if(id.y() + 1 < out_height)
+                // Left-over columns
+                const int columns_left = out_width - id.x();
+                for (int x = 0; x < columns_left; ++x)
                 {
-                    *(mtx_out + x + 1 * out_stride) = c.val[1][x];
-                    if(id.y() + 2 < out_height)
+                    *(mtx_out0 + x) = acc00[x];
+                    if (id.y() + 1 < out_height)
                     {
-                        *(mtx_out + x + 2 * out_stride) = c.val[2][x];
-                        if(id.y() + 3 < out_height)
+                        *(mtx_out0 + x + out_stride1) = acc10[x];
+                        if (id.y() + 2 < out_height)
                         {
-                            *(mtx_out + x + 3 * out_stride) = c.val[3][x];
+                            *(mtx_out0 + x + out_stride2) = acc20[x];
+                            if (id.y() + 3 < out_height)
+                            {
+                                *(mtx_out0 + x + out_stride3) = acc30[x];
+                            }
                         }
                     }
                 }
             }
-        }
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
 } // namespace cpu
 
 } // namespace arm_compute
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h
index 6bf865a624dcaa257af207d30df56ba580f7cf04..74ea4c2b1799467b99ac2d284a84b7b7b50d850c 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,22 +24,18 @@
 #ifndef SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H
 #define SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/CPP/Validate.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
-
-void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
-
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
+void vector_matrix_multiply_f32(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
 
-void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
+void matrix_matrix_multiply_f32(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/gemm_matrix_mul/list.h b/src/cpu/kernels/gemm_matrix_mul/list.h
index 9cdb58ae067f9685235754648e88280c6026aa9e..15b23b1d81653cc8c9f9ffe3fb4ca58a3558478f 100644
--- a/src/cpu/kernels/gemm_matrix_mul/list.h
+++ b/src/cpu/kernels/gemm_matrix_mul/list.h
@@ -27,8 +27,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_GEMMMATRIXMUL_KERNEL(func_name) \
-    void func_name(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector)
+#define DECLARE_GEMMMATRIXMUL_KERNEL(func_name)                                                                        \
+    void func_name(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, \
+                   float alpha, const bool is_dst_vector)
 DECLARE_GEMMMATRIXMUL_KERNEL(neon_fp32_gemm_matrix_mul);
 DECLARE_GEMMMATRIXMUL_KERNEL(neon_fp16_gemm_matrix_mul);
 #undef DECLARE_GEMMMATRIXMUL_KERNEL
diff --git a/src/cpu/kernels/genproposals/generic/neon/fp16.cpp b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp
index d4e469b691bc5bcdec536fd52ce4a10f0544c139..4ed7e54f1c2ebbc3bd09496f9a09994459181b25 100644
--- a/src/cpu/kernels/genproposals/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp
@@ -27,10 +27,13 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void neon_fp16_computeallanchors(const ITensor     *anchors,
+                                 ITensor           *all_anchors,
+                                 ComputeAnchorsInfo anchors_info,
+                                 const Window      &window)
 {
     return compute_all_anchors<float16_t>(anchors, all_anchors, anchors_info, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/genproposals/generic/neon/fp32.cpp b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp
index 09aa6ecec40b981f737221d079d02158cc527d67..f15cd63bb262963510d55ac353ccb7591d58ec53 100644
--- a/src/cpu/kernels/genproposals/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp
@@ -26,9 +26,12 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void neon_fp32_computeallanchors(const ITensor     *anchors,
+                                 ITensor           *all_anchors,
+                                 ComputeAnchorsInfo anchors_info,
+                                 const Window      &window)
 {
     return compute_all_anchors<float>(anchors, all_anchors, anchors_info, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.cpp b/src/cpu/kernels/genproposals/generic/neon/impl.cpp
index 824e85adaca9e500eb0a8653268682d054e28bdf..8cb76f3afb994a051ff32b4a9999b1cbd36bb13d 100644
--- a/src/cpu/kernels/genproposals/generic/neon/impl.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,41 +28,10 @@ class ITensor;
 class Window;
 namespace cpu
 {
-template <typename T>
-void compute_all_anchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
-{
-    Iterator all_anchors_it(all_anchors, window);
-    Iterator anchors_it(all_anchors, window);
-
-    const size_t num_anchors = anchors->info()->dimension(1);
-    const T      stride      = 1.f / anchors_info.spatial_scale();
-    const size_t feat_width  = anchors_info.feat_width();
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const size_t anchor_offset = id.y() % num_anchors;
-
-        const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
-        const auto anchor_ptr     = reinterpret_cast<T *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
-
-        const size_t shift_idy = id.y() / num_anchors;
-        const T      shiftx    = (shift_idy % feat_width) * stride;
-        const T      shifty    = (shift_idy / feat_width) * stride;
-
-        *out_anchor_ptr       = *anchor_ptr + shiftx;
-        *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
-        *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
-        *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
-    },
-    all_anchors_it);
-}
-
-template void compute_all_anchors<float>(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window);
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void compute_all_anchors<float16_t>(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window);
-#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void compute_all_anchors_qasymm16(const ITensor     *anchors,
+                                  ITensor           *all_anchors,
+                                  ComputeAnchorsInfo anchors_info,
+                                  const Window      &window)
 {
     Iterator all_anchors_it(all_anchors, window);
     Iterator anchors_it(all_anchors, window);
@@ -73,28 +42,30 @@ void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors,
 
     const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform();
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const size_t anchor_offset = id.y() % num_anchors;
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const size_t anchor_offset = id.y() % num_anchors;
 
-        const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr());
-        const auto anchor_ptr     = reinterpret_cast<int16_t *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
+            const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr());
+            const auto anchor_ptr = reinterpret_cast<int16_t *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
 
-        const size_t shift_idy = id.y() / num_anchors;
-        const float  shiftx    = (shift_idy % feat_width) * stride;
-        const float  shifty    = (shift_idy / feat_width) * stride;
+            const size_t shift_idy = id.y() / num_anchors;
+            const float  shiftx    = (shift_idy % feat_width) * stride;
+            const float  shifty    = (shift_idy / feat_width) * stride;
 
-        const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx;
-        const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty;
-        const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx;
-        const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty;
+            const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx;
+            const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty;
+            const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx;
+            const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty;
 
-        *out_anchor_ptr       = quantize_qsymm16(new_anchor_x1, qinfo.scale);
-        *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale);
-        *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale);
-        *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale);
-    },
-    all_anchors_it);
+            *out_anchor_ptr       = quantize_qsymm16(new_anchor_x1, qinfo.scale);
+            *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale);
+            *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale);
+            *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale);
+        },
+        all_anchors_it);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.h b/src/cpu/kernels/genproposals/generic/neon/impl.h
index 88f5e520206ac8c0458fee42b19352276b394cd4..3317bcfbe6d153bf157bc8ad95d4c62a49713d50 100644
--- a/src/cpu/kernels/genproposals/generic/neon/impl.h
+++ b/src/cpu/kernels/genproposals/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,18 +24,52 @@
 #ifndef SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H
 #define SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 namespace arm_compute
 {
-class ITensor;
-class Window;
 namespace cpu
 {
 template <typename T>
-void compute_all_anchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window);
+void compute_all_anchors(const ITensor     *anchors,
+                         ITensor           *all_anchors,
+                         ComputeAnchorsInfo anchors_info,
+                         const Window      &window)
+{
+    Iterator all_anchors_it(all_anchors, window);
+    Iterator anchors_it(all_anchors, window);
+
+    const size_t num_anchors = anchors->info()->dimension(1);
+    const T      stride      = 1.f / anchors_info.spatial_scale();
+    const size_t feat_width  = anchors_info.feat_width();
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const size_t anchor_offset = id.y() % num_anchors;
+
+            const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
+            const auto anchor_ptr     = reinterpret_cast<T *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
+
+            const size_t shift_idy = id.y() / num_anchors;
+            const T      shiftx    = (shift_idy % feat_width) * stride;
+            const T      shifty    = (shift_idy / feat_width) * stride;
+
+            *out_anchor_ptr       = *anchor_ptr + shiftx;
+            *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
+            *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
+            *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
+        },
+        all_anchors_it);
+}
 
-void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window);
+void compute_all_anchors_qasymm16(const ITensor     *anchors,
+                                  ITensor           *all_anchors,
+                                  ComputeAnchorsInfo anchors_info,
+                                  const Window      &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif //define SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H
diff --git a/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp
index cfb5a41d6edb3d7831a5011278f623f2f459281c..7182d0b27d087f77ef97e6b521dedd60c02a771f 100644
--- a/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp
@@ -26,9 +26,12 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qu16_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void neon_qu16_computeallanchors(const ITensor     *anchors,
+                                 ITensor           *all_anchors,
+                                 ComputeAnchorsInfo anchors_info,
+                                 const Window      &window)
 {
     return compute_all_anchors_qasymm16(anchors, all_anchors, anchors_info, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp
index e9fcc84b354402f5be79cda3eb837cab821dec19..44418c0bb965264dabc67abf85abc3defd7df795 100644
--- a/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,20 +22,176 @@
  * SOFTWARE.
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/instancenorm/generic/neon/impl.h"
+
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_instancenorm(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
+namespace
 {
-    if(use_mixed_precision)
+template <typename InputType, typename AccType>
+void vector_float_sum_fp16(AccType &result, AccType &result_square, const InputType &inputs)
+{
+    result        = wrapper::vadd(result, inputs);
+    result_square = wrapper::vadd(result_square, wrapper::vmul(inputs, inputs));
+}
+
+template <typename InputType, typename AccType>
+InputType vector_float_norm_fp16(const InputType &inputs,
+                                 const AccType   &vec_mean,
+                                 const AccType   &vec_multip,
+                                 const AccType   &vec_beta)
+{
+    return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta);
+}
+
+template <>
+inline void vector_float_sum_fp16(float32x4_t &result, float32x4_t &result_square, const float16x8_t &inputs)
+{
+    vector_float_sum_fp16(result, result_square, wrapper::vcvt<float>(wrapper::vgetlow(inputs)));
+    vector_float_sum_fp16(result, result_square, wrapper::vcvt<float>(wrapper::vgethigh(inputs)));
+}
+template <>
+inline float16x8_t vector_float_norm_fp16(const float16x8_t &inputs,
+                                          const float32x4_t &vec_mean,
+                                          const float32x4_t &vec_multip,
+                                          const float32x4_t &vec_beta)
+{
+    const auto input_low  = wrapper::vcvt<float>(wrapper::vgetlow(inputs));
+    const auto input_high = wrapper::vcvt<float>(wrapper::vgethigh(inputs));
+    const auto result_low = wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_low, vec_mean, vec_multip, vec_beta));
+    const auto result_high =
+        wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_high, vec_mean, vec_multip, vec_beta));
+    float16x8_t result = wrapper::vcombine(result_low, result_high);
+
+    return result;
+}
+
+template <typename AccType>
+void instance_normalization_nchw_fp16(
+    const ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
+
+    // Clear X/Y dimensions on execution window as we handle the planes manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    constexpr int      window_step_x  = 16 / sizeof(float16_t);
+    const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
+
+    Iterator input_it(input, win);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            Window win_plane = window;
+            win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+            win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+            win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+            Iterator input_plane_it(input, win_plane);
+            Iterator output_plane_it(output, win_plane);
+
+            auto sum_h_w         = static_cast<AccType>(0.f);
+            auto sum_squares_h_w = static_cast<AccType>(0.f);
+
+            execute_window_loop(
+                win_plane,
+                [&](const Coordinates &)
+                {
+                    const auto input_ptr = reinterpret_cast<const float16_t *>(input_plane_it.ptr());
+
+                    auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+                    auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+
+                    // Compute S elements per iteration
+                    int x = window.x().start();
+                    for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+                    {
+                        auto vec_input_val = wrapper::vloadq(input_ptr + x);
+                        vector_float_sum_fp16(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
+                    }
+
+                    auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+                    auto vec2_sum_squares_h_w =
+                        wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
+
+                    vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+                    vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+
+                    sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+                    sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+                    // Compute left-over elements
+                    for (; x < window.x().end(); ++x)
+                    {
+                        const auto value = static_cast<AccType>(*(input_ptr + x));
+                        sum_h_w += value;
+                        sum_squares_h_w += value * value;
+                    }
+                },
+                input_plane_it, output_plane_it);
+
+            const auto mean_h_w = sum_h_w / elements_plane;
+            const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+            const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
+            const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
+            const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
+            const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
+
+            execute_window_loop(
+                win_plane,
+                [&](const Coordinates &)
+                {
+                    auto input_ptr  = reinterpret_cast<const float16_t *>(input_plane_it.ptr());
+                    auto output_ptr = reinterpret_cast<float16_t *>(output_plane_it.ptr());
+
+                    // Compute S elements per iteration
+                    int x = window.x().start();
+                    for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+                    {
+                        const auto vec_val = wrapper::vloadq(input_ptr + x);
+                        const auto normalized_vec =
+                            vector_float_norm_fp16(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
+                        wrapper::vstore(output_ptr + x, normalized_vec);
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window.x().end(); ++x)
+                    {
+                        const auto val    = static_cast<AccType>(*(input_ptr + x));
+                        *(output_ptr + x) = static_cast<float16_t>((val - mean_h_w) * multip_h_w + beta);
+                    }
+                },
+                input_plane_it, output_plane_it);
+        },
+        input_it);
+}
+} // namespace
+
+void neon_fp16_instancenorm(ITensor      *input,
+                            ITensor      *output,
+                            float         gamma,
+                            float         beta,
+                            float         epsilon,
+                            bool          use_mixed_precision,
+                            const Window &window)
+{
+    if (use_mixed_precision)
     {
-        return instance_normalization_nchw<float16_t, float>(input, output, gamma, beta, epsilon, window);
+        return instance_normalization_nchw_fp16<float>(input, output, gamma, beta, epsilon, window);
     }
     else
     {
-        return instance_normalization_nchw<float16_t>(input, output, gamma, beta, epsilon, window);
+        return instance_normalization_nchw_fp16<float16_t>(input, output, gamma, beta, epsilon, window);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp
index 061dd9585c76a33c3cbaadd069adb7372d7eb912..e1ca05518d52f28908605c75f8b33107b8e86e3e 100644
--- a/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp
@@ -26,7 +26,13 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_instancenorm(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
+void neon_fp32_instancenorm(ITensor      *input,
+                            ITensor      *output,
+                            float         gamma,
+                            float         beta,
+                            float         epsilon,
+                            bool          use_mixed_precision,
+                            const Window &window)
 {
     ARM_COMPUTE_UNUSED(use_mixed_precision);
     return instance_normalization_nchw<float>(input, output, gamma, beta, epsilon, window);
diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.cpp b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp
index e35cf9760841439da33ac59c33973b8610d23f1d..515079e1b54a1c308399e5474d76c78276c1b8a2 100644
--- a/src/cpu/kernels/instancenorm/generic/neon/impl.cpp
+++ b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/instancenorm/generic/neon/impl.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -37,37 +38,16 @@ void vector_float_sum(AccType &result, AccType &result_square, const InputType &
     result_square = wrapper::vadd(result_square, wrapper::vmul(inputs, inputs));
 }
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline void vector_float_sum(float32x4_t &result, float32x4_t &result_square, const float16x8_t &inputs)
-{
-    vector_float_sum(result, result_square, wrapper::vcvt<float>(wrapper::vgetlow(inputs)));
-    vector_float_sum(result, result_square, wrapper::vcvt<float>(wrapper::vgethigh(inputs)));
-}
-template <>
-inline float16x8_t vector_float_norm(const float16x8_t &inputs, const float32x4_t &vec_mean, const float32x4_t &vec_multip, const float32x4_t &vec_beta)
-{
-    const auto  input_low   = wrapper::vcvt<float>(wrapper::vgetlow(inputs));
-    const auto  input_high  = wrapper::vcvt<float>(wrapper::vgethigh(inputs));
-    const auto  result_low  = wrapper::vcvt<float16_t>(vector_float_norm(input_low, vec_mean, vec_multip, vec_beta));
-    const auto  result_high = wrapper::vcvt<float16_t>(vector_float_norm(input_high, vec_mean, vec_multip, vec_beta));
-    float16x8_t result      = wrapper::vcombine(result_low, result_high);
-
-    return result;
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
 template <typename InputType, typename AccType>
-InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
+InputType
+vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
 {
     return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta);
 }
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <typename T, typename AccType>
-void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
+void instance_normalization_nchw(
+    ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
@@ -81,92 +61,96 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, f
     const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
 
     Iterator input_it(input, win);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        Window win_plane = window;
-        win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
-        win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
-
-        Iterator input_plane_it(input, win_plane);
-        Iterator output_plane_it(output, win_plane);
-
-        auto sum_h_w         = static_cast<AccType>(0.f);
-        auto sum_squares_h_w = static_cast<AccType>(0.f);
-
-        execute_window_loop(win_plane, [&](const Coordinates &)
-        {
-            const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
-
-            auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-            auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-
-            // Compute S elements per iteration
-            int x = window.x().start();
-            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
-            {
-                auto vec_input_val = wrapper::vloadq(input_ptr + x);
-                vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
-            }
-
-            auto vec2_sum_h_w         = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
-            auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
-
-            vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
-            vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
-
-            sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
-            sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
-
-            // Compute left-over elements
-            for(; x < window.x().end(); ++x)
-            {
-                const auto value = static_cast<AccType>(*(input_ptr + x));
-                sum_h_w += value;
-                sum_squares_h_w += value * value;
-            }
-        },
-        input_plane_it, output_plane_it);
-
-        const auto mean_h_w = sum_h_w / elements_plane;
-        const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
-
-        const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
-        const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
-        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
-        const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
-
-        execute_window_loop(win_plane, [&](const Coordinates &)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            auto input_ptr  = reinterpret_cast<T *>(input_plane_it.ptr());
-            auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
-
-            // Compute S elements per iteration
-            int x = window.x().start();
-            //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
-            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
-            {
-                const auto vec_val        = wrapper::vloadq(input_ptr + x);
-                const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
-                wrapper::vstore(output_ptr + x, normalized_vec);
-            }
-
-            // Compute left-over elements
-            for(; x < window.x().end(); ++x)
-            {
-                const auto val    = static_cast<AccType>(*(input_ptr + x));
-                *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta);
-            }
+            Window win_plane = window;
+            win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+            win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+            win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+            Iterator input_plane_it(input, win_plane);
+            Iterator output_plane_it(output, win_plane);
+
+            auto sum_h_w         = static_cast<AccType>(0.f);
+            auto sum_squares_h_w = static_cast<AccType>(0.f);
+
+            execute_window_loop(
+                win_plane,
+                [&](const Coordinates &)
+                {
+                    const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
+
+                    auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+                    auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+
+                    // Compute S elements per iteration
+                    int x = window.x().start();
+                    for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+                    {
+                        auto vec_input_val = wrapper::vloadq(input_ptr + x);
+                        vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
+                    }
+
+                    auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+                    auto vec2_sum_squares_h_w =
+                        wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
+
+                    vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+                    vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+
+                    sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+                    sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+                    // Compute left-over elements
+                    for (; x < window.x().end(); ++x)
+                    {
+                        const auto value = static_cast<AccType>(*(input_ptr + x));
+                        sum_h_w += value;
+                        sum_squares_h_w += value * value;
+                    }
+                },
+                input_plane_it, output_plane_it);
+
+            const auto mean_h_w = sum_h_w / elements_plane;
+            const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+            const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
+            const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
+            const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
+            const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
+
+            execute_window_loop(
+                win_plane,
+                [&](const Coordinates &)
+                {
+                    auto input_ptr  = reinterpret_cast<T *>(input_plane_it.ptr());
+                    auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
+
+                    // Compute S elements per iteration
+                    int x = window.x().start();
+                    //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
+                    for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+                    {
+                        const auto vec_val        = wrapper::vloadq(input_ptr + x);
+                        const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
+                        wrapper::vstore(output_ptr + x, normalized_vec);
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window.x().end(); ++x)
+                    {
+                        const auto val    = static_cast<AccType>(*(input_ptr + x));
+                        *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta);
+                    }
+                },
+                input_plane_it, output_plane_it);
         },
-        input_plane_it, output_plane_it);
-    },
-    input_it);
+        input_it);
 }
 
-template void instance_normalization_nchw<float>(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void instance_normalization_nchw<float16_t, float>(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
-template void instance_normalization_nchw<float16_t>(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
-#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void instance_normalization_nchw<float>(
+    ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.h b/src/cpu/kernels/instancenorm/generic/neon/impl.h
index 1d413a9bcd8240f9708357f9caeec933b126d0ca..e1cc7487f7adc6260296f235780f26c0db7711f5 100644
--- a/src/cpu/kernels/instancenorm/generic/neon/impl.h
+++ b/src/cpu/kernels/instancenorm/generic/neon/impl.h
@@ -32,22 +32,15 @@ namespace arm_compute
 namespace cpu
 {
 template <typename T, typename AccType = T>
-void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+void instance_normalization_nchw(
+    ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
 
 template <typename InputType, typename AccType = InputType>
 void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs);
 
 template <typename InputType, typename AccType = InputType>
-InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta);
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template <>
-inline void vector_float_sum(float32x4_t &result, float32x4_t &result_square, const float16x8_t &inputs);
-
-template <>
-inline float16x8_t vector_float_norm(const float16x8_t &inputs, const float32x4_t &vec_mean, const float32x4_t &vec_multip, const float32x4_t &vec_beta);
-#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
+InputType
+vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta);
 } // namespace cpu
 } // namespace arm_compute
 #endif //define SRC_CORE_SVE_KERNELS_INSTANCENORM_IMPL_H
diff --git a/src/cpu/kernels/instancenorm/list.h b/src/cpu/kernels/instancenorm/list.h
index 54f1d3213f195afa159b3becc7489321cdebe926..51b496c41d2d72c5916e7e14514eaf0d688f6aea 100644
--- a/src/cpu/kernels/instancenorm/list.h
+++ b/src/cpu/kernels/instancenorm/list.h
@@ -27,8 +27,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_INSTANCENORM_KERNEL(func_name) \
-    void func_name(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
+#define DECLARE_INSTANCENORM_KERNEL(func_name)                                                                        \
+    void func_name(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, \
+                   const Window &window)
 DECLARE_INSTANCENORM_KERNEL(neon_fp32_instancenorm);
 DECLARE_INSTANCENORM_KERNEL(neon_fp16_instancenorm);
 #undef DECLARE_INSTANCENORM_KERNEL
diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
index b503a8b734856b8aaf808b8a3b08dfeaa270b962..32d9ca4eac7684dfa434dfd6f514e32a16eb64bb 100644
--- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
+++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
@@ -24,18 +24,17 @@
 #include "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/AssemblyUtils.h"
-
 #include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include "src/core/utils/AssemblyUtils.h"
 
 #include "depthwise_common.hpp"
-
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -54,9 +53,13 @@ constexpr unsigned int idx_channels = 0;
 constexpr unsigned int idx_batches  = 3;
 
 template <typename TSrc, typename TWeights, typename TDst>
-void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
-                    const ConvolutionInfo &info, const CPUInfo &cpu_info,
-                    std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel, std::string &_name)
+void create_arm_dwc(const ITensorInfo                                      *src,
+                    const ITensorInfo                                      *weights,
+                    ITensorInfo                                            *dst,
+                    const ConvolutionInfo                                  &info,
+                    const CPUInfo                                          &cpu_info,
+                    std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
+                    std::string                                            &_name)
 {
     unsigned int stride_cols{};
     unsigned int stride_rows{};
@@ -79,13 +82,13 @@ void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorI
 
     const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
 
-    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, dilation_rows, dilation_cols,
-                                            n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
-                                            padding, activation, nullptr);
+    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
+                                            dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels,
+                                            dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr);
 
     // Configure assembly pooling kernel
     auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst>(args);
-    if(dwc_kernel_asm == nullptr)
+    if (dwc_kernel_asm == nullptr)
     {
         // Configuration not supported: Leave function unconfigured:
         return;
@@ -96,11 +99,16 @@ void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorI
 }
 
 template <typename TSrc, typename TWeights, typename TDst>
-void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
-                          const ConvolutionInfo &info, const CPUInfo &cpu_info,
+void create_arm_dwc_quant(const ITensorInfo                                      *src,
+                          const ITensorInfo                                      *weights,
+                          ITensorInfo                                            *dst,
+                          const ConvolutionInfo                                  &info,
+                          const CPUInfo                                          &cpu_info,
                           std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
-                          std::vector<int32_t> &multipliers, std::vector<int32_t> &right_shifts, std::vector<int32_t> &left_shifts,
-                          std::string &_name)
+                          std::vector<int32_t>                                   &multipliers,
+                          std::vector<int32_t>                                   &right_shifts,
+                          std::vector<int32_t>                                   &left_shifts,
+                          std::string                                            &_name)
 {
     unsigned int stride_cols{};
     unsigned int stride_rows{};
@@ -123,9 +131,9 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT
 
     const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
 
-    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, dilation_rows, dilation_cols,
-                                            n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
-                                            padding, activation, nullptr);
+    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
+                                            dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels,
+                                            dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr);
 
     const auto src_qinfo     = src->quantization_info().uniform();
     const auto weights_qinfo = weights->quantization_info();
@@ -135,64 +143,50 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT
 
     multipliers.resize(num_filters);
     std::vector<int32_t> dst_shifts(num_filters);
-    quantization::compute_quantized_multipliers_and_shifts(src,
-                                                           weights,
-                                                           dst,
-                                                           multipliers.data(),
-                                                           dst_shifts.data());
+    quantization::compute_quantized_multipliers_and_shifts(src, weights, dst, multipliers.data(), dst_shifts.data());
 
     // Quantize activation bounds
     int32_t min_activation = std::numeric_limits<TSrc>::lowest();
     int32_t max_activation = std::numeric_limits<TSrc>::max();
-    if(info.act_info.enabled())
+    if (info.act_info.enabled())
     {
-        std::tie(min_activation, max_activation) = get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo);
+        std::tie(min_activation, max_activation) =
+            get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo);
     }
 
     // Set quantization parameters for assembly kernels
     arm_gemm::Requantize32 requant_args{};
-    if(is_data_type_quantized_per_channel(weights->data_type()))
+    if (is_data_type_quantized_per_channel(weights->data_type()))
     {
         left_shifts.resize(num_filters);
         right_shifts.resize(num_filters);
         bool need_left_shift = false; // Select more optimized path if left shift is not needed
-        for(unsigned int i = 0; i < num_filters; ++i)
+        for (unsigned int i = 0; i < num_filters; ++i)
         {
             left_shifts[i]  = std::max(-dst_shifts[i], static_cast<int32_t>(0));
             right_shifts[i] = std::min(-dst_shifts[i], static_cast<int32_t>(0));
-            if(dst_shifts[i] < 0 && !need_left_shift)
+            if (dst_shifts[i] < 0 && !need_left_shift)
             {
                 need_left_shift = true;
             }
         }
 
-        requant_args = arm_gemm::Requantize32(nullptr,
-                                              0,
-                                              src_qinfo.offset,
-                                              weights_qinfo.uniform().offset,
-                                              dst_qinfo.offset,
-                                              (need_left_shift) ? left_shifts.data() : nullptr,
-                                              right_shifts.data(),
-                                              multipliers.data(),
-                                              static_cast<TSrc>(min_activation),
-                                              static_cast<TSrc>(max_activation));
+        requant_args = arm_gemm::Requantize32(nullptr, 0, src_qinfo.offset, weights_qinfo.uniform().offset,
+                                              dst_qinfo.offset, (need_left_shift) ? left_shifts.data() : nullptr,
+                                              right_shifts.data(), multipliers.data(),
+                                              static_cast<TSrc>(min_activation), static_cast<TSrc>(max_activation));
     }
     else
     {
-        requant_args = arm_gemm::Requantize32(nullptr,
-                                              0,
-                                              src_qinfo.offset,
-                                              weights_qinfo.uniform().offset,
-                                              dst_qinfo.offset,
-                                              -dst_shifts[0],
-                                              multipliers[0],
-                                              static_cast<TSrc>(min_activation),
-                                              static_cast<TSrc>(max_activation));
+        requant_args = arm_gemm::Requantize32(nullptr, 0, src_qinfo.offset, weights_qinfo.uniform().offset,
+                                              dst_qinfo.offset, -dst_shifts[0], multipliers[0],
+                                              static_cast<TSrc>(min_activation), static_cast<TSrc>(max_activation));
     }
 
     // Configure assembly pooling kernel with requantization
-    auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args);
-    if(dwc_kernel_asm == nullptr)
+    auto dwc_kernel_asm =
+        arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args);
+    if (dwc_kernel_asm == nullptr)
     {
         // Configuration not supported: Leave function unconfigured:
         return;
@@ -203,18 +197,18 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT
 } // namespace
 
 CpuDepthwiseConv2dAssemblyWrapperKernel::CpuDepthwiseConv2dAssemblyWrapperKernel()
-    : _kernel_asm(nullptr),
-      _multipliers(),
-      _left_shifts(),
-      _right_shifts(),
-      _name()
+    : _kernel_asm(nullptr), _multipliers(), _left_shifts(), _right_shifts(), _name()
 {
 }
 
 CpuDepthwiseConv2dAssemblyWrapperKernel::~CpuDepthwiseConv2dAssemblyWrapperKernel() = default;
 
-void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *, ITensorInfo *dst,
-                                                        const ConvolutionInfo &info, const CPUInfo &cpu_info)
+void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src,
+                                                        const ITensorInfo *weights,
+                                                        const ITensorInfo *,
+                                                        ITensorInfo           *dst,
+                                                        const ConvolutionInfo &info,
+                                                        const CPUInfo         &cpu_info)
 {
     ARM_COMPUTE_UNUSED(cpu_info);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
@@ -225,24 +219,30 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src,
     _name = "CpuDepthwiseConv2dAssemblyWrapperKernel";
     std::string asm_kernel_name("");
 #if defined(__aarch64__)
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::QASYMM8:
-            if(is_data_type_quantized_per_channel(weights->data_type()))
+            if (is_data_type_quantized_per_channel(weights->data_type()))
             {
-                create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
+                create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm,
+                                                               _multipliers, _right_shifts, _left_shifts,
+                                                               asm_kernel_name);
             }
             else
             {
-                create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
+                create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm,
+                                                                _multipliers, _right_shifts, _left_shifts,
+                                                                asm_kernel_name);
             }
             break;
         case DataType::QASYMM8_SIGNED:
-            create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
+            create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers,
+                                                         _right_shifts, _left_shifts, asm_kernel_name);
             break;
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
         case DataType::F16:
-            create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name);
+            create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm,
+                                                            asm_kernel_name);
             break;
 #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
         case DataType::F32:
@@ -255,13 +255,17 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src,
 
     Window win = calculate_max_window(*dst, Steps());
     ICpuKernel::configure(win);
-    if(_kernel_asm != nullptr)
+    if (_kernel_asm != nullptr)
     {
         _name += "/" + asm_kernel_name;
     }
 }
 
-Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo     *src,
+                                                         const ITensorInfo     *weights,
+                                                         const ITensorInfo     *bias,
+                                                         const ITensorInfo     *dst,
+                                                         const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
@@ -269,10 +273,12 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src,
     ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
 #endif // !defined(__aarch64__)
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Only NHWC is supported by assembly kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC,
+                                    "Only NHWC is supported by assembly kernels");
 
-    if(is_data_type_quantized_per_channel(weights->data_type()))
+    if (is_data_type_quantized_per_channel(weights->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
@@ -282,12 +288,12 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src,
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0));
 
-        if(is_data_type_quantized(src->data_type()))
+        if (is_data_type_quantized(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -297,7 +303,7 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src,
         }
     }
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
@@ -305,17 +311,15 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src,
     }
 
     // Assembly kernels cannot work with padding greater than the kernel.
-    const auto &padding = info.pad_stride_info;
-    const auto &dilation = info.dilation;
+    const auto &padding   = info.pad_stride_info;
+    const auto &dilation  = info.dilation;
     const auto &wei_shape = weights->tensor_shape();
 
     const auto dilated_wei_w = wei_shape[1] + (wei_shape[1] - 1) * (dilation.x() - 1);
     const auto dilated_wei_h = wei_shape[2] + (wei_shape[2] - 1) * (dilation.y() - 1);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(
-        padding.pad_left() >= dilated_wei_w || padding.pad_right() >= dilated_wei_w ||
-        padding.pad_top() >= dilated_wei_h || padding.pad_bottom() >= dilated_wei_h
-    );
+    ARM_COMPUTE_RETURN_ERROR_ON(padding.pad_left() >= dilated_wei_w || padding.pad_right() >= dilated_wei_w ||
+                                padding.pad_top() >= dilated_wei_h || padding.pad_bottom() >= dilated_wei_h);
 
     return Status{};
 }
@@ -351,13 +355,12 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const
     const size_t ld_dst_row   = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
     const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
 
-    _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch,
-                         parameters_ptr,
-                         dst_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
-                         working_space, info.thread_id, info.num_threads);
+    _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch, parameters_ptr, dst_ptr, ld_dst_col, ld_dst_row,
+                         ld_dst_batch, working_space, info.thread_id, info.num_threads);
 }
 
-void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row)
+void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(
+    void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row)
 {
     _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row);
 }
diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
index f61cb1b09c4572eeffb35630ebf1ecad6f8339b4..fadaefb99950b0c467e5475fd75692022bbec708 100644
--- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/kernels/CpuKernelSelectionTypes.h"
@@ -35,8 +36,8 @@ namespace depthwise
 {
 // Forward declarations
 class IDepthwiseCommon;
-} // depthwise
-} // arm_conv
+} // namespace depthwise
+} // namespace arm_conv
 
 namespace arm_compute
 {
@@ -66,7 +67,12 @@ public:
      * @param[in]  info     Depthwise convolution layer meta-data.
      * @param[in]  cpu_info CPU information needed to select the most appropriate kernel.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info, const CPUInfo &cpu_info);
+    void configure(const ITensorInfo     *src,
+                   const ITensorInfo     *weights,
+                   const ITensorInfo     *bias,
+                   ITensorInfo           *dst,
+                   const ConvolutionInfo &info,
+                   const CPUInfo         &cpu_info);
 
     /** Indicates whether or not this function can be used to process the given parameters.
      *
@@ -74,10 +80,14 @@ public:
      *
      * @return a status.
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *weights,
+                           const ITensorInfo     *bias,
+                           const ITensorInfo     *dst,
+                           const ConvolutionInfo &info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     /** Pack bias and weights in a storage space for the assembly kernel
@@ -88,7 +98,8 @@ public:
      * @param[in] ld_weights_col Columns displacement for the weights tensor.
      * @param[in] ld_weights_row Rows displacement for the weights tensor.
      */
-    void pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row);
+    void pack_parameters(
+        void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row);
 
     /** Get the amount of storage space required for the rearranged weights and bias.
      *
diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
index 10ff4183c08d14bb0fbc676c25d5be717ffc4dc0..a161c800fd36695252a4f6554d40da4df3711e0d 100644
--- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
+++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
@@ -22,14 +22,16 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
+
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include <arm_neon.h>
 
@@ -41,7 +43,10 @@ namespace kernels
 {
 using namespace arm_compute::misc::shape_calculator;
 
-void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo      *src,
+                                               ITensorInfo            *dst,
+                                               const PoolingLayerInfo &info,
+                                               const CPUInfo          &cpu_info)
 {
     ARM_COMPUTE_UNUSED(cpu_info);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -52,10 +57,10 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorIn
 #if defined(__aarch64__)
     const bool requantize = src->quantization_info() != dst->quantization_info();
 
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::QASYMM8:
-            if(requantize)
+            if (requantize)
             {
                 create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info);
             }
@@ -65,7 +70,7 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorIn
             }
             break;
         case DataType::QASYMM8_SIGNED:
-            if(requantize)
+            if (requantize)
             {
                 create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info);
             }
@@ -91,7 +96,8 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorIn
     INEKernel::configure(win);
 }
 
-Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
+Status
+CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
@@ -99,43 +105,52 @@ Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const IT
     ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
 #endif /* __aarch64__ */
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC),
+                                    "Only NHWC is supported by assembly kernels");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
                                     "Only AVG and MAX pooling are supported by assembly kernels");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(info), "Pooling region that is entirely outside input tensor is unsupported by assembly kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_pool_region_entirely_outside_input(info),
+        "Pooling region that is entirely outside input tensor is unsupported by assembly kernels");
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
 
         const auto src_qinfo = src->quantization_info().uniform();
         const auto dst_qinfo = dst->quantization_info().uniform();
 
-        if(src_qinfo != dst_qinfo)
+        if (src_qinfo != dst_qinfo)
         {
             const float multiplier = src_qinfo.scale / dst_qinfo.scale;
             int32_t     dst_multiplier{};
             int32_t     dst_shift{};
-            ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
+            ARM_COMPUTE_RETURN_ERROR_ON(
+                quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
         }
         else
         {
-            if(src->data_type() == DataType::QASYMM8)
+            if (src->data_type() == DataType::QASYMM8)
             {
                 const bool has_padding = info.pad_stride_info.has_padding();
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !info.exclude_padding && has_padding,
+                    "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
             }
         }
     }
     else
     {
-        if(src->data_type() == DataType::QASYMM8)
+        if (src->data_type() == DataType::QASYMM8)
         {
             // If dst is not configured, the quantization info are the same
             const bool has_padding = info.pad_stride_info.has_padding();
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                !info.exclude_padding && has_padding,
+                "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
         }
     }
     return Status{};
@@ -154,9 +169,10 @@ void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &
     ITensor       *dst       = tensors.get_tensor(TensorType::ACL_DST);
     ITensor       *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
 
-    const auto in_ptr        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    auto       out_ptr       = dst->buffer() + dst->info()->offset_first_element_in_bytes();
-    auto       working_space = (workspace == nullptr) ? nullptr : workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
+    const auto in_ptr  = src->buffer() + src->info()->offset_first_element_in_bytes();
+    auto       out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes();
+    auto       working_space =
+        (workspace == nullptr) ? nullptr : workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
 
     const auto src_shape   = src->info()->tensor_shape();
     const auto dst_shape   = dst->info()->tensor_shape();
@@ -170,8 +186,7 @@ void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &
     const size_t ld_dst_row   = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
     const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
 
-    _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch,
-                         out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
+    _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch, out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
                          working_space, info.thread_id, info.num_threads);
 }
 
@@ -186,9 +201,14 @@ bool CpuPool2dAssemblyWrapperKernel::is_configured() const
 }
 
 template <typename Typesrc, typename Typedst>
-void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo      *src,
+                                                        ITensorInfo            *dst,
+                                                        const PoolingLayerInfo &info,
+                                                        const CPUInfo          &cpu_info)
 {
-    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
+    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG)
+                                                         ? arm_conv::pooling::PoolingType::AVERAGE
+                                                         : arm_conv::pooling::PoolingType::MAX;
 
     arm_conv::pooling::PoolingWindow window{};
     window.cols = static_cast<unsigned int>(info.pool_size.x());
@@ -197,7 +217,8 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src,
     arm_conv::pooling::PoolingStride stride{};
     std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
 
-    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
+    const arm_conv::pooling::PaddingValues padding{info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(),
+                                                   info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom()};
 
     constexpr unsigned int idx_width    = 1;
     constexpr unsigned int idx_height   = 2;
@@ -211,11 +232,12 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src,
     const unsigned int dst_rows   = dst->dimension(idx_height);
     const unsigned int dst_cols   = dst->dimension(idx_width);
 
-    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
+    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows,
+                                        src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
 
     // Configure assembly pooling kernel
     auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args);
-    if(pooling_kernel_asm == nullptr)
+    if (pooling_kernel_asm == nullptr)
     {
         // Configuration not supported: Leave function unconfigured:
         return;
@@ -225,9 +247,14 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src,
 }
 
 template <typename Typesrc, typename Typedst>
-void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo      *src,
+                                                                ITensorInfo            *dst,
+                                                                const PoolingLayerInfo &info,
+                                                                const CPUInfo          &cpu_info)
 {
-    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
+    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG)
+                                                         ? arm_conv::pooling::PoolingType::AVERAGE
+                                                         : arm_conv::pooling::PoolingType::MAX;
 
     arm_conv::pooling::PoolingWindow window{};
     window.cols = static_cast<unsigned int>(info.pool_size.x());
@@ -236,7 +263,8 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf
     arm_conv::pooling::PoolingStride stride{};
     std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
 
-    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
+    const arm_conv::pooling::PaddingValues padding{info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(),
+                                                   info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom()};
 
     constexpr unsigned int idx_width    = 1;
     constexpr unsigned int idx_height   = 2;
@@ -250,7 +278,8 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf
     const unsigned int dst_rows   = dst->dimension(idx_height);
     const unsigned int dst_cols   = dst->dimension(idx_width);
 
-    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
+    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows,
+                                        src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
 
     const auto src_qinfo = src->quantization_info().uniform();
     const auto dst_qinfo = dst->quantization_info().uniform();
@@ -260,15 +289,15 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf
     int32_t     dst_shift{};
     quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift);
 
-    const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset,
-                                                       dst_qinfo.offset,
+    const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset, dst_qinfo.offset,
                                                        dst_shift, // left shift
                                                        0,         // right shift
                                                        dst_multiplier);
 
     // Configure assembly pooling kernel with requantization
-    auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
-    if(pooling_kernel_asm == nullptr)
+    auto pooling_kernel_asm =
+        arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
+    if (pooling_kernel_asm == nullptr)
     {
         // Configuration not supported: Leave function unconfigured:
         return;
diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
index 8713d5c54d4a1c0d83084a3f76f65ce20b27578c..b4ff1e6f2db10a04bf64981229a071ca1bad511b 100644
--- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
@@ -25,8 +25,9 @@
 #define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
 
 #include "arm_compute/core/Types.h"
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
+
 #include "src/core/common/Macros.h"
+#include "src/core/NEON/kernels/assembly/pooling.hpp"
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/kernels/CpuKernelSelectionTypes.h"
 
@@ -101,7 +102,8 @@ private:
      * @param[in] info Pooling layer meta-data.
      */
     template <typename Typesrc, typename Typedst>
-    void create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+    void
+    create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
 
     /** Helper function to create the assembly kernel with requantization support
      *
@@ -110,9 +112,12 @@ private:
      * @param[in] info Pooling layer meta-data.
      */
     template <typename Typesrc, typename Typedst>
-    void create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+    void create_arm_pooling_requant(const ITensorInfo      *src,
+                                    ITensorInfo            *dst,
+                                    const PoolingLayerInfo &info,
+                                    const CPUInfo          &cpu_info);
 
-    std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{ nullptr };
+    std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{nullptr};
 
     /** Return minimum workload size of the relevant kernel
      *
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp
index 661c3d7f46717d825618f38f9c20fdac578a0a8a..6c6527de0638a97ca7a8daad1748837c897ded53 100644
--- a/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp
@@ -32,13 +32,15 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
+void neon_fp16_l2_normalize_x(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
 {
     ARM_COMPUTE_UNUSED(unused_axis);
     return l2_normalize_x<float16_t, 8>(in, sum, out, epsilon, window);
 }
 
-void neon_fp16_l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+void neon_fp16_l2_normalize_yz(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
 {
     return l2_normalize_yz<float16_t, 8>(in, sum, out, epsilon, window, axis);
 }
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp
index be32bdc4fa817cc1db9c0e520d8db72e52ea5ed2..520877068c694396fb811fdfda0775b7da09e172 100644
--- a/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp
@@ -22,21 +22,23 @@
  * SOFTWARE.
  */
 
-#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h"
-
 #include "arm_compute/core/Helpers.h"
 
+#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h"
+
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
+void neon_fp32_l2_normalize_x(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
 {
     ARM_COMPUTE_UNUSED(unused_axis);
     return l2_normalize_x<float, 4>(in, sum, out, epsilon, window);
 }
 
-void neon_fp32_l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+void neon_fp32_l2_normalize_yz(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
 {
     return l2_normalize_yz<float, 4>(in, sum, out, epsilon, window, axis);
 }
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/impl.cpp b/src/cpu/kernels/l2normlayer/generic/neon/impl.cpp
deleted file mode 100644
index 288653770256a16fc5c17c9c030c79f9628f88a5..0000000000000000000000000000000000000000
--- a/src/cpu/kernels/l2normlayer/generic/neon/impl.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2017-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Registrars.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename T, int S>
-void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
-{
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    const int  window_step_x  = 16 / data_size_from_type(in->info()->data_type());
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input_it(in, win_collapsed);
-    Iterator sum_it(sum, win_collapsed);
-    Iterator output_it(out, win_collapsed);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-
-        const T    sum_value      = *reinterpret_cast<const T *>(sum_it.ptr());
-        const T    norm_value     = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)));
-        const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{});
-
-        // Compute elements over vector steps
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            out_ptr[x] = in_ptr[x] * norm_value;
-        }
-    },
-    input_it, sum_it, output_it);
-}
-
-template <typename T, int S>
-void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
-{
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    const int  window_step_x  = 16 / data_size_from_type(in->info()->data_type());
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Window window_sum(win);
-    window_sum.set(axis, Window::Dimension(0, 0, 0));
-
-    Iterator input_it(in, win);
-    Iterator sum_it(sum, window_sum);
-    Iterator output_it(out, win);
-
-    const auto vec_eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
-        const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-
-        // Compute elements over vector steps
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps));
-            wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon)));
-            out_ptr[x]         = in_ptr[x] * norm_value;
-        }
-    },
-    input_it, sum_it, output_it);
-}
-
-template void l2_normalize_yz<float, 4>(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis);
-template void l2_normalize_x<float, 4>(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window);
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void l2_normalize_yz<float16_t, 8>(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis);
-template void l2_normalize_x<float16_t, 8>(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window);
-#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/impl.h b/src/cpu/kernels/l2normlayer/generic/neon/impl.h
index 98391fb3fd05e4f5d558fe6bda4dca19b29a3c2f..6bd19299b77372b354dfaf641e56aaf96c07b309 100644
--- a/src/cpu/kernels/l2normlayer/generic/neon/impl.h
+++ b/src/cpu/kernels/l2normlayer/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,21 +24,108 @@
 #ifndef SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H
 #define SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H
 
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
 #include <cstddef>
 
 namespace arm_compute
 {
-class ITensor;
-class Window;
-
 namespace cpu
 {
 template <typename T, int S>
-void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window);
+void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
+{
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    const int  window_step_x  = 16 / data_size_from_type(in->info()->data_type());
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input_it(in, win_collapsed);
+    Iterator sum_it(sum, win_collapsed);
+    Iterator output_it(out, win_collapsed);
+
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+            const T    sum_value      = *reinterpret_cast<const T *>(sum_it.ptr());
+            const T    norm_value     = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)));
+            const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{});
+
+            // Compute elements over vector steps
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                out_ptr[x] = in_ptr[x] * norm_value;
+            }
+        },
+        input_it, sum_it, output_it);
+}
 
 template <typename T, int S>
-void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis);
+void l2_normalize_yz(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+{
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    const int  window_step_x  = 16 / data_size_from_type(in->info()->data_type());
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Window window_sum(win);
+    window_sum.set(axis, Window::Dimension(0, 0, 0));
+
+    Iterator input_it(in, win);
+    Iterator sum_it(sum, window_sum);
+    Iterator output_it(out, win);
+
+    const auto vec_eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+            const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+            // Compute elements over vector steps
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps));
+                wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+            }
 
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon)));
+                out_ptr[x]         = in_ptr[x] * norm_value;
+            }
+        },
+        input_it, sum_it, output_it);
+}
 } // namespace cpu
 } // namespace arm_compute
 #endif //SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H
diff --git a/src/cpu/kernels/l2normlayer/list.h b/src/cpu/kernels/l2normlayer/list.h
index 2bad7f54f5c297540dbed059c4ea37ff920d9ba6..e2a879d06ecf9daab4723c9bb11f1d20de2969a0 100644
--- a/src/cpu/kernels/l2normlayer/list.h
+++ b/src/cpu/kernels/l2normlayer/list.h
@@ -27,8 +27,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_L2NORMLAYER_KERNEL(func_name) \
-    void func_name(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+#define DECLARE_L2NORMLAYER_KERNEL(func_name)                                                                \
+    void func_name(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, \
+                   size_t axis)
 
 DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_x);
 DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_yz);
diff --git a/src/cpu/kernels/lut/generic/neon/u8.cpp b/src/cpu/kernels/lut/generic/neon/u8.cpp
index 8ab647bfee2390fb4112428a372c0771a21afa3f..5516f5b33db18f5ac1358eb8774fbfec478b3fd9 100644
--- a/src/cpu/kernels/lut/generic/neon/u8.cpp
+++ b/src/cpu/kernels/lut/generic/neon/u8.cpp
@@ -32,376 +32,374 @@ namespace cpu
 #ifdef __aarch64__
 
 void lut_u8_neon(
-    const uint8_t        *table,
-    size_t                num_strings,
-    size_t                string_length,
-    const uint8_t *const *input,
-    uint8_t *const       *output)
+    const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, uint8_t *const *output)
 {
-    __asm__ __volatile__(
-        "ldr q16, [%x[table], #0x0]\n"
-        "ldr q17, [%x[table], #0x10]\n"
-        "mov x23, #0x0\n"
-        "ldr q18, [%x[table], #0x20]\n"
-        "ldr q19, [%x[table], #0x30]\n"
-        "ldr q20, [%x[table], #0x40]\n"
-        "ldr q21, [%x[table], #0x50]\n"
-        "ldr q22, [%x[table], #0x60]\n"
-        "ldr q23, [%x[table], #0x70]\n"
-        "ldr q24, [%x[table], #0x80]\n"
-        "ldr q25, [%x[table], #0x90]\n"
-        "ldr q26, [%x[table], #0xa0]\n"
-        "ldr q27, [%x[table], #0xb0]\n"
-        "ldr q28, [%x[table], #0xc0]\n"
-        "ldr q29, [%x[table], #0xd0]\n"
-        "ldr q30, [%x[table], #0xe0]\n"
-        "ldr q31, [%x[table], #0xf0]\n"
-        "1:" // string loop
-        "ldr x22, [%x[input], x23, LSL #0x3]\n"
-        "ldr x21, [%x[output], x23, LSL #0x3]\n"
-        "movi v11.16b, #0x40\n"
-        "movi v10.16b, #0x80\n"
-        "movi v9.16b, #0xc0\n"
-        "mov x20, %x[string_length]\n"
-        "2:" // 4 rounds: width loop
-        "cmp x20, #0x30\n"
-        "bge 27f\n"
-        "tbz x20, #5, 10f\n"
-        "ld1 { v8.16b }, [x22], #0x10\n"
-        "ld1 { v13.16b }, [x22], #0x10\n"
-        "tbz x20, #3, 6f\n"
-        "ldr d12, [x22], #0x8\n"
-        "tbz x20, #2, 4f\n"
-        "ld1 { v12.s }[2], [x22], #0x4\n"
-        "tbz x20, #1, 3f\n"
-        "ld1 { v12.h }[6], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[14], [x22]\n"
-        "b 26f\n"
-        "3:" // 4 rounds: Partial load: partial_1_44
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[12], [x22]\n"
-        "b 26f\n"
-        "4:" // 4 rounds: Partial load: partial_2_40
-        "tbz x20, #1, 5f\n"
-        "ld1 { v12.h }[4], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[10], [x22]\n"
-        "b 26f\n"
-        "5:" // 4 rounds: Partial load: partial_1_40
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[8], [x22]\n"
-        "b 26f\n"
-        "6:" // 4 rounds: Partial load: partial_4_32
-        "tbz x20, #2, 8f\n"
-        "ldr s12, [x22], #0x4\n"
-        "tbz x20, #1, 7f\n"
-        "ld1 { v12.h }[2], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[6], [x22]\n"
-        "b 26f\n"
-        "7:" // 4 rounds: Partial load: partial_1_36
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[4], [x22]\n"
-        "b 26f\n"
-        "8:" // 4 rounds: Partial load: partial_2_32
-        "tbz x20, #1, 9f\n"
-        "ldr h12, [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[2], [x22]\n"
-        "b 26f\n"
-        "9:" // 4 rounds: Partial load: partial_1_32
-        "tbz x20, #0, 26f\n"
-        "ldr b12, [x22, #0x0]\n"
-        "b 26f\n"
-        "10:" // 4 rounds: Partial load: partial_16_0
-        "tbz x20, #4, 18f\n"
-        "ld1 { v8.16b }, [x22], #0x10\n"
-        "tbz x20, #3, 14f\n"
-        "ldr d13, [x22], #0x8\n"
-        "tbz x20, #2, 12f\n"
-        "ld1 { v13.s }[2], [x22], #0x4\n"
-        "tbz x20, #1, 11f\n"
-        "ld1 { v13.h }[6], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[14], [x22]\n"
-        "b 26f\n"
-        "11:" // 4 rounds: Partial load: partial_1_28
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[12], [x22]\n"
-        "b 26f\n"
-        "12:" // 4 rounds: Partial load: partial_2_24
-        "tbz x20, #1, 13f\n"
-        "ld1 { v13.h }[4], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[10], [x22]\n"
-        "b 26f\n"
-        "13:" // 4 rounds: Partial load: partial_1_24
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[8], [x22]\n"
-        "b 26f\n"
-        "14:" // 4 rounds: Partial load: partial_4_16
-        "tbz x20, #2, 16f\n"
-        "ldr s13, [x22], #0x4\n"
-        "tbz x20, #1, 15f\n"
-        "ld1 { v13.h }[2], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[6], [x22]\n"
-        "b 26f\n"
-        "15:" // 4 rounds: Partial load: partial_1_20
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[4], [x22]\n"
-        "b 26f\n"
-        "16:" // 4 rounds: Partial load: partial_2_16
-        "tbz x20, #1, 17f\n"
-        "ldr h13, [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[2], [x22]\n"
-        "b 26f\n"
-        "17:" // 4 rounds: Partial load: partial_1_16
-        "tbz x20, #0, 26f\n"
-        "ldr b13, [x22, #0x0]\n"
-        "b 26f\n"
-        "18:" // 4 rounds: Partial load: partial_8_0
-        "tbz x20, #3, 22f\n"
-        "ldr d8, [x22], #0x8\n"
-        "tbz x20, #2, 20f\n"
-        "ld1 { v8.s }[2], [x22], #0x4\n"
-        "tbz x20, #1, 19f\n"
-        "ld1 { v8.h }[6], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[14], [x22]\n"
-        "b 26f\n"
-        "19:" // 4 rounds: Partial load: partial_1_12
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[12], [x22]\n"
-        "b 26f\n"
-        "20:" // 4 rounds: Partial load: partial_2_8
-        "tbz x20, #1, 21f\n"
-        "ld1 { v8.h }[4], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[10], [x22]\n"
-        "b 26f\n"
-        "21:" // 4 rounds: Partial load: partial_1_8
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[8], [x22]\n"
-        "b 26f\n"
-        "22:" // 4 rounds: Partial load: partial_4_0
-        "tbz x20, #2, 24f\n"
-        "ldr s8, [x22], #0x4\n"
-        "tbz x20, #1, 23f\n"
-        "ld1 { v8.h }[2], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[6], [x22]\n"
-        "b 26f\n"
-        "23:" // 4 rounds: Partial load: partial_1_4
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[4], [x22]\n"
-        "b 26f\n"
-        "24:" // 4 rounds: Partial load: partial_2_0
-        "tbz x20, #1, 25f\n"
-        "ldr h8, [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[2], [x22]\n"
-        "b 26f\n"
-        "25:" // 4 rounds: Partial load: partial_1_0
-        "ldr b8, [x22, #0x0]\n"
-        "26:" // 4 rounds: Partial load: Done
-        "b 28f\n"
-        "27:" // 4 rounds: Full load
-        "ldr q8, [x22, #0x0]\n"
-        "ldr q13, [x22, #0x10]\n"
-        "ldr q12, [x22, #0x20]\n"
-        "add x22, x22, #0x30\n"
-        "28:" // 4 rounds: Load done
-        "sub v0.16b, v8.16b, v11.16b\n"
-        "sub v7.16b, v8.16b, v10.16b\n"
-        "tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b\n"
-        "sub v6.16b, v8.16b, v9.16b\n"
-        "sub v5.16b, v13.16b, v11.16b\n"
-        "tbl v8.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v8.16b\n"
-        "sub v4.16b, v13.16b, v10.16b\n"
-        "sub v3.16b, v13.16b, v9.16b\n"
-        "tbl v7.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v7.16b\n"
-        "sub v2.16b, v12.16b, v11.16b\n"
-        "sub v1.16b, v12.16b, v10.16b\n"
-        "tbl v6.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v6.16b\n"
-        "tbl v13.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v13.16b\n"
-        "tbl v5.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v5.16b\n"
-        "orr v8.16b, v8.16b, v0.16b\n"
-        "sub v0.16b, v12.16b, v9.16b\n"
-        "tbl v4.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v4.16b\n"
-        "tbl v3.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v3.16b\n"
-        "tbl v12.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v12.16b\n"
-        "tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b\n"
-        "orr v7.16b, v7.16b, v6.16b\n"
-        "tbl v1.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v1.16b\n"
-        "tbl v0.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v0.16b\n"
-        "orr v13.16b, v13.16b, v5.16b\n"
-        "orr v4.16b, v4.16b, v3.16b\n"
-        "orr v12.16b, v12.16b, v2.16b\n"
-        "cmp x20, #0x30\n"
-        "orr v1.16b, v1.16b, v0.16b\n"
-        "orr v8.16b, v8.16b, v7.16b\n"
-        "orr v13.16b, v13.16b, v4.16b\n"
-        "orr v12.16b, v12.16b, v1.16b\n"
-        "bge 53f\n"
-        "tbz x20, #5, 36f\n"
-        "st1 { v8.16b }, [x21], #0x10\n"
-        "st1 { v13.16b }, [x21], #0x10\n"
-        "tbz x20, #3, 32f\n"
-        "str d12, [x21], #0x8\n"
-        "tbz x20, #2, 30f\n"
-        "st1 { v12.s }[2], [x21], #0x4\n"
-        "tbz x20, #1, 29f\n"
-        "st1 { v12.h }[6], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[14], [x21]\n"
-        "b 52f\n"
-        "29:" // 4 rounds: Partial writeback: partial_1_44
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[12], [x21]\n"
-        "b 52f\n"
-        "30:" // 4 rounds: Partial writeback: partial_2_40
-        "tbz x20, #1, 31f\n"
-        "st1 { v12.h }[4], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[10], [x21]\n"
-        "b 52f\n"
-        "31:" // 4 rounds: Partial writeback: partial_1_40
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[8], [x21]\n"
-        "b 52f\n"
-        "32:" // 4 rounds: Partial writeback: partial_4_32
-        "tbz x20, #2, 34f\n"
-        "str s12, [x21], #0x4\n"
-        "tbz x20, #1, 33f\n"
-        "st1 { v12.h }[2], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[6], [x21]\n"
-        "b 52f\n"
-        "33:" // 4 rounds: Partial writeback: partial_1_36
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[4], [x21]\n"
-        "b 52f\n"
-        "34:" // 4 rounds: Partial writeback: partial_2_32
-        "tbz x20, #1, 35f\n"
-        "str h12, [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[2], [x21]\n"
-        "b 52f\n"
-        "35:" // 4 rounds: Partial writeback: partial_1_32
-        "tbz x20, #0, 52f\n"
-        "str b12, [x21, #0x0]\n"
-        "b 52f\n"
-        "36:" // 4 rounds: Partial writeback: partial_16_0
-        "tbz x20, #4, 44f\n"
-        "st1 { v8.16b }, [x21], #0x10\n"
-        "tbz x20, #3, 40f\n"
-        "str d13, [x21], #0x8\n"
-        "tbz x20, #2, 38f\n"
-        "st1 { v13.s }[2], [x21], #0x4\n"
-        "tbz x20, #1, 37f\n"
-        "st1 { v13.h }[6], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[14], [x21]\n"
-        "b 52f\n"
-        "37:" // 4 rounds: Partial writeback: partial_1_28
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[12], [x21]\n"
-        "b 52f\n"
-        "38:" // 4 rounds: Partial writeback: partial_2_24
-        "tbz x20, #1, 39f\n"
-        "st1 { v13.h }[4], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[10], [x21]\n"
-        "b 52f\n"
-        "39:" // 4 rounds: Partial writeback: partial_1_24
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[8], [x21]\n"
-        "b 52f\n"
-        "40:" // 4 rounds: Partial writeback: partial_4_16
-        "tbz x20, #2, 42f\n"
-        "str s13, [x21], #0x4\n"
-        "tbz x20, #1, 41f\n"
-        "st1 { v13.h }[2], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[6], [x21]\n"
-        "b 52f\n"
-        "41:" // 4 rounds: Partial writeback: partial_1_20
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[4], [x21]\n"
-        "b 52f\n"
-        "42:" // 4 rounds: Partial writeback: partial_2_16
-        "tbz x20, #1, 43f\n"
-        "str h13, [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[2], [x21]\n"
-        "b 52f\n"
-        "43:" // 4 rounds: Partial writeback: partial_1_16
-        "tbz x20, #0, 52f\n"
-        "str b13, [x21, #0x0]\n"
-        "b 52f\n"
-        "44:" // 4 rounds: Partial writeback: partial_8_0
-        "tbz x20, #3, 48f\n"
-        "str d8, [x21], #0x8\n"
-        "tbz x20, #2, 46f\n"
-        "st1 { v8.s }[2], [x21], #0x4\n"
-        "tbz x20, #1, 45f\n"
-        "st1 { v8.h }[6], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[14], [x21]\n"
-        "b 52f\n"
-        "45:" // 4 rounds: Partial writeback: partial_1_12
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[12], [x21]\n"
-        "b 52f\n"
-        "46:" // 4 rounds: Partial writeback: partial_2_8
-        "tbz x20, #1, 47f\n"
-        "st1 { v8.h }[4], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[10], [x21]\n"
-        "b 52f\n"
-        "47:" // 4 rounds: Partial writeback: partial_1_8
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[8], [x21]\n"
-        "b 52f\n"
-        "48:" // 4 rounds: Partial writeback: partial_4_0
-        "tbz x20, #2, 50f\n"
-        "str s8, [x21], #0x4\n"
-        "tbz x20, #1, 49f\n"
-        "st1 { v8.h }[2], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[6], [x21]\n"
-        "b 52f\n"
-        "49:" // 4 rounds: Partial writeback: partial_1_4
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[4], [x21]\n"
-        "b 52f\n"
-        "50:" // 4 rounds: Partial writeback: partial_2_0
-        "tbz x20, #1, 51f\n"
-        "str h8, [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[2], [x21]\n"
-        "b 52f\n"
-        "51:" // 4 rounds: Partial writeback: partial_1_0
-        "str b8, [x21, #0x0]\n"
-        "52:" // 4 rounds: Partial writeback: Done
-        "b 54f\n"
-        "53:" // 4 rounds: Full writeback
-        "str q8, [x21, #0x0]\n"
-        "str q13, [x21, #0x10]\n"
-        "str q12, [x21, #0x20]\n"
-        "add x21, x21, #0x30\n"
-        "54:" // 4 rounds: Writeback done
-        "subs x20, x20, #0x30\n"
-        "bgt 2b\n"
-        "add x23, x23, #0x1\n"
-        "cmp x23, %x[num_strings]\n"
-        "bne 1b\n"
-        :
-        : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), [string_length] "r"(string_length), [table] "r"(table)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23");
+    __asm__ __volatile__("ldr q16, [%x[table], #0x0]\n"
+                         "ldr q17, [%x[table], #0x10]\n"
+                         "mov x23, #0x0\n"
+                         "ldr q18, [%x[table], #0x20]\n"
+                         "ldr q19, [%x[table], #0x30]\n"
+                         "ldr q20, [%x[table], #0x40]\n"
+                         "ldr q21, [%x[table], #0x50]\n"
+                         "ldr q22, [%x[table], #0x60]\n"
+                         "ldr q23, [%x[table], #0x70]\n"
+                         "ldr q24, [%x[table], #0x80]\n"
+                         "ldr q25, [%x[table], #0x90]\n"
+                         "ldr q26, [%x[table], #0xa0]\n"
+                         "ldr q27, [%x[table], #0xb0]\n"
+                         "ldr q28, [%x[table], #0xc0]\n"
+                         "ldr q29, [%x[table], #0xd0]\n"
+                         "ldr q30, [%x[table], #0xe0]\n"
+                         "ldr q31, [%x[table], #0xf0]\n"
+                         "1:" // string loop
+                         "ldr x22, [%x[input], x23, LSL #0x3]\n"
+                         "ldr x21, [%x[output], x23, LSL #0x3]\n"
+                         "movi v11.16b, #0x40\n"
+                         "movi v10.16b, #0x80\n"
+                         "movi v9.16b, #0xc0\n"
+                         "mov x20, %x[string_length]\n"
+                         "2:" // 4 rounds: width loop
+                         "cmp x20, #0x30\n"
+                         "bge 27f\n"
+                         "tbz x20, #5, 10f\n"
+                         "ld1 { v8.16b }, [x22], #0x10\n"
+                         "ld1 { v13.16b }, [x22], #0x10\n"
+                         "tbz x20, #3, 6f\n"
+                         "ldr d12, [x22], #0x8\n"
+                         "tbz x20, #2, 4f\n"
+                         "ld1 { v12.s }[2], [x22], #0x4\n"
+                         "tbz x20, #1, 3f\n"
+                         "ld1 { v12.h }[6], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[14], [x22]\n"
+                         "b 26f\n"
+                         "3:" // 4 rounds: Partial load: partial_1_44
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[12], [x22]\n"
+                         "b 26f\n"
+                         "4:" // 4 rounds: Partial load: partial_2_40
+                         "tbz x20, #1, 5f\n"
+                         "ld1 { v12.h }[4], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[10], [x22]\n"
+                         "b 26f\n"
+                         "5:" // 4 rounds: Partial load: partial_1_40
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[8], [x22]\n"
+                         "b 26f\n"
+                         "6:" // 4 rounds: Partial load: partial_4_32
+                         "tbz x20, #2, 8f\n"
+                         "ldr s12, [x22], #0x4\n"
+                         "tbz x20, #1, 7f\n"
+                         "ld1 { v12.h }[2], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[6], [x22]\n"
+                         "b 26f\n"
+                         "7:" // 4 rounds: Partial load: partial_1_36
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[4], [x22]\n"
+                         "b 26f\n"
+                         "8:" // 4 rounds: Partial load: partial_2_32
+                         "tbz x20, #1, 9f\n"
+                         "ldr h12, [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[2], [x22]\n"
+                         "b 26f\n"
+                         "9:" // 4 rounds: Partial load: partial_1_32
+                         "tbz x20, #0, 26f\n"
+                         "ldr b12, [x22, #0x0]\n"
+                         "b 26f\n"
+                         "10:" // 4 rounds: Partial load: partial_16_0
+                         "tbz x20, #4, 18f\n"
+                         "ld1 { v8.16b }, [x22], #0x10\n"
+                         "tbz x20, #3, 14f\n"
+                         "ldr d13, [x22], #0x8\n"
+                         "tbz x20, #2, 12f\n"
+                         "ld1 { v13.s }[2], [x22], #0x4\n"
+                         "tbz x20, #1, 11f\n"
+                         "ld1 { v13.h }[6], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[14], [x22]\n"
+                         "b 26f\n"
+                         "11:" // 4 rounds: Partial load: partial_1_28
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[12], [x22]\n"
+                         "b 26f\n"
+                         "12:" // 4 rounds: Partial load: partial_2_24
+                         "tbz x20, #1, 13f\n"
+                         "ld1 { v13.h }[4], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[10], [x22]\n"
+                         "b 26f\n"
+                         "13:" // 4 rounds: Partial load: partial_1_24
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[8], [x22]\n"
+                         "b 26f\n"
+                         "14:" // 4 rounds: Partial load: partial_4_16
+                         "tbz x20, #2, 16f\n"
+                         "ldr s13, [x22], #0x4\n"
+                         "tbz x20, #1, 15f\n"
+                         "ld1 { v13.h }[2], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[6], [x22]\n"
+                         "b 26f\n"
+                         "15:" // 4 rounds: Partial load: partial_1_20
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[4], [x22]\n"
+                         "b 26f\n"
+                         "16:" // 4 rounds: Partial load: partial_2_16
+                         "tbz x20, #1, 17f\n"
+                         "ldr h13, [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[2], [x22]\n"
+                         "b 26f\n"
+                         "17:" // 4 rounds: Partial load: partial_1_16
+                         "tbz x20, #0, 26f\n"
+                         "ldr b13, [x22, #0x0]\n"
+                         "b 26f\n"
+                         "18:" // 4 rounds: Partial load: partial_8_0
+                         "tbz x20, #3, 22f\n"
+                         "ldr d8, [x22], #0x8\n"
+                         "tbz x20, #2, 20f\n"
+                         "ld1 { v8.s }[2], [x22], #0x4\n"
+                         "tbz x20, #1, 19f\n"
+                         "ld1 { v8.h }[6], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[14], [x22]\n"
+                         "b 26f\n"
+                         "19:" // 4 rounds: Partial load: partial_1_12
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[12], [x22]\n"
+                         "b 26f\n"
+                         "20:" // 4 rounds: Partial load: partial_2_8
+                         "tbz x20, #1, 21f\n"
+                         "ld1 { v8.h }[4], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[10], [x22]\n"
+                         "b 26f\n"
+                         "21:" // 4 rounds: Partial load: partial_1_8
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[8], [x22]\n"
+                         "b 26f\n"
+                         "22:" // 4 rounds: Partial load: partial_4_0
+                         "tbz x20, #2, 24f\n"
+                         "ldr s8, [x22], #0x4\n"
+                         "tbz x20, #1, 23f\n"
+                         "ld1 { v8.h }[2], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[6], [x22]\n"
+                         "b 26f\n"
+                         "23:" // 4 rounds: Partial load: partial_1_4
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[4], [x22]\n"
+                         "b 26f\n"
+                         "24:" // 4 rounds: Partial load: partial_2_0
+                         "tbz x20, #1, 25f\n"
+                         "ldr h8, [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[2], [x22]\n"
+                         "b 26f\n"
+                         "25:" // 4 rounds: Partial load: partial_1_0
+                         "ldr b8, [x22, #0x0]\n"
+                         "26:" // 4 rounds: Partial load: Done
+                         "b 28f\n"
+                         "27:" // 4 rounds: Full load
+                         "ldr q8, [x22, #0x0]\n"
+                         "ldr q13, [x22, #0x10]\n"
+                         "ldr q12, [x22, #0x20]\n"
+                         "add x22, x22, #0x30\n"
+                         "28:" // 4 rounds: Load done
+                         "sub v0.16b, v8.16b, v11.16b\n"
+                         "sub v7.16b, v8.16b, v10.16b\n"
+                         "tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b\n"
+                         "sub v6.16b, v8.16b, v9.16b\n"
+                         "sub v5.16b, v13.16b, v11.16b\n"
+                         "tbl v8.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v8.16b\n"
+                         "sub v4.16b, v13.16b, v10.16b\n"
+                         "sub v3.16b, v13.16b, v9.16b\n"
+                         "tbl v7.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v7.16b\n"
+                         "sub v2.16b, v12.16b, v11.16b\n"
+                         "sub v1.16b, v12.16b, v10.16b\n"
+                         "tbl v6.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v6.16b\n"
+                         "tbl v13.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v13.16b\n"
+                         "tbl v5.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v5.16b\n"
+                         "orr v8.16b, v8.16b, v0.16b\n"
+                         "sub v0.16b, v12.16b, v9.16b\n"
+                         "tbl v4.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v4.16b\n"
+                         "tbl v3.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v3.16b\n"
+                         "tbl v12.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v12.16b\n"
+                         "tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b\n"
+                         "orr v7.16b, v7.16b, v6.16b\n"
+                         "tbl v1.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v1.16b\n"
+                         "tbl v0.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v0.16b\n"
+                         "orr v13.16b, v13.16b, v5.16b\n"
+                         "orr v4.16b, v4.16b, v3.16b\n"
+                         "orr v12.16b, v12.16b, v2.16b\n"
+                         "cmp x20, #0x30\n"
+                         "orr v1.16b, v1.16b, v0.16b\n"
+                         "orr v8.16b, v8.16b, v7.16b\n"
+                         "orr v13.16b, v13.16b, v4.16b\n"
+                         "orr v12.16b, v12.16b, v1.16b\n"
+                         "bge 53f\n"
+                         "tbz x20, #5, 36f\n"
+                         "st1 { v8.16b }, [x21], #0x10\n"
+                         "st1 { v13.16b }, [x21], #0x10\n"
+                         "tbz x20, #3, 32f\n"
+                         "str d12, [x21], #0x8\n"
+                         "tbz x20, #2, 30f\n"
+                         "st1 { v12.s }[2], [x21], #0x4\n"
+                         "tbz x20, #1, 29f\n"
+                         "st1 { v12.h }[6], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[14], [x21]\n"
+                         "b 52f\n"
+                         "29:" // 4 rounds: Partial writeback: partial_1_44
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[12], [x21]\n"
+                         "b 52f\n"
+                         "30:" // 4 rounds: Partial writeback: partial_2_40
+                         "tbz x20, #1, 31f\n"
+                         "st1 { v12.h }[4], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[10], [x21]\n"
+                         "b 52f\n"
+                         "31:" // 4 rounds: Partial writeback: partial_1_40
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[8], [x21]\n"
+                         "b 52f\n"
+                         "32:" // 4 rounds: Partial writeback: partial_4_32
+                         "tbz x20, #2, 34f\n"
+                         "str s12, [x21], #0x4\n"
+                         "tbz x20, #1, 33f\n"
+                         "st1 { v12.h }[2], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[6], [x21]\n"
+                         "b 52f\n"
+                         "33:" // 4 rounds: Partial writeback: partial_1_36
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[4], [x21]\n"
+                         "b 52f\n"
+                         "34:" // 4 rounds: Partial writeback: partial_2_32
+                         "tbz x20, #1, 35f\n"
+                         "str h12, [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[2], [x21]\n"
+                         "b 52f\n"
+                         "35:" // 4 rounds: Partial writeback: partial_1_32
+                         "tbz x20, #0, 52f\n"
+                         "str b12, [x21, #0x0]\n"
+                         "b 52f\n"
+                         "36:" // 4 rounds: Partial writeback: partial_16_0
+                         "tbz x20, #4, 44f\n"
+                         "st1 { v8.16b }, [x21], #0x10\n"
+                         "tbz x20, #3, 40f\n"
+                         "str d13, [x21], #0x8\n"
+                         "tbz x20, #2, 38f\n"
+                         "st1 { v13.s }[2], [x21], #0x4\n"
+                         "tbz x20, #1, 37f\n"
+                         "st1 { v13.h }[6], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[14], [x21]\n"
+                         "b 52f\n"
+                         "37:" // 4 rounds: Partial writeback: partial_1_28
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[12], [x21]\n"
+                         "b 52f\n"
+                         "38:" // 4 rounds: Partial writeback: partial_2_24
+                         "tbz x20, #1, 39f\n"
+                         "st1 { v13.h }[4], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[10], [x21]\n"
+                         "b 52f\n"
+                         "39:" // 4 rounds: Partial writeback: partial_1_24
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[8], [x21]\n"
+                         "b 52f\n"
+                         "40:" // 4 rounds: Partial writeback: partial_4_16
+                         "tbz x20, #2, 42f\n"
+                         "str s13, [x21], #0x4\n"
+                         "tbz x20, #1, 41f\n"
+                         "st1 { v13.h }[2], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[6], [x21]\n"
+                         "b 52f\n"
+                         "41:" // 4 rounds: Partial writeback: partial_1_20
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[4], [x21]\n"
+                         "b 52f\n"
+                         "42:" // 4 rounds: Partial writeback: partial_2_16
+                         "tbz x20, #1, 43f\n"
+                         "str h13, [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[2], [x21]\n"
+                         "b 52f\n"
+                         "43:" // 4 rounds: Partial writeback: partial_1_16
+                         "tbz x20, #0, 52f\n"
+                         "str b13, [x21, #0x0]\n"
+                         "b 52f\n"
+                         "44:" // 4 rounds: Partial writeback: partial_8_0
+                         "tbz x20, #3, 48f\n"
+                         "str d8, [x21], #0x8\n"
+                         "tbz x20, #2, 46f\n"
+                         "st1 { v8.s }[2], [x21], #0x4\n"
+                         "tbz x20, #1, 45f\n"
+                         "st1 { v8.h }[6], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[14], [x21]\n"
+                         "b 52f\n"
+                         "45:" // 4 rounds: Partial writeback: partial_1_12
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[12], [x21]\n"
+                         "b 52f\n"
+                         "46:" // 4 rounds: Partial writeback: partial_2_8
+                         "tbz x20, #1, 47f\n"
+                         "st1 { v8.h }[4], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[10], [x21]\n"
+                         "b 52f\n"
+                         "47:" // 4 rounds: Partial writeback: partial_1_8
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[8], [x21]\n"
+                         "b 52f\n"
+                         "48:" // 4 rounds: Partial writeback: partial_4_0
+                         "tbz x20, #2, 50f\n"
+                         "str s8, [x21], #0x4\n"
+                         "tbz x20, #1, 49f\n"
+                         "st1 { v8.h }[2], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[6], [x21]\n"
+                         "b 52f\n"
+                         "49:" // 4 rounds: Partial writeback: partial_1_4
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[4], [x21]\n"
+                         "b 52f\n"
+                         "50:" // 4 rounds: Partial writeback: partial_2_0
+                         "tbz x20, #1, 51f\n"
+                         "str h8, [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[2], [x21]\n"
+                         "b 52f\n"
+                         "51:" // 4 rounds: Partial writeback: partial_1_0
+                         "str b8, [x21, #0x0]\n"
+                         "52:" // 4 rounds: Partial writeback: Done
+                         "b 54f\n"
+                         "53:" // 4 rounds: Full writeback
+                         "str q8, [x21, #0x0]\n"
+                         "str q13, [x21, #0x10]\n"
+                         "str q12, [x21, #0x20]\n"
+                         "add x21, x21, #0x30\n"
+                         "54:" // 4 rounds: Writeback done
+                         "subs x20, x20, #0x30\n"
+                         "bgt 2b\n"
+                         "add x23, x23, #0x1\n"
+                         "cmp x23, %x[num_strings]\n"
+                         "bne 1b\n"
+                         :
+                         : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output),
+                           [string_length] "r"(string_length), [table] "r"(table)
+                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+                           "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+                           "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23");
 }
 
 #endif // __aarch64__
diff --git a/src/cpu/kernels/lut/generic/sve2/u8.cpp b/src/cpu/kernels/lut/generic/sve2/u8.cpp
index b80d75326ecfc4370e2402c943f5269fe4591c5b..ee8572703ecc0b0e8f7291fcafe08dc8dc483785 100644
--- a/src/cpu/kernels/lut/generic/sve2/u8.cpp
+++ b/src/cpu/kernels/lut/generic/sve2/u8.cpp
@@ -32,11 +32,7 @@ namespace arm_compute
 namespace cpu
 {
 void lut_u8_sve2(
-    const uint8_t        *table,
-    size_t                num_strings,
-    size_t                string_length,
-    const uint8_t *const *input,
-    uint8_t *const       *output)
+    const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, uint8_t *const *output)
 {
     __asm__ __volatile__(
         "ptrue p0.b\n"
@@ -636,7 +632,9 @@ void lut_u8_sve2(
         "bne 2b\n"
         : [table] "+&r"(table)
         : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), [string_length] "r"(string_length)
-        : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
+        : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1",
+          "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21",
+          "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
 }
 
 } // namespace cpu
diff --git a/src/cpu/kernels/lut/list.h b/src/cpu/kernels/lut/list.h
index 7a2afc692745edbabfcdff64a94a2d47b02afae6..da90346267b060cf4722694a19cf9c3ff4912c0a 100644
--- a/src/cpu/kernels/lut/list.h
+++ b/src/cpu/kernels/lut/list.h
@@ -34,13 +34,9 @@ namespace cpu
 {
 
 #ifdef __aarch64__
-#define DECLARE_LUT_KERNEL(func_name) \
-    void func_name( \
-        const uint8_t        *table, \
-        size_t                num_strings, \
-        size_t                string_length, \
-        const uint8_t *const *input, \
-        uint8_t *const       *output)
+#define DECLARE_LUT_KERNEL(func_name)                                                                           \
+    void func_name(const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, \
+                   uint8_t *const *output)
 
 DECLARE_LUT_KERNEL(lut_u8_neon);
 DECLARE_LUT_KERNEL(lut_u8_sve2);
diff --git a/src/cpu/kernels/maxunpool/generic/neon/impl.cpp b/src/cpu/kernels/maxunpool/generic/neon/impl.cpp
deleted file mode 100644
index 77e3b8594a06b1d82b0e0c168f537cd0911e0d1b..0000000000000000000000000000000000000000
--- a/src/cpu/kernels/maxunpool/generic/neon/impl.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2020-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/cpu/kernels/maxunpool/generic/neon/impl.h"
-namespace arm_compute
-{
-class ITensor;
-class Window;
-namespace cpu
-{
-template <typename T>
-void max_unpooling(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)
-{
-    Iterator  input_itr(input, window);
-    Iterator  indices_itr(indices, window);
-    auto      out_ptr      = reinterpret_cast<T *>(output->buffer());
-    const int out_stride_w = static_cast<int>(output->info()->strides_in_bytes()[3]);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto vindices                                         = reinterpret_cast<uint32_t *>(indices_itr.ptr());
-        auto vinput                                           = reinterpret_cast<T *>(input_itr.ptr());
-        out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput;
-    },
-    input_itr, indices_itr);
-}
-template void max_unpooling<float>(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window);
-template void max_unpooling<int8_t>(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window);
-template void max_unpooling<uint8_t>(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window);
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void max_unpooling<float16_t>(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window);
-#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/maxunpool/generic/neon/impl.h b/src/cpu/kernels/maxunpool/generic/neon/impl.h
index 3fea9cfcf3a59661f9a85d6977c11edd5732333e..73a5b86a2ffb3f5f81d32671f6d1d10bd63d63b5 100644
--- a/src/cpu/kernels/maxunpool/generic/neon/impl.h
+++ b/src/cpu/kernels/maxunpool/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,19 +21,33 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_SVE_KERNELS_MAXUNPOOLING_IMPL_H
-#define SRC_CORE_SVE_KERNELS_MAXUNPOOLING_IMPL_H
+#ifndef ACL_SRC_CPU_KERNELS_MAXUNPOOL_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_MAXUNPOOL_GENERIC_NEON_IMPL_H
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 namespace arm_compute
 {
-class ITensor;
-class Window;
 namespace cpu
 {
 template <typename T>
-void max_unpooling(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window);
+void max_unpooling(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)
+{
+    Iterator  input_itr(input, window);
+    Iterator  indices_itr(indices, window);
+    auto      out_ptr      = reinterpret_cast<T *>(output->buffer());
+    const int out_stride_w = static_cast<int>(output->info()->strides_in_bytes()[3]);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            auto vindices                                         = reinterpret_cast<uint32_t *>(indices_itr.ptr());
+            auto vinput                                           = reinterpret_cast<T *>(input_itr.ptr());
+            out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput;
+        },
+        input_itr, indices_itr);
+}
 } // namespace cpu
 } // namespace arm_compute
-#endif //define SRC_CORE_SVE_KERNELS_MAXUNPOOLING_IMPL_H
+#endif // ACL_SRC_CPU_KERNELS_MAXUNPOOL_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
index 47bf64ae578c75c2aa1cc764b50000190cfc5221..6470f391e21c3b5a3401cc3d9446641de878c1b9 100644
--- a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
@@ -22,6 +22,8 @@
  * SOFTWARE.
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/CpuTypes.h"
 #include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
 
@@ -29,6 +31,82 @@ namespace arm_compute
 {
 namespace cpu
 {
+template <>
+void mean_stddev_normalization<float16_t, 8>(ITensor *input, ITensor *output, float epsilon, const Window &window)
+{
+    // Set build options
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x  = 8;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Iterator input_itr(input, win);
+    Iterator output_itr(output, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            int  x       = window_start_x;
+            auto in_ptr  = reinterpret_cast<const float16_t *>(input_itr.ptr());
+            auto out_ptr = reinterpret_cast<float16_t *>(output_itr.ptr());
+
+            float16x8_t sum_vec    = vdupq_n_f16(static_cast<float16_t>(0.0f));
+            float32x4_t sum_sq_vec = vdupq_n_f32(0.0f);
+
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                float16x8_t data = vld1q_f16(in_ptr + x);
+                sum_vec          = vaddq_f16(sum_vec, data);
+                float32x4_t dl   = vcvt_f32_f16(vget_low_f16(data));
+                float32x4_t dh   = vcvt_f32_f16(vget_high_f16(data));
+                sum_sq_vec       = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl));
+                sum_sq_vec       = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh));
+            }
+
+            float16x4_t sum_carry_res = vpadd_f16(vget_high_f16(sum_vec), vget_low_f16(sum_vec));
+            sum_carry_res             = vpadd_f16(sum_carry_res, sum_carry_res);
+            sum_carry_res             = vpadd_f16(sum_carry_res, sum_carry_res);
+
+            float32x4_t sum_sq_carry_res = vpaddq_f32(sum_sq_vec, sum_sq_vec);
+            sum_sq_carry_res             = vpaddq_f32(sum_sq_carry_res, sum_sq_carry_res);
+
+            float16_t sum    = vget_lane_f16(sum_carry_res, 0);
+            float     sum_sq = vgetq_lane_f32(sum_sq_carry_res, 0);
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                float16_t data = *(in_ptr + x);
+                sum += data;
+                float fdata = static_cast<float>(data);
+                sum_sq += fdata * fdata;
+            }
+
+            float16_t mean       = sum / input->info()->dimension(0);
+            float     var        = (sum_sq / input->info()->dimension(0)) - (mean * mean);
+            float16_t stddev_inv = static_cast<float16_t>(1.f / sqrt(var + epsilon));
+
+            float16x8_t mean_vec       = vdupq_n_f16(mean);
+            float16x8_t stddev_inv_vec = vdupq_n_f16(stddev_inv);
+
+            for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                float16x8_t data = vld1q_f16(in_ptr + x);
+                float16x8_t res  = vmulq_f16(vsubq_f16(data, mean_vec), stddev_inv_vec);
+                // Store results
+                vst1q_f16(out_ptr + x, res);
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
+            }
+        },
+        input_itr, output_itr);
+}
+
 void neon_fp16_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const Window &window)
 {
     return mean_stddev_normalization<float16_t, 8>(input, output, epsilon, window);
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
index 0d00acdd0c66269523936a36bd84a524c3dfa38e..11f6294a35b6bededf08fc6e2714ce10855ccf11 100644
--- a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -45,138 +46,63 @@ void mean_stddev_normalization(ITensor *input, ITensor *output, float epsilon, c
     Iterator input_itr(input, win);
     Iterator output_itr(output, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int  x       = window_start_x;
-        auto in_ptr  = reinterpret_cast<const ScalarType *>(input_itr.ptr());
-        auto out_ptr = reinterpret_cast<ScalarType *>(output_itr.ptr());
-
-        auto sum_vec    = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
-        auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto data  = wrapper::vloadq(in_ptr + x);
-            sum_vec    = wrapper::vadd(sum_vec, data);
-            sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data));
-        }
-
-        auto sum_carry_res    = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec));
-        auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec));
-        for(int i = 0; i < size / 4; ++i)
-        {
-            sum_carry_res    = wrapper::vpadd(sum_carry_res, sum_carry_res);
-            sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res);
-        }
-
-        auto sum    = wrapper::vgetlane(sum_carry_res, 0);
-        auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0);
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            ScalarType data = *(in_ptr + x);
-            sum += data;
-            sum_sq += data * data;
-        }
-
-        ScalarType mean       = sum / input->info()->dimension(0);
-        ScalarType var        = (sum_sq / input->info()->dimension(0)) - (mean * mean);
-        ScalarType stddev_inv = 1.f / sqrt(var + epsilon);
-
-        auto mean_vec       = wrapper::vdup_n(mean, ExactTagType{});
-        auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{});
-        for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto data = wrapper::vloadq(in_ptr + x);
-            auto res  = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec);
-            // Store results
-            wrapper::vstore(out_ptr + x, res);
-        }
-        for(; x < window_end_x; ++x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
-        }
-    },
-    input_itr, output_itr);
+            int  x       = window_start_x;
+            auto in_ptr  = reinterpret_cast<const ScalarType *>(input_itr.ptr());
+            auto out_ptr = reinterpret_cast<ScalarType *>(output_itr.ptr());
+
+            auto sum_vec    = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
+            auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
+
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                auto data  = wrapper::vloadq(in_ptr + x);
+                sum_vec    = wrapper::vadd(sum_vec, data);
+                sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data));
+            }
+
+            auto sum_carry_res    = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec));
+            auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec));
+            for (int i = 0; i < size / 4; ++i)
+            {
+                sum_carry_res    = wrapper::vpadd(sum_carry_res, sum_carry_res);
+                sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res);
+            }
+
+            auto sum    = wrapper::vgetlane(sum_carry_res, 0);
+            auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0);
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                ScalarType data = *(in_ptr + x);
+                sum += data;
+                sum_sq += data * data;
+            }
+
+            ScalarType mean       = sum / input->info()->dimension(0);
+            ScalarType var        = (sum_sq / input->info()->dimension(0)) - (mean * mean);
+            ScalarType stddev_inv = 1.f / sqrt(var + epsilon);
+
+            auto mean_vec       = wrapper::vdup_n(mean, ExactTagType{});
+            auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{});
+            for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                auto data = wrapper::vloadq(in_ptr + x);
+                auto res  = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec);
+                // Store results
+                wrapper::vstore(out_ptr + x, res);
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
+            }
+        },
+        input_itr, output_itr);
 }
 template void mean_stddev_normalization<float, 4>(ITensor *input, ITensor *output, float epsilon, const Window &window);
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template <>
-void mean_stddev_normalization<float16_t, 8>(ITensor *input, ITensor *output, float epsilon, const Window &window)
-{
-    // Set build options
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x  = 8;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Iterator input_itr(input, win);
-    Iterator output_itr(output, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int  x       = window_start_x;
-        auto in_ptr  = reinterpret_cast<const float16_t *>(input_itr.ptr());
-        auto out_ptr = reinterpret_cast<float16_t *>(output_itr.ptr());
-
-        float16x8_t sum_vec    = vdupq_n_f16(static_cast<float16_t>(0.0f));
-        float32x4_t sum_sq_vec = vdupq_n_f32(0.0f);
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            float16x8_t data = vld1q_f16(in_ptr + x);
-            sum_vec          = vaddq_f16(sum_vec, data);
-            float32x4_t dl   = vcvt_f32_f16(vget_low_f16(data));
-            float32x4_t dh   = vcvt_f32_f16(vget_high_f16(data));
-            sum_sq_vec       = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl));
-            sum_sq_vec       = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh));
-        }
-
-        float16x4_t sum_carry_res = vpadd_f16(vget_high_f16(sum_vec), vget_low_f16(sum_vec));
-        sum_carry_res             = vpadd_f16(sum_carry_res, sum_carry_res);
-        sum_carry_res             = vpadd_f16(sum_carry_res, sum_carry_res);
-
-        float32x4_t sum_sq_carry_res = vpaddq_f32(sum_sq_vec, sum_sq_vec);
-        sum_sq_carry_res             = vpaddq_f32(sum_sq_carry_res, sum_sq_carry_res);
-
-        float16_t sum    = vget_lane_f16(sum_carry_res, 0);
-        float     sum_sq = vgetq_lane_f32(sum_sq_carry_res, 0);
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            float16_t data = *(in_ptr + x);
-            sum += data;
-            float fdata = static_cast<float>(data);
-            sum_sq += fdata * fdata;
-        }
-
-        float16_t mean       = sum / input->info()->dimension(0);
-        float     var        = (sum_sq / input->info()->dimension(0)) - (mean * mean);
-        float16_t stddev_inv = static_cast<float16_t>(1.f / sqrt(var + epsilon));
-
-        float16x8_t mean_vec       = vdupq_n_f16(mean);
-        float16x8_t stddev_inv_vec = vdupq_n_f16(stddev_inv);
-
-        for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            float16x8_t data = vld1q_f16(in_ptr + x);
-            float16x8_t res  = vmulq_f16(vsubq_f16(data, mean_vec), stddev_inv_vec);
-            // Store results
-            vst1q_f16(out_ptr + x, res);
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
-        }
-    },
-    input_itr, output_itr);
-}
-#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp
index 53af1e4b16139dc75fa7b5e5439bb0728d934dfe..32654df5dcca67fe63b4d538e51643c97b18bc53 100644
--- a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
@@ -69,77 +70,76 @@ void neon_qasymm8_meanstddevnorm(ITensor *input, ITensor *output, float epsilon,
     const float32x4_t quant_min_vec    = vdupq_n_f32(0.0f);
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        int  x       = window_start_x;
-        auto in_ptr  = reinterpret_cast<const uint8_t *>(input_itr.ptr());
-        auto out_ptr = reinterpret_cast<uint8_t *>(output_itr.ptr());
+        win,
+        [&](const Coordinates &)
+        {
+            int  x       = window_start_x;
+            auto in_ptr  = reinterpret_cast<const uint8_t *>(input_itr.ptr());
+            auto out_ptr = reinterpret_cast<uint8_t *>(output_itr.ptr());
 
-        uint32x4_t sum_vec    = vdupq_n_u32(0);
-        uint32x4_t sum_sq_vec = vdupq_n_u32(0);
+            uint32x4_t sum_vec    = vdupq_n_u32(0);
+            uint32x4_t sum_sq_vec = vdupq_n_u32(0);
 
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const uint8x16_t data         = vld1q_u8(in_ptr + x);
-            sum_vec                       = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data)));
-            const uint16x8_t squares_low  = vmull_u8(vget_low_u8(data), vget_low_u8(data));
-            const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data));
-            sum_sq_vec                    = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high)));
-        }
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint8x16_t data         = vld1q_u8(in_ptr + x);
+                sum_vec                       = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data)));
+                const uint16x8_t squares_low  = vmull_u8(vget_low_u8(data), vget_low_u8(data));
+                const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data));
+                sum_sq_vec = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high)));
+            }
 
 #ifdef __aarch64__
-        sum_vec         = vpaddq_u32(sum_vec, sum_vec);
-        sum_vec         = vpaddq_u32(sum_vec, sum_vec);
-        uint32_t sum    = vgetq_lane_u32(sum_vec, 0);
-        sum_sq_vec      = vpaddq_u32(sum_sq_vec, sum_sq_vec);
-        sum_sq_vec      = vpaddq_u32(sum_sq_vec, sum_sq_vec);
-        uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0);
+            sum_vec         = vpaddq_u32(sum_vec, sum_vec);
+            sum_vec         = vpaddq_u32(sum_vec, sum_vec);
+            uint32_t sum    = vgetq_lane_u32(sum_vec, 0);
+            sum_sq_vec      = vpaddq_u32(sum_sq_vec, sum_sq_vec);
+            sum_sq_vec      = vpaddq_u32(sum_sq_vec, sum_sq_vec);
+            uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0);
 #elif __arm__ // #ifdef __aarch64__
-        uint32_t sum =  vgetq_lane_u32(sum_vec, 0) +
-                        vgetq_lane_u32(sum_vec, 1) +
-                        vgetq_lane_u32(sum_vec, 2) +
-                        vgetq_lane_u32(sum_vec, 3);
+            uint32_t sum = vgetq_lane_u32(sum_vec, 0) + vgetq_lane_u32(sum_vec, 1) + vgetq_lane_u32(sum_vec, 2) +
+                           vgetq_lane_u32(sum_vec, 3);
 
-        uint32_t sum_sq =   vgetq_lane_u32(sum_sq_vec, 0) +
-                            vgetq_lane_u32(sum_sq_vec, 1) +
-                            vgetq_lane_u32(sum_sq_vec, 2) +
-                            vgetq_lane_u32(sum_sq_vec, 3);
+            uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0) + vgetq_lane_u32(sum_sq_vec, 1) +
+                              vgetq_lane_u32(sum_sq_vec, 2) + vgetq_lane_u32(sum_sq_vec, 3);
 #endif        // #ifdef __aarch64__
-        for(; x < window_end_x; ++x)
-        {
-            auto data = static_cast<uint32_t>(*(in_ptr + x));
-            sum += data;
-            sum_sq += (data * data);
-        }
+            for (; x < window_end_x; ++x)
+            {
+                auto data = static_cast<uint32_t>(*(in_ptr + x));
+                sum += data;
+                sum_sq += (data * data);
+            }
 
-        const float       mean      = (static_cast<float>(sum) / static_cast<float>(input->info()->dimension(0)));
-        const float       var       = (static_cast<float>(sum_sq) / static_cast<float>(input->info()->dimension(0))) - (mean * mean);
-        const float       stdev_inv = 1.0f / sqrtf(var + epsilon);
-        const float32x4_t v_scale   = vdupq_n_f32(stdev_inv * output_inv_scale);
-        const float32x4_t v_offset  = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset);
-        for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const uint8x16_t data = vld1q_u8(in_ptr + x);
-            float32x4_t      db1  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data)))));
-            float32x4_t      db2  = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data)))));
-            float32x4_t      db3  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data)))));
-            float32x4_t      db4  = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data)))));
-            db1                   = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec);
-            db2                   = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec);
-            db3                   = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec);
-            db4                   = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec);
-            const uint8x16_t out  = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4));
-            vst1q_u8(out_ptr + x, out);
-        }
+            const float mean = (static_cast<float>(sum) / static_cast<float>(input->info()->dimension(0)));
+            const float var =
+                (static_cast<float>(sum_sq) / static_cast<float>(input->info()->dimension(0))) - (mean * mean);
+            const float       stdev_inv = 1.0f / sqrtf(var + epsilon);
+            const float32x4_t v_scale   = vdupq_n_f32(stdev_inv * output_inv_scale);
+            const float32x4_t v_offset  = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset);
+            for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint8x16_t data = vld1q_u8(in_ptr + x);
+                float32x4_t      db1  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data)))));
+                float32x4_t      db2  = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data)))));
+                float32x4_t      db3  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data)))));
+                float32x4_t      db4  = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data)))));
+                db1 = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec);
+                db2 = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec);
+                db3 = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec);
+                db4 = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec);
+                const uint8x16_t out = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4));
+                vst1q_u8(out_ptr + x, out);
+            }
 
-        for(; x < window_end_x; ++x)
-        {
-            auto          data = static_cast<float32_t>(*(in_ptr + x));
-            const uint8_t res  = data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset);
-            *(out_ptr + x)     = res;
-        }
-    },
-    input_itr, output_itr);
+            for (; x < window_end_x; ++x)
+            {
+                auto          data = static_cast<float32_t>(*(in_ptr + x));
+                const uint8_t res =
+                    data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset);
+                *(out_ptr + x) = res;
+            }
+        },
+        input_itr, output_itr);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/pool2d/neon/fp16.cpp b/src/cpu/kernels/pool2d/neon/fp16.cpp
index 4e15d3ad3fb4981d9310c07e71ac9e53764adf87..9d24d79afb9e182e3b801b4024e1cd733139b8e6 100644
--- a/src/cpu/kernels/pool2d/neon/fp16.cpp
+++ b/src/cpu/kernels/pool2d/neon/fp16.cpp
@@ -25,8 +25,10 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/cpu/kernels/pool2d/neon/impl.h"
 #include "src/cpu/kernels/pool2d/neon/list.h"
 
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
@@ -35,9 +37,127 @@ namespace arm_compute
 {
 namespace cpu
 {
+#ifdef ENABLE_NCHW_KERNELS
+
 namespace
 {
-void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+float16x4_t
+read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, int x, int y, const float16_t *ptr, float16_t fval)
+{
+    float16_t  vec[4];
+    const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
+    for (int i = 0; i < 4; i++)
+    {
+        if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
+        {
+            vec[i] = *(ptr + i);
+        }
+        else
+        {
+            vec[i] = fval;
+        }
+    }
+    return wrapper::vload(vec);
+}
+} // namespace
+
+void pooling3_fp16_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
+{
+    ARM_COMPUTE_UNUSED(dst1);
+
+    Iterator in(src, window_src);
+    Iterator out(dst0, window);
+
+    constexpr const int pool_size            = 3;
+    const int           pool_pad_right       = pool_info.pad_stride_info.pad_right();
+    const int           pool_pad_top         = pool_info.pad_stride_info.pad_top();
+    const int           pool_pad_left        = pool_info.pad_stride_info.pad_left();
+    const int           pool_pad_bottom      = pool_info.pad_stride_info.pad_bottom();
+    int                 pool_stride_x        = 0;
+    int                 pool_stride_y        = 0;
+    std::tie(pool_stride_x, pool_stride_y)   = pool_info.pad_stride_info.stride();
+    const int                  src_w         = src->info()->dimension(0);
+    const int                  src_h         = src->info()->dimension(1);
+    const int                  upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int                  upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float16_t            fp16_min      = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+    const float16_t            fill_value    = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f;
+    const unsigned char *const src_top_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const unsigned char *const src_middle_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const unsigned char *const src_bottom_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto  x_val   = id.x() * pool_stride_x;
+            const auto  y_val_0 = id.y() * pool_stride_y;
+            const auto  y_val_1 = (id.y() * pool_stride_y) + 1;
+            const auto  y_val_2 = (id.y() * pool_stride_y) + 2;
+            float16x4_t top_data =
+                read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_0,
+                                           reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()), fill_value);
+            float16x4_t middle_data = read_4_boundary_aware_fp16(
+                src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_1,
+                reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()), fill_value);
+            float16x4_t bottom_data = read_4_boundary_aware_fp16(
+                src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_2,
+                reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()), fill_value);
+            float16x4_t res = {};
+
+            // Get power of 2 in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                top_data    = vmul_f16(top_data, top_data);
+                middle_data = vmul_f16(middle_data, middle_data);
+                bottom_data = vmul_f16(bottom_data, bottom_data);
+            }
+
+            if (pool_info.pool_type != PoolingType::MAX)
+            {
+                // Calculate scale
+                const float scale = calculate_avg_scale_pool2d(
+                    pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h,
+                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                const float16x4_t scale_v = vdup_n_f16(scale);
+                // Perform pooling
+                const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
+                res                        = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
+                res                        = vmul_f16(vpadd_f16(res, res), scale_v);
+            }
+            else
+            {
+                const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
+                res                        = vpmax_f16(vset_lane_f16(fp16_min, max_data, 3), max_data);
+                res                        = vpmax_f16(res, res);
+            }
+
+            // Calculate square-root in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                res = vsqrt_f16(res);
+            }
+
+            *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
+        },
+        in, out);
+}
+#endif // ENABLE_NCHW_KERNELS
+
+void pooling2_f16_maxpool_indices(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
 {
     const int window_start_x = window.x().start();
     const int window_end_x   = window.x().end();
@@ -53,8 +173,8 @@ void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds
     const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
     const int pool_pad_left = pool_info.pad_stride_info.pad_left();
 
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
+    int pool_stride_x                      = 0;
+    int pool_stride_y                      = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
 
     const int pad_right      = src->info()->padding().right;
@@ -63,255 +183,477 @@ void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds
     const int in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
     const int in_stride_z    = static_cast<int>(src->info()->strides_in_bytes().z());
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-        const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
-        const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
-        {
-            const auto  in_x0_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off;
-            const auto  in_x1_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off;
-            const auto  in_x2_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off;
-            const auto  in_x3_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off;
-            const auto  v_x0      = vld1q_f16(in_x0_ptr);
-            const auto  v_x1      = vld1q_f16(in_x1_ptr);
-            const auto  v_x2      = vld1q_f16(in_x2_ptr);
-            const auto  v_x3      = vld1q_f16(in_x3_ptr);
-            float16x8_t vres      = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
-            // Store result
-            vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
-
-            const uint32_t   offset_base    = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t   offset_x0      = (uint32_t)offset_base / sizeof(float16_t) + x_off;
-            const uint32_t   offset_x1      = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32_t   offset_x2      = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t   offset_x3      = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32x4_t voffset_x0_0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
-            const uint32x4_t voffset_x0_1   = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
-            const uint16x8_t voffset_x0     = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
-            const uint32x4_t voffset_x1_0   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
-            const uint32x4_t voffset_x1_1   = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
-            const uint16x8_t voffset_x1     = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
-            const uint32x4_t voffset_x2_0   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
-            const uint32x4_t voffset_x2_1   = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
-            const uint16x8_t voffset_x2     = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
-            const uint32x4_t voffset_x3_0   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
-            const uint32x4_t voffset_x3_1   = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
-            const uint16x8_t voffset_x3     = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
-            const uint16x8_t tmp_indices0   = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
-            const uint16x8_t tmp_indices1   = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
-            const uint16x8_t tmp_indices2   = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
-            const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
-            const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
-            // Store indicies
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            const auto x0  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off);
-            const auto x1  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off);
-            const auto x2  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off);
-            const auto x3  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off);
-            float16_t  res = std::max(std::max(x2, x3), std::max(x0, x1));
+            const int idx_width    = id.y() * pool_stride_x;
+            const int idx_height   = id.z() * pool_stride_y;
+            const int pool_limit_y = pool_pad_top - idx_height;
+            const int pool_limit_x = pool_pad_left - idx_width;
+
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+            const int in_x0_offset =
+                (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x1_offset =
+                (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x2_offset =
+                (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x3_offset =
+                (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+
+            int x_off = window_start_x;
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+            {
+                const auto  in_x0_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off;
+                const auto  in_x1_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off;
+                const auto  in_x2_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off;
+                const auto  in_x3_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off;
+                const auto  v_x0      = vld1q_f16(in_x0_ptr);
+                const auto  v_x1      = vld1q_f16(in_x1_ptr);
+                const auto  v_x2      = vld1q_f16(in_x2_ptr);
+                const auto  v_x3      = vld1q_f16(in_x3_ptr);
+                float16x8_t vres      = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
+                // Store result
+                vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
+
+                const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x,
+                                                                          pool_stride_y, DataLayout::NHWC);
+                const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
+                const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+                const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) -
+                                           pad_horizontal * src->info()->tensor_shape()[1];
+                const uint32_t   offset_x3    = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+                const uint32x4_t voffset_x0_0 = {offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3};
+                const uint32x4_t voffset_x0_1 = {offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7};
+                const uint16x8_t voffset_x0   = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
+                const uint32x4_t voffset_x1_0 = {offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3};
+                const uint32x4_t voffset_x1_1 = {offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7};
+                const uint16x8_t voffset_x1   = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
+                const uint32x4_t voffset_x2_0 = {offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3};
+                const uint32x4_t voffset_x2_1 = {offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7};
+                const uint16x8_t voffset_x2   = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
+                const uint32x4_t voffset_x3_0 = {offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3};
+                const uint32x4_t voffset_x3_1 = {offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7};
+                const uint16x8_t voffset_x3   = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
+                const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
+                const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
+                const uint16x8_t tmp_indices2 =
+                    vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
+                const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
+                const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
+                // Store indicies
+                vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
+                vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
+            }
 
-            // Store result
-            *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
-
-            const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
-            const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t offset_x3   = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32_t tmp_idx0    = (x0 >= x1) ? offset_x0 : offset_x1;
-            const uint32_t tmp_idx1    = (x2 >= x3) ? offset_x2 : offset_x3;
-            const uint32_t tmp_idx2    = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
-
-            // Store indices
-            *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
-        }
-    },
-    in, out, indices);
-}
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                const auto x0  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off);
+                const auto x1  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off);
+                const auto x2  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off);
+                const auto x3  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off);
+                float16_t  res = std::max(std::max(x2, x3), std::max(x0, x1));
+
+                // Store result
+                *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
+
+                const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x,
+                                                                          pool_stride_y, DataLayout::NHWC);
+                const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
+                const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+                const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) -
+                                           pad_horizontal * src->info()->tensor_shape()[1];
+                const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+                const uint32_t tmp_idx0  = (x0 >= x1) ? offset_x0 : offset_x1;
+                const uint32_t tmp_idx1  = (x2 >= x3) ? offset_x2 : offset_x3;
+                const uint32_t tmp_idx2  = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
+
+                // Store indices
+                *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
+            }
+        },
+        in, out, indices);
 }
-
-void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+#ifdef ENABLE_NCHW_KERNELS
+
+void pooling2_fp16_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
 {
-    if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
+    if (pool_info.pool_type == PoolingType::MAX && dst1)
     {
-        pooling2_f16_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
+        pooling2_nchw_maxpool_indices<float16_t>(src, dst0, dst1, pool_info, window_src, window);
     }
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 8;
+    else
+    {
+        Iterator      in(src, window_src);
+        Iterator      out(dst0, window);
+        constexpr int pool_size       = 2;
+        const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
+        const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
+        const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
+        const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+        int           pool_stride_x, pool_stride_y = 0;
+        std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+        const int       src_w                  = src->info()->dimension(0);
+        const int       src_h                  = src->info()->dimension(1);
+        const int       upper_bound_w          = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+        const int       upper_bound_h          = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+        const float16_t fp16_min               = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+        const float16_t fill_value             = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
+
+        const unsigned char *const src_top_ptr =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+        const unsigned char *const src_bottom_ptr =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto in_top_ptr    = reinterpret_cast<const float16_t *>(src_top_ptr + in.offset());
+                const auto in_bottom_ptr = reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset());
+
+                const auto  x_val       = id.x() * pool_stride_x;
+                const auto  y_val_0     = id.y() * pool_stride_y;
+                const auto  y_val_1     = (id.y() * pool_stride_y) + 1;
+                float16x4_t top_data    = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val,
+                                                                     y_val_0, in_top_ptr, fill_value);
+                float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val,
+                                                                     y_val_1, in_bottom_ptr, fill_value);
+                float16x4_t res         = {};
+
+                // Get power of 2 in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    top_data    = vmul_f16(top_data, top_data);
+                    bottom_data = vmul_f16(bottom_data, bottom_data);
+                }
 
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+                if (pool_info.pool_type != PoolingType::MAX)
+                {
+                    const float scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                    const float16x4_t scale_v = vdup_n_f16(scale);
 
-    Iterator in(src, window_src);
-    Iterator out(dst0, window_out);
+                    const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
+                    res                        = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
+                }
+                else
+                {
+                    const float16x4_t max_data = vmax_f16(top_data, bottom_data);
+                    res                        = vpmax_f16(max_data, max_data);
+                }
 
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int       upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int       upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float16_t min_value     = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
-    float16x8_t     vres;
+                // Calculate square-root in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    res = vsqrt_f16(res);
+                }
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-        const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+                // Store result
+                *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
+            },
+            in, out);
+    }
+}
+
+void poolingMxN_fp16_neon_nchw(const ITensor    *src,
+                               ITensor          *dst0,
+                               ITensor          *dst1,
+                               PoolingLayerInfo &pool_info,
+                               const Window     &window_src,
+                               const Window     &window)
+{
+    ARM_COMPUTE_UNUSED(dst1);
+    Iterator in(src, window_src);
+    Iterator out(dst0, window);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
+    const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+    const int       src_w                  = src->info()->dimension(0);
+    const int       src_h                  = src->info()->dimension(1);
+    const int       upper_bound_w          = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int       upper_bound_h          = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float16_t fp16_min               = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+    const float16_t fill_value             = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            if(pool_info.pool_type != PoolingType::MAX)
+            float16_t res = 0.0f;
+
+            if (pool_info.pool_type != PoolingType::MAX)
             {
                 // Calculate scale
-                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                               pool_stride_y);
-                const float16x8_t scale_v = vdupq_n_f16(scale);
+                const float16_t scale = calculate_avg_scale_pool2d(
+                    pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w,
+                    upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
                 // Perform pooling
-                vres = vdupq_n_f16(0.0f);
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                for (int y = 0; y < pool_size_y; ++y)
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    for (int x = 0; x < pool_size_x; ++x)
                     {
-                        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
+                        const auto ptr = reinterpret_cast<const float16_t *>(
+                            in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+                            (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
 
-                        // Get power of 2 in case of l2 pooling and accumulate
-                        if(pool_info.pool_type == PoolingType::L2)
-                        {
-                            vres = vaddq_f16(vres, vmulq_f16(data, data));
-                        }
-                        else
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+
+                        if (pool_info.pool_type == PoolingType::L2)
                         {
-                            vres = vaddq_f16(vres, data);
+                            data *= data;
                         }
+
+                        res += data;
                     }
                 }
+
                 // Divide by scale
-                vres = vmulq_f16(vres, scale_v);
+                res *= scale;
             }
-            else
+            else // if max pooling
             {
-                vres = vdupq_n_f16(min_value);
+                res = fp16_min;
 
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                for (int y = 0; y < pool_size_y; ++y)
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    for (int x = 0; x < pool_size_x; ++x)
                     {
-                        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-                        vres                   = vmaxq_f16(vres, data);
+                        const auto ptr = reinterpret_cast<const float16_t *>(
+                            in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+                            (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+                        res            = std::max(res, data);
                     }
                 }
             }
 
             // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
+            if (pool_info.pool_type == PoolingType::L2)
             {
-                float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
-                vres                        = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
+                res = std::sqrt(res);
             }
 
             // Store result
-            vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
-        }
+            *(reinterpret_cast<float16_t *>(out.ptr())) = res;
+        },
+        in, out);
+}
+#endif // ENABLE_NCHW_KERNELS
+
+void poolingMxN_fp16_neon_nhwc(const ITensor    *src,
+                               ITensor          *dst0,
+                               ITensor          *dst1,
+                               PoolingLayerInfo &pool_info,
+                               const Window     &window_src,
+                               const Window     &window)
+{
+    if (pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
+    {
+        pooling2_f16_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
+    }
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 8;
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            float16_t res = 0.0f;
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                // Calculate scale
-                const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                                   pool_stride_y);
+    Iterator in(src, window_src);
+    Iterator out(dst0, window_out);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+    const int       upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int       upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float16_t min_value     = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+    float16x8_t     vres;
 
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            const int idx_width    = id.y() * pool_stride_x;
+            const int idx_height   = id.z() * pool_stride_y;
+            const int pool_limit_y = pool_pad_top - idx_height;
+            const int pool_limit_x = pool_pad_left - idx_width;
+
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+            const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
+
+            int x_off = window_start_x;
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+            {
+                if (pool_info.pool_type != PoolingType::MAX)
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    // Calculate scale
+                    const float scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                    const float16x8_t scale_v = vdupq_n_f16(scale);
+
+                    // Perform pooling
+                    vres = vdupq_n_f16(0.0f);
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const float data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (src->info()->strides_in_bytes().z())) + x_off);
-
-                        // Get power of 2 in case of l2 pooling and accumulate
-                        if(pool_info.pool_type == PoolingType::L2)
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
                         {
-                            res += data * data;
+                            const float16x8_t data = vld1q_f16(
+                                reinterpret_cast<const float16_t *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+
+                            // Get power of 2 in case of l2 pooling and accumulate
+                            if (pool_info.pool_type == PoolingType::L2)
+                            {
+                                vres = vaddq_f16(vres, vmulq_f16(data, data));
+                            }
+                            else
+                            {
+                                vres = vaddq_f16(vres, data);
+                            }
                         }
-                        else
+                    }
+                    // Divide by scale
+                    vres = vmulq_f16(vres, scale_v);
+                }
+                else
+                {
+                    vres = vdupq_n_f16(min_value);
+
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
                         {
-                            res += data;
+                            const float16x8_t data = vld1q_f16(
+                                reinterpret_cast<const float16_t *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+                            vres = vmaxq_f16(vres, data);
                         }
                     }
                 }
 
-                // Divide by scale
-                res *= scale;
+                // Calculate square-root in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
+                    vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal),
+                                                     sqrt_reciprocal));
+                }
+
+                // Store result
+                vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
             }
-            else
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                res = min_value;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                float16_t res = 0.0f;
+
+                if (pool_info.pool_type != PoolingType::MAX)
+                {
+                    // Calculate scale
+                    const float16_t scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const float data =
+                                *(reinterpret_cast<const float16_t *>(
+                                      in.ptr() +
+                                      (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                      (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                  x_off);
+
+                            // Get power of 2 in case of l2 pooling and accumulate
+                            if (pool_info.pool_type == PoolingType::L2)
+                            {
+                                res += data * data;
+                            }
+                            else
+                            {
+                                res += data;
+                            }
+                        }
+                    }
+
+                    // Divide by scale
+                    res *= scale;
+                }
+                else
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    res = min_value;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                     (src->info()->strides_in_bytes().z())) + x_off);
-                        res                  = std::max(res, data);
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const float16_t data =
+                                *(reinterpret_cast<const float16_t *>(
+                                      in.ptr() +
+                                      (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                      (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                  x_off);
+                            res = std::max(res, data);
+                        }
                     }
                 }
-            }
 
-            // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                res = std::sqrt(res);
-            }
+                // Calculate square-root in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    res = std::sqrt(res);
+                }
 
-            // Store result
-            *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
-        }
-    },
-    in, out);
+                // Store result
+                *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
+            }
+        },
+        in, out);
 }
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/pool2d/neon/fp32.cpp b/src/cpu/kernels/pool2d/neon/fp32.cpp
index a400f3a95d7353989cd6821912ac59446d76a612..aaa37863cb4b0e00ca5c0e4fc9f01cc4a49f79a1 100644
--- a/src/cpu/kernels/pool2d/neon/fp32.cpp
+++ b/src/cpu/kernels/pool2d/neon/fp32.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool2d/neon/list.h"
 
 namespace arm_compute
@@ -34,7 +35,12 @@ namespace cpu
 {
 namespace
 {
-void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_f32_maxpool_indices(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
 {
     const int window_start_x = window.x().start();
     const int window_end_x   = window.x().end();
@@ -50,8 +56,8 @@ void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds
     const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
     const int pool_pad_left = pool_info.pad_stride_info.pad_left();
 
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
+    int pool_stride_x                      = 0;
+    int pool_stride_y                      = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
 
     float32x4_t vres;
@@ -63,89 +69,102 @@ void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds
     const int in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
     const int in_stride_z    = static_cast<int>(src->info()->strides_in_bytes().z());
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-
-        const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
-        const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
-        {
-            const auto in_x0_ptr = reinterpret_cast<const float *>(in.ptr() + in_x0_offset);
-            const auto in_x1_ptr = reinterpret_cast<const float *>(in.ptr() + in_x1_offset);
-            const auto in_x2_ptr = reinterpret_cast<const float *>(in.ptr() + in_x2_offset);
-            const auto in_x3_ptr = reinterpret_cast<const float *>(in.ptr() + in_x3_offset);
-            const auto v_x0      = vld1q_f32(in_x0_ptr + x_off);
-            const auto v_x1      = vld1q_f32(in_x1_ptr + x_off);
-            const auto v_x2      = vld1q_f32(in_x2_ptr + x_off);
-            const auto v_x3      = vld1q_f32(in_x3_ptr + x_off);
-            vres                 = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
-            // Store result
-            vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
-
-            const uint32_t   offset_base  = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t   offset_x0    = offset_base / sizeof(float) + x_off;
-            const uint32_t   offset_x1    = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32_t   offset_x2    = offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t   offset_x3    = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32x4_t voffset_x0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
-            const uint32x4_t voffset_x1   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
-            const uint32x4_t voffset_x2   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
-            const uint32x4_t voffset_x3   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
-            const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
-            const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
-            const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
-
-            // Store indices
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            const auto x0 = *(reinterpret_cast<const float *>(in.ptr() + in_x0_offset) + x_off);
-            const auto x1 = *(reinterpret_cast<const float *>(in.ptr() + in_x1_offset) + x_off);
-            const auto x2 = *(reinterpret_cast<const float *>(in.ptr() + in_x2_offset) + x_off);
-            const auto x3 = *(reinterpret_cast<const float *>(in.ptr() + in_x3_offset) + x_off);
-            res           = std::max(std::max(x2, x3), std::max(x0, x1));
-
-            // Store result
-            *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
-
-            const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t offset_x0   = offset_base / sizeof(float) + x_off;
-            const uint32_t offset_x1   = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32_t offset_x2   = offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t offset_x3   = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32_t tmp_idx0    = (x0 >= x1) ? offset_x0 : offset_x1;
-            const uint32_t tmp_idx1    = (x2 >= x3) ? offset_x2 : offset_x3;
-            const uint32_t tmp_idx2    = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
-
-            // Store indices
-            *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
-        }
-    },
-    in, out, indices);
+            const int idx_width    = id.y() * pool_stride_x;
+            const int idx_height   = id.z() * pool_stride_y;
+            const int pool_limit_y = pool_pad_top - idx_height;
+            const int pool_limit_x = pool_pad_left - idx_width;
+
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+
+            const int in_x0_offset =
+                (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x1_offset =
+                (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x2_offset =
+                (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x3_offset =
+                (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+
+            int x_off = window_start_x;
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+            {
+                const auto in_x0_ptr = reinterpret_cast<const float *>(in.ptr() + in_x0_offset);
+                const auto in_x1_ptr = reinterpret_cast<const float *>(in.ptr() + in_x1_offset);
+                const auto in_x2_ptr = reinterpret_cast<const float *>(in.ptr() + in_x2_offset);
+                const auto in_x3_ptr = reinterpret_cast<const float *>(in.ptr() + in_x3_offset);
+                const auto v_x0      = vld1q_f32(in_x0_ptr + x_off);
+                const auto v_x1      = vld1q_f32(in_x1_ptr + x_off);
+                const auto v_x2      = vld1q_f32(in_x2_ptr + x_off);
+                const auto v_x3      = vld1q_f32(in_x3_ptr + x_off);
+                vres                 = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
+                // Store result
+                vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
+
+                const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x,
+                                                                      pool_stride_y, DataLayout::NHWC);
+                const uint32_t offset_x0   = offset_base / sizeof(float) + x_off;
+                const uint32_t offset_x1   = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
+                const uint32_t offset_x2 =
+                    offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
+                const uint32_t   offset_x3    = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
+                const uint32x4_t voffset_x0   = {offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3};
+                const uint32x4_t voffset_x1   = {offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3};
+                const uint32x4_t voffset_x2   = {offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3};
+                const uint32x4_t voffset_x3   = {offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3};
+                const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
+                const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
+                const uint32x4_t tmp_indices2 =
+                    vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
+
+                // Store indices
+                vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                const auto x0 = *(reinterpret_cast<const float *>(in.ptr() + in_x0_offset) + x_off);
+                const auto x1 = *(reinterpret_cast<const float *>(in.ptr() + in_x1_offset) + x_off);
+                const auto x2 = *(reinterpret_cast<const float *>(in.ptr() + in_x2_offset) + x_off);
+                const auto x3 = *(reinterpret_cast<const float *>(in.ptr() + in_x3_offset) + x_off);
+                res           = std::max(std::max(x2, x3), std::max(x0, x1));
+
+                // Store result
+                *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
+
+                const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x,
+                                                                      pool_stride_y, DataLayout::NHWC);
+                const uint32_t offset_x0   = offset_base / sizeof(float) + x_off;
+                const uint32_t offset_x1   = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
+                const uint32_t offset_x2 =
+                    offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
+                const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
+                const uint32_t tmp_idx0  = (x0 >= x1) ? offset_x0 : offset_x1;
+                const uint32_t tmp_idx1  = (x2 >= x3) ? offset_x2 : offset_x3;
+                const uint32_t tmp_idx2  = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
+
+                // Store indices
+                *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
+            }
+        },
+        in, out, indices);
 }
 } // namespace
 
-void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, const PoolingLayerInfo &pool_info, const Window &window)
+void poolingMxN_fp32_neon_nhwc_kernel_indices(
+    const ITensor *src, ITensor *dst0, ITensor *dst1, const PoolingLayerInfo &pool_info, const Window &window)
 {
-    const int window_start_x     = window.x().start();
-    const int window_end_x       = window.x().end();
+    const int     window_start_x = window.x().start();
+    const int     window_end_x   = window.x().end();
     constexpr int window_step_x  = 4;
 
     Window window_out = window;
@@ -160,8 +179,8 @@ void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0,
     const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
     const int pool_pad_left = pool_info.pad_stride_info.pad_left();
 
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
+    int pool_stride_x                      = 0;
+    int pool_stride_y                      = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
 
     const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit);
@@ -169,9 +188,9 @@ void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0,
     float32x4_t vres;
     uint32x4_t  vidx;
 
-    constexpr int idx_width   = 1;
-    constexpr int idx_height  = 2;
-    constexpr int idx_batch   = 3;
+    constexpr int idx_width  = 1;
+    constexpr int idx_height = 2;
+    constexpr int idx_batch  = 3;
 
     const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
     const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
@@ -182,89 +201,97 @@ void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0,
 
     const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            const int idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
 
-        const int pool_start_x = std::max(0, -idx_width);
-        const int pool_start_y = std::max(0, -idx_height);
+            const int pool_start_x = std::max(0, -idx_width);
+            const int pool_start_y = std::max(0, -idx_height);
 
-        const int pool_end_x = std::min(pool_size_x, input_dim_w - idx_width);
-        const int pool_end_y = std::min(pool_size_y, input_dim_h - idx_height);
+            const int pool_end_x = std::min(pool_size_x, input_dim_w - idx_width);
+            const int pool_end_y = std::min(pool_size_y, input_dim_h - idx_height);
 
-        const uint8_t *in_ptr_n = in_ptr_start + id[idx_batch] * n_stride;
+            const uint8_t *in_ptr_n = in_ptr_start + id[idx_batch] * n_stride;
 
-        const int in_ptr_y_offset = (z_stride * idx_height) + (pool_start_y * z_stride);
-        const int in_ptr_x_offset = (y_stride * idx_width) + (pool_start_x * y_stride);
+            const int in_ptr_y_offset = (z_stride * idx_height) + (pool_start_y * z_stride);
+            const int in_ptr_x_offset = (y_stride * idx_width) + (pool_start_x * y_stride);
 
-        int x_off = window_start_x;
+            int x_off = window_start_x;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
-        {
-            vres              = vdupq_n_f32(min_value);
-            vidx              = vdupq_n_u32(0U);
-            const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
-            uint32_t    curr_kernel_index = pool_size_x * pool_start_y;
-            for(int y = pool_start_y; y < pool_end_y; ++y)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
             {
-                const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
-                curr_kernel_index += pool_start_x;
-                for(int x = pool_start_x; x < pool_end_x; ++x)
+                vres                             = vdupq_n_f32(min_value);
+                vidx                             = vdupq_n_u32(0U);
+                const uint8_t *in_ptr_y          = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
+                uint32_t       curr_kernel_index = pool_size_x * pool_start_y;
+                for (int y = pool_start_y; y < pool_end_y; ++y)
                 {
-                    const float32x4_t data              = vld1q_f32(reinterpret_cast<const float *>(in_ptr_x));
-                    const uint32x4_t  vidx_curr         = vdupq_n_u32(curr_kernel_index);
-                    const uint32x4_t idxMask = vcgtq_f32(data, vres);
-                    vidx                     = vbslq_u32(idxMask, vidx_curr, vidx);
-                    vres                     = vmaxq_f32(vres, data);
-                    in_ptr_x += y_stride;
-                    curr_kernel_index++;
+                    const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
+                    curr_kernel_index += pool_start_x;
+                    for (int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const float32x4_t data      = vld1q_f32(reinterpret_cast<const float *>(in_ptr_x));
+                        const uint32x4_t  vidx_curr = vdupq_n_u32(curr_kernel_index);
+                        const uint32x4_t  idxMask   = vcgtq_f32(data, vres);
+                        vidx                        = vbslq_u32(idxMask, vidx_curr, vidx);
+                        vres                        = vmaxq_f32(vres, data);
+                        in_ptr_x += y_stride;
+                        curr_kernel_index++;
+                    }
+                    curr_kernel_index += (pool_size_x - pool_end_x);
+                    in_ptr_y += z_stride;
                 }
-                curr_kernel_index += (pool_size_x - pool_end_x);
-                in_ptr_y += z_stride;
+                // Store result
+                vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
+                vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, vidx);
             }
-            // Store result
-            vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, vidx);
-        }
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            float    res      = min_value;
-            uint32_t idx      = 0U;
-            const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
-            for(int y = pool_start_y; y < pool_end_y; ++y)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
-                for(int x = pool_start_x; x < pool_end_x; ++x)
+                float          res      = min_value;
+                uint32_t       idx      = 0U;
+                const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
+                for (int y = pool_start_y; y < pool_end_y; ++y)
                 {
-                    const float data = *(reinterpret_cast<const float *>(in_ptr_x));
-                    if(data > res)
+                    const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
+                    for (int x = pool_start_x; x < pool_end_x; ++x)
                     {
-                        idx = pool_size_x * y + x;
-                        res = data;
+                        const float data = *(reinterpret_cast<const float *>(in_ptr_x));
+                        if (data > res)
+                        {
+                            idx = pool_size_x * y + x;
+                            res = data;
+                        }
+                        in_ptr_x += y_stride;
                     }
-                    in_ptr_x += y_stride;
+                    in_ptr_y += z_stride;
                 }
-                in_ptr_y += z_stride;
-            }
 
-            // Store result
-            *(reinterpret_cast<float *>(out.ptr()) + x_off)        = res;
-            *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = idx;
-        }
-    },
-    out, indices);
+                // Store result
+                *(reinterpret_cast<float *>(out.ptr()) + x_off)        = res;
+                *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = idx;
+            }
+        },
+        out, indices);
 }
 
-void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_fp32_neon_nhwc(const ITensor    *src,
+                               ITensor          *dst0,
+                               ITensor          *dst1,
+                               PoolingLayerInfo &pool_info,
+                               const Window     &window_src,
+                               const Window     &window)
 {
-    if((pool_info.pool_type == PoolingType::MAX) && pool_info.use_kernel_indices && (dst1 != nullptr))
+    if ((pool_info.pool_type == PoolingType::MAX) && pool_info.use_kernel_indices && (dst1 != nullptr))
     {
         poolingMxN_fp32_neon_nhwc_kernel_indices(src, dst0, dst1, pool_info, window);
     }
-    else if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && !pool_info.pad_stride_info.has_padding() && (dst1 != nullptr))
+    else if (pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX &&
+             !pool_info.pad_stride_info.has_padding() && (dst1 != nullptr))
     {
         pooling2_f32_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
     }
@@ -280,153 +307,174 @@ void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1,
         Iterator in(src, window_src);
         Iterator out(dst0, window_out);
 
-        const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-        const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
-        const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-        const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-        const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-        const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-        int       pool_stride_x   = 0;
-        int       pool_stride_y   = 0;
+        const int pool_size_x =
+            pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+        const int pool_size_y =
+            pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+        const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+        const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+        const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+        const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+        int       pool_stride_x                = 0;
+        int       pool_stride_y                = 0;
         std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
         const int   upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
         const int   upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
         const float min_value     = get_initial_min<float>(pool_info.use_inf_as_limit);
         float32x4_t vres;
 
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const int idx_width    = id.y() * pool_stride_x;
-            const int idx_height   = id.z() * pool_stride_y;
-            const int pool_limit_y = pool_pad_top - idx_height;
-            const int pool_limit_x = pool_pad_left - idx_width;
-
-            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-            const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
-            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-            const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
-            int x_off = window_start_x;
-            for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+        execute_window_loop(
+            window_out,
+            [&](const Coordinates &id)
             {
-                if(pool_info.pool_type != PoolingType::MAX)
+                const int idx_width    = id.y() * pool_stride_x;
+                const int idx_height   = id.z() * pool_stride_y;
+                const int pool_limit_y = pool_pad_top - idx_height;
+                const int pool_limit_x = pool_pad_left - idx_width;
+
+                const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+                const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+                const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+                const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
+
+                int x_off = window_start_x;
+                for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
                 {
-                    // Calculate scale
-                    const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                                   pool_stride_y);
-                    const float32x4_t scale_v = vdupq_n_f32(scale);
+                    if (pool_info.pool_type != PoolingType::MAX)
+                    {
+                        // Calculate scale
+                        const float scale = calculate_avg_scale_pool2d(
+                            pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                            upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                        const float32x4_t scale_v = vdupq_n_f32(scale);
 
-                    // Perform pooling
-                    vres = vdupq_n_f32(0.0f);
+                        // Perform pooling
+                        vres = vdupq_n_f32(0.0f);
 
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
-                    {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
+                        for (int y = pool_start_y; y < pool_end_y; ++y)
                         {
-                            const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-
-                            // Get power of 2 in case of l2 pooling and accumulate
-                            if(pool_info.pool_type == PoolingType::L2)
-                            {
-                                vres = vmlaq_f32(vres, data, data);
-                            }
-                            else
+                            for (int x = pool_start_x; x < pool_end_x; ++x)
                             {
-                                vres = vaddq_f32(vres, data);
+                                const float32x4_t data = vld1q_f32(
+                                    reinterpret_cast<const float *>(
+                                        in.ptr() +
+                                        (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                        (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                    x_off);
+
+                                // Get power of 2 in case of l2 pooling and accumulate
+                                if (pool_info.pool_type == PoolingType::L2)
+                                {
+                                    vres = vmlaq_f32(vres, data, data);
+                                }
+                                else
+                                {
+                                    vres = vaddq_f32(vres, data);
+                                }
                             }
                         }
+                        // Divide by scale
+                        vres = vmulq_f32(vres, scale_v);
                     }
-                    // Divide by scale
-                    vres = vmulq_f32(vres, scale_v);
-                }
-                else
-                {
-                    vres = vdupq_n_f32(min_value);
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
+                    else
                     {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
+                        vres = vdupq_n_f32(min_value);
+                        for (int y = pool_start_y; y < pool_end_y; ++y)
                         {
-                            const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-                            vres                   = vmaxq_f32(vres, data);
+                            for (int x = pool_start_x; x < pool_end_x; ++x)
+                            {
+                                const float32x4_t data = vld1q_f32(
+                                    reinterpret_cast<const float *>(
+                                        in.ptr() +
+                                        (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                        (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                    x_off);
+                                vres = vmaxq_f32(vres, data);
+                            }
                         }
                     }
-                }
 
-                // Calculate square-root in case of l2 pooling
-                if(pool_info.pool_type == PoolingType::L2)
-                {
-                    float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
-                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
-                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
-                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
-                                         };
-                    vres = l2_res;
-                }
-
-                // Store result
-                vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
-            }
+                    // Calculate square-root in case of l2 pooling
+                    if (pool_info.pool_type == PoolingType::L2)
+                    {
+                        float32x4_t l2_res = {static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
+                                              static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
+                                              static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
+                                              static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))};
+                        vres               = l2_res;
+                    }
 
-            // Left-overs loop
-            for(; x_off < window_end_x; ++x_off)
-            {
-                float res = 0.0f;
+                    // Store result
+                    vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
+                }
 
-                if(pool_info.pool_type != PoolingType::MAX)
+                // Left-overs loop
+                for (; x_off < window_end_x; ++x_off)
                 {
-                    // Calculate scale
-                    const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                                   pool_stride_y);
+                    float res = 0.0f;
 
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
+                    if (pool_info.pool_type != PoolingType::MAX)
                     {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
-                        {
-                            const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (src->info()->strides_in_bytes().z())) + x_off);
+                        // Calculate scale
+                        const float scale = calculate_avg_scale_pool2d(
+                            pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                            upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
-                            // Get power of 2 in case of l2 pooling and accumulate
-                            if(pool_info.pool_type == PoolingType::L2)
+                        for (int y = pool_start_y; y < pool_end_y; ++y)
+                        {
+                            for (int x = pool_start_x; x < pool_end_x; ++x)
                             {
-                                res += data * data;
+                                const float data =
+                                    *(reinterpret_cast<const float *>(
+                                          in.ptr() +
+                                          (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                          (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                      x_off);
+
+                                // Get power of 2 in case of l2 pooling and accumulate
+                                if (pool_info.pool_type == PoolingType::L2)
+                                {
+                                    res += data * data;
+                                }
+                                else
+                                {
+                                    res += data;
+                                }
                             }
-                            else
+                        }
+
+                        // Divide by scale
+                        res *= scale;
+                    }
+                    else
+                    {
+                        res = min_value;
+                        for (int y = pool_start_y; y < pool_end_y; ++y)
+                        {
+                            for (int x = pool_start_x; x < pool_end_x; ++x)
                             {
-                                res += data;
+                                const float data =
+                                    *(reinterpret_cast<const float *>(
+                                          in.ptr() +
+                                          (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                          (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                      x_off);
+                                res = std::max(res, data);
                             }
                         }
                     }
 
-                    // Divide by scale
-                    res *= scale;
-                }
-                else
-                {
-                    res = min_value;
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
+                    // Calculate square-root in case of l2 pooling
+                    if (pool_info.pool_type == PoolingType::L2)
                     {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
-                        {
-                            const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (src->info()->strides_in_bytes().z())) + x_off);
-                            res              = std::max(res, data);
-                        }
+                        res = std::sqrt(res);
                     }
-                }
 
-                // Calculate square-root in case of l2 pooling
-                if(pool_info.pool_type == PoolingType::L2)
-                {
-                    res = std::sqrt(res);
+                    // Store result
+                    *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
                 }
-
-                // Store result
-                *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
-            }
-        },
-        in, out);
+            },
+            in, out);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/pool2d/neon/impl.h b/src/cpu/kernels/pool2d/neon/impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..008cf651e136d378ad8aa76df3c10150b818f8bf
--- /dev/null
+++ b/src/cpu/kernels/pool2d/neon/impl.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CPU_KERNELS_POOL2D_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_POOL2D_NEON_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/cpu/kernels/pool2d/neon/list.h"
+
+#include <limits>
+
+#ifdef ENABLE_NCHW_KERNELS
+namespace arm_compute
+{
+namespace cpu
+{
+
+namespace
+{
+template <typename T>
+auto read_2_boundary_aware_as_f32(int srcw, int srch, int pad_l, int pad_t, int x, int y, const T *ptr, T fval)
+{
+    T          vec[2];
+    const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
+    for (int i = 0; i < 2; i++)
+    {
+        if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
+        {
+            vec[i] = *(ptr + i);
+        }
+        else
+        {
+            vec[i] = fval;
+        }
+    }
+    float32_t vec_f32[2] = {vec[0], vec[1]};
+    return wrapper::vload(vec_f32);
+}
+} // namespace
+
+template <typename T>
+void pooling2_nchw_maxpool_indices(const ITensor    *src,
+                                   ITensor          *dst0,
+                                   ITensor          *dst1,
+                                   PoolingLayerInfo &pool_info,
+                                   const Window     &window_src,
+                                   const Window     &window)
+{
+    Iterator  in(src, window_src);
+    Iterator  out(dst0, window);
+    Iterator  indices(dst1, window);
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+    const int            src_w             = src->info()->dimension(0);
+    const int            src_h             = src->info()->dimension(1);
+    const uint8_t *const src_top_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const src_bottom_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const int pad_left    = src->info()->padding().left;
+    const int pad_right   = src->info()->padding().right;
+    const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y());
+    const T   float_min   = get_initial_min<T>(pool_info.use_inf_as_limit);
+    const T   fill_value  = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f;
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto x_val   = id.x() * pool_stride_x;
+            const auto y_val_0 = id.y() * pool_stride_y;
+            const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+            auto       top_data =
+                read_2_boundary_aware_as_f32(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_0,
+                                             reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
+            auto bottom_data =
+                read_2_boundary_aware_as_f32(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_1,
+                                             reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
+
+            // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
+            const float32x2_t max_data_top      = vpmax_f32(top_data, top_data);
+            const float32x2_t max_data_bottom   = vpmax_f32(bottom_data, bottom_data);
+            const float32x2_t max_data          = vmax_f32(max_data_top, max_data_bottom);
+            *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
+
+            // Calculate max data indice, which will be used in max unpool.
+            const uint32_t offset_base =
+                offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW);
+            const uint32_t   offset_top     = (uint32_t)(offset_base / sizeof(T));
+            const uint32_t   offset_bottom  = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
+            const uint32x2_t voffset_top    = {offset_top, offset_top + 1u};
+            const uint32x2_t voffset_bottom = {offset_bottom, offset_bottom + 1u};
+            const uint32x2_t tmp_indices_top =
+                vbsl_u32(vcge_f32(top_data, vrev64_f32(top_data)), voffset_top, vrev64_u32(voffset_top));
+            const uint32x2_t tmp_indices_bottom =
+                vbsl_u32(vcge_f32(bottom_data, vrev64_f32(bottom_data)), voffset_bottom, vrev64_u32(voffset_bottom));
+            *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(
+                vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
+        },
+        in, out, indices);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ENABLE_NCHW_KERNELS
+
+#endif // ACL_SRC_CPU_KERNELS_POOL2D_NEON_IMPL_H
diff --git a/src/cpu/kernels/pool2d/neon/list.h b/src/cpu/kernels/pool2d/neon/list.h
index eb141d6fcd6b102a2eca50ad25603377a3c7f429..5db843d56b7b60d2685de87aadbf0e518bd79af1 100644
--- a/src/cpu/kernels/pool2d/neon/list.h
+++ b/src/cpu/kernels/pool2d/neon/list.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,21 +21,24 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_POOLING_LIST_H
-#define SRC_CORE_NEON_KERNELS_POOLING_LIST_H
+#ifndef ACL_SRC_CPU_KERNELS_POOL2D_NEON_LIST_H
+#define ACL_SRC_CPU_KERNELS_POOL2D_NEON_LIST_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/pool2d/neon/quantized.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_POOLING_KERNEL(func_name) \
-    void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window)
+#define DECLARE_POOLING_KERNEL(func_name)                                                                           \
+    void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, \
+                   const Window &window)
 
 DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_neon_nhwc);
 DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_signed_neon_nhwc);
@@ -44,11 +47,11 @@ DECLARE_POOLING_KERNEL(poolingMxN_fp32_neon_nhwc);
 
 #if defined(ENABLE_NCHW_KERNELS)
 
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#if defined(ENABLE_FP16_KERNELS)
 DECLARE_POOLING_KERNEL(pooling2_fp16_neon_nchw);
 DECLARE_POOLING_KERNEL(pooling3_fp16_neon_nchw);
 DECLARE_POOLING_KERNEL(poolingMxN_fp16_neon_nchw);
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+#endif /* defined(ENABLE_FP16_KERNELS) */
 
 DECLARE_POOLING_KERNEL(pooling2_fp32_neon_nchw);
 DECLARE_POOLING_KERNEL(pooling3_fp32_neon_nchw);
@@ -65,7 +68,12 @@ T get_initial_min(bool use_inf_as_limit)
 }
 
 template <typename T>
-inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y, DataLayout data_layout)
+inline uint32_t offset_no_padding(uint32_t           padded_offset,
+                                  const Coordinates &id,
+                                  const ITensorInfo &info,
+                                  int                pool_stride_x,
+                                  int                pool_stride_y,
+                                  DataLayout         data_layout)
 {
     const int pad_left    = info.padding().left;
     const int pad_right   = info.padding().right;
@@ -76,22 +84,24 @@ inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id,
     const int pad_horiz   = pad_left + pad_right;
     const int pad_vert    = pad_top + pad_bottom;
 
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
-        const uint32_t offset_base = padded_offset
-                                     - sizeof(T) * pad_horiz * id.y() * pool_stride_y                                            /* subtract padding elems per row */
-                                     - pad_top * sizeof(T)                                                                       /* top padding */
-                                     - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
-                                     - in_stride_w * id[3];
+        const uint32_t offset_base =
+            padded_offset - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */
+            - pad_top * sizeof(T)                                          /* top padding */
+            - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() -
+            pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
+            - in_stride_w * id[3];
 
         return offset_base;
     }
     else
     {
-        const uint32_t offset_base = padded_offset
-                                     - sizeof(T) * pad_horiz * id.y() * pool_stride_x                          // subtract padding elems per row
-                                     - pad_top * sizeof(T)                                                     // top padding
-                                     - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems
+        const uint32_t offset_base = padded_offset -
+                                     sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row
+                                     - pad_top * sizeof(T)                          // top padding
+                                     - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() *
+                                           pool_stride_y // for each Z plane there are width*pad_right padding elems
                                      - in_stride_w * id[3];
 
         return offset_base;
@@ -100,4 +110,4 @@ inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id,
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // SRC_CORE_NEON_KERNELS_POOLING_LIST_H
\ No newline at end of file
+#endif // ACL_SRC_CPU_KERNELS_POOL2D_NEON_LIST_H
diff --git a/src/cpu/kernels/pool2d/neon/nchw/all.cpp b/src/cpu/kernels/pool2d/neon/nchw/all.cpp
index c342b96426649f24768fa5bd8ecb892edd858b86..0602bea66776b26159f08aff96fe0b7e9d998b45 100644
--- a/src/cpu/kernels/pool2d/neon/nchw/all.cpp
+++ b/src/cpu/kernels/pool2d/neon/nchw/all.cpp
@@ -25,9 +25,12 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/cpu/kernels/pool2d/neon/impl.h"
 #include "src/cpu/kernels/pool2d/neon/list.h"
+
 #include <limits>
 
 #ifdef ENABLE_NCHW_KERNELS
@@ -38,15 +41,19 @@ namespace cpu
 #define READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
     (x == width + pad_left - 1) ? vset_lane_f32(*(ptr), vdup_n_f32(fval), 0) : vld1_f32(ptr)
 #define READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
-    (x == pad_left - 1) ? vset_lane_f32(*(1 + ptr), vdup_n_f32(fval), 1) : READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
-#define READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
-    ((y < pad_top) || (x < pad_left - 1) || (y >= height + pad_top) || (x > width + pad_left - 1)) ? vdup_n_f32(fval) : READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
+    (x == pad_left - 1) ? vset_lane_f32(*(1 + ptr), vdup_n_f32(fval), 1)              \
+                        : READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
+#define READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)                   \
+    ((y < pad_top) || (x < pad_left - 1) || (y >= height + pad_top) || (x > width + pad_left - 1)) \
+        ? vdup_n_f32(fval)                                                                         \
+        : READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
 
 #define READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)           \
     vcombine_f32(READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval), \
                  READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, (x + 2), y, (ptr + 2), fval))
 
-float32x4x2_t read_8_boundary_aware(int height, int width, int pad_left, int pad_top, int x, int y, const float *ptr, float fval)
+float32x4x2_t
+read_8_boundary_aware(int height, int width, int pad_left, int pad_top, int x, int y, const float *ptr, float fval)
 {
     float32x4x2_t vec;
     vec.val[0] = READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval);
@@ -54,509 +61,280 @@ float32x4x2_t read_8_boundary_aware(int height, int width, int pad_left, int pad
     return vec;
 }
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-float16x4_t read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, int x, int y, const float16_t *ptr, float16_t fval)
-{
-    float16_t  vec[4];
-    const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
-    for(int i = 0; i < 4; i++)
-    {
-        if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
-        {
-            vec[i] = *(ptr + i);
-        }
-        else
-        {
-            vec[i] = fval;
-        }
-    }
-    return wrapper::vload(vec);
-}
-
-void pooling3_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_fp32_neon_nchw(const ITensor    *src,
+                               ITensor          *dst0,
+                               ITensor          *dst1,
+                               PoolingLayerInfo &pool_info,
+                               const Window     &window_src,
+                               const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
-
     Iterator in(src, window_src);
     Iterator out(dst0, window);
 
-    constexpr const int pool_size       = 3;
-    const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
+    const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int                  src_w          = src->info()->dimension(0);
-    const int                  src_h          = src->info()->dimension(1);
-    const int                  upper_bound_w  = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int                  upper_bound_h  = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float16_t            fp16_min       = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
-    const float16_t            fill_value     = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f;
-    const unsigned char *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const unsigned char *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto  x_val    = id.x() * pool_stride_x;
-        const auto  y_val_0  = id.y() * pool_stride_y;
-        const auto  y_val_1  = (id.y() * pool_stride_y) + 1;
-        const auto  y_val_2  = (id.y() * pool_stride_y) + 2;
-        float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                          x_val, y_val_0, reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()), fill_value);
-        float16x4_t middle_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                             x_val, y_val_1, reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()), fill_value);
-        float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                             x_val, y_val_2, reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()), fill_value);
-        float16x4_t res = {};
-
-        // Get power of 2 in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            top_data    = vmul_f16(top_data, top_data);
-            middle_data = vmul_f16(middle_data, middle_data);
-            bottom_data = vmul_f16(bottom_data, bottom_data);
-        }
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                           pool_stride_y);
-            const float16x4_t scale_v = vdup_n_f16(scale);
-            // Perform pooling
-            const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
-            res                        = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
-            res                        = vmul_f16(vpadd_f16(res, res), scale_v);
-        }
-        else
+    const int   src_w                      = src->info()->dimension(0);
+    const int   src_h                      = src->info()->dimension(1);
+    const int   upper_bound_w              = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int   upper_bound_h              = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float min_value                  = get_initial_min<float>(pool_info.use_inf_as_limit);
+    const float fill_value                 = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
-            res                        = vpmax_f16(vset_lane_f16(fp16_min, max_data, 3), max_data);
-            res                        = vpmax_f16(res, res);
-        }
+            float res = 0.0f;
 
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            res = vsqrt_f16(res);
-        }
-
-        *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
-    },
-    in, out);
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
-f16_to_f32(float16x4_t in)
-{
-    float32x2_t out = { static_cast<float>(vget_lane_f16(in, 0)), static_cast<float>(vget_lane_f16(in, 1)) };
-    return out;
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            if (pool_info.pool_type != PoolingType::MAX)
+            {
+                // Calculate scale
+                const float scale = calculate_avg_scale_pool2d(
+                    pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w,
+                    upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
-template <typename T>
-inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
-f16_to_f32(float32x2_t in)
-{
-    return in;
-}
+                // Perform pooling
+                for (int y = 0; y < pool_size_y; ++y)
+                {
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto ptr = reinterpret_cast<const float *>(
+                            in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+                            (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
 
-template <typename T>
-auto read_2_boundary_aware(int srcw, int srch, int pad_l, int pad_t, int x, int y, const T *ptr, T fval)
-{
-    T          vec[2];
-    const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
-    for(int i = 0; i < 2; i++)
-    {
-        if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
-        {
-            vec[i] = *(ptr + i);
-        }
-        else
-        {
-            vec[i] = fval;
-        }
-    }
-    return wrapper::vload(vec);
-}
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        float     data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
 
-template <typename T>
-void pooling2_nchw_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    Iterator  in(src, window_src);
-    Iterator  out(dst0, window);
-    Iterator  indices(dst1, window);
-    const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left = pool_info.pad_stride_info.pad_left();
-    int       pool_stride_x = 0;
-    int       pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int            src_w          = src->info()->dimension(0);
-    const int            src_h          = src->info()->dimension(1);
-    const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const int            pad_left       = src->info()->padding().left;
-    const int            pad_right      = src->info()->padding().right;
-    const int            in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
-    const T              float_min      = get_initial_min<T>(pool_info.use_inf_as_limit);
-    const T              fill_value     = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f;
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto x_val    = id.x() * pool_stride_x;
-        const auto y_val_0  = id.y() * pool_stride_y;
-        const auto y_val_1  = (id.y() * pool_stride_y) + 1;
-        auto       top_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                    x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
-        auto bottom_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                 x_val, y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
-        float32x2_t top_data_f32    = f16_to_f32<T>(top_data);
-        float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
-
-        // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
-        const float32x2_t max_data_top      = vpmax_f32(top_data_f32, top_data_f32);
-        const float32x2_t max_data_bottom   = vpmax_f32(bottom_data_f32, bottom_data_f32);
-        const float32x2_t max_data          = vmax_f32(max_data_top, max_data_bottom);
-        *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
-
-        // Calculate max data indice, which will be used in max unpool.
-        const uint32_t   offset_base              = offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW);
-        const uint32_t   offset_top               = (uint32_t)(offset_base / sizeof(T));
-        const uint32_t   offset_bottom            = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
-        const uint32x2_t voffset_top              = { offset_top, offset_top + 1u };
-        const uint32x2_t voffset_bottom           = { offset_bottom, offset_bottom + 1u };
-        const uint32x2_t tmp_indices_top          = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
-        const uint32x2_t tmp_indices_bottom       = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
-        *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
-    },
-    in, out, indices);
-}
+                        if (pool_info.pool_type == PoolingType::L2)
+                        {
+                            data *= data;
+                        }
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    if(pool_info.pool_type == PoolingType::MAX && dst1)
-    {
-        pooling2_nchw_maxpool_indices<float16_t>(src, dst0, dst1, pool_info, window_src, window);
-    }
-    else
-    {
-        Iterator      in(src, window_src);
-        Iterator      out(dst0, window);
-        constexpr int pool_size       = 2;
-        const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-        const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-        const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-        const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-        int           pool_stride_x, pool_stride_y = 0;
-        std::tie(pool_stride_x, pool_stride_y)     = pool_info.pad_stride_info.stride();
-        const int       src_w         = src->info()->dimension(0);
-        const int       src_h         = src->info()->dimension(1);
-        const int       upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-        const int       upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-        const float16_t fp16_min      = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
-        const float16_t fill_value    = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
-
-        const unsigned char *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-        const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto in_top_ptr    = reinterpret_cast<const float16_t *>(src_top_ptr + in.offset());
-            const auto in_bottom_ptr = reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset());
-
-            const auto  x_val    = id.x() * pool_stride_x;
-            const auto  y_val_0  = id.y() * pool_stride_y;
-            const auto  y_val_1  = (id.y() * pool_stride_y) + 1;
-            float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                              x_val, y_val_0, in_top_ptr, fill_value);
-            float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                                 x_val, y_val_1, in_bottom_ptr, fill_value);
-            float16x4_t res = {};
+                        res += data;
+                    }
+                }
 
-            // Get power of 2 in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                top_data    = vmul_f16(top_data, top_data);
-                bottom_data = vmul_f16(bottom_data, bottom_data);
+                // Divide by scale
+                res *= scale;
             }
-
-            if(pool_info.pool_type != PoolingType::MAX)
+            else // if max pooling
             {
-                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                               pool_stride_y);
-                const float16x4_t scale_v = vdup_n_f16(scale);
+                res = min_value;
 
-                const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
-                res                        = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
-            }
-            else
-            {
-                const float16x4_t max_data = vmax_f16(top_data, bottom_data);
-                res                        = vpmax_f16(max_data, max_data);
+                for (int y = 0; y < pool_size_y; ++y)
+                {
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto ptr = reinterpret_cast<const float *>(
+                            in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+                            (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        float     data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+                        res            = std::max(res, data);
+                    }
+                }
             }
 
             // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
+            if (pool_info.pool_type == PoolingType::L2)
             {
-                res = vsqrt_f16(res);
+                res = std::sqrt(res);
             }
 
             // Store result
-            *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
+            *(reinterpret_cast<float *>(out.ptr())) = res;
         },
         in, out);
-    }
 }
 
-void poolingMxN_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_fp32_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
 {
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int       src_w         = src->info()->dimension(0);
-    const int       src_h         = src->info()->dimension(1);
-    const int       upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int       upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float16_t fp16_min      = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
-    const float16_t fill_value    = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
-
-    execute_window_loop(window, [&](const Coordinates & id)
+    if (pool_info.pool_type == PoolingType::MAX && dst1)
     {
-        float16_t res = 0.0f;
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                               pool_stride_y);
-
-            // Perform pooling
-            for(int y = 0; y < pool_size_y; ++y)
+        pooling2_nchw_maxpool_indices<float>(src, dst0, dst1, pool_info, window_src, window);
+    }
+    else
+    {
+        Iterator      in(src, window_src);
+        Iterator      out(dst0, window);
+        constexpr int pool_size                = 2;
+        const int     pool_pad_right           = pool_info.pad_stride_info.pad_right();
+        const int     pool_pad_top             = pool_info.pad_stride_info.pad_top();
+        const int     pool_pad_left            = pool_info.pad_stride_info.pad_left();
+        const int     pool_pad_bottom          = pool_info.pad_stride_info.pad_bottom();
+        int           pool_stride_x            = 0;
+        int           pool_stride_y            = 0;
+        std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+        const int   src_w                      = src->info()->dimension(0);
+        const int   src_h                      = src->info()->dimension(1);
+        const int   upper_bound_w              = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+        const int   upper_bound_h              = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+        const float min_value                  = get_initial_min<float>(pool_info.use_inf_as_limit);
+        const float fill_value                 = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+        const uint8_t *const src_top_ptr =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+        const uint8_t *const src_bottom_ptr =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
             {
-                for(int x = 0; x < pool_size_x; ++x)
-                {
-                    const auto ptr = reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
-                                                                         + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
-
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+                const auto in_top_ptr    = reinterpret_cast<const float *>(src_top_ptr + in.offset());
+                const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
+
+                const auto x_val      = id.x() * pool_stride_x;
+                const auto y_val_0    = id.y() * pool_stride_y;
+                const auto y_val_1    = (id.y() * pool_stride_y) + 1;
+                auto       top_data   = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0,
+                                                              in_top_ptr, fill_value);
+                auto bottom_data      = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1,
+                                                              in_bottom_ptr, fill_value);
+                float32x2_t res       = {};
+                float       final_res = 0;
 
-                    if(pool_info.pool_type == PoolingType::L2)
-                    {
-                        data *= data;
-                    }
-
-                    res += data;
+                // Get power of 2 in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    top_data    = vmul_f32(top_data, top_data);
+                    bottom_data = vmul_f32(bottom_data, bottom_data);
                 }
-            }
 
-            // Divide by scale
-            res *= scale;
-        }
-        else // if max pooling
-        {
-            res = fp16_min;
-
-            for(int y = 0; y < pool_size_y; ++y)
-            {
-                for(int x = 0; x < pool_size_x; ++x)
+                if (pool_info.pool_type != PoolingType::MAX)
                 {
-                    const auto ptr = reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
-                                                                         + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
-
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
-                    res            = std::max(res, data);
+                    // Calculate scale
+                    float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size,
+                                                             pool_size, upper_bound_w, upper_bound_h, pool_pad_left,
+                                                             pool_pad_top, pool_stride_x, pool_stride_y);
+                    const float32x2_t scale_v = vdup_n_f32(scale);
+
+                    // Perform pooling
+                    const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
+                    res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
                 }
-            }
-        }
-
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            res = std::sqrt(res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float16_t *>(out.ptr())) = res;
-    },
-    in, out);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-void poolingMxN_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int   src_w         = src->info()->dimension(0);
-    const int   src_h         = src->info()->dimension(1);
-    const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float min_value     = get_initial_min<float>(pool_info.use_inf_as_limit);
-    const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float res = 0.0f;
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h,
-                                                           pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-
-            // Perform pooling
-            for(int y = 0; y < pool_size_y; ++y)
-            {
-                for(int x = 0; x < pool_size_x; ++x)
+                else
                 {
-                    const auto ptr = reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
-                                                                     + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
-
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    float     data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
-
-                    if(pool_info.pool_type == PoolingType::L2)
-                    {
-                        data *= data;
-                    }
-
-                    res += data;
+                    const float32x2_t max_data = vmax_f32(top_data, bottom_data);
+                    res                        = vpmax_f32(max_data, max_data);
                 }
-            }
+                final_res = vget_lane_f32(res, 0);
 
-            // Divide by scale
-            res *= scale;
-        }
-        else // if max pooling
-        {
-            res = min_value;
-
-            for(int y = 0; y < pool_size_y; ++y)
-            {
-                for(int x = 0; x < pool_size_x; ++x)
+                // Calculate square-root in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
                 {
-                    const auto ptr = reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
-                                                                     + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
-
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    float     data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
-                    res            = std::max(res, data);
+                    final_res = sqrt(final_res);
                 }
-            }
-        }
 
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            res = std::sqrt(res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float *>(out.ptr())) = res;
-    },
-    in, out);
+                // Store result
+                *(reinterpret_cast<float *>(out.ptr())) = final_res;
+            },
+            in, out);
+    }
 }
 
-void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling3_fp32_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
 {
-    if(pool_info.pool_type == PoolingType::MAX && dst1)
-    {
-        pooling2_nchw_maxpool_indices<float>(src, dst0, dst1, pool_info, window_src, window);
-    }
-    else
-    {
-        Iterator      in(src, window_src);
-        Iterator      out(dst0, window);
-        constexpr int pool_size       = 2;
-        const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-        const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-        const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-        const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-        int           pool_stride_x   = 0;
-        int           pool_stride_y   = 0;
-        std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-        const int   src_w         = src->info()->dimension(0);
-        const int   src_h         = src->info()->dimension(1);
-        const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-        const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-        const float min_value     = get_initial_min<float>(pool_info.use_inf_as_limit);
-        const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
-
-        const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-        const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    ARM_COMPUTE_UNUSED(dst1);
+    Iterator in(src, window_src);
+    Iterator out(dst0, window);
 
-        execute_window_loop(window, [&](const Coordinates & id)
+    constexpr const int pool_size          = 3;
+    const int           pool_pad_right     = pool_info.pad_stride_info.pad_right();
+    const int           pool_pad_top       = pool_info.pad_stride_info.pad_top();
+    const int           pool_pad_left      = pool_info.pad_stride_info.pad_left();
+    const int           pool_pad_bottom    = pool_info.pad_stride_info.pad_bottom();
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+    const int   src_w                      = src->info()->dimension(0);
+    const int   src_h                      = src->info()->dimension(1);
+    const int   upper_bound_w              = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int   upper_bound_h              = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float min_value                  = get_initial_min<float>(pool_info.use_inf_as_limit);
+    const float fill_value                 = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+    const uint8_t *const src_top_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const src_middle_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const uint8_t *const src_bottom_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
             const auto in_top_ptr    = reinterpret_cast<const float *>(src_top_ptr + in.offset());
+            const auto in_middle_ptr = reinterpret_cast<const float *>(src_middle_ptr + in.offset());
             const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
 
-            const auto  x_val       = id.x() * pool_stride_x;
-            const auto  y_val_0     = id.y() * pool_stride_y;
-            const auto  y_val_1     = (id.y() * pool_stride_y) + 1;
-            auto        top_data    = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, fill_value);
-            auto        bottom_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, in_bottom_ptr, fill_value);
-            float32x2_t res         = {};
-            float       final_res   = 0;
+            const auto x_val   = id.x() * pool_stride_x;
+            const auto y_val_0 = id.y() * pool_stride_y;
+            const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+            const auto y_val_2 = (id.y() * pool_stride_y) + 2;
+            auto top_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr,
+                                                  fill_value);
+            auto middle_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1,
+                                                     in_middle_ptr, fill_value);
+            auto bottom_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_2,
+                                                     in_bottom_ptr, fill_value);
+
+            float32x2_t res       = {};
+            float       final_res = 0;
 
             // Get power of 2 in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
+            if (pool_info.pool_type == PoolingType::L2)
             {
-                top_data    = vmul_f32(top_data, top_data);
-                bottom_data = vmul_f32(bottom_data, bottom_data);
+                top_data    = vmulq_f32(top_data, top_data);
+                middle_data = vmulq_f32(middle_data, middle_data);
+                bottom_data = vmulq_f32(bottom_data, bottom_data);
             }
 
-            if(pool_info.pool_type != PoolingType::MAX)
+            if (pool_info.pool_type != PoolingType::MAX)
             {
                 // Calculate scale
-                float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                         pool_stride_y);
+                float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size,
+                                                         pool_size, upper_bound_w, upper_bound_h, pool_pad_left,
+                                                         pool_pad_top, pool_stride_x, pool_stride_y);
                 const float32x2_t scale_v = vdup_n_f32(scale);
 
                 // Perform pooling
-                const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
-                res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
+                const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
+                res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
+                res = vmul_f32(vpadd_f32(res, res), scale_v);
             }
             else
             {
-                const float32x2_t max_data = vmax_f32(top_data, bottom_data);
-                res                        = vpmax_f32(max_data, max_data);
+                const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
+                res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, max_data, 3)), vget_low_f32(max_data));
+                res = vpmax_f32(res, res);
             }
             final_res = vget_lane_f32(res, 0);
 
             // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
+            if (pool_info.pool_type == PoolingType::L2)
             {
                 final_res = sqrt(final_res);
             }
@@ -565,191 +343,120 @@ void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
             *(reinterpret_cast<float *>(out.ptr())) = final_res;
         },
         in, out);
-    }
-}
-
-void pooling3_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    constexpr const int pool_size       = 3;
-    const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int   src_w         = src->info()->dimension(0);
-    const int   src_h         = src->info()->dimension(1);
-    const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float min_value     = get_initial_min<float>(pool_info.use_inf_as_limit);
-    const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
-
-    const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto in_top_ptr    = reinterpret_cast<const float *>(src_top_ptr + in.offset());
-        const auto in_middle_ptr = reinterpret_cast<const float *>(src_middle_ptr + in.offset());
-        const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
-
-        const auto x_val       = id.x() * pool_stride_x;
-        const auto y_val_0     = id.y() * pool_stride_y;
-        const auto y_val_1     = (id.y() * pool_stride_y) + 1;
-        const auto y_val_2     = (id.y() * pool_stride_y) + 2;
-        auto       top_data    = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, fill_value);
-        auto       middle_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, in_middle_ptr, fill_value);
-        auto       bottom_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_2, in_bottom_ptr, fill_value);
-
-        float32x2_t res       = {};
-        float       final_res = 0;
-
-        // Get power of 2 in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            top_data    = vmulq_f32(top_data, top_data);
-            middle_data = vmulq_f32(middle_data, middle_data);
-            bottom_data = vmulq_f32(bottom_data, bottom_data);
-        }
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                     pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
-
-            // Perform pooling
-            const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
-            res                        = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
-            res                        = vmul_f32(vpadd_f32(res, res), scale_v);
-        }
-        else
-        {
-            const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
-            res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, max_data, 3)), vget_low_f32(max_data));
-            res                        = vpmax_f32(res, res);
-        }
-        final_res = vget_lane_f32(res, 0);
-
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float *>(out.ptr())) = final_res;
-    },
-    in, out);
 }
 
-void pooling7_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling7_fp32_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
     Iterator in(src, window_src);
     Iterator out(dst0, window);
 
-    constexpr const int pool_size       = 7;
-    const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
+    constexpr const int pool_size          = 7;
+    const int           pool_pad_right     = pool_info.pad_stride_info.pad_right();
+    const int           pool_pad_top       = pool_info.pad_stride_info.pad_top();
+    const int           pool_pad_left      = pool_info.pad_stride_info.pad_left();
+    const int           pool_pad_bottom    = pool_info.pad_stride_info.pad_bottom();
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int   src_w         = src->info()->dimension(0);
-    const int   src_h         = src->info()->dimension(1);
-    const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float min_value     = get_initial_min<float>(pool_info.use_inf_as_limit);
-    const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
-
-    std::array<const uint8_t *, pool_size> src_ptrs{ {} };
-    for(int i = 0; i < pool_size; ++i)
+    const int   src_w                      = src->info()->dimension(0);
+    const int   src_h                      = src->info()->dimension(1);
+    const int   upper_bound_w              = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int   upper_bound_h              = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float min_value                  = get_initial_min<float>(pool_info.use_inf_as_limit);
+    const float fill_value                 = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+    std::array<const uint8_t *, pool_size> src_ptrs{{}};
+    for (int i = 0; i < pool_size; ++i)
     {
-        src_ptrs[i] = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
+        src_ptrs[i] =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
     }
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto in_ptr = reinterpret_cast<const float *>(src_ptrs[0] + in.offset());
-
-        auto          x_val = id.x() * pool_stride_x;
-        auto          y_val = id.y() * pool_stride_y;
-        float32x4x2_t data  = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            auto in_ptr = reinterpret_cast<const float *>(src_ptrs[0] + in.offset());
 
-        float32x2_t res       = {};
-        float       final_res = 0.f;
+            auto          x_val = id.x() * pool_stride_x;
+            auto          y_val = id.y() * pool_stride_y;
+            float32x4x2_t data =
+                read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
 
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                     pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
+            float32x2_t res       = {};
+            float       final_res = 0.f;
 
-            // Get power of 2 in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                data.val[0] = vmulq_f32(data.val[0], data.val[0]);
-                data.val[1] = vmulq_f32(data.val[1], data.val[1]);
-            }
-            float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
-            for(int i = 1; i < pool_size; ++i)
+            if (pool_info.pool_type != PoolingType::MAX)
             {
-                in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
+                // Calculate scale
+                float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size,
+                                                         pool_size, upper_bound_w, upper_bound_h, pool_pad_left,
+                                                         pool_pad_top, pool_stride_x, pool_stride_y);
+                const float32x2_t scale_v = vdup_n_f32(scale);
 
-                x_val = id.x() * pool_stride_x;
-                y_val = (id.y() * pool_stride_y) + i;
-                data  = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
                 // Get power of 2 in case of l2 pooling
-                if(pool_info.pool_type == PoolingType::L2)
+                if (pool_info.pool_type == PoolingType::L2)
                 {
                     data.val[0] = vmulq_f32(data.val[0], data.val[0]);
                     data.val[1] = vmulq_f32(data.val[1], data.val[1]);
                 }
-                sum_data = vaddq_f32(sum_data, data.val[0]);
-                sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
+                float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
+                for (int i = 1; i < pool_size; ++i)
+                {
+                    in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
+
+                    x_val = id.x() * pool_stride_x;
+                    y_val = (id.y() * pool_stride_y) + i;
+                    data  = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr,
+                                                  fill_value);
+                    // Get power of 2 in case of l2 pooling
+                    if (pool_info.pool_type == PoolingType::L2)
+                    {
+                        data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+                        data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+                    }
+                    sum_data = vaddq_f32(sum_data, data.val[0]);
+                    sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
+                }
+                res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
+                res = vmul_f32(vpadd_f32(res, res), scale_v);
             }
-            res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
-            res = vmul_f32(vpadd_f32(res, res), scale_v);
-        }
-        else
-        {
-            for(int i = 1; i < pool_size; ++i)
+            else
             {
-                in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
+                for (int i = 1; i < pool_size; ++i)
+                {
+                    in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
+
+                    x_val              = id.x() * pool_stride_x;
+                    y_val              = (id.y() * pool_stride_y) + i;
+                    float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val,
+                                                               in_ptr, fill_value);
+                    data               = vmax2q_f32(data, temp);
+                }
+                res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, data.val[1], 3)), vget_low_f32(data.val[1]));
+                res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0])));
+                res = vpmax_f32(res, res);
+            }
+            final_res = vget_lane_f32(res, 0);
 
-                x_val              = id.x() * pool_stride_x;
-                y_val              = (id.y() * pool_stride_y) + i;
-                float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
-                data               = vmax2q_f32(data, temp);
+            // Calculate square-root in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                final_res = sqrt(final_res);
             }
-            res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, data.val[1], 3)), vget_low_f32(data.val[1]));
-            res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0])));
-            res = vpmax_f32(res, res);
-        }
-        final_res = vget_lane_f32(res, 0);
-
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
-        }
 
-        // Store result
-        *(reinterpret_cast<float *>(out.ptr())) = final_res;
-    },
-    in, out);
+            // Store result
+            *(reinterpret_cast<float *>(out.ptr())) = final_res;
+        },
+        in, out);
 }
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // ENABLE_NCHW_KERNELS
\ No newline at end of file
+#endif // ENABLE_NCHW_KERNELS
diff --git a/src/cpu/kernels/pool2d/neon/qasymm8.cpp b/src/cpu/kernels/pool2d/neon/qasymm8.cpp
index 7f8841edd882ddb5c3b30fe343e943357dd7857e..44675b53943f9cedb0e263857858b5320d4d9c71 100644
--- a/src/cpu/kernels/pool2d/neon/qasymm8.cpp
+++ b/src/cpu/kernels/pool2d/neon/qasymm8.cpp
@@ -25,17 +25,23 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool2d/neon/list.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void poolingMxN_qasymm8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_qasymm8_neon_nhwc(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
 {
     poolingMxN_q8_neon_nhwc<uint8_t>(src, dst0, dst1, pool_info, window_src, window);
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp
index 8643651f27c93f53ffec8f2c83bcbb182a309cc5..d434323e89fbb6a0cf99a33520590e3fa2f2c1f3 100644
--- a/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp
@@ -25,17 +25,23 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool2d/neon/list.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor    *src,
+                                         ITensor          *dst0,
+                                         ITensor          *dst1,
+                                         PoolingLayerInfo &pool_info,
+                                         const Window     &window_src,
+                                         const Window     &window)
 {
     poolingMxN_q8_neon_nhwc<int8_t>(src, dst0, dst1, pool_info, window_src, window);
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/pool2d/neon/quantized.h b/src/cpu/kernels/pool2d/neon/quantized.h
index a2cd3991be6bacf534260bbc50aec56f1abc6314..38f1b2f1f91c41e51d6f1a4ed600efa784a04b60 100644
--- a/src/cpu/kernels/pool2d/neon/quantized.h
+++ b/src/cpu/kernels/pool2d/neon/quantized.h
@@ -26,11 +26,13 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/helpers/PoolingHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/PoolingHelpers.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -38,7 +40,12 @@ namespace arm_compute
 namespace cpu
 {
 template <typename T>
-void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_q8_neon_nhwc(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
 
@@ -60,15 +67,15 @@ void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, P
     using q32_t   = typename wrapper::traits::promote_t<q16_t>;
     using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
 
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
     const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
     const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
     const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
     const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
 
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
+    int pool_stride_x                      = 0;
+    int pool_stride_y                      = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
     const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
     const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
@@ -80,233 +87,267 @@ void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, P
     const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
     // "new_offset" doesn't have to consider the "half_scale_v" in its computation
     // With a requantization performed in a single step there won't be uncertainties introduced
-    const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
+    const int32_t new_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
 
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-        const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            if(pool_info.pool_type != PoolingType::MAX)
+            const int idx_width    = id.y() * pool_stride_x;
+            const int idx_height   = id.z() * pool_stride_y;
+            const int pool_limit_y = pool_pad_top - idx_height;
+            const int pool_limit_x = pool_pad_left - idx_width;
+
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+            const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
+
+            int x_off = window_start_x;
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
             {
-                q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-                q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-                q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-                q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-
-                // Calculate scale
-                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                               pool_stride_y);
-
-                // Perform pooling
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                if (pool_info.pool_type != PoolingType::MAX)
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                    q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                    q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                    q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+
+                    // Calculate scale
+                    const float scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                    // Perform pooling
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                         (src->info()->strides_in_bytes().z())) + x_off);
-
-                        const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
-                        const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
-                        vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
-                        vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
-                        vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
-                        vres4                   = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const q8x16_t data = wrapper::vloadq(
+                                reinterpret_cast<const T *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+
+                            const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
+                            const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
+                            vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
+                            vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
+                            vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
+                            vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                        }
                     }
-                }
 
-                if(src_qinfo != dst_qinfo)
-                {
-                    const float32x4x4_t vres =
+                    if (src_qinfo != dst_qinfo)
                     {
-                        {
+                        const float32x4x4_t vres = {{
                             vcvtq_f32_q32(vres1),
                             vcvtq_f32_q32(vres2),
                             vcvtq_f32_q32(vres3),
                             vcvtq_f32_q32(vres4),
-                        }
-                    };
-                    const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
-                    // Store result
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
+                        }};
+                        const auto          requantized_dst =
+                            vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+                        // Store result
+                        wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
+                        wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8,
+                                        wrapper::vgethigh(requantized_dst));
+                    }
+                    else
+                    {
+                        const float32x4_t scale_v = vdupq_n_f32(scale);
+                        // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                        vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
+                        vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
+                        vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
+                        vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
+
+                        const q8x8_t res1 =
+                            wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
+                        const q8x8_t res2 =
+                            wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
+                        // Store result
+                        wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
+                        wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
+                    }
                 }
                 else
                 {
-                    const float32x4_t scale_v = vdupq_n_f32(scale);
-                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                    vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
-                    vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
-                    vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
-                    vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
-
-                    const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
-                    const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
-                    // Store result
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
-                }
-            }
-            else
-            {
-                q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+                    q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
 
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                         (src->info()->strides_in_bytes().z())) + x_off);
-                        vres               = wrapper::vmax(vres, data);
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const q8x16_t data = wrapper::vloadq(
+                                reinterpret_cast<const T *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+                            vres = wrapper::vmax(vres, data);
+                        }
                     }
-                }
 
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
-                                requant_qinfo) :
-                                vres);
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                    (src_qinfo != dst_qinfo)
+                                        ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres),
+                                                                               wrapper::vgethigh(vres), requant_qinfo)
+                                        : vres);
+                }
             }
-        }
 
-        if(pool_info.pool_type == PoolingType::MAX)
-        {
-            for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
+            if (pool_info.pool_type == PoolingType::MAX)
             {
-                q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                       (src->info()->strides_in_bytes().z())) + x_off);
-                        vres              = wrapper::vmax(vres, data);
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const q8x8_t data = wrapper::vload(
+                                reinterpret_cast<const T *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+                            vres = wrapper::vmax(vres, data);
+                        }
                     }
-                }
 
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
-                                (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                    (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+                }
             }
-        }
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            if(pool_info.pool_type != PoolingType::MAX)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                q32_t res = static_cast<q32_t>(0.f);
+                if (pool_info.pool_type != PoolingType::MAX)
+                {
+                    q32_t res = static_cast<q32_t>(0.f);
 
-                // Calculate scale
-                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                               pool_stride_y);
+                    // Calculate scale
+                    const float scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
-                // Perform pooling
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    // Perform pooling
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                     (src->info()->strides_in_bytes().z())) + x_off);
-                        res += data;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const T data =
+                                *(reinterpret_cast<const T *>(
+                                      in.ptr() +
+                                      (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                      (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                  x_off);
+                            res += data;
+                        }
                     }
-                }
 
-                if(src_qinfo != dst_qinfo)
-                {
-                    const float res_f           = static_cast<float>(res);
-                    const float new_scale       = quant_rescale / scale;
-                    const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+                    if (src_qinfo != dst_qinfo)
+                    {
+                        const float res_f          = static_cast<float>(res);
+                        const float new_scale      = quant_rescale / scale;
+                        const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
 
-                    // Store result
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
+                        // Store result
+                        *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
+                    }
+                    else
+                    {
+                        // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                        res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+
+                        // Store result
+                        *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                    }
                 }
                 else
                 {
-                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                    res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
-
-                    // Store result
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-                }
-            }
-            else
-            {
-                T res = std::numeric_limits<T>::min();
+                    T res = std::numeric_limits<T>::min();
 
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                     (src->info()->strides_in_bytes().z())) + x_off);
-                        res          = std::max(res, data);
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const T data =
+                                *(reinterpret_cast<const T *>(
+                                      in.ptr() +
+                                      (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                      (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                  x_off);
+                            res = std::max(res, data);
+                        }
                     }
-                }
 
-                // Store result
-                if(src_qinfo != dst_qinfo)
-                {
-                    const float res_f                           = static_cast<float>(res);
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
-                }
-                else
-                {
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                    // Store result
+                    if (src_qinfo != dst_qinfo)
+                    {
+                        const float res_f                           = static_cast<float>(res);
+                        *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+                    }
+                    else
+                    {
+                        *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                    }
                 }
             }
-        }
-
-    },
-    in, out);
+        },
+        in, out);
 }
 
 #if defined(ENABLE_NCHW_KERNELS)
 template <typename T, typename TVec>
-inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
-                               const int pool_size, const int upper_bound_w, const int upper_bound_h,
-                               const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+inline void scale_vector_q16x8(bool               exclude_padding,
+                               TVec              &v,
+                               const Coordinates &id,
+                               int                id_offset,
+                               int                step,
+                               const int          pool_size,
+                               const int          upper_bound_w,
+                               const int          upper_bound_h,
+                               const int          pad_x,
+                               const int          pad_y,
+                               const int          stride_x,
+                               const int          stride_y)
 {
     int       start_x = (id.x() + id_offset) * stride_x - pad_x;
     int       start_y = id.y() * stride_y - pad_y;
     const int end_y   = std::min(start_y + pool_size, upper_bound_h);
-    if(exclude_padding)
+    if (exclude_padding)
     {
         start_y = std::max(0, start_y);
     }
 
-    std::array<T, 8> elems =
-    {
-        {
-            wrapper::vgetlane(v, 0),
-            wrapper::vgetlane(v, 1),
-            wrapper::vgetlane(v, 2),
-            wrapper::vgetlane(v, 3),
-            wrapper::vgetlane(v, 4),
-            wrapper::vgetlane(v, 5),
-            wrapper::vgetlane(v, 6),
-            wrapper::vgetlane(v, 7),
-        }
-    };
-
-    for(auto &el : elems)
+    std::array<T, 8> elems = {{
+        wrapper::vgetlane(v, 0),
+        wrapper::vgetlane(v, 1),
+        wrapper::vgetlane(v, 2),
+        wrapper::vgetlane(v, 3),
+        wrapper::vgetlane(v, 4),
+        wrapper::vgetlane(v, 5),
+        wrapper::vgetlane(v, 6),
+        wrapper::vgetlane(v, 7),
+    }};
+
+    for (auto &el : elems)
     {
         int       c_start_x = start_x;
         const int end_x     = std::min(c_start_x + pool_size, upper_bound_w);
-        if(exclude_padding)
+        if (exclude_padding)
         {
             c_start_x = std::max(0, c_start_x);
         }
@@ -326,15 +367,16 @@ inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates
 }
 
 template <typename T>
-auto load16_boundary_aware(int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval)
+auto load16_boundary_aware(
+    int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval)
 {
     ARM_COMPUTE_UNUSED(pad_b, pad_r);
     T vec[16];
     //handle reading a row out of the tensor
     const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
-    for(int i = 0; i < 16; i++)
+    for (int i = 0; i < 16; i++)
     {
-        if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
+        if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
         {
             vec[i] = *(ptr + i);
         }
@@ -349,24 +391,24 @@ auto load16_boundary_aware(int srcw, int srch, int pad_l, int pad_r, int pad_t,
 template <typename T, typename V, bool deinterleave>
 inline void write16_boundary_aware(int x, int dst_w, const V &lower, const V &upper, T *ptr)
 {
-    if(deinterleave)
+    if (deinterleave)
     {
-        for(int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i)
+        for (int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i)
         {
             *(ptr + i * 2) = lower[i];
         }
-        for(int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i)
+        for (int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i)
         {
             *(ptr + 1 + i * 2) = upper[i];
         }
     }
     else
     {
-        for(int i = 0; i < 8 && (i + x) < dst_w; ++i)
+        for (int i = 0; i < 8 && (i + x) < dst_w; ++i)
         {
             *(ptr + i) = lower[i];
         }
-        for(int i = 0; i < 8 && (i + x + 8) < dst_w; ++i)
+        for (int i = 0; i < 8 && (i + x + 8) < dst_w; ++i)
         {
             *(ptr + i + 8) = upper[i];
         }
@@ -376,14 +418,19 @@ inline void write16_boundary_aware(int x, int dst_w, const V &lower, const V &up
 template <typename T, typename V>
 inline void write8_boundary_aware(int x, int dst_w, const V &v, T *ptr)
 {
-    for(int i = 0; i < 8 && (i + x) < dst_w; ++i)
+    for (int i = 0; i < 8 && (i + x) < dst_w; ++i)
     {
         *(ptr + i) = v[i];
     }
 }
 
 template <typename T>
-void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_quantized_neon_nchw(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
     Iterator in(src, window_src);
@@ -397,129 +444,136 @@ void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds
     using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
     using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
 
-    constexpr int pool_size       = 2;
-    int           pool_stride_x   = 0;
-    int           pool_stride_y   = 0;
-    const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+    constexpr int pool_size                = 2;
+    int           pool_stride_x            = 0;
+    int           pool_stride_y            = 0;
+    const int     pool_pad_right           = pool_info.pad_stride_info.pad_right();
+    const int     pool_pad_top             = pool_info.pad_stride_info.pad_top();
+    const int     pool_pad_left            = pool_info.pad_stride_info.pad_left();
+    const int     pool_pad_bottom          = pool_info.pad_stride_info.pad_bottom();
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int                     upper_bound_w        = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int                     upper_bound_h        = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const T *const                src_top_ptr          = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
-    const T *const                src_bottom_ptr       = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
+    const int      upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int      upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const T *const src_top_ptr   = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
+    const T *const src_bottom_ptr = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
     const int                     scale_step_x         = (pool_stride_x == 1) ? 2 : 1;
     const UniformQuantizationInfo src_qinfo            = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo dst_qinfo            = dst0->info()->quantization_info().uniform();
     const bool                    have_different_qinfo = src_qinfo != dst_qinfo;
 
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
-    const int                     src_w          = src->info()->dimension(0);
-    const int                     src_h          = src->info()->dimension(1);
-    const int                     dst_w          = dst0->info()->dimension(0);
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
+    const int                     src_w         = src->info()->dimension(0);
+    const int                     src_h         = src->info()->dimension(1);
+    const int                     dst_w         = dst0->info()->dimension(0);
 
     const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? std::numeric_limits<T>::min() : T(0);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto x_val   = id.x() * pool_stride_x;
-        const auto y_val_0 = id.y() * pool_stride_y;
-        const auto y_val_1 = (id.y() * pool_stride_y) + 1;
-
-        auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
-                                              x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
-        auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
-                                                 x_val, y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto x_val   = id.x() * pool_stride_x;
+            const auto y_val_0 = id.y() * pool_stride_y;
+            const auto y_val_1 = (id.y() * pool_stride_y) + 1;
 
-        q8x8_t lower_res = {};
-        q8x8_t upper_res = {};
+            auto top_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
+            auto bottom_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
 
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            const q16x8x2_t top_data_q16    = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
-            const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
+            q8x8_t lower_res = {};
+            q8x8_t upper_res = {};
 
-            // Add rows
-            const q16x8x2_t vrsum =
+            if (pool_info.pool_type != PoolingType::MAX)
             {
-                {
+                const q16x8x2_t top_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data))}};
+                const q16x8x2_t bottom_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data))}};
+
+                // Add rows
+                const q16x8x2_t vrsum = {{
                     wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
                     wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
-                }
-            };
+                }};
 
-            // Pair-wise add row data
-            const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
-            const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
+                // Pair-wise add row data
+                const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
+                const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
 
-            q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
+                q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
 
-            // Scale lower result
-            scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x,
-                                               pool_size, upper_bound_w, upper_bound_h,
-                                               pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-            lower_res = wrapper::vmovn(res_lower);
+                // Scale lower result
+                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x, pool_size,
+                                                   upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                   pool_stride_x, pool_stride_y);
+                lower_res = wrapper::vmovn(res_lower);
 
-            // Compute upper result for stride_x == 1
-            if(pool_stride_x == 1)
-            {
-                // Shifted row sum
-                const q16x8x2_t vrsum_shifted =
+                // Compute upper result for stride_x == 1
+                if (pool_stride_x == 1)
                 {
-                    {
-                        wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
-                        wrapper::vext_1(vrsum.val[1], vrsum.val[1])
-                    }
-                };
-
-                // Pair-wise add shifted row
-                q16x8_t res_upper = wrapper::vcombine(
-                                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
-                                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
-
-                // Scale upper result
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                upper_res = wrapper::vmovn(res_upper);
+                    // Shifted row sum
+                    const q16x8x2_t vrsum_shifted = {
+                        {wrapper::vext_1(vrsum.val[0], vrsum.val[1]), wrapper::vext_1(vrsum.val[1], vrsum.val[1])}};
+
+                    // Pair-wise add shifted row
+                    q16x8_t res_upper = wrapper::vcombine(
+                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
+                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]),
+                                       wrapper::vgethigh(vrsum_shifted.val[1])));
+
+                    // Scale upper result
+                    scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2, pool_size,
+                                                       upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                       pool_stride_x, pool_stride_y);
+                    upper_res = wrapper::vmovn(res_upper);
+                }
             }
-        }
-        else
-        {
-            const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
-            lower_res              = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
-            if(pool_stride_x == 1)
+            else
             {
-                const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
-                upper_res                      = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
+                const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
+                lower_res              = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
+                if (pool_stride_x == 1)
+                {
+                    const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
+                    upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
+                }
             }
-        }
 
-        if(have_different_qinfo)
-        {
-            const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
-            lower_res                  = wrapper::vgetlow(requantized_dst);
-            upper_res                  = wrapper::vgethigh(requantized_dst);
-        }
-        auto out_ptr = reinterpret_cast<T *>(out.ptr());
-        // Store result
-        if(pool_stride_x == 1)
-        {
-            write16_boundary_aware<T, q8x8_t, true>(id.x(), dst_w, lower_res, upper_res, out_ptr);
-        }
-        else
-        {
-            write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, lower_res, out_ptr);
-        }
-    },
-    in, out);
+            if (have_different_qinfo)
+            {
+                const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
+                lower_res                  = wrapper::vgetlow(requantized_dst);
+                upper_res                  = wrapper::vgethigh(requantized_dst);
+            }
+            auto out_ptr = reinterpret_cast<T *>(out.ptr());
+            // Store result
+            if (pool_stride_x == 1)
+            {
+                write16_boundary_aware<T, q8x8_t, true>(id.x(), dst_w, lower_res, upper_res, out_ptr);
+            }
+            else
+            {
+                write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, lower_res, out_ptr);
+            }
+        },
+        in, out);
 }
 
 template <typename T>
-void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling3_quantized_neon_nchw(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
     Iterator in(src, window_src);
@@ -533,13 +587,13 @@ void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds
     using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
     using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
 
-    constexpr int pool_size       = 3;
-    const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int           pool_stride_x   = 0;
-    int           pool_stride_y   = 0;
+    constexpr int pool_size                = 3;
+    const int     pool_pad_right           = pool_info.pad_stride_info.pad_right();
+    const int     pool_pad_top             = pool_info.pad_stride_info.pad_top();
+    const int     pool_pad_left            = pool_info.pad_stride_info.pad_left();
+    const int     pool_pad_bottom          = pool_info.pad_stride_info.pad_bottom();
+    int           pool_stride_x            = 0;
+    int           pool_stride_y            = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
     const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
     const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
@@ -547,147 +601,145 @@ void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds
     const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
 
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
 
-    const T *const src_top_ptr    = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
-    const T *const src_middle_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
-    const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
+    const T *const src_top_ptr = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
+    const T *const src_middle_ptr = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
+    const T *const src_bottom_ptr = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
 
     const int src_w      = src->info()->dimension(0);
     const int src_h      = src->info()->dimension(1);
     const T   fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
     const int dst_w      = dst0->info()->dimension(0);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto x_val   = id.x() * pool_stride_x;
-        const auto y_val_0 = id.y() * pool_stride_y;
-        const auto y_val_1 = (id.y() * pool_stride_y) + 1;
-        const auto y_val_2 = (id.y() * pool_stride_y) + 2;
-
-        auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
-                                              x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
-        auto middle_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
-                                                 x_val, y_val_1, reinterpret_cast<const T *>(src_middle_ptr + in.offset()), fill_value);
-        auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
-                                                 x_val, y_val_2, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
-
-        q8x8_t  fres  = {};
-        q8x16_t fqres = {};
-
-        if(pool_info.pool_type == PoolingType::AVG)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            // Convert data to u16
-            const q16x8x2_t top_data_q16    = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
-            const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
-            const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
-
-            // Calculate row sums
-            const q16x8x2_t vrsum =
+            const auto x_val   = id.x() * pool_stride_x;
+            const auto y_val_0 = id.y() * pool_stride_y;
+            const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+            const auto y_val_2 = (id.y() * pool_stride_y) + 2;
+
+            auto top_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
+            auto middle_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_1, reinterpret_cast<const T *>(src_middle_ptr + in.offset()), fill_value);
+            auto bottom_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_2, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
+
+            q8x8_t  fres  = {};
+            q8x16_t fqres = {};
+
+            if (pool_info.pool_type == PoolingType::AVG)
             {
+                // Convert data to u16
+                const q16x8x2_t top_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data))}};
+                const q16x8x2_t middle_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data))}};
+                const q16x8x2_t bottom_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data))}};
+
+                // Calculate row sums
+                const q16x8x2_t vrsum           = {{
+                              wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
+                              wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
+                }};
+                const q16x8x2_t vrsum_shifted_1 = {
+                    {wrapper::vext_1(vrsum.val[0], vrsum.val[1]), wrapper::vext_1(vrsum.val[1], vrsum.val[1])}};
+                const q16x8x2_t vrsum_shifted_2 = {
+                    {wrapper::vext_2(vrsum.val[0], vrsum.val[1]), wrapper::vext_2(vrsum.val[1], vrsum.val[1])}};
+                // Calculate final sum
+                q16x8x2_t final_sum = {{
+                    wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
+                    wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
+                }};
+                if (pool_stride_x == 2)
                 {
-                    wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
-                    wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
+                    q16x8_t res = {
+                        wrapper::vgetlane(final_sum.val[0], 0), wrapper::vgetlane(final_sum.val[0], 2),
+                        wrapper::vgetlane(final_sum.val[0], 4), wrapper::vgetlane(final_sum.val[0], 6),
+                        wrapper::vgetlane(final_sum.val[1], 0), wrapper::vgetlane(final_sum.val[1], 2),
+                        wrapper::vgetlane(final_sum.val[1], 4), wrapper::vgetlane(final_sum.val[1], 6),
+                    };
+
+                    scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1, pool_size,
+                                                       upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                       pool_stride_x, pool_stride_y);
+                    fres = wrapper::vmovn(res);
                 }
-            };
-            const q16x8x2_t vrsum_shifted_1 =
-            {
+                else
                 {
-                    wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
-                    wrapper::vext_1(vrsum.val[1], vrsum.val[1])
+                    // Scale lower result
+                    scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1, pool_size,
+                                                       upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                       pool_stride_x, pool_stride_y);
+                    // Scale lower result
+                    scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1, pool_size,
+                                                       upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                       pool_stride_x, pool_stride_y);
+                    fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
                 }
-            };
-            const q16x8x2_t vrsum_shifted_2 =
+            }
+            else
             {
+                const q8x16_t max_data        = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
+                const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
+                const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
+                const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
+
+                if (pool_stride_x == 2)
                 {
-                    wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
-                    wrapper::vext_2(vrsum.val[1], vrsum.val[1])
+                    const q8x8x2_t      table      = {{wrapper::vgetlow(final_max), wrapper::vgethigh(final_max)}};
+                    static const q8x8_t lookup_val = {0, 2, 4, 6, 8, 10, 12, 14};
+                    fres                           = wrapper::vtbl(table, lookup_val);
                 }
-            };
-            // Calculate final sum
-            q16x8x2_t final_sum =
-            {
+                else
                 {
-                    wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
-                    wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
+                    fqres = final_max;
                 }
-            };
-            if(pool_stride_x == 2)
-            {
-                q16x8_t res =
-                {
-                    wrapper::vgetlane(final_sum.val[0], 0),
-                    wrapper::vgetlane(final_sum.val[0], 2),
-                    wrapper::vgetlane(final_sum.val[0], 4),
-                    wrapper::vgetlane(final_sum.val[0], 6),
-                    wrapper::vgetlane(final_sum.val[1], 0),
-                    wrapper::vgetlane(final_sum.val[1], 2),
-                    wrapper::vgetlane(final_sum.val[1], 4),
-                    wrapper::vgetlane(final_sum.val[1], 6),
-                };
-
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                fres = wrapper::vmovn(res);
             }
-            else
-            {
-                // Scale lower result
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                // Scale lower result
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
-            }
-        }
-        else
-        {
-            const q8x16_t max_data        = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
-            const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
-            const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
-            const q8x16_t final_max       = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
 
-            if(pool_stride_x == 2)
+            // Store result
+            if (pool_stride_x == 1)
             {
-                const q8x8x2_t      table      = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
-                static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
-                fres                           = wrapper::vtbl(table, lookup_val);
+                if (src_qinfo != dst_qinfo)
+                {
+                    fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres),
+                                                                 requant_qinfo);
+                }
+                write16_boundary_aware<T, q8x8_t, false>(id.x(), dst_w, wrapper::vgetlow(fqres),
+                                                         wrapper::vgethigh(fqres), reinterpret_cast<T *>(out.ptr()));
             }
             else
             {
-                fqres = final_max;
-            }
-        }
-
-        // Store result
-        if(pool_stride_x == 1)
-        {
-            if(src_qinfo != dst_qinfo)
-            {
-                fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
-            }
-            write16_boundary_aware<T, q8x8_t, false>(id.x(), dst_w, wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), reinterpret_cast<T *>(out.ptr()));
-        }
-        else
-        {
-            if(src_qinfo != dst_qinfo)
-            {
-                fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
+                if (src_qinfo != dst_qinfo)
+                {
+                    fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
+                }
+                write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, fres, reinterpret_cast<T *>(out.ptr()));
             }
-            write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, fres, reinterpret_cast<T *>(out.ptr()));
-        }
-    },
-    in, out);
+        },
+        in, out);
 }
 
 template <typename T>
-void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_quantized_neon_nchw(const ITensor    *src,
+                                    ITensor          *dst0,
+                                    ITensor          *dst1,
+                                    PoolingLayerInfo &pool_info,
+                                    const Window     &window_src,
+                                    const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
     Iterator in(src, window_src);
@@ -697,74 +749,81 @@ void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *
     using q16_t = typename wrapper::traits::promote_t<T>;
     using q32_t = typename wrapper::traits::promote_t<q16_t>;
 
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
+    const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
     const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
     const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
 
-    const UniformQuantizationInfo &src_qinfo        = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo &dst_qinfo        = dst0->info()->quantization_info().uniform();
-    const int                      src_w            = src->info()->dimension(0);
-    const int                      src_h            = src->info()->dimension(1);
-    const T                        fill_value       = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
-    const int                      stridex_in_bytes = static_cast<int>(src->info()->strides_in_bytes().x());
-    const int                      stridey_in_bytes = static_cast<int>(src->info()->strides_in_bytes().y());
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        T res = std::numeric_limits<T>::min();
-
-        if(pool_info.pool_type != PoolingType::MAX)
+    const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
+    const int                      src_w     = src->info()->dimension(0);
+    const int                      src_h     = src->info()->dimension(1);
+    const T   fill_value       = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
+    const int stridex_in_bytes = static_cast<int>(src->info()->strides_in_bytes().x());
+    const int stridey_in_bytes = static_cast<int>(src->info()->strides_in_bytes().y());
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            q32_t sres = 0;
-
-            // Calculate scale
-            const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                           pool_stride_y);
+            T res = std::numeric_limits<T>::min();
 
-            // Perform pooling
-            for(int y = 0; y < pool_size_y; ++y)
+            if (pool_info.pool_type != PoolingType::MAX)
             {
-                for(int x = 0; x < pool_size_x; ++x)
+                q32_t sres = 0;
+
+                // Calculate scale
+                const float scale = calculate_avg_scale_pool2d(
+                    pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w,
+                    upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                // Perform pooling
+                for (int y = 0; y < pool_size_y; ++y)
                 {
-                    const auto in_ptr = reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto in_ptr = reinterpret_cast<const T *>(
+                            in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
 
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    const T   data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
-                    sres += data;
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        const T   data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
+                        sres += data;
+                    }
                 }
+                // Divide by scale
+                res = static_cast<T>(support::cpp11::round(sres * scale));
             }
-            // Divide by scale
-            res = static_cast<T>(support::cpp11::round(sres * scale));
-        }
-        else
-        {
-            for(int y = 0; y < pool_size_y; ++y)
+            else
             {
-                for(int x = 0; x < pool_size_x; ++x)
+                for (int y = 0; y < pool_size_y; ++y)
                 {
-                    const auto in_ptr = reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto in_ptr = reinterpret_cast<const T *>(
+                            in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
 
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    const T   data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
-                    res            = std::max(res, data);
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        const T   data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
+                        res            = std::max(res, data);
+                    }
                 }
             }
-        }
-        // Store result
-        res                                 = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) : res;
-        *(reinterpret_cast<T *>(out.ptr())) = res;
-    },
-    in, out);
+            // Store result
+            res                                 = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(
+                                                                                 Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo)
+                                                                           : res;
+            *(reinterpret_cast<T *>(out.ptr())) = res;
+        },
+        in, out);
 }
 #endif /* defined(ENABLE_NCHW_KERNELS) */
 } // namespace cpu
diff --git a/src/cpu/kernels/pool3d/neon/impl.cpp b/src/cpu/kernels/pool3d/neon/impl.cpp
deleted file mode 100644
index 2b089f3079dcb3f42d7c6a97a8891c1bff7407b0..0000000000000000000000000000000000000000
--- a/src/cpu/kernels/pool3d/neon/impl.cpp
+++ /dev/null
@@ -1,462 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/PoolingHelpers.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/cpu/kernels/pool3d/neon/quantized.h"
-
-#include "src/cpu/kernels/pool3d/neon/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-template <typename T>
-void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
-                                    const int window_start_x, const int window_end_x, const int window_step_x)
-
-{
-    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
-    using vector_type = typename vtype::type;
-    using tag_type    = typename vtype::tag_type;
-
-    int pool_stride_x = static_cast<int>(pool_info.stride.width);
-    int pool_stride_y = static_cast<int>(pool_info.stride.height);
-    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
-
-    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
-    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
-
-    const int pool_pad_top   = static_cast<int>(pool_info.padding.top);
-    const int pool_pad_left  = static_cast<int>(pool_info.padding.left);
-    const int pool_pad_front = static_cast<int>(pool_info.padding.front);
-
-    const int input_dim_w = src->info()->dimension(1);
-    const int input_dim_h = src->info()->dimension(2);
-    const int input_dim_d = src->info()->dimension(3);
-
-    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
-    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
-    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
-    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
-
-    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
-
-    Iterator out(dst0, window_out);
-
-    vector_type vres;
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
-
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
-
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
-
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
-
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
-
-        int x_off = window_start_x;
-
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type());
-            for(int z = pool_start_z; z < pool_end_z; ++z)
-            {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        vres                       = wrapper::vmax(vres, data);
-                    }
-                }
-            }
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res(0);
-            res = -std::numeric_limits<float>::infinity();
-            for(int z = pool_start_z; z < pool_end_z; ++z)
-            {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res                     = std::max(res, data);
-                    }
-                }
-            }
-            // Store result
-            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-        }
-    },
-    out);
-}
-
-template <typename T>
-void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
-                                    const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
-{
-    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
-    using vector_type = typename vtype::type;
-    using tag_type    = typename vtype::tag_type;
-
-    int pool_stride_x = static_cast<int>(pool_info.stride.width);
-    int pool_stride_y = static_cast<int>(pool_info.stride.height);
-    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
-
-    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
-    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
-
-    const int pool_pad_top    = static_cast<int>(pool_info.padding.top);
-    const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
-    const int pool_pad_left   = static_cast<int>(pool_info.padding.left);
-    const int pool_pad_right  = static_cast<int>(pool_info.padding.right);
-    const int pool_pad_front  = static_cast<int>(pool_info.padding.front);
-    const int pool_pad_back   = static_cast<int>(pool_info.padding.back);
-
-    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
-
-    const int input_dim_w = src->info()->dimension(1);
-    const int input_dim_h = src->info()->dimension(2);
-    const int input_dim_d = src->info()->dimension(3);
-
-    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
-    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
-    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
-    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
-
-    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
-
-    Iterator out(dst0, window_out);
-
-    vector_type vres;
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
-
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
-
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
-
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
-
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
-
-        // Calculate scale
-        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                       pool_pad_top, pool_pad_front, pool_stride_x,
-                                                       pool_stride_y, pool_stride_z);
-        const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
-
-        int x_off = window_start_x;
-
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            // Perform pooling
-            vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
-            for(int z = pool_start_z; z < pool_end_z; ++z)
-            {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        vres                       = wrapper::vadd(vres, data);
-                    }
-                }
-            }
-
-            // Divide by scale
-            vres = wrapper::vmul(vres, scale_v);
-
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res(0);
-
-            for(int z = pool_start_z; z < pool_end_z; ++z)
-            {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res += data;
-                    }
-                }
-            }
-
-            // Divide by scale
-            res *= scale;
-
-            // Store result
-            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-        }
-    },
-    out);
-}
-
-template <typename T>
-void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
-                                   const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
-{
-    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
-    using vector_type = typename vtype::type;
-    using tag_type    = typename vtype::tag_type;
-
-    int pool_stride_x = static_cast<int>(pool_info.stride.width);
-    int pool_stride_y = static_cast<int>(pool_info.stride.height);
-    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
-
-    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
-    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
-
-    const int pool_pad_top    = static_cast<int>(pool_info.padding.top);
-    const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
-    const int pool_pad_left   = static_cast<int>(pool_info.padding.left);
-    const int pool_pad_right  = static_cast<int>(pool_info.padding.right);
-    const int pool_pad_front  = static_cast<int>(pool_info.padding.front);
-    const int pool_pad_back   = static_cast<int>(pool_info.padding.back);
-
-    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
-
-    const int input_dim_w = src->info()->dimension(1);
-    const int input_dim_h = src->info()->dimension(2);
-    const int input_dim_d = src->info()->dimension(3);
-
-    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
-    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
-    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
-    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
-
-    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
-
-    Iterator out(dst0, window_out);
-
-    vector_type vres;
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
-
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
-
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
-
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
-
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
-
-        // Calculate scale
-        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                       pool_pad_top, pool_pad_front, pool_stride_x,
-                                                       pool_stride_y, pool_stride_z);
-
-        int x_off = window_start_x;
-
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            // Perform pooling
-            vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
-            for(int z = pool_start_z; z < pool_end_z; ++z)
-            {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        vres                       = wrapper::vmla(vres, data, data);
-                    }
-                }
-            }
-
-            const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
-
-            // Divide by scale
-            vres = wrapper::vmul(vres, scale_v);
-
-            // Calculate square-root
-            vres = wrapper::vinv(wrapper::vinvsqrt(vres));
-
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res(0);
-
-            for(int z = pool_start_z; z < pool_end_z; ++z)
-            {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res += data * data;
-                    }
-                }
-            }
-
-            // Divide by scale
-            res *= scale;
-
-            // Square root
-            res = std::sqrt(res);
-
-            // Store result
-            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-        }
-    },
-    out);
-}
-} // namespace
-
-template <typename T>
-void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
-{
-    const int     window_start_x = window.x().start();
-    const int     window_end_x   = window.x().end();
-    constexpr int window_step_x  = 16 / sizeof(T);
-    Window        window_out     = window;
-
-    // Needed to handle loop left-over
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    switch(pool_info.pool_type)
-    {
-        case PoolingType::MAX:
-            max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
-            break;
-        case PoolingType::AVG:
-            avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
-            break;
-        case PoolingType::L2:
-            l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Pool operation not supported");
-    }
-}
-
-template <typename T>
-void poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
-{
-    constexpr int window_step_x = 16;
-    Window        window_out    = window;
-
-    // Needed to handle loop left-over
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    switch(pool_info.pool_type)
-    {
-        case PoolingType::MAX:
-            max_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x);
-            break;
-        case PoolingType::AVG:
-            avg_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Pool operation not supported");
-    }
-}
-
-template void poolingMxNxD_fp_neon_ndhwc<float>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void poolingMxNxD_fp_neon_ndhwc<float16_t>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-template void poolingMxNxD_q8_neon_ndhwc<uint8_t>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
-template void poolingMxNxD_q8_neon_ndhwc<int8_t>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/pool3d/neon/impl.h b/src/cpu/kernels/pool3d/neon/impl.h
index 7ad8c8eb05f06a0369d1d8032a3e5d29000edf13..ce89199b5d90d4faf155e878d31b70db5185bde0 100644
--- a/src/cpu/kernels/pool3d/neon/impl.h
+++ b/src/cpu/kernels/pool3d/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,19 +26,459 @@
 
 #include "arm_compute/core/Helpers.h"
 
+#include "src/core/helpers/PoolingHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/cpu/kernels/pool3d/neon/quantized.h"
+
 namespace arm_compute
 {
-// Forward declarations
-class ITensor;
-class Window;
-struct Pooling3dLayerInfo;
 namespace cpu
 {
+namespace
+{
+template <typename T>
+void max_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                    ITensor            *dst0,
+                                    Pooling3dLayerInfo &pool_info,
+                                    const Window       &window_out,
+                                    const int           window_start_x,
+                                    const int           window_end_x,
+                                    const int           window_step_x)
+
+{
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top   = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_left  = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_front = static_cast<int>(pool_info.padding.front);
+
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    Iterator out(dst0, window_out);
+
+    vector_type vres;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+            int x_off = window_start_x;
+
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+            {
+                vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vmax(vres, data);
+                        }
+                    }
+                }
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                T res(0);
+                res = -std::numeric_limits<float>::infinity();
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res                     = std::max(res, data);
+                        }
+                    }
+                }
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        },
+        out);
+}
+
+template <typename T>
+void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                    ITensor            *dst0,
+                                    Pooling3dLayerInfo &pool_info,
+                                    const Window       &window_out,
+                                    const int           window_start_x,
+                                    const int           window_end_x,
+                                    const int           window_step_x)
+{
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top    = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
+    const int pool_pad_left   = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_right  = static_cast<int>(pool_info.padding.right);
+    const int pool_pad_front  = static_cast<int>(pool_info.padding.front);
+    const int pool_pad_back   = static_cast<int>(pool_info.padding.back);
+
+    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
+
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    Iterator out(dst0, window_out);
+
+    vector_type vres;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+            const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
+
+            int x_off = window_start_x;
+
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+            {
+                // Perform pooling
+                vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vadd(vres, data);
+                        }
+                    }
+                }
+
+                // Divide by scale
+                vres = wrapper::vmul(vres, scale_v);
+
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                T res(0);
+
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data;
+                        }
+                    }
+                }
+
+                // Divide by scale
+                res *= scale;
+
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        },
+        out);
+}
+
+template <typename T>
+void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                   ITensor            *dst0,
+                                   Pooling3dLayerInfo &pool_info,
+                                   const Window       &window_out,
+                                   const int           window_start_x,
+                                   const int           window_end_x,
+                                   const int           window_step_x)
+{
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top    = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
+    const int pool_pad_left   = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_right  = static_cast<int>(pool_info.padding.right);
+    const int pool_pad_front  = static_cast<int>(pool_info.padding.front);
+    const int pool_pad_back   = static_cast<int>(pool_info.padding.back);
+
+    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
+
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    Iterator out(dst0, window_out);
+
+    vector_type vres;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+
+            int x_off = window_start_x;
+
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+            {
+                // Perform pooling
+                vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vmla(vres, data, data);
+                        }
+                    }
+                }
+
+                const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
+
+                // Divide by scale
+                vres = wrapper::vmul(vres, scale_v);
+
+                // Calculate square-root
+                vres = wrapper::vinv(wrapper::vinvsqrt(vres));
+
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                T res(0);
+
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data * data;
+                        }
+                    }
+                }
+
+                // Divide by scale
+                res *= scale;
+
+                // Square root
+                res = std::sqrt(res);
+
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        },
+        out);
+}
+} // namespace
+
 template <typename T>
-void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
+void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    const int     window_start_x = window.x().start();
+    const int     window_end_x   = window.x().end();
+    constexpr int window_step_x  = 16 / sizeof(T);
+    Window        window_out     = window;
+
+    // Needed to handle loop left-over
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    switch (pool_info.pool_type)
+    {
+        case PoolingType::MAX:
+            max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                              window_step_x);
+            break;
+        case PoolingType::AVG:
+            avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                              window_step_x);
+            break;
+        case PoolingType::L2:
+            l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                             window_step_x);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Pool operation not supported");
+    }
+}
 
 template <typename T>
-void poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
+void poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    constexpr int window_step_x = 16;
+    Window        window_out    = window;
+
+    // Needed to handle loop left-over
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    switch (pool_info.pool_type)
+    {
+        case PoolingType::MAX:
+            max_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x);
+            break;
+        case PoolingType::AVG:
+            avg_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Pool operation not supported");
+    }
+}
 } // namespace cpu
 } // namespace arm_compute
 #endif //define SRC_CORE_POOLING_3D_LAYER_IMPL_H
diff --git a/src/cpu/kernels/pool3d/neon/quantized.h b/src/cpu/kernels/pool3d/neon/quantized.h
index ac14f5eafa14383f771c91d5f94d645ea1559310..8819907901b4e1a0db970b4979c2dc33cee93b1b 100644
--- a/src/cpu/kernels/pool3d/neon/quantized.h
+++ b/src/cpu/kernels/pool3d/neon/quantized.h
@@ -26,17 +26,18 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/PoolingHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
 template <typename T>
-void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
-                                    const int window_step_x)
+void avg_poolingMxNxD_q8_neon_ndhwc(
+    const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x)
 
 {
     using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
@@ -89,144 +90,147 @@ void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
     const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
     // "new_offset" doesn't have to consider the "half_scale_v" in its computation
     // With a requantization performed in a single step there won't be uncertainties introduced
-    const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+    const int32_t new_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
 
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
 
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
 
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
 
-        // Calculate scale
-        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                       pool_pad_top, pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
 
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
 
-        int x_off = window_start_x;
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+            int x_off = window_start_x;
 
-            // Perform pooling
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-
-                        const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
-                        const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
-                        vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
-                        vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
-                        vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
-                        vres4                   = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
+                            const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
+                            vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
+                            vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
+                            vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
+                            vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                        }
                     }
                 }
-            }
 
-            if(src_qinfo != dst_qinfo)
-            {
-                const float32x4x4_t vres =
+                if (src_qinfo != dst_qinfo)
                 {
-                    {
+                    const float32x4x4_t vres = {{
                         vcvtq_f32_q32(vres1),
                         vcvtq_f32_q32(vres2),
                         vcvtq_f32_q32(vres3),
                         vcvtq_f32_q32(vres4),
-                    }
-                };
-                const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
-            }
-            else
-            {
-                const float32x4_t scale_v = vdupq_n_f32(scale);
-                // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
-                vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
-                vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
-                vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
-
-                const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
-                const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
+                    }};
+                    const auto          requantized_dst =
+                        vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
+                }
+                else
+                {
+                    const float32x4_t scale_v = vdupq_n_f32(scale);
+                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                    vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
+                    vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
+                    vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
+                    vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
+
+                    const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
+                    const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
+                }
             }
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            q32_t res = static_cast<q32_t>(0.f);
 
-            // Perform pooling
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                q32_t res = static_cast<q32_t>(0.f);
+
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res += data;
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data;
+                        }
                     }
                 }
-            }
 
-            if(src_qinfo != dst_qinfo)
-            {
-                const float res_f           = static_cast<float>(res);
-                const float new_scale       = quant_rescale / scale;
-                const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+                if (src_qinfo != dst_qinfo)
+                {
+                    const float res_f           = static_cast<float>(res);
+                    const float new_scale       = quant_rescale / scale;
+                    const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
 
-                // Store result
-                *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
-            }
-            else
-            {
-                // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+                    // Store result
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
+                }
+                else
+                {
+                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                    res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
 
-                // Store result
-                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                    // Store result
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                }
             }
-        }
-    },
-    out);
+        },
+        out);
 }
 
 template <typename T>
-void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
-                                    const int window_step_x)
+void max_poolingMxNxD_q8_neon_ndhwc(
+    const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x)
 
 {
     using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
@@ -266,125 +270,130 @@ void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
     const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
 
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
 
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
 
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
 
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
 
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
 
-        int x_off = window_start_x;
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+            int x_off = window_start_x;
 
-            // Perform pooling
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-
-                        vres = wrapper::vmax(vres, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            vres = wrapper::vmax(vres, data);
+                        }
                     }
                 }
-            }
-
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
-                            requant_qinfo) :
-                            vres);
-        }
 
-        // Leftovers using half the window step
-        for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
-        {
-            q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                (src_qinfo != dst_qinfo)
+                                    ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres),
+                                                                           wrapper::vgethigh(vres), requant_qinfo)
+                                    : vres);
+            }
 
-            // Perform pooling
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Leftovers using half the window step
+            for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const q8x8_t   data     = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-
-                        vres = wrapper::vmax(vres, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const q8x8_t   data     = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            vres = wrapper::vmax(vres, data);
+                        }
                     }
                 }
-            }
-
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
-                            (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
-        }
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res = std::numeric_limits<T>::min();
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+            }
 
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                T res = std::numeric_limits<T>::min();
+
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-
-                        res = std::max(res, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            res = std::max(res, data);
+                        }
                     }
                 }
-            }
 
-            // Store result
-            if(src_qinfo != dst_qinfo)
-            {
-                const float res_f                           = static_cast<float>(res);
-                *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
-            }
-            else
-            {
-                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                // Store result
+                if (src_qinfo != dst_qinfo)
+                {
+                    const float res_f                           = static_cast<float>(res);
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+                }
+                else
+                {
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                }
             }
-        }
-    },
-    out);
+        },
+        out);
 }
 
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
\ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
diff --git a/src/cpu/kernels/range/generic/neon/fp16.cpp b/src/cpu/kernels/range/generic/neon/fp16.cpp
index 5d50dce9075ced4c9e1c8c9316e3c9c74443ef88..505c18c27d1727f04d39fe191b63b5472abcf9a6 100644
--- a/src/cpu/kernels/range/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/range/generic/neon/fp16.cpp
@@ -23,10 +23,10 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 
-#include "src/cpu/kernels/range/generic/neon/impl.h"
-
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/range/generic/neon/impl.h"
 
 namespace arm_compute
 {
diff --git a/src/cpu/kernels/range/generic/neon/fp32.cpp b/src/cpu/kernels/range/generic/neon/fp32.cpp
index 6044f0f886349adab042532329ba8105b20559ff..e5e472abb5bc606d00dd377b308408512b3c7e70 100644
--- a/src/cpu/kernels/range/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/range/generic/neon/fp32.cpp
@@ -22,10 +22,10 @@
  * SOFTWARE.
  */
 
-#include "src/cpu/kernels/range/generic/neon/impl.h"
-
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/range/generic/neon/impl.h"
 
 namespace arm_compute
 {
diff --git a/src/cpu/kernels/range/generic/neon/impl.cpp b/src/cpu/kernels/range/generic/neon/impl.cpp
deleted file mode 100644
index f91251c9141573350e2a63a9d26dfd40b0188a1c..0000000000000000000000000000000000000000
--- a/src/cpu/kernels/range/generic/neon/impl.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/cpu/kernels/range/generic/neon/impl.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Registrars.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename T>
-void neon_range_function(ITensor *output, float start, float step, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::tag_type;
-
-    const auto step_vec  = wrapper::vdup_n(static_cast<T>(step), ExactTagType{});
-    const auto start_vec = wrapper::vdup_n(static_cast<T>(start), ExactTagType{});
-    auto       id_vec    = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const int  window_step_x  = 16 / sizeof(T);
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator output_it(output, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int        x       = window_start_x;
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            for(int count = 0; count < window_step_x; ++count)
-            {
-                id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count);
-            }
-
-            // start + step * id
-            const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
-            wrapper::vstore(out_ptr + x, res_vec);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const auto res = start + x * step;
-            *(out_ptr + x) = res;
-        }
-
-    },
-    output_it);
-}
-
-template void neon_range_function<uint8_t>(ITensor *output, float start, float step, const Window &window);
-template void neon_range_function<uint16_t>(ITensor *output, float start, float step, const Window &window);
-template void neon_range_function<uint32_t>(ITensor *output, float start, float step, const Window &window);
-template void neon_range_function<int8_t>(ITensor *output, float start, float step, const Window &window);
-template void neon_range_function<int16_t>(ITensor *output, float start, float step, const Window &window);
-template void neon_range_function<int32_t>(ITensor *output, float start, float step, const Window &window);
-template void neon_range_function<float32_t>(ITensor *output, float start, float step, const Window &window);
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void neon_range_function<float16_t>(ITensor *output, float start, float step, const Window &window);
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/range/generic/neon/impl.h b/src/cpu/kernels/range/generic/neon/impl.h
index 7ac2fc9c1124aeb8eb690457006de46c37b1bfee..f8c30d52a0d1f660877c6b5a8c43ede2766cd112 100644
--- a/src/cpu/kernels/range/generic/neon/impl.h
+++ b/src/cpu/kernels/range/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,64 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_RANGE_IMPL_H
-#define SRC_CORE_NEON_KERNELS_RANGE_IMPL_H
+#ifndef ACL_SRC_CPU_KERNELS_RANGE_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_RANGE_GENERIC_NEON_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
-class ITensor;
-class Window;
-
 namespace cpu
 {
 template <typename T>
-void neon_range_function(ITensor *output, float start, float step, const Window &window);
+void neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::tag_type;
+
+    const auto step_vec  = wrapper::vdup_n(static_cast<T>(step), ExactTagType{});
+    const auto start_vec = wrapper::vdup_n(static_cast<T>(start), ExactTagType{});
+    auto       id_vec    = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = 16 / sizeof(T);
+
+    Window win{window};
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator output_it(output, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            int        x       = window_start_x;
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                for (int count = 0; count < window_step_x; ++count)
+                {
+                    id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count);
+                }
+
+                // start + step * id
+                const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
+                wrapper::vstore(out_ptr + x, res_vec);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const auto res = start + x * step;
+                *(out_ptr + x) = res;
+            }
+        },
+        output_it);
+}
 } // namespace cpu
 } // namespace arm_compute
-#endif //SRC_CORE_NEON_KERNELS_RANGE_IMPL_H
+#endif // ACL_SRC_CPU_KERNELS_RANGE_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/range/list.h b/src/cpu/kernels/range/list.h
index 25d52bfe7f51dfec03a3d81f8bd67ab02e59d310..cade91e8ddaaaae405d462562e1cd5a131bf2050 100644
--- a/src/cpu/kernels/range/list.h
+++ b/src/cpu/kernels/range/list.h
@@ -28,8 +28,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_RANGE_KERNEL(func_name) \
-    void func_name(ITensor *output, float start, float step, const Window &window)
+#define DECLARE_RANGE_KERNEL(func_name) void func_name(ITensor *output, float start, float step, const Window &window)
 
 DECLARE_RANGE_KERNEL(fp16_neon_range_function);
 DECLARE_RANGE_KERNEL(fp32_neon_range_function);
diff --git a/src/cpu/kernels/roialign/generic/neon/fp16.cpp b/src/cpu/kernels/roialign/generic/neon/fp16.cpp
index c265d5d4eb410c45f9d4e4a0c4fc9cf3438b3051..cf998305622bf7624c615c416862ff15142cc661 100644
--- a/src/cpu/kernels/roialign/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/roialign/generic/neon/fp16.cpp
@@ -29,7 +29,12 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void neon_fp16_roialign(const ITensor      *input,
+                        ITensor            *output,
+                        const ITensor      *rois,
+                        ROIPoolingLayerInfo pool_info,
+                        const Window       &window,
+                        const ThreadInfo   &info)
 {
     return roi_align<float16_t, float16_t>(input, output, rois, pool_info, window, info);
 }
diff --git a/src/cpu/kernels/roialign/generic/neon/fp32.cpp b/src/cpu/kernels/roialign/generic/neon/fp32.cpp
index 51355aaef0ca2b982bf3a82d2d2a07a9271d432f..c1dba99b5e06fe748a46c13a1b0717c0e8c1ad39 100644
--- a/src/cpu/kernels/roialign/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/roialign/generic/neon/fp32.cpp
@@ -26,7 +26,12 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void neon_fp32_roialign(const ITensor      *input,
+                        ITensor            *output,
+                        const ITensor      *rois,
+                        ROIPoolingLayerInfo pool_info,
+                        const Window       &window,
+                        const ThreadInfo   &info)
 {
     return roi_align<float, float>(input, output, rois, pool_info, window, info);
 }
diff --git a/src/cpu/kernels/roialign/generic/neon/impl.cpp b/src/cpu/kernels/roialign/generic/neon/impl.cpp
deleted file mode 100644
index 630d649abaef12dd6f8dfea74bc00fe9f90703a1..0000000000000000000000000000000000000000
--- a/src/cpu/kernels/roialign/generic/neon/impl.cpp
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * Copyright (c) 2019-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/cpu/kernels/roialign/generic/neon/impl.h"
-
-#include "src/core/NEON/INEKernel.h"
-#include "src/cpu/CpuTypes.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Average pooling over an aligned window */
-template <typename input_data_type>
-inline input_data_type roi_align_1x1(const ITensor *input,
-                                     unsigned int   roi_batch,
-                                     float          region_start_x,
-                                     float          bin_size_x,
-                                     int            grid_size_x,
-                                     float          region_end_x,
-                                     float          region_start_y,
-                                     float          bin_size_y,
-                                     int            grid_size_y,
-                                     float          region_end_y,
-                                     int            pz)
-{
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
-    {
-        return input_data_type(0);
-    }
-    else
-    {
-        const DataLayout data_layout = input->info()->data_layout();
-        float            avg         = 0;
-        // Iterate through the aligned pooling region
-        for(int iy = 0; iy < grid_size_y; ++iy)
-        {
-            for(int ix = 0; ix < grid_size_x; ++ix)
-            {
-                // Align the window in the middle of every bin
-                float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
-                float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
-
-                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
-                const int y_low  = y;
-                const int x_low  = x;
-                const int y_high = y_low + 1;
-                const int x_high = x_low + 1;
-
-                const float ly = y - y_low;
-                const float lx = x - x_low;
-                const float hy = 1. - ly;
-                const float hx = 1. - lx;
-
-                const float w1 = hy * hx;
-                const float w2 = hy * lx;
-                const float w3 = ly * hx;
-                const float w4 = ly * lx;
-                if(data_layout == DataLayout::NCHW)
-                {
-                    const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch)));
-                    const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch)));
-                    const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch)));
-                    const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch)));
-                    avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                }
-                else
-                {
-                    const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch)));
-                    const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch)));
-                    const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch)));
-                    const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch)));
-                    avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                }
-            }
-        }
-
-        avg /= grid_size_x * grid_size_y;
-        return input_data_type(avg);
-    }
-}
-
-/** Average pooling over an aligned window */
-template <typename input_data_type>
-inline input_data_type roi_align_1x1_qasymm8(const ITensor          *input,
-                                             unsigned int            roi_batch,
-                                             float                   region_start_x,
-                                             float                   bin_size_x,
-                                             int                     grid_size_x,
-                                             float                   region_end_x,
-                                             float                   region_start_y,
-                                             float                   bin_size_y,
-                                             int                     grid_size_y,
-                                             float                   region_end_y,
-                                             int                     pz,
-                                             const QuantizationInfo &out_qinfo)
-{
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
-    {
-        return input_data_type(out_qinfo.uniform().offset);
-    }
-    else
-    {
-        float                         avg              = 0;
-        const UniformQuantizationInfo input_qinfo      = input->info()->quantization_info().uniform();
-        const bool                    is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type());
-        const DataLayout              data_layout      = input->info()->data_layout();
-
-        // Iterate through the aligned pooling region
-        for(int iy = 0; iy < grid_size_y; ++iy)
-        {
-            for(int ix = 0; ix < grid_size_x; ++ix)
-            {
-                // Align the window in the middle of every bin
-                float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
-                float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
-
-                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
-                const int y_low  = y;
-                const int x_low  = x;
-                const int y_high = y_low + 1;
-                const int x_high = x_low + 1;
-
-                const float ly = y - y_low;
-                const float lx = x - x_low;
-                const float hy = 1. - ly;
-                const float hx = 1. - lx;
-
-                const float w1 = hy * hx;
-                const float w2 = hy * lx;
-                const float w3 = ly * hx;
-                const float w4 = ly * lx;
-
-                if(data_layout == DataLayout::NCHW)
-                {
-                    if(is_qasymm_signed)
-                    {
-                        float data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
-                        float data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
-                        float data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
-                        float data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
-                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                    }
-                    else
-                    {
-                        float data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
-                        float data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
-                        float data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
-                        float data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
-                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                    }
-                }
-                else
-                {
-                    if(is_qasymm_signed)
-                    {
-                        const auto data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
-                        const auto data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
-                        const auto data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
-                        const auto data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
-                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                    }
-                    else
-                    {
-                        const auto data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
-                        const auto data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
-                        const auto data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
-                        const auto data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
-                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                    }
-                }
-            }
-        }
-
-        avg /= grid_size_x * grid_size_y;
-
-        input_data_type res = 0;
-        if(is_qasymm_signed)
-        {
-            res = quantize_qasymm8_signed(avg, out_qinfo);
-        }
-        else
-        {
-            res = quantize_qasymm8(avg, out_qinfo);
-        }
-        return res;
-    }
-}
-inline float compute_region_coordinate(int p, float bin_size, float roi_anchor, float max_value)
-{
-    const float region_start = p * bin_size + roi_anchor;
-    return utility::clamp(region_start, 0.0f, max_value);
-}
-
-template <typename input_data_type, typename roi_data_type>
-void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-
-    const DataLayout data_layout    = input->info()->data_layout();
-    const size_t     values_per_roi = rois->info()->dimension(0);
-
-    const int roi_list_start = window.x().start();
-    const int roi_list_end   = window.x().end();
-
-    const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int idx_depth  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    const int input_width   = input->info()->dimension(idx_width);
-    const int input_height  = input->info()->dimension(idx_height);
-    const int input_chanels = input->info()->dimension(idx_depth);
-    const int pooled_w      = pool_info.pooled_width();
-    const int pooled_h      = pool_info.pooled_height();
-
-    const DataType data_type = input->info()->data_type();
-    const bool     is_qasymm = is_data_type_quantized_asymmetric(data_type);
-
-    const auto             *rois_ptr   = reinterpret_cast<const roi_data_type *>(rois->buffer());
-    const QuantizationInfo &rois_qinfo = rois->info()->quantization_info();
-    for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
-    {
-        const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
-
-        roi_data_type qx1 = rois_ptr[values_per_roi * roi_indx + 1];
-        roi_data_type qy1 = rois_ptr[values_per_roi * roi_indx + 2];
-        roi_data_type qx2 = rois_ptr[values_per_roi * roi_indx + 3];
-        roi_data_type qy2 = rois_ptr[values_per_roi * roi_indx + 4];
-        float         x1(qx1);
-        float         x2(qx2);
-        float         y1(qy1);
-        float         y2(qy2);
-        if(is_qasymm)
-        {
-            x1 = dequantize_qasymm16(qx1, rois_qinfo);
-            x2 = dequantize_qasymm16(qx2, rois_qinfo);
-            y1 = dequantize_qasymm16(qy1, rois_qinfo);
-            y2 = dequantize_qasymm16(qy2, rois_qinfo);
-        }
-        const float roi_anchor_x = x1 * pool_info.spatial_scale();
-        const float roi_anchor_y = y1 * pool_info.spatial_scale();
-        const float roi_dims_x   = std::max((x2 - x1) * pool_info.spatial_scale(), 1.0f);
-        const float roi_dims_y   = std::max((y2 - y1) * pool_info.spatial_scale(), 1.0f);
-        float       bin_size_x   = roi_dims_x / pool_info.pooled_width();
-        float       bin_size_y   = roi_dims_y / pool_info.pooled_height();
-
-        // Iterate through all feature maps
-        for(int ch = 0; ch < input_chanels; ++ch)
-        {
-            // Iterate through all output pixels
-            for(int py = 0; py < pooled_h; ++py)
-            {
-                for(int px = 0; px < pooled_w; ++px)
-                {
-                    const float     region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width);
-                    const float     region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height);
-                    const float     region_end_x   = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width);
-                    const float     region_end_y   = compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height);
-                    const int       roi_bin_grid_x = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x));
-                    const int       roi_bin_grid_y = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y));
-                    input_data_type out_val(0);
-                    if(is_qasymm)
-                    {
-                        out_val = roi_align_1x1_qasymm8<input_data_type>(
-                                      input, roi_batch, region_start_x, bin_size_x,
-                                      roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
-                                      roi_bin_grid_y, region_end_y, ch, output->info()->quantization_info());
-                    }
-                    else
-                    {
-                        out_val = roi_align_1x1<input_data_type>(
-                                      input, roi_batch, region_start_x, bin_size_x,
-                                      roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
-                                      roi_bin_grid_y, region_end_y, ch);
-                    }
-
-                    if(data_layout == DataLayout::NCHW)
-                    {
-                        auto out_ptr = reinterpret_cast<input_data_type *>(output->ptr_to_element(Coordinates(px, py, ch, roi_indx)));
-                        *out_ptr     = out_val;
-                    }
-                    else
-                    {
-                        auto out_ptr = reinterpret_cast<input_data_type *>(output->ptr_to_element(Coordinates(ch, px, py, roi_indx)));
-                        *out_ptr     = out_val;
-                    }
-                }
-            }
-        }
-    }
-}
-template void roi_align<float, float>(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info);
-template void roi_align<uint8_t, uint16_t>(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info);
-template void roi_align<int8_t, uint16_t>(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info);
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void roi_align<float16_t, float16_t>(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info);
-#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/roialign/generic/neon/impl.h b/src/cpu/kernels/roialign/generic/neon/impl.h
index ff96b8edab0a7f9b513181af03842f28ca2f27e3..db2f67705d1e30bbf32396f218d7a0b19132ad02 100644
--- a/src/cpu/kernels/roialign/generic/neon/impl.h
+++ b/src/cpu/kernels/roialign/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,9 +32,349 @@ class ITensor;
 class Window;
 namespace cpu
 {
+/** Average pooling over an aligned window */
+template <typename input_data_type>
+inline input_data_type roi_align_1x1(const ITensor *input,
+                                     unsigned int   roi_batch,
+                                     float          region_start_x,
+                                     float          bin_size_x,
+                                     int            grid_size_x,
+                                     float          region_end_x,
+                                     float          region_start_y,
+                                     float          bin_size_y,
+                                     int            grid_size_y,
+                                     float          region_end_y,
+                                     int            pz)
+{
+    if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    {
+        return input_data_type(0);
+    }
+    else
+    {
+        const DataLayout data_layout = input->info()->data_layout();
+        float            avg         = 0;
+        // Iterate through the aligned pooling region
+        for (int iy = 0; iy < grid_size_y; ++iy)
+        {
+            for (int ix = 0; ix < grid_size_x; ++ix)
+            {
+                // Align the window in the middle of every bin
+                float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
+                float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
+
+                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
+                const int y_low  = y;
+                const int x_low  = x;
+                const int y_high = y_low + 1;
+                const int x_high = x_low + 1;
+
+                const float ly = y - y_low;
+                const float lx = x - x_low;
+                const float hy = 1. - ly;
+                const float hx = 1. - lx;
+
+                const float w1 = hy * hx;
+                const float w2 = hy * lx;
+                const float w3 = ly * hx;
+                const float w4 = ly * lx;
+                if (data_layout == DataLayout::NCHW)
+                {
+                    const auto data1 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch)));
+                    const auto data2 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch)));
+                    const auto data3 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch)));
+                    const auto data4 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch)));
+                    avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                }
+                else
+                {
+                    const auto data1 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch)));
+                    const auto data2 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch)));
+                    const auto data3 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch)));
+                    const auto data4 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch)));
+                    avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                }
+            }
+        }
+
+        avg /= grid_size_x * grid_size_y;
+        return input_data_type(avg);
+    }
+}
+
+/** Average pooling over an aligned window */
+template <typename input_data_type>
+inline input_data_type roi_align_1x1_qasymm8(const ITensor          *input,
+                                             unsigned int            roi_batch,
+                                             float                   region_start_x,
+                                             float                   bin_size_x,
+                                             int                     grid_size_x,
+                                             float                   region_end_x,
+                                             float                   region_start_y,
+                                             float                   bin_size_y,
+                                             int                     grid_size_y,
+                                             float                   region_end_y,
+                                             int                     pz,
+                                             const QuantizationInfo &out_qinfo)
+{
+    if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    {
+        return input_data_type(out_qinfo.uniform().offset);
+    }
+    else
+    {
+        float                         avg         = 0;
+        const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform();
+        const bool       is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type());
+        const DataLayout data_layout      = input->info()->data_layout();
+
+        // Iterate through the aligned pooling region
+        for (int iy = 0; iy < grid_size_y; ++iy)
+        {
+            for (int ix = 0; ix < grid_size_x; ++ix)
+            {
+                // Align the window in the middle of every bin
+                float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
+                float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
+
+                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
+                const int y_low  = y;
+                const int x_low  = x;
+                const int y_high = y_low + 1;
+                const int x_high = x_low + 1;
+
+                const float ly = y - y_low;
+                const float lx = x - x_low;
+                const float hy = 1. - ly;
+                const float hx = 1. - lx;
+
+                const float w1 = hy * hx;
+                const float w2 = hy * lx;
+                const float w3 = ly * hx;
+                const float w4 = ly * lx;
+
+                if (data_layout == DataLayout::NCHW)
+                {
+                    if (is_qasymm_signed)
+                    {
+                        float data1 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(x_low, y_low, pz, roi_batch))),
+                                                      input_qinfo);
+                        float data2 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(x_high, y_low, pz, roi_batch))),
+                                                      input_qinfo);
+                        float data3 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(x_low, y_high, pz, roi_batch))),
+                                                      input_qinfo);
+                        float data4 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(x_high, y_high, pz, roi_batch))),
+                                                      input_qinfo);
+                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    }
+                    else
+                    {
+                        float data1 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))),
+                                               input_qinfo);
+                        float data2 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))),
+                                               input_qinfo);
+                        float data3 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))),
+                                               input_qinfo);
+                        float data4 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))),
+                                               input_qinfo);
+                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    }
+                }
+                else
+                {
+                    if (is_qasymm_signed)
+                    {
+                        const auto data1 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(pz, x_low, y_low, roi_batch))),
+                                                      input_qinfo);
+                        const auto data2 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(pz, x_high, y_low, roi_batch))),
+                                                      input_qinfo);
+                        const auto data3 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(pz, x_low, y_high, roi_batch))),
+                                                      input_qinfo);
+                        const auto data4 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(pz, x_high, y_high, roi_batch))),
+                                                      input_qinfo);
+                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    }
+                    else
+                    {
+                        const auto data1 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))),
+                                               input_qinfo);
+                        const auto data2 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))),
+                                               input_qinfo);
+                        const auto data3 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))),
+                                               input_qinfo);
+                        const auto data4 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))),
+                                               input_qinfo);
+                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    }
+                }
+            }
+        }
+
+        avg /= grid_size_x * grid_size_y;
+
+        input_data_type res = 0;
+        if (is_qasymm_signed)
+        {
+            res = quantize_qasymm8_signed(avg, out_qinfo);
+        }
+        else
+        {
+            res = quantize_qasymm8(avg, out_qinfo);
+        }
+        return res;
+    }
+}
+inline float compute_region_coordinate(int p, float bin_size, float roi_anchor, float max_value)
+{
+    const float region_start = p * bin_size + roi_anchor;
+    return utility::clamp(region_start, 0.0f, max_value);
+}
+
 template <typename input_data_type, typename roi_data_type>
-void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info);
+void roi_align(const ITensor      *input,
+               ITensor            *output,
+               const ITensor      *rois,
+               ROIPoolingLayerInfo pool_info,
+               const Window       &window,
+               const ThreadInfo   &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+
+    const DataLayout data_layout    = input->info()->data_layout();
+    const size_t     values_per_roi = rois->info()->dimension(0);
+
+    const int roi_list_start = window.x().start();
+    const int roi_list_end   = window.x().end();
+
+    const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int idx_depth  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    const int input_width   = input->info()->dimension(idx_width);
+    const int input_height  = input->info()->dimension(idx_height);
+    const int input_chanels = input->info()->dimension(idx_depth);
+    const int pooled_w      = pool_info.pooled_width();
+    const int pooled_h      = pool_info.pooled_height();
+
+    const DataType data_type = input->info()->data_type();
+    const bool     is_qasymm = is_data_type_quantized_asymmetric(data_type);
+
+    const auto             *rois_ptr   = reinterpret_cast<const roi_data_type *>(rois->buffer());
+    const QuantizationInfo &rois_qinfo = rois->info()->quantization_info();
+    for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
+    {
+        const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
+
+        roi_data_type qx1 = rois_ptr[values_per_roi * roi_indx + 1];
+        roi_data_type qy1 = rois_ptr[values_per_roi * roi_indx + 2];
+        roi_data_type qx2 = rois_ptr[values_per_roi * roi_indx + 3];
+        roi_data_type qy2 = rois_ptr[values_per_roi * roi_indx + 4];
+        float         x1(qx1);
+        float         x2(qx2);
+        float         y1(qy1);
+        float         y2(qy2);
+        if (is_qasymm)
+        {
+            x1 = dequantize_qasymm16(qx1, rois_qinfo);
+            x2 = dequantize_qasymm16(qx2, rois_qinfo);
+            y1 = dequantize_qasymm16(qy1, rois_qinfo);
+            y2 = dequantize_qasymm16(qy2, rois_qinfo);
+        }
+        const float roi_anchor_x = x1 * pool_info.spatial_scale();
+        const float roi_anchor_y = y1 * pool_info.spatial_scale();
+        const float roi_dims_x   = std::max((x2 - x1) * pool_info.spatial_scale(), 1.0f);
+        const float roi_dims_y   = std::max((y2 - y1) * pool_info.spatial_scale(), 1.0f);
+        float       bin_size_x   = roi_dims_x / pool_info.pooled_width();
+        float       bin_size_y   = roi_dims_y / pool_info.pooled_height();
+
+        // Iterate through all feature maps
+        for (int ch = 0; ch < input_chanels; ++ch)
+        {
+            // Iterate through all output pixels
+            for (int py = 0; py < pooled_h; ++py)
+            {
+                for (int px = 0; px < pooled_w; ++px)
+                {
+                    const float region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width);
+                    const float region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height);
+                    const float region_end_x = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width);
+                    const float region_end_y =
+                        compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height);
+                    const int roi_bin_grid_x =
+                        (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x));
+                    const int roi_bin_grid_y =
+                        (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y));
+                    input_data_type out_val(0);
+                    if (is_qasymm)
+                    {
+                        out_val = roi_align_1x1_qasymm8<input_data_type>(
+                            input, roi_batch, region_start_x, bin_size_x, roi_bin_grid_x, region_end_x, region_start_y,
+                            bin_size_y, roi_bin_grid_y, region_end_y, ch, output->info()->quantization_info());
+                    }
+                    else
+                    {
+                        out_val = roi_align_1x1<input_data_type>(input, roi_batch, region_start_x, bin_size_x,
+                                                                 roi_bin_grid_x, region_end_x, region_start_y,
+                                                                 bin_size_y, roi_bin_grid_y, region_end_y, ch);
+                    }
 
+                    if (data_layout == DataLayout::NCHW)
+                    {
+                        auto out_ptr = reinterpret_cast<input_data_type *>(
+                            output->ptr_to_element(Coordinates(px, py, ch, roi_indx)));
+                        *out_ptr = out_val;
+                    }
+                    else
+                    {
+                        auto out_ptr = reinterpret_cast<input_data_type *>(
+                            output->ptr_to_element(Coordinates(ch, px, py, roi_indx)));
+                        *out_ptr = out_val;
+                    }
+                }
+            }
+        }
+    }
+}
 } // namespace cpu
 } // namespace arm_compute
 #endif //define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H
diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp
index d6bd9a95cee3991c977204a7f3e610f48d600a4f..11c5770f53bd2922af813e06c60d55f2fbde2ad2 100644
--- a/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp
@@ -26,7 +26,12 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qu8_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void neon_qu8_roialign(const ITensor      *input,
+                       ITensor            *output,
+                       const ITensor      *rois,
+                       ROIPoolingLayerInfo pool_info,
+                       const Window       &window,
+                       const ThreadInfo   &info)
 {
     return roi_align<uint8_t, uint16_t>(input, output, rois, pool_info, window, info);
 }
diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp
index a839581afffa7d4be546ea243cc3792a02cd5caa..7f93cc87b3b6536bdf8c13760f696eb28a0c7367 100644
--- a/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp
@@ -26,7 +26,12 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qs8_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void neon_qs8_roialign(const ITensor      *input,
+                       ITensor            *output,
+                       const ITensor      *rois,
+                       ROIPoolingLayerInfo pool_info,
+                       const Window       &window,
+                       const ThreadInfo   &info)
 {
     return roi_align<int8_t, uint16_t>(input, output, rois, pool_info, window, info);
 }
diff --git a/src/cpu/kernels/roialign/list.h b/src/cpu/kernels/roialign/list.h
index 1c71b0248804bf176d2e75c5ccc7611fcc5df2b9..fdb3c0050db9476a65adff3d6e427ad7eecf3984 100644
--- a/src/cpu/kernels/roialign/list.h
+++ b/src/cpu/kernels/roialign/list.h
@@ -27,9 +27,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_ROIALIGN_KERNEL(func_name)                                     \
-    void func_name(const ITensor *input, ITensor *output, const ITensor *rois, \
-                   ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+#define DECLARE_ROIALIGN_KERNEL(func_name)                                                                    \
+    void func_name(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, \
+                   const Window &window, const ThreadInfo &info)
 DECLARE_ROIALIGN_KERNEL(neon_fp32_roialign);
 DECLARE_ROIALIGN_KERNEL(neon_fp16_roialign);
 DECLARE_ROIALIGN_KERNEL(neon_qu8_roialign);
diff --git a/src/cpu/kernels/scale/neon/fp16.cpp b/src/cpu/kernels/scale/neon/fp16.cpp
index 895f42215e4e976cfad80a0a3afefee131c8b6a7..c8a7b7038ea06a25362dcb0b904d4c1cbf945ff7 100644
--- a/src/cpu/kernels/scale/neon/fp16.cpp
+++ b/src/cpu/kernels/scale/neon/fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,10 +27,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
+#include "src/cpu/kernels/scale/neon/list.h"
 #include "support/Rounding.h"
 
 #include <arm_neon.h>
@@ -41,8 +43,12 @@ namespace arm_compute
 {
 namespace
 {
-void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                             float sampling_offset, bool align_corners, const Window &window)
+void fp16_neon_scale_nearest(const ITensor *src,
+                             ITensor       *dst,
+                             const ITensor *offsets,
+                             float          sampling_offset,
+                             bool           align_corners,
+                             const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -62,33 +68,46 @@ void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *of
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t    offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto       in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int        offset_row = in_hi * in_stride_wc;
-        int32_t          x          = window_start_x;
-        const float16_t *in_ptr     = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int        offset_row = in_hi * in_stride_wc;
+            int32_t          x          = window_start_x;
+            const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x,
+                                wrapper::vloadq(in_ptr + offset + offset_row + x));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+            }
+        },
+        out);
 }
 
-void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                              BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                              bool align_corners, const Window &window)
+void fp16_neon_scale_bilinear(const ITensor *src,
+                              ITensor       *dst,
+                              const ITensor *offsets,
+                              const ITensor *dx,
+                              const ITensor *dy,
+                              BorderMode     border_mode,
+                              PixelValue     constant_border_value,
+                              float          sampling_offset,
+                              bool           align_corners,
+                              const Window  &window)
 {
     // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
     Iterator  out(dst, window);
     const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
@@ -103,73 +122,155 @@ void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *o
     win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
     Iterator in(src, win_in);
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
         using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type;
 
         const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto       offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto       dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto       dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t    in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int32_t    in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+                const float16_t *in_ptr =
+                    reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+                const auto a00 =
+                    (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
+                const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h)
+                                     ? *(in_ptr + in_stride_c)
+                                     : const_border_value;
+                const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1)
+                                     ? *(in_ptr + in_stride_wc)
+                                     : const_border_value;
+                const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1)
+                                     ? *(in_ptr + in_stride_c + in_stride_wc)
+                                     : const_border_value;
+
+                *reinterpret_cast<float16_t *>(out.ptr()) =
+                    static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+
+                auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
+                auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
+                auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
+                auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
+
+                const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c +
+                                   clamped_h * in_stride_wc);
+                const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+                                   clamped_h * in_stride_wc);
+                const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c +
+                                   clamped_h1 * in_stride_wc);
+                const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+                                   clamped_h1 * in_stride_wc);
+
+                *reinterpret_cast<float16_t *>(out.ptr()) =
+                    static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
     }
     else
     {
         ARM_COMPUTE_ERROR("Not implemented");
     }
 }
-}
+} // namespace
 namespace cpu
 {
-void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                     InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                     bool align_corners, const Window &window)
+#ifdef ENABLE_NCHW_KERNELS
+void fp16_bilinear_neon_scale_nchw(const ITensor      *src,
+                                   ITensor            *dst,
+                                   const ITensor      *offsets,
+                                   const ITensor      *dx,
+                                   const ITensor      *dy,
+                                   InterpolationPolicy policy,
+                                   BorderMode          border_mode,
+                                   PixelValue          constant_border_value,
+                                   float               sampling_offset,
+                                   bool                align_corners,
+                                   const Window       &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    arm_compute::cpu::scale_bilinear_nchw<float16_t>(src, dst, dx, dy, offsets, border_mode, constant_border_value,
+                                                     sampling_offset, align_corners, window);
+}
+
+void fp16_nearest_neon_scale_nchw(const ITensor      *src,
+                                  ITensor            *dst,
+                                  const ITensor      *offsets,
+                                  const ITensor      *dx,
+                                  const ITensor      *dy,
+                                  InterpolationPolicy policy,
+                                  BorderMode          border_mode,
+                                  PixelValue          constant_border_value,
+                                  float               sampling_offset,
+                                  bool                align_corners,
+                                  const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    ARM_COMPUTE_UNUSED(policy);
+    ARM_COMPUTE_UNUSED(border_mode);
+    arm_compute::cpu::scale_nearest_nchw<float16_t>(src, dst, dx, dy, offsets, constant_border_value, sampling_offset,
+                                                    align_corners, window);
+}
+#endif // ENABLE_NCHW_KERNELS
+void fp16_neon_scale(const ITensor      *src,
+                     ITensor            *dst,
+                     const ITensor      *offsets,
+                     const ITensor      *dx,
+                     const ITensor      *dy,
+                     InterpolationPolicy policy,
+                     BorderMode          border_mode,
+                     PixelValue          constant_border_value,
+                     float               sampling_offset,
+                     bool                align_corners,
+                     const Window       &window)
+{
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+        fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                                 align_corners, window);
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         fp16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
 }
+
+void fp16_common_neon_scale(const ITensor      *src,
+                            ITensor            *dst,
+                            const ITensor      *offsets,
+                            const ITensor      *dx,
+                            const ITensor      *dy,
+                            InterpolationPolicy policy,
+                            BorderMode          border_mode,
+                            PixelValue          constant_border_value,
+                            float               sampling_offset,
+                            bool                align_corners,
+                            const Window       &window)
+{
+    arm_compute::cpu::common_neon_scale<float16_t>(src, dst, offsets, dx, dy, policy, border_mode,
+                                                   constant_border_value, sampling_offset, align_corners, window);
+}
+
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/scale/neon/integer.cpp b/src/cpu/kernels/scale/neon/integer.cpp
index 2ab14cf83aed9dea44bd5fa7a1f8efce92da1fc9..bbf92e041282d36fb94d5a883d48f457f6703b60 100644
--- a/src/cpu/kernels/scale/neon/integer.cpp
+++ b/src/cpu/kernels/scale/neon/integer.cpp
@@ -22,8 +22,9 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -33,8 +34,12 @@ namespace arm_compute
 {
 namespace
 {
-void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                           float sampling_offset, bool align_corners, const Window &window)
+void u8_neon_scale_nearest(const ITensor *src,
+                           ITensor       *dst,
+                           const ITensor *offsets,
+                           float          sampling_offset,
+                           bool           align_corners,
+                           const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -54,43 +59,58 @@ void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offs
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t  offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto     in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int      offset_row = in_hi * in_stride_wc;
-        int32_t        x          = window_start_x;
-        const uint8_t *in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int      offset_row = in_hi * in_stride_wc;
+            int32_t        x          = window_start_x;
+            const uint8_t *in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x,
+                                wrapper::vloadq(in_ptr + offset + offset_row + x));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+            }
+        },
+        out);
 }
 
-void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                            BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                            bool align_corners, const Window &window)
+void u8_neon_scale_bilinear(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            const ITensor *dx,
+                            const ITensor *dy,
+                            BorderMode     border_mode,
+                            PixelValue     constant_border_value,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
 {
     // Compute the ratio between source and destination dimensions
-    const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-    const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
     const int input_width  = src->info()->dimension(1);
     const int input_height = src->info()->dimension(2);
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
         Iterator  out(dst, window);
-        const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-        const int in_stride_wc = in_stride_c * (input_width + src->info()->padding().top + src->info()->padding().bottom);
+        const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+        const int in_stride_wc =
+            in_stride_c * (input_width + src->info()->padding().top + src->info()->padding().bottom);
 
         // Don't increment in Y and Z direction for the input tensor
         // A pointer to the start of this plane is needed as base for the precomputed offsets
@@ -100,24 +120,37 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
         Iterator in(src, win_in);
 
         const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto     offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t  in_hi  = std::floor((id.z() + sampling_offset) * scale_y - sampling_offset);
-            const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < input_width && 0 <= in_hi && in_hi < input_height) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < input_width - 1 && 0 <= in_hi && in_hi < input_height) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < input_width && -1 <= in_hi && in_hi < input_height - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < input_width - 1 && -1 <= in_hi && in_hi < input_height - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int32_t  in_hi = std::floor((id.z() + sampling_offset) * scale_y - sampling_offset);
+                const uint8_t *in_ptr =
+                    reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+                const auto a00 = (0 <= offset && offset < input_width && 0 <= in_hi && in_hi < input_height)
+                                     ? *in_ptr
+                                     : const_border_value;
+                const auto a01 = (-1 <= offset && offset < input_width - 1 && 0 <= in_hi && in_hi < input_height)
+                                     ? *(in_ptr + in_stride_c)
+                                     : const_border_value;
+                const auto a10 = (0 <= offset && offset < input_width && -1 <= in_hi && in_hi < input_height - 1)
+                                     ? *(in_ptr + in_stride_wc)
+                                     : const_border_value;
+                const auto a11 = (-1 <= offset && offset < input_width - 1 && -1 <= in_hi && in_hi < input_height - 1)
+                                     ? *(in_ptr + in_stride_c + in_stride_wc)
+                                     : const_border_value;
+
+                *reinterpret_cast<uint8_t *>(out.ptr()) =
+                    static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
         using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
 
@@ -152,12 +185,12 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
         const float fp_coord_offset_y = sampling_offset * (scale_y - 1);
         const float fp_coord_offset_x = sampling_offset * (scale_x - 1);
 
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const uint8_t *in_ptr  = in.ptr() + bo * in_stride_b;
             uint8_t       *out_ptr = out.ptr() + bo * out_stride_b;
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = yo * scale_y + fp_coord_offset_y;
@@ -174,7 +207,7 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
                 const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
 
                 uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = xo * scale_x + fp_coord_offset_x;
@@ -205,7 +238,7 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
                     uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
                         const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
                         const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
@@ -270,19 +303,21 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
                         const auto out_2_int = wrapper::vcvta<uint32_t>(out_2);
                         const auto out_3_int = wrapper::vcvta<uint32_t>(out_3);
 #else  // defined(__aarch64__) && !defined(BARE_METAL)
-                        const auto out_0_int = wrapper::vcvt<uint32_t>(out_0);
-                        const auto out_1_int = wrapper::vcvt<uint32_t>(out_1);
-                        const auto out_2_int = wrapper::vcvt<uint32_t>(out_2);
-                        const auto out_3_int = wrapper::vcvt<uint32_t>(out_3);
+                        const auto out_0_int                      = wrapper::vcvt<uint32_t>(out_0);
+                        const auto out_1_int                      = wrapper::vcvt<uint32_t>(out_1);
+                        const auto out_2_int                      = wrapper::vcvt<uint32_t>(out_2);
+                        const auto out_3_int                      = wrapper::vcvt<uint32_t>(out_3);
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
-                        const auto low_part  = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
-                        const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
-                        const auto out       = wrapper::vcombine(low_part, high_part);
+                        const auto low_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+                        const auto high_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+                        const auto out = wrapper::vcombine(low_part, high_part);
 
                         wrapper::vstore(out_ptr_xo_yo + cout * sizeof(uint8_t), out);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
                         const uint8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
                         const uint8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
@@ -311,18 +346,27 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
     }
 }
 
-void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                            BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                            bool align_corners, const Window &window)
+void s8_neon_scale_bilinear(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            const ITensor *dx,
+                            const ITensor *dy,
+                            BorderMode     border_mode,
+                            PixelValue     constant_border_value,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, offsets, constant_border_value);
-    if(border_mode == BorderMode::REPLICATE)
+    if (border_mode == BorderMode::REPLICATE)
     {
         using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
 
         // Compute the ratio between source and destination dimensions
-        const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-        const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+        const float scale_x =
+            scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+        const float scale_y =
+            scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
         const int     in_stride_x  = src->info()->strides_in_bytes()[1];
         const int     in_stride_y  = src->info()->strides_in_bytes()[2];
@@ -356,12 +400,12 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
         const float fp_coord_offset_y = sampling_offset * (scale_y - 1);
         const float fp_coord_offset_x = sampling_offset * (scale_x - 1);
 
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const int8_t *in_ptr  = reinterpret_cast<int8_t *>(in.ptr() + bo * in_stride_b);
             int8_t       *out_ptr = reinterpret_cast<int8_t *>(out.ptr() + bo * out_stride_b);
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = yo * scale_y + fp_coord_offset_y;
@@ -378,7 +422,7 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
                 const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
 
                 int8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = xo * scale_x + fp_coord_offset_x;
@@ -409,7 +453,7 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
                     int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
                         const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
                         const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
@@ -479,14 +523,16 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
                         const auto out_2_int                      = wrapper::vcvt<int32_t>(out_2);
                         const auto out_3_int                      = wrapper::vcvt<int32_t>(out_3);
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
-                        const auto low_part  = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
-                        const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
-                        const auto out       = wrapper::vcombine(low_part, high_part);
+                        const auto low_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+                        const auto high_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+                        const auto out = wrapper::vcombine(low_part, high_part);
 
                         wrapper::vstore(out_ptr_xo_yo + cout * sizeof(int8_t), out);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
                         const int8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
                         const int8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
@@ -515,8 +561,12 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
     }
 }
 
-void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                            float sampling_offset, bool align_corners, const Window &window)
+void s16_neon_scale_nearest(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -536,33 +586,46 @@ void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t  offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto     in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int      offset_row = in_hi * in_stride_wc;
-        int32_t        x          = window_start_x;
-        const int16_t *in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int      offset_row = in_hi * in_stride_wc;
+            int32_t        x          = window_start_x;
+            const int16_t *in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x,
+                                wrapper::vloadq(in_ptr + offset + offset_row + x));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+            }
+        },
+        out);
 }
 
-void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                             BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                             bool align_corners, const Window &window)
+void s16_neon_scale_bilinear(const ITensor *src,
+                             ITensor       *dst,
+                             const ITensor *offsets,
+                             const ITensor *dx,
+                             const ITensor *dy,
+                             BorderMode     border_mode,
+                             PixelValue     constant_border_value,
+                             float          sampling_offset,
+                             bool           align_corners,
+                             const Window  &window)
 {
     // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
     Iterator  out(dst, window);
     const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
@@ -577,64 +640,93 @@ void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *of
     win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
     Iterator in(src, win_in);
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
         const int16_t const_border_value = static_cast<int16_t>(constant_border_value.get<int16_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto     offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int32_t  in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+                const int16_t *in_ptr =
+                    reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+                const auto a00 =
+                    (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
+                const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h)
+                                     ? *(in_ptr + in_stride_c)
+                                     : const_border_value;
+                const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1)
+                                     ? *(in_ptr + in_stride_wc)
+                                     : const_border_value;
+                const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1)
+                                     ? *(in_ptr + in_stride_c + in_stride_wc)
+                                     : const_border_value;
+
+                *reinterpret_cast<int16_t *>(out.ptr()) =
+                    static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            const auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            const auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            const auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            const auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+
+                const auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
+                const auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
+                const auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
+                const auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
+
+                const auto a00 =
+                    *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
+                const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+                                   clamped_h * in_stride_wc);
+                const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c +
+                                   clamped_h1 * in_stride_wc);
+                const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+                                   clamped_h1 * in_stride_wc);
+
+                *reinterpret_cast<int16_t *>(out.ptr()) =
+                    static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
     }
     else
     {
         ARM_COMPUTE_ERROR("Not implemented");
     }
 }
-}
+} // namespace
 namespace cpu
 {
-void s8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                   bool align_corners, const Window &window)
+void s8_neon_scale(const ITensor      *src,
+                   ITensor            *dst,
+                   const ITensor      *offsets,
+                   const ITensor      *dx,
+                   const ITensor      *dy,
+                   InterpolationPolicy policy,
+                   BorderMode          border_mode,
+                   PixelValue          constant_border_value,
+                   float               sampling_offset,
+                   bool                align_corners,
+                   const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        s8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+        s8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                               align_corners, window);
     }
     else
     {
@@ -642,32 +734,50 @@ void s8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, con
     }
 }
 
-void u8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                   bool align_corners, const Window &window)
+void u8_neon_scale(const ITensor      *src,
+                   ITensor            *dst,
+                   const ITensor      *offsets,
+                   const ITensor      *dx,
+                   const ITensor      *dy,
+                   InterpolationPolicy policy,
+                   BorderMode          border_mode,
+                   PixelValue          constant_border_value,
+                   float               sampling_offset,
+                   bool                align_corners,
+                   const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+        u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                               align_corners, window);
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         u8_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
 }
 
-void s16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                    bool align_corners, const Window &window)
+void s16_neon_scale(const ITensor      *src,
+                    ITensor            *dst,
+                    const ITensor      *offsets,
+                    const ITensor      *dx,
+                    const ITensor      *dy,
+                    InterpolationPolicy policy,
+                    BorderMode          border_mode,
+                    PixelValue          constant_border_value,
+                    float               sampling_offset,
+                    bool                align_corners,
+                    const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+        s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                                align_corners, window);
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         s16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/scale/neon/list.h b/src/cpu/kernels/scale/neon/list.h
index 28a10872245bd2156fdfedd58bc14becc0f61ea4..153dc67c3db1c1cb3101bd0f6bb330df0a4481e7 100644
--- a/src/cpu/kernels/scale/neon/list.h
+++ b/src/cpu/kernels/scale/neon/list.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_SCALE_LIST_H
-#define SRC_CORE_NEON_KERNELS_SCALE_LIST_H
+#ifndef ACL_SRC_CPU_KERNELS_SCALE_NEON_LIST_H
+#define ACL_SRC_CPU_KERNELS_SCALE_NEON_LIST_H
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
@@ -34,28 +35,201 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_SCALE_KERNEL(func_name)                                                                                         \
-    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,              \
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \
-                   bool align_corners, const Window &window)
+#define DECLARE_SCALE_KERNEL(func_name)                                                                            \
+    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \
+                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value,           \
+                   float sampling_offset, bool align_corners, const Window &window)
 
 DECLARE_SCALE_KERNEL(s16_neon_scale);
 DECLARE_SCALE_KERNEL(u8_neon_scale);
 DECLARE_SCALE_KERNEL(s8_neon_scale);
 DECLARE_SCALE_KERNEL(qasymm8_neon_scale);
 DECLARE_SCALE_KERNEL(qasymm8_signed_neon_scale);
+DECLARE_SCALE_KERNEL(fp16_common_neon_scale);
+DECLARE_SCALE_KERNEL(fp16_bilinear_neon_scale_nchw);
+DECLARE_SCALE_KERNEL(fp16_nearest_neon_scale_nchw);
 
 #undef DECLARE_SCALE_KERNEL
 
+#ifdef ENABLE_NCHW_KERNELS
+template <typename T>
+void scale_nearest_nchw(const ITensor *src,
+                        ITensor       *dst,
+                        const ITensor *dx,
+                        const ITensor *dy,
+                        const ITensor *offsets,
+                        PixelValue     constant_border_value,
+                        float          sampling_offset,
+                        bool           align_corners,
+                        const Window  &window)
+{
+    ARM_COMPUTE_UNUSED(dx, dy);
+    ARM_COMPUTE_UNUSED(constant_border_value);
+    const size_t in_stride_x = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+
+    // Compute the ratio between source height and destination height
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    // Set offsets window
+    Window win_off;
+    win_off.set(Window::DimX, window[Window::DimX]);
+    win_off.set(Window::DimY, window[Window::DimY]);
+    for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+    {
+        win_off.set(d, Window::Dimension(0, 0, 0));
+    }
+
+    // Create iterators
+    Iterator src_i(src, win_in);
+    Iterator dst_i(dst, window);
+    Iterator offsets_i(offsets, win_off);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets_i.ptr());
+            const auto in_yi       = static_cast<int32_t>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.y() + sampling_offset) * hr)
+                                    : std::floor((id.y() + sampling_offset) * hr));
+            const int32_t offset_row = in_yi * in_stride_x;
+            *reinterpret_cast<T *>(dst_i.ptr()) =
+                *(reinterpret_cast<const T *>(src_i.ptr()) + offsets_ptr[0] + offset_row);
+        },
+        src_i, offsets_i, dst_i);
+}
+
+template <typename T>
+void scale_bilinear_nchw(const ITensor *src,
+                         ITensor       *dst,
+                         const ITensor *dx,
+                         const ITensor *dy,
+                         const ITensor *offsets,
+                         BorderMode     border_mode,
+                         PixelValue     constant_border_value,
+                         float          sampling_offset,
+                         bool           align_corners,
+                         const Window  &window)
+{
+    // Compute the ratio between source height and destination height
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    Window win_off;
+    win_off.set(Window::DimX, window.x());
+    win_off.set(Window::DimY, window.y());
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+    {
+        win_off.set(d, Window::Dimension(0, 0, 0));
+    }
+
+    Iterator src_i(src, win_in);
+    Iterator dst_i(dst, window);
+    Iterator offsets_i(offsets, win_off);
+    Iterator dx_i(dx, win_off);
+    Iterator dy_i(dy, win_off);
+
+    const int32_t in_dim_w    = src->info()->dimension(0);
+    const int32_t in_dim_h    = src->info()->dimension(1);
+    const int32_t in_stride_w = in_dim_w + src->info()->padding().left + src->info()->padding().right;
+
+    if (border_mode == BorderMode::CONSTANT)
+    {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        using ConstType = T;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int32_t index_h       = std::floor((id.y() + sampling_offset) * hr - sampling_offset);
+                const auto    index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
+                const auto    dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
+                const auto    dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
+                const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+
+                const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h)
+                                     ? (*(pixel_row_ptr + index_w + index_h * in_stride_w))
+                                     : const_border_value;
+                const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h)
+                                     ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w))
+                                     : const_border_value;
+                const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1)
+                                     ? (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w))
+                                     : const_border_value;
+                const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1)
+                                     ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w))
+                                     : const_border_value;
+
+                *reinterpret_cast<T *>(dst_i.ptr()) =
+                    static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            src_i, offsets_i, dx_i, dy_i, dst_i);
+    }
+    else if (border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int  index_h       = std::floor((id.y() + sampling_offset) * hr - sampling_offset);
+                const auto index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
+                const auto dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
+                const auto dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
+                const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+
+                auto clamped_x  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
+                auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
+                auto clamped_y  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
+                auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
+
+                const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w);
+                const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w);
+                const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w);
+                const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w);
+
+                *reinterpret_cast<T *>(dst_i.ptr()) =
+                    static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            src_i, offsets_i, dx_i, dy_i, dst_i);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+#endif // ENABLE_NCHW_KERNELS
+
 template <typename T>
-void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset,
-                        bool align_corners, const Window &window)
+void nearest_neon_scale(const ITensor *src,
+                        ITensor       *dst,
+                        const ITensor *offsets,
+                        float          sampling_offset,
+                        bool           align_corners,
+                        const Window  &window)
 {
     ARM_COMPUTE_UNUSED(offsets);
 
     // Compute the ratio between source and destination dimensions
-    const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-    const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
     const int in_stride_y  = src->info()->strides_in_bytes()[1];
     const int in_stride_z  = src->info()->strides_in_bytes()[2];
@@ -84,17 +258,17 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets
     const int bo_end   = window_execution[3].end();
     const int bo_step  = window_execution[3].step();
 
-    for(int bo = bo_start; bo < bo_end; bo += bo_step)
+    for (int bo = bo_start; bo < bo_end; bo += bo_step)
     {
         const uint8_t *in_ptr_base  = in.ptr() + bo * in_stride_w;
         uint8_t       *out_ptr_base = out.ptr() + bo * out_stride_w;
 
-        for(int yo = yo_start; yo < yo_end; yo += yo_step)
+        for (int yo = yo_start; yo < yo_end; yo += yo_step)
         {
             // Floating-point coordinate
             float yi_f = ((yo + sampling_offset) * scale_y);
             int   yi   = 0;
-            if(align_corners)
+            if (align_corners)
             {
                 yi = utils::rounding::round_half_away_from_zero(yi_f);
             }
@@ -103,12 +277,12 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets
                 yi = static_cast<int>(std::floor(yi_f));
             }
 
-            for(int xo = xo_start; xo < xo_end; xo += xo_step)
+            for (int xo = xo_start; xo < xo_end; xo += xo_step)
             {
                 // Floating-point coordinate
                 float xi_f = ((xo + sampling_offset) * scale_x);
                 int   xi   = 0;
-                if(align_corners)
+                if (align_corners)
                 {
                     xi = utils::rounding::round_half_away_from_zero(xi_f);
                 }
@@ -121,15 +295,15 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets
                 uint8_t       *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
 
                 int cout = 0;
-                for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                 {
                     auto out0 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
                     wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0);
                 }
 
-                for(; cout < out_dim_ch; ++cout)
+                for (; cout < out_dim_ch; ++cout)
                 {
-                    auto out0                                            = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
+                    auto out0 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
                     *(reinterpret_cast<T *>(out_ptr + cout * sizeof(T))) = out0;
                 }
             }
@@ -138,9 +312,16 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets
 }
 
 template <typename T>
-void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                         BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                         bool align_corners, const Window &window)
+void bilinear_neon_scale(const ITensor *src,
+                         ITensor       *dst,
+                         const ITensor *offsets,
+                         const ITensor *dx,
+                         const ITensor *dy,
+                         BorderMode     border_mode,
+                         PixelValue     constant_border_value,
+                         float          sampling_offset,
+                         bool           align_corners,
+                         const Window  &window)
 {
     ARM_COMPUTE_UNUSED(offsets);
     ARM_COMPUTE_UNUSED(dx);
@@ -148,8 +329,10 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
     // Compute the ratio between source and destination dimensions
-    const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-    const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
     const int in_stride_y  = src->info()->strides_in_bytes()[1];
     const int in_stride_z  = src->info()->strides_in_bytes()[2];
@@ -180,7 +363,7 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
     const int bo_end   = window_execution[3].end();
     const int bo_step  = window_execution[3].step();
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
@@ -189,12 +372,12 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
 
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const uint8_t *in_ptr_base  = in.ptr() + bo * in_stride_w;
             uint8_t       *out_ptr_base = out.ptr() + bo * out_stride_w;
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
@@ -204,7 +387,7 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
                 const auto a1 = (yi_f - static_cast<float>(yi));
                 const auto b1 = (1.f - a1);
 
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
@@ -223,32 +406,35 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
                     uint8_t       *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
                         auto in00 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
                         auto in01 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
                         auto in10 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
                         auto in11 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
-                        if((yi >= 0) && (yi < in_dim_h))
+                        if ((yi >= 0) && (yi < in_dim_h))
                         {
-                            if((xi >= 0) && (xi < in_dim_w))
+                            if ((xi >= 0) && (xi < in_dim_w))
                             {
                                 in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
                             }
-                            if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+                            if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
                             {
-                                in01 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
+                                in01 = wrapper::vloadq(
+                                    reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
                             }
                         }
-                        if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
+                        if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
                         {
-                            if((xi >= 0) && (xi < in_dim_w))
+                            if ((xi >= 0) && (xi < in_dim_w))
                             {
-                                in10 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
+                                in10 = wrapper::vloadq(
+                                    reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
                             }
-                            if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+                            if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
                             {
-                                in11 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
+                                in11 = wrapper::vloadq(
+                                    reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
                             }
                         }
 
@@ -264,32 +450,33 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
                         wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
                         auto in00 = static_cast<T>(const_border_value);
                         auto in01 = static_cast<T>(const_border_value);
                         auto in10 = static_cast<T>(const_border_value);
                         auto in11 = static_cast<T>(const_border_value);
-                        if((yi >= 0) && (yi < in_dim_h))
+                        if ((yi >= 0) && (yi < in_dim_h))
                         {
-                            if((xi >= 0) && (xi < in_dim_w))
+                            if ((xi >= 0) && (xi < in_dim_w))
                             {
                                 in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
                             }
-                            if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+                            if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
                             {
                                 in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
                             }
                         }
-                        if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
+                        if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
                         {
-                            if((xi >= 0) && (xi < in_dim_w))
+                            if ((xi >= 0) && (xi < in_dim_w))
                             {
                                 in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
                             }
-                            if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+                            if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
                             {
-                                in11 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
+                                in11 = *(
+                                    reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
                             }
                         }
                         auto out0 = static_cast<T>(0);
@@ -303,14 +490,14 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
             }
         }
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const uint8_t *in_ptr  = in.ptr() + bo * in_stride_w;
             uint8_t       *out_ptr = out.ptr() + bo * out_stride_w;
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
@@ -327,7 +514,7 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
                 const int yi1_offset = yi1 * in_stride_z;
 
                 const int y_offset = yo * out_stride_z;
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
@@ -356,12 +543,16 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
                     const int offset = xo * out_stride_y + y_offset;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
-                        const auto in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
-                        const auto in01 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
-                        const auto in10 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
-                        const auto in11 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
+                        const auto in00 = wrapper::vloadq(
+                            reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
+                        const auto in01 = wrapper::vloadq(
+                            reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
+                        const auto in10 = wrapper::vloadq(
+                            reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
+                        const auto in11 = wrapper::vloadq(
+                            reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
 
                         auto out0 = wrapper::vmul(in00, s00);
                         out0      = wrapper::vmla(out0, in01, s01);
@@ -370,12 +561,16 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
                         wrapper::vstore(reinterpret_cast<T *>(out_ptr + offset + cout * sizeof(T)), out0);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
-                        const T in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
-                        const T in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
-                        const T in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
-                        const T in11 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
+                        const T in00 =
+                            *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
+                        const T in01 =
+                            *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
+                        const T in10 =
+                            *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
+                        const T in11 =
+                            *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
 
                         T out0 = in00 * s00_s;
                         out0 += in01 * s01_s;
@@ -394,15 +589,24 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
 }
 
 template <typename T>
-void common_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                       InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                       bool align_corners, const Window &window)
+void common_neon_scale(const ITensor      *src,
+                       ITensor            *dst,
+                       const ITensor      *offsets,
+                       const ITensor      *dx,
+                       const ITensor      *dy,
+                       InterpolationPolicy policy,
+                       BorderMode          border_mode,
+                       PixelValue          constant_border_value,
+                       float               sampling_offset,
+                       bool                align_corners,
+                       const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+        bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                               align_corners, window);
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         nearest_neon_scale<T>(src, dst, offsets, sampling_offset, align_corners, window);
     }
@@ -410,4 +614,4 @@ void common_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets,
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_NEON_KERNELS_SCALE_LIST_H */
+#endif // ACL_SRC_CPU_KERNELS_SCALE_NEON_LIST_H
diff --git a/src/cpu/kernels/scale/neon/qasymm8.cpp b/src/cpu/kernels/scale/neon/qasymm8.cpp
index 778459ae39ebb46728436be253e368d17bfeb590..62a821daa538f7dff54a077dc88a29dc4ae53ee1 100644
--- a/src/cpu/kernels/scale/neon/qasymm8.cpp
+++ b/src/cpu/kernels/scale/neon/qasymm8.cpp
@@ -28,9 +28,16 @@ namespace arm_compute
 {
 namespace
 {
-void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                                 BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                                 bool align_corners, const Window &window)
+void qasymm8_neon_scale_bilinear(const ITensor *src,
+                                 ITensor       *dst,
+                                 const ITensor *offsets,
+                                 const ITensor *dx,
+                                 const ITensor *dy,
+                                 BorderMode     border_mode,
+                                 PixelValue     constant_border_value,
+                                 float          sampling_offset,
+                                 bool           align_corners,
+                                 const Window  &window)
 {
     // Data layout is NHWC
     const int32_t input_width  = src->info()->dimension(1);
@@ -40,10 +47,12 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
     const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
 
     // Compute the ratio between source and destination dimensions
-    const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-    const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
         const int32_t in_stride_y = src->info()->strides_in_bytes()[1];
         const int32_t in_stride_z = src->info()->strides_in_bytes()[2];
@@ -59,7 +68,7 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
         win_in.set(1, Window::Dimension(0, 0, 0));
         win_in.set(2, Window::Dimension(0, 0, 0));
 
-        for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+        for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
         {
             win_off.set(d, Window::Dimension(0, 0, 0));
         }
@@ -68,36 +77,41 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
         Iterator out(dst, window);
 
         const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) ?
-                             (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) ?
-                             (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) ?
-                             (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) :
-                             const_border_value;
-
-            const float inp00                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
-            const float inp01                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
-            const float inp10                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
-            const float inp11                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
+                const int32_t index_w =
+                    *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
+
+                const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height)
+                                     ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z))
+                                     : const_border_value;
+                const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height)
+                                     ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z))
+                                     : const_border_value;
+                const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1)
+                                     ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z))
+                                     : const_border_value;
+                const auto a11 =
+                    (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1)
+                        ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z))
+                        : const_border_value;
+
+                const float inp00                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
+                const float inp01                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
+                const float inp10                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
+                const float inp11                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
+                *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(
+                    scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+            },
+            in, out);
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
         using FloatTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
         using Int32TagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
@@ -141,12 +155,12 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
         const float fp_coord_offset_y = sampling_offset * (scale_y - 1);
         const float fp_coord_offset_x = sampling_offset * (scale_x - 1);
 
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const uint8_t *in_ptr  = in.ptr() + bo * in_stride_b;
             uint8_t       *out_ptr = out.ptr() + bo * out_stride_b;
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = yo * scale_y + fp_coord_offset_y;
@@ -163,7 +177,7 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
                 const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
 
                 uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = xo * scale_x + fp_coord_offset_x;
@@ -194,7 +208,7 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
                     uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
                         const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
                         const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
@@ -204,34 +218,82 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
                         const uint16x8_t in00_low  = wrapper::vmovl(wrapper::vgetlow(in00));
                         const uint16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00));
 
-                        const auto in00_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_low))), voffset_in)), vscale_in);
-                        const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_low))), voffset_in)), vscale_in);
-                        const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_high))), voffset_in)), vscale_in);
-                        const auto in00_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_high))), voffset_in)), vscale_in);
+                        const auto in00_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_low))), voffset_in)),
+                            vscale_in);
+                        const auto in00_1 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_low))), voffset_in)),
+                            vscale_in);
+                        const auto in00_2 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_high))), voffset_in)),
+                            vscale_in);
+                        const auto in00_3 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_high))), voffset_in)),
+                            vscale_in);
 
                         const uint16x8_t in01_low  = wrapper::vmovl(wrapper::vgetlow(in01));
                         const uint16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01));
 
-                        const auto in01_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_low))), voffset_in)), vscale_in);
-                        const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_low))), voffset_in)), vscale_in);
-                        const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_high))), voffset_in)), vscale_in);
-                        const auto in01_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_high))), voffset_in)), vscale_in);
+                        const auto in01_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_low))), voffset_in)),
+                            vscale_in);
+                        const auto in01_1 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_low))), voffset_in)),
+                            vscale_in);
+                        const auto in01_2 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_high))), voffset_in)),
+                            vscale_in);
+                        const auto in01_3 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_high))), voffset_in)),
+                            vscale_in);
 
                         const uint16x8_t in10_low  = wrapper::vmovl(wrapper::vgetlow(in10));
                         const uint16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10));
 
-                        const auto in10_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_low))), voffset_in)), vscale_in);
-                        const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_low))), voffset_in)), vscale_in);
-                        const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_high))), voffset_in)), vscale_in);
-                        const auto in10_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_high))), voffset_in)), vscale_in);
+                        const auto in10_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_low))), voffset_in)),
+                            vscale_in);
+                        const auto in10_1 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_low))), voffset_in)),
+                            vscale_in);
+                        const auto in10_2 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_high))), voffset_in)),
+                            vscale_in);
+                        const auto in10_3 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_high))), voffset_in)),
+                            vscale_in);
 
                         const uint16x8_t in11_low  = wrapper::vmovl(wrapper::vgetlow(in11));
                         const uint16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11));
 
-                        const auto in11_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_low))), voffset_in)), vscale_in);
-                        const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_low))), voffset_in)), vscale_in);
-                        const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_high))), voffset_in)), vscale_in);
-                        const auto in11_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_high))), voffset_in)), vscale_in);
+                        const auto in11_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_low))), voffset_in)),
+                            vscale_in);
+                        const auto in11_1 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_low))), voffset_in)),
+                            vscale_in);
+                        const auto in11_2 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_high))), voffset_in)),
+                            vscale_in);
+                        const auto in11_3 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_high))), voffset_in)),
+                            vscale_in);
 
                         auto out_0 = wrapper::vmul(in00_0, s00);
                         out_0      = wrapper::vmla(out_0, in01_0, s01);
@@ -264,14 +326,16 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
                         const auto out_2_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o));
                         const auto out_3_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o));
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
-                        const auto low_part  = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
-                        const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
-                        const auto out       = wrapper::vcombine(low_part, high_part);
+                        const auto low_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+                        const auto high_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+                        const auto out = wrapper::vcombine(low_part, high_part);
 
                         wrapper::vstore(out_ptr_xo_yo + cout * sizeof(uint8_t), out);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
                         const uint8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
                         const uint8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
@@ -292,7 +356,8 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
 #if defined(__aarch64__) && !defined(BARE_METAL)
                         *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = quantize_qasymm8(out, oq_info);
 #else  // defined(__aarch64__) && !defined(BARE_METAL)
-                        *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = quantize_qasymm8(out, oq_info, RoundingPolicy::TO_ZERO);
+                        *(out_ptr_xo_yo + cout * sizeof(uint8_t)) =
+                            quantize_qasymm8(out, oq_info, RoundingPolicy::TO_ZERO);
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
                     }
                 }
@@ -304,28 +369,38 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
         ARM_COMPUTE_ERROR("Not implemented");
     }
 }
-}
+} // namespace
 namespace cpu
 {
-void qasymm8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                        InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                        bool align_corners, const Window &window)
+void qasymm8_neon_scale(const ITensor      *src,
+                        ITensor            *dst,
+                        const ITensor      *offsets,
+                        const ITensor      *dx,
+                        const ITensor      *dy,
+                        InterpolationPolicy policy,
+                        BorderMode          border_mode,
+                        PixelValue          constant_border_value,
+                        float               sampling_offset,
+                        bool                align_corners,
+                        const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        if(src->info()->quantization_info() == dst->info()->quantization_info())
+        if (src->info()->quantization_info() == dst->info()->quantization_info())
         {
-            u8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+            u8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset,
+                          align_corners, window);
         }
         else
         {
-            qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+            qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                                        align_corners, window);
         }
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         nearest_neon_scale<uint8_t>(src, dst, offsets, sampling_offset, align_corners, window);
     }
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/scale/neon/qasymm8_signed.cpp b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp
index cd63dfba63c84af2b484ef158a2eb985225fed3f..5a885178a7653849ccdb11c21231dba2d0f18769 100644
--- a/src/cpu/kernels/scale/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp
@@ -28,9 +28,16 @@ namespace arm_compute
 {
 namespace
 {
-void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                                        BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                                        bool align_corners, const Window &window)
+void qasymm8_signed_neon_scale_bilinear(const ITensor *src,
+                                        ITensor       *dst,
+                                        const ITensor *offsets,
+                                        const ITensor *dx,
+                                        const ITensor *dy,
+                                        BorderMode     border_mode,
+                                        PixelValue     constant_border_value,
+                                        float          sampling_offset,
+                                        bool           align_corners,
+                                        const Window  &window)
 {
     // Data layout is NHWC
     const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
@@ -40,10 +47,12 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
     const int32_t input_height = src->info()->dimension(2);
 
     // Compute the ratio between source and destination dimensions
-    const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-    const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
         const int32_t in_stride_y = src->info()->strides_in_bytes()[1];
         const int32_t in_stride_z = src->info()->strides_in_bytes()[2];
@@ -58,7 +67,7 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
         win_in.set(1, Window::Dimension(0, 0, 0));
         win_in.set(2, Window::Dimension(0, 0, 0));
 
-        for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+        for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
         {
             win_off.set(d, Window::Dimension(0, 0, 0));
         }
@@ -67,36 +76,41 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
         Iterator out(dst, window);
 
         const int8_t const_border_value = static_cast<int8_t>(constant_border_value.get<int8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) ?
-                             (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) ?
-                             (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) ?
-                             (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) :
-                             const_border_value;
-
-            const float inp00                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
-            const float inp01                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
-            const float inp10                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
-            const float inp11                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
+                const int32_t index_w =
+                    *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
+
+                const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height)
+                                     ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z))
+                                     : const_border_value;
+                const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height)
+                                     ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z))
+                                     : const_border_value;
+                const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1)
+                                     ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z))
+                                     : const_border_value;
+                const auto a11 =
+                    (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1)
+                        ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z))
+                        : const_border_value;
+
+                const float inp00                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
+                const float inp01                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
+                const float inp10                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
+                const float inp11                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
+                *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(
+                    scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+            },
+            in, out);
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
         using FloatTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
         using Int32TagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
@@ -140,12 +154,12 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
         const float32x4_t invvscale_o = wrapper::vdup_n(1.f / oq_info.scale, FloatTagType{});
         const float32x4_t voffset_o   = vdupq_n_f32(oq_info.offset);
 
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const int8_t *in_ptr  = reinterpret_cast<int8_t *>(in.ptr() + bo * in_stride_b);
             int8_t       *out_ptr = reinterpret_cast<int8_t *>(out.ptr() + bo * out_stride_b);
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = yo * scale_y + fp_coord_offset_y;
@@ -162,7 +176,7 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
                 const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
 
                 int8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = xo * scale_x + fp_coord_offset_x;
@@ -193,7 +207,7 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
                     int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
                         const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
                         const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
@@ -203,34 +217,70 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
                         const int16x8_t in00_low  = wrapper::vmovl(wrapper::vgetlow(in00));
                         const int16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00));
 
-                        const auto in00_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_low)), voffset_in)), vscale_in);
-                        const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_low)), voffset_in)), vscale_in);
-                        const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_high)), voffset_in)), vscale_in);
-                        const auto in00_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_high)), voffset_in)), vscale_in);
+                        const auto in00_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_low)), voffset_in)),
+                            vscale_in);
+                        const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgethigh(in00_low)), voffset_in)),
+                                                          vscale_in);
+                        const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgetlow(in00_high)), voffset_in)),
+                                                          vscale_in);
+                        const auto in00_3 =
+                            wrapper::vmul(wrapper::vcvt<float>(
+                                              wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_high)), voffset_in)),
+                                          vscale_in);
 
                         const int16x8_t in01_low  = wrapper::vmovl(wrapper::vgetlow(in01));
                         const int16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01));
 
-                        const auto in01_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_low)), voffset_in)), vscale_in);
-                        const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_low)), voffset_in)), vscale_in);
-                        const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_high)), voffset_in)), vscale_in);
-                        const auto in01_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_high)), voffset_in)), vscale_in);
+                        const auto in01_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_low)), voffset_in)),
+                            vscale_in);
+                        const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgethigh(in01_low)), voffset_in)),
+                                                          vscale_in);
+                        const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgetlow(in01_high)), voffset_in)),
+                                                          vscale_in);
+                        const auto in01_3 =
+                            wrapper::vmul(wrapper::vcvt<float>(
+                                              wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_high)), voffset_in)),
+                                          vscale_in);
 
                         const int16x8_t in10_low  = wrapper::vmovl(wrapper::vgetlow(in10));
                         const int16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10));
 
-                        const auto in10_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_low)), voffset_in)), vscale_in);
-                        const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_low)), voffset_in)), vscale_in);
-                        const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_high)), voffset_in)), vscale_in);
-                        const auto in10_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_high)), voffset_in)), vscale_in);
+                        const auto in10_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_low)), voffset_in)),
+                            vscale_in);
+                        const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgethigh(in10_low)), voffset_in)),
+                                                          vscale_in);
+                        const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgetlow(in10_high)), voffset_in)),
+                                                          vscale_in);
+                        const auto in10_3 =
+                            wrapper::vmul(wrapper::vcvt<float>(
+                                              wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_high)), voffset_in)),
+                                          vscale_in);
 
                         const int16x8_t in11_low  = wrapper::vmovl(wrapper::vgetlow(in11));
                         const int16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11));
 
-                        const auto in11_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_low)), voffset_in)), vscale_in);
-                        const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_low)), voffset_in)), vscale_in);
-                        const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_high)), voffset_in)), vscale_in);
-                        const auto in11_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_high)), voffset_in)), vscale_in);
+                        const auto in11_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_low)), voffset_in)),
+                            vscale_in);
+                        const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgethigh(in11_low)), voffset_in)),
+                                                          vscale_in);
+                        const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgetlow(in11_high)), voffset_in)),
+                                                          vscale_in);
+                        const auto in11_3 =
+                            wrapper::vmul(wrapper::vcvt<float>(
+                                              wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_high)), voffset_in)),
+                                          vscale_in);
 
                         auto out_0 = wrapper::vmul(in00_0, s00);
                         out_0      = wrapper::vmla(out_0, in01_0, s01);
@@ -263,14 +313,16 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
                         const auto out_2_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o));
                         const auto out_3_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o));
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
-                        const auto low_part  = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
-                        const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
-                        const auto out       = wrapper::vcombine(low_part, high_part);
+                        const auto low_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+                        const auto high_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+                        const auto out = wrapper::vcombine(low_part, high_part);
 
                         wrapper::vstore(out_ptr_xo_yo + cout * sizeof(int8_t), out);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
                         const int8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
                         const int8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
@@ -291,7 +343,8 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
 #if defined(__aarch64__) && !defined(BARE_METAL)
                         *(out_ptr_xo_yo + cout * sizeof(int8_t)) = quantize_qasymm8_signed(out, oq_info);
 #else  // defined(__aarch64__) && !defined(BARE_METAL)
-                        *(out_ptr_xo_yo + cout * sizeof(int8_t)) = quantize_qasymm8_signed(out, oq_info, RoundingPolicy::TO_ZERO);
+                        *(out_ptr_xo_yo + cout * sizeof(int8_t)) =
+                            quantize_qasymm8_signed(out, oq_info, RoundingPolicy::TO_ZERO);
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
                     }
                 }
@@ -303,28 +356,39 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
         ARM_COMPUTE_ERROR("Not implemented");
     }
 }
-}
+} // namespace
 namespace cpu
 {
-void qasymm8_signed_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                               InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                               bool align_corners, const Window &window)
+void qasymm8_signed_neon_scale(const ITensor      *src,
+                               ITensor            *dst,
+                               const ITensor      *offsets,
+                               const ITensor      *dx,
+                               const ITensor      *dy,
+                               InterpolationPolicy policy,
+                               BorderMode          border_mode,
+                               PixelValue          constant_border_value,
+                               float               sampling_offset,
+                               bool                align_corners,
+                               const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        if(src->info()->quantization_info() == dst->info()->quantization_info() && border_mode == BorderMode::REPLICATE)
+        if (src->info()->quantization_info() == dst->info()->quantization_info() &&
+            border_mode == BorderMode::REPLICATE)
         {
-            s8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+            s8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset,
+                          align_corners, window);
         }
         else
         {
-            qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+            qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value,
+                                               sampling_offset, align_corners, window);
         }
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         nearest_neon_scale<int8_t>(src, dst, offsets, sampling_offset, align_corners, window);
     }
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/scale/sve/fp16.cpp b/src/cpu/kernels/scale/sve/fp16.cpp
index ceda19f366cd5ea033640a3a30be6432f22b0665..cb28f4cb1c6691aa59a48a4e4f2cb407a0c2686f 100644
--- a/src/cpu/kernels/scale/sve/fp16.cpp
+++ b/src/cpu/kernels/scale/sve/fp16.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -41,8 +42,12 @@ namespace arm_compute
 {
 namespace
 {
-void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                            float sampling_offset, bool align_corners, const Window &window)
+void fp16_sve_scale_nearest(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -61,38 +66,50 @@ void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<float16_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x));
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<float16_t *>(out.ptr());
 
-            x += svcntw();
-            pg = svwhilelt_b16(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    out);
-}
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b16(x, window_end_x);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        out);
 }
+} // namespace
 namespace cpu
 {
-void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                    bool align_corners, const Window &window)
+void fp16_sve_scale(const ITensor      *src,
+                    ITensor            *dst,
+                    const ITensor      *offsets,
+                    const ITensor      *dx,
+                    const ITensor      *dy,
+                    InterpolationPolicy policy,
+                    BorderMode          border_mode,
+                    PixelValue          constant_border_value,
+                    float               sampling_offset,
+                    bool                align_corners,
+                    const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         fp16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
@@ -103,4 +120,4 @@ void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, co
 }
 } // namespace cpu
 } // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/scale/sve/fp32.cpp b/src/cpu/kernels/scale/sve/fp32.cpp
index f3472f1efdf512cc7563a97598fddd62187dc2f6..cbb345edbbfa7e3a1c8b80b4b12afe4fb4a8d79c 100644
--- a/src/cpu/kernels/scale/sve/fp32.cpp
+++ b/src/cpu/kernels/scale/sve/fp32.cpp
@@ -25,23 +25,27 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
+#include <arm_sve.h>
 #include <cmath>
 #include <cstddef>
 
-#include <arm_sve.h>
-
 namespace arm_compute
 {
 namespace
 {
-void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                            float sampling_offset, bool align_corners, const Window &window)
+void fp32_sve_scale_nearest(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -60,38 +64,50 @@ void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const float *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<float *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b32(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x));
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const float *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<float *>(out.ptr());
 
-            x += svcntw();
-            pg = svwhilelt_b32(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b32(), pg));
-    },
-    out);
-}
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b32(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b32(x, window_end_x);
+            } while (svptest_any(svptrue_b32(), pg));
+        },
+        out);
 }
+} // namespace
 namespace cpu
 {
-void fp32_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                    bool align_corners, const Window &window)
+void fp32_sve_scale(const ITensor      *src,
+                    ITensor            *dst,
+                    const ITensor      *offsets,
+                    const ITensor      *dx,
+                    const ITensor      *dy,
+                    InterpolationPolicy policy,
+                    BorderMode          border_mode,
+                    PixelValue          constant_border_value,
+                    float               sampling_offset,
+                    bool                align_corners,
+                    const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         fp32_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
diff --git a/src/cpu/kernels/scale/sve/integer.cpp b/src/cpu/kernels/scale/sve/integer.cpp
index 82c70ee360d329a834244c3927f9bf8ee6e48be2..df950b1789cbcf4f1e8b21e7449345c99f808245 100644
--- a/src/cpu/kernels/scale/sve/integer.cpp
+++ b/src/cpu/kernels/scale/sve/integer.cpp
@@ -25,9 +25,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -39,8 +40,12 @@ namespace arm_compute
 {
 namespace
 {
-void u8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                          float sampling_offset, bool align_corners, const Window &window)
+void u8_sve_scale_nearest(const ITensor *src,
+                          ITensor       *dst,
+                          const ITensor *offsets,
+                          float          sampling_offset,
+                          bool           align_corners,
+                          const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -59,32 +64,40 @@ void u8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offse
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
-
-            x += svcntw();
-            pg = svwhilelt_b8(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    out);
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
+
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b8(x, window_end_x);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        out);
 }
 
-void s16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                           float sampling_offset, bool align_corners, const Window &window)
+void s16_sve_scale_nearest(const ITensor *src,
+                           ITensor       *dst,
+                           const ITensor *offsets,
+                           float          sampling_offset,
+                           bool           align_corners,
+                           const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -103,38 +116,50 @@ void s16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offs
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<int16_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x));
-
-            x += svcntw();
-            pg = svwhilelt_b16(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    out);
-}
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<int16_t *>(out.ptr());
+
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b16(x, window_end_x);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        out);
 }
+} // namespace
 namespace cpu
 {
-void u8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                  InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                  bool align_corners, const Window &window)
+void u8_sve_scale(const ITensor      *src,
+                  ITensor            *dst,
+                  const ITensor      *offsets,
+                  const ITensor      *dx,
+                  const ITensor      *dy,
+                  InterpolationPolicy policy,
+                  BorderMode          border_mode,
+                  PixelValue          constant_border_value,
+                  float               sampling_offset,
+                  bool                align_corners,
+                  const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         u8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
@@ -144,12 +169,20 @@ void u8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, cons
     }
 }
 
-void s16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                   bool align_corners, const Window &window)
+void s16_sve_scale(const ITensor      *src,
+                   ITensor            *dst,
+                   const ITensor      *offsets,
+                   const ITensor      *dx,
+                   const ITensor      *dy,
+                   InterpolationPolicy policy,
+                   BorderMode          border_mode,
+                   PixelValue          constant_border_value,
+                   float               sampling_offset,
+                   bool                align_corners,
+                   const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         s16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
diff --git a/src/cpu/kernels/scale/sve/list.h b/src/cpu/kernels/scale/sve/list.h
index b9c3a10a78f03fc0de5817eecf07aa7caea9bab2..aff741a4a7f48843d467738ef13084a9174244a6 100644
--- a/src/cpu/kernels/scale/sve/list.h
+++ b/src/cpu/kernels/scale/sve/list.h
@@ -28,10 +28,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_SCALE_KERNEL(func_name)                                                                                         \
-    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,              \
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \
-                   bool align_corners, const Window &window)
+#define DECLARE_SCALE_KERNEL(func_name)                                                                            \
+    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \
+                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value,           \
+                   float sampling_offset, bool align_corners, const Window &window)
 
 DECLARE_SCALE_KERNEL(fp16_sve_scale);
 DECLARE_SCALE_KERNEL(fp32_sve_scale);
diff --git a/src/cpu/kernels/scale/sve/qasymm8.cpp b/src/cpu/kernels/scale/sve/qasymm8.cpp
index d45a69e43b13b69d09f15ec91e242024437aa623..0fc794c6c2653a71ef494584ace74ac05243db18 100644
--- a/src/cpu/kernels/scale/sve/qasymm8.cpp
+++ b/src/cpu/kernels/scale/sve/qasymm8.cpp
@@ -25,10 +25,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -40,8 +40,12 @@ namespace arm_compute
 {
 namespace
 {
-void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                               float sampling_offset, bool align_corners, const Window &window)
+void qasymm8_sve_scale_nearest(const ITensor *src,
+                               ITensor       *dst,
+                               const ITensor *offsets,
+                               float          sampling_offset,
+                               bool           align_corners,
+                               const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -60,38 +64,50 @@ void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
 
-            x += svcntw();
-            pg = svwhilelt_b8(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    out);
-}
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b8(x, window_end_x);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        out);
 }
+} // namespace
 namespace cpu
 {
-void qasymm8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                       InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                       bool align_corners, const Window &window)
+void qasymm8_sve_scale(const ITensor      *src,
+                       ITensor            *dst,
+                       const ITensor      *offsets,
+                       const ITensor      *dx,
+                       const ITensor      *dy,
+                       InterpolationPolicy policy,
+                       BorderMode          border_mode,
+                       PixelValue          constant_border_value,
+                       float               sampling_offset,
+                       bool                align_corners,
+                       const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         qasymm8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
diff --git a/src/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp
index 67bca65f582c8697e149845ffdb8f6571f6e7907..68ea01e29e31811ccbf686891482685a953c042f 100644
--- a/src/cpu/kernels/scale/sve/qasymm8_signed.cpp
+++ b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp
@@ -25,10 +25,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -40,8 +40,12 @@ namespace arm_compute
 {
 namespace
 {
-void qasymm8_signed_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                                      float sampling_offset, bool align_corners, const Window &window)
+void qasymm8_signed_sve_scale_nearest(const ITensor *src,
+                                      ITensor       *dst,
+                                      const ITensor *offsets,
+                                      float          sampling_offset,
+                                      bool           align_corners,
+                                      const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -60,38 +64,50 @@ void qasymm8_signed_sve_scale_nearest(const ITensor *src, ITensor *dst, const IT
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const int8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<int8_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x));
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const int8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<int8_t *>(out.ptr());
 
-            x += svcntw();
-            pg = svwhilelt_b8(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    out);
-}
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b8(x, window_end_x);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        out);
 }
+} // namespace
 namespace cpu
 {
-void qasymm8_signed_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                              InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                              bool align_corners, const Window &window)
+void qasymm8_signed_sve_scale(const ITensor      *src,
+                              ITensor            *dst,
+                              const ITensor      *offsets,
+                              const ITensor      *dx,
+                              const ITensor      *dy,
+                              InterpolationPolicy policy,
+                              BorderMode          border_mode,
+                              PixelValue          constant_border_value,
+                              float               sampling_offset,
+                              bool                align_corners,
+                              const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         qasymm8_signed_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
diff --git a/src/cpu/kernels/select/generic/neon/fp16.cpp b/src/cpu/kernels/select/generic/neon/fp16.cpp
index b460213c726419b1ecdfd9852f67799247585ca2..38a58099bd9ee54854db0d46ebe04fa527050c32 100644
--- a/src/cpu/kernels/select/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/select/generic/neon/fp16.cpp
@@ -23,20 +23,22 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 
-#include "src/cpu/kernels/select/generic/neon/impl.h"
-
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/select/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_f16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_f16_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_16<float16_t, uint16x8_t>(c, x, y, output, window);
 }
-void neon_f16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_f16_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<float16_t>(c, x, y, output, window);
 }
@@ -45,4 +47,4 @@ void neon_f16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITe
 
 } // namespace arm_compute
 
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)  && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)  && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/select/generic/neon/fp32.cpp b/src/cpu/kernels/select/generic/neon/fp32.cpp
index 63fd594901ee1f60d4b583db1a0afc3c5771a0f8..50a80cb338854feb3492edbed1d2ea0685b0c568 100644
--- a/src/cpu/kernels/select/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/select/generic/neon/fp32.cpp
@@ -22,20 +22,22 @@
  * SOFTWARE.
  */
 
-#include "src/cpu/kernels/select/generic/neon/impl.h"
-
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/select/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_f32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_f32_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_32<float, uint32x4_t>(c, x, y, output, window);
 }
-void neon_f32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_f32_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<float>(c, x, y, output, window);
 }
diff --git a/src/cpu/kernels/select/generic/neon/impl.cpp b/src/cpu/kernels/select/generic/neon/impl.cpp
deleted file mode 100644
index 62c46f3c18f1186e1271b178936f3a090ea2a840..0000000000000000000000000000000000000000
--- a/src/cpu/kernels/select/generic/neon/impl.cpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/TensorInfo.h"
-#include "src/cpu/kernels/select/generic/neon/impl.h"
-#include "src/core/NEON/NEAsymm.h"
-
-#include <arm_neon.h>
-#include <map>
-#include <string>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType, typename VectorType>
-void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-               const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *))
-{
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator condition(cond, win);
-    Iterator input1(in1, win);
-    Iterator input2(in2, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr    = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
-        const auto input1_ptr    = reinterpret_cast<const ScalarType *>(input1.ptr());
-        const auto input2_ptr    = reinterpret_cast<const ScalarType *>(input2.ptr());
-
-        int x = window_start_x;
-        for(; x <= limit; x += window_step_x)
-        {
-            const auto c = (*condition_conversion)(condition_ptr + x);
-            const auto a = wrapper::vloadq(input1_ptr + x);
-            const auto b = wrapper::vloadq(input2_ptr + x);
-            wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            const auto c      = *(condition_ptr + x);
-            const auto a      = *(input1_ptr + x);
-            const auto b      = *(input2_ptr + x);
-            *(output_ptr + x) = static_cast<bool>(c) ? a : b;
-        }
-    },
-    condition, input1, input2, output);
-}
-
-template <typename ScalarType, typename VectorType>
-void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    const auto window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
-    });
-}
-
-template <typename ScalarType, typename VectorType>
-void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    const auto window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
-    });
-}
-
-template <typename ScalarType, typename VectorType>
-void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    const auto window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
-    });
-}
-
-template <typename ScalarType>
-void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(window);
-
-    auto       output_ptr    = reinterpret_cast<ScalarType *>(out->buffer());
-    const auto condition_ptr = reinterpret_cast<const uint8_t *>(cond->buffer());
-    const auto input1_ptr    = reinterpret_cast<const ScalarType *>(in1->buffer());
-    const auto input2_ptr    = reinterpret_cast<const ScalarType *>(in2->buffer());
-
-    const int outer_size = cond->info()->total_size() / cond->info()->element_size();
-    const int inner_size = (in1->info()->total_size() / in1->info()->element_size()) / outer_size;
-    int       offset     = 0;
-    const int step       = 16 / in1->info()->element_size();
-
-    for(int i = 0; i < outer_size; ++i)
-    {
-        int        x         = offset;
-        const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr;
-        for(; x <= offset + inner_size - step; x += step)
-        {
-            wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x));
-        }
-        if(x <= offset + inner_size - (step / 2))
-        {
-            wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x));
-            x += step / 2;
-        }
-        for(; x < offset + inner_size; ++x)
-        {
-            *(output_ptr + x) = *(input_ptr + x);
-        }
-        offset += inner_size;
-    }
-}
-
-template void select_op_32<float, uint32x4_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_not_same_rank<float>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_8<int8_t, uint8x16_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_16<int16_t, uint16x8_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void select_op_16<float16_t, uint16x8_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-
-template void select_op_32<int32_t, uint32x4_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_not_same_rank<int8_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_not_same_rank<int16_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void select_op_not_same_rank<float16_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-
-template void select_op_not_same_rank<int32_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_8<uint8_t, uint8x16_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_16<uint16_t, uint16x8_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_32<uint32_t, uint32x4_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_not_same_rank<uint8_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_not_same_rank<uint16_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_not_same_rank<uint32_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-} // namespace cpu
-
-} // namespace arm_compute
diff --git a/src/cpu/kernels/select/generic/neon/impl.h b/src/cpu/kernels/select/generic/neon/impl.h
index 2bbc38b7444393e156a68681f6f3edfc7b397fc1..7ce640b6ff7bd7d4f6fb147d30e1630d9db73ab5 100644
--- a/src/cpu/kernels/select/generic/neon/impl.h
+++ b/src/cpu/kernels/select/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,34 +21,157 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_SELECT_IMPL_H
-#define SRC_CORE_NEON_KERNELS_SELECT_IMPL_H
+#ifndef ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H
 
-#include <cstdint>
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/NEON/NEAsymm.h"
+#include "src/cpu/kernels/select/generic/neon/impl.h"
+
+#include <arm_neon.h>
+#include <map>
+#include <string>
 
 namespace arm_compute
 {
-class ITensor;
-class Window;
-
 namespace cpu
 {
-template <typename ScalarType>
-void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
 template <typename ScalarType, typename VectorType>
-void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+void select_op(const ITensor *cond,
+               const ITensor *in1,
+               const ITensor *in2,
+               ITensor       *out,
+               const Window  &window,
+               const int      window_step_x,
+               const int      window_start_x,
+               const int      window_end_x,
+               const int      limit,
+               VectorType (*condition_conversion)(const uint8_t *))
+{
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator condition(cond, win);
+    Iterator input1(in1, win);
+    Iterator input2(in2, win);
+    Iterator output(out, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            auto       output_ptr    = reinterpret_cast<ScalarType *>(output.ptr());
+            const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
+            const auto input1_ptr    = reinterpret_cast<const ScalarType *>(input1.ptr());
+            const auto input2_ptr    = reinterpret_cast<const ScalarType *>(input2.ptr());
+
+            int x = window_start_x;
+            for (; x <= limit; x += window_step_x)
+            {
+                const auto c = (*condition_conversion)(condition_ptr + x);
+                const auto a = wrapper::vloadq(input1_ptr + x);
+                const auto b = wrapper::vloadq(input2_ptr + x);
+                wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                const auto c      = *(condition_ptr + x);
+                const auto a      = *(input1_ptr + x);
+                const auto b      = *(input2_ptr + x);
+                *(output_ptr + x) = static_cast<bool>(c) ? a : b;
+            }
+        },
+        condition, input1, input2, output);
+}
 
 template <typename ScalarType, typename VectorType>
-void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    const auto window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    select_op<ScalarType, VectorType>(
+        cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x,
+        [](const uint8_t *condition_ptr) -> VectorType
+        {
+            static const auto zero =
+                wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+            return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
+        });
+}
 
 template <typename ScalarType, typename VectorType>
-void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    const auto window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    select_op<ScalarType, VectorType>(
+        cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x,
+        [](const uint8_t *condition_ptr) -> VectorType
+        {
+            static const auto zero =
+                wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+            return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
+        });
+}
 
 template <typename ScalarType, typename VectorType>
-void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-               const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *));
+void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    const auto window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    select_op<ScalarType, VectorType>(
+        cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x,
+        [](const uint8_t *condition_ptr) -> VectorType
+        {
+            static const auto zero =
+                wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+            return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
+        });
+}
+
+template <typename ScalarType>
+void select_op_not_same_rank(
+    const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(window);
+
+    auto       output_ptr    = reinterpret_cast<ScalarType *>(out->buffer());
+    const auto condition_ptr = reinterpret_cast<const uint8_t *>(cond->buffer());
+    const auto input1_ptr    = reinterpret_cast<const ScalarType *>(in1->buffer());
+    const auto input2_ptr    = reinterpret_cast<const ScalarType *>(in2->buffer());
+
+    const int outer_size = cond->info()->total_size() / cond->info()->element_size();
+    const int inner_size = (in1->info()->total_size() / in1->info()->element_size()) / outer_size;
+    int       offset     = 0;
+    const int step       = 16 / in1->info()->element_size();
 
+    for (int i = 0; i < outer_size; ++i)
+    {
+        int        x         = offset;
+        const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr;
+        for (; x <= offset + inner_size - step; x += step)
+        {
+            wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x));
+        }
+        if (x <= offset + inner_size - (step / 2))
+        {
+            wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x));
+            x += step / 2;
+        }
+        for (; x < offset + inner_size; ++x)
+        {
+            *(output_ptr + x) = *(input_ptr + x);
+        }
+        offset += inner_size;
+    }
+}
 } // namespace cpu
 } // namespace arm_compute
-#endif //SRC_CORE_NEON_KERNELS_SELECT_IMPL_H
\ No newline at end of file
+#endif // ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/select/generic/neon/integer.cpp b/src/cpu/kernels/select/generic/neon/integer.cpp
index 71b2f0b9336ece96a4f18dc9b2adcb33c57a2c6a..135087c2618bae32e75b4082fcd03dd19e3b8ca7 100644
--- a/src/cpu/kernels/select/generic/neon/integer.cpp
+++ b/src/cpu/kernels/select/generic/neon/integer.cpp
@@ -25,59 +25,71 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 
-#include <arm_neon.h>
-
 #include "src/cpu/kernels/select/generic/neon/impl.h"
 
+#include <arm_neon.h>
+
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_s8_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s8_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_8<int8_t, uint8x16_t>(c, x, y, output, window);
 }
-void neon_s16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s16_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_16<int16_t, uint16x8_t>(c, x, y, output, window);
 }
-void neon_s32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s32_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_32<int32_t, uint32x4_t>(c, x, y, output, window);
 }
-void neon_s8_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s8_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<int8_t>(c, x, y, output, window);
 }
-void neon_s16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s16_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<int16_t>(c, x, y, output, window);
 }
-void neon_s32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s32_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<int32_t>(c, x, y, output, window);
 }
-void neon_u8_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u8_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_8<uint8_t, uint8x16_t>(c, x, y, output, window);
 }
-void neon_u16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u16_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_16<uint16_t, uint16x8_t>(c, x, y, output, window);
 }
-void neon_u32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u32_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_32<uint32_t, uint32x4_t>(c, x, y, output, window);
 }
-void neon_u8_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u8_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<uint8_t>(c, x, y, output, window);
 }
-void neon_u16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u16_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<uint16_t>(c, x, y, output, window);
 }
-void neon_u32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u32_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<uint32_t>(c, x, y, output, window);
 }
diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
index f6556696b06ca8ba7776c7130c691066001cce24..2e2adf33e08385cb14b99f406113f2d1e49301a8 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
@@ -23,6 +23,7 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/CpuTypes.h"
 #include "src/cpu/kernels/softmax/generic/neon/impl.h"
 
@@ -30,8 +31,13 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                       ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_fp16_softmax(const ITensor *in,
+                       const ITensor *max,
+                       void *const    tmp,
+                       ITensor       *out,
+                       const float    beta,
+                       bool           is_log,
+                       const Window  &window)
 {
     return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
 }
@@ -40,6 +46,6 @@ void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
 {
     return neon_logits_1d_max<float16_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
index ddd270ae709392491231583627c0da33262680b7..61df40c1b5d4ba9ae47c9b8e037ac8351dc49a30 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
@@ -22,14 +22,20 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                       ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_fp32_softmax(const ITensor *in,
+                       const ITensor *max,
+                       void *const    tmp,
+                       ITensor       *out,
+                       const float    beta,
+                       bool           is_log,
+                       const Window  &window)
 {
     return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
 }
@@ -38,5 +44,5 @@ void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
 {
     return neon_logits_1d_max<float>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp
index 5654bb52ca14a3cea1cf21e8fba519dcc49ab3bc..5d6e6a4f807f6089879d309858074683fe61c9be 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,77 +22,21 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/softmax/generic/neon/impl.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "support/SaturateCast.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-template <typename T>
-void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    constexpr int window_step_x  = 16 / sizeof(T);
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end());
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    const int sum_stages = log2(window_step_x / 2);
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        // Get pointers
-        const auto in_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(output.ptr());
-
-        // Init max value
-        auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
-        int  x       = window_start_x;
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto current_value = wrapper::vloadq(in_ptr + x);
-            vec_max                  = wrapper::vmax(vec_max, current_value);
-        }
-        auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
-
-        for(int i = 0; i < sum_stages; ++i)
-        {
-            carry_max = wrapper::vpmax(carry_max, carry_max);
-        }
-        T max_val = wrapper::vgetlane(carry_max, 0);
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
-        }
-
-        *out_ptr = max_val;
-    },
-    input, output);
-}
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-template void neon_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window);
-#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-template void neon_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
 template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
 template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
 
 template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                      ITensor *out, float beta, bool is_log, const Window &window)
+void neon_softmax_logits_1d_quantized(
+    const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
 {
-    static_assert(std::is_same<T, qasymm8_t>::value
-                  || std::is_same<T, qasymm8_signed_t>::value,
+    static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value,
                   "quantized type should be either qasymm8_t or qasymm8_signed_t.");
 
     const int start_x     = in->info()->valid_region().anchor.x();
@@ -106,294 +50,174 @@ void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, voi
     Iterator      out_it(out, window);
     constexpr int vec_size = 16;
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<float *>(tmp);
-
-        float sum{};
-        float sum_inversed{};
-
-        /* Compute exponentials and sum */
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
-            const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
+            /* Get pointers */
+            const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+            const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+            const auto tmp_ptr = reinterpret_cast<float *>(tmp);
 
-            /* Init sum to zero */
-            float32x4x4_t vec_sum =
-            {
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-            };
+            float sum{};
+            float sum_inversed{};
 
-            /* Loop over row and compute exponentials and sum */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
+            /* Compute exponentials and sum */
             {
-                auto vec_elements     = wrapper::vloadq(in_ptr + x);
-                vec_elements          = wrapper::vqsub(vec_max, vec_elements);
-                auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
-
-                if(is_log)
-                {
-                    vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
-                    vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
-                    vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
-                    vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
-                    vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
-                    vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
-                    vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
-                    vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
-                }
-                else
-                {
-                    vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
-                    vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
-                    vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
-                    vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
-                    vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
-                    vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
-                    vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
-                    vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
-                }
-
-                vst4q_f32(tmp_ptr + x, vec_elements_flt);
-            }
-
-            /* Reduce sum */
-            const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
-            auto       sum_res     = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
-            sum_res                = vpadd_f32(sum_res, sum_res);
-            sum                    = wrapper::vgetlane(sum_res, 0);
+                /* Get max value */
+                const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+                const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
+
+                /* Init sum to zero */
+                float32x4x4_t vec_sum = {
+                    vdupq_n_f32(0.f),
+                    vdupq_n_f32(0.f),
+                    vdupq_n_f32(0.f),
+                    vdupq_n_f32(0.f),
+                };
+
+                /* Loop over row and compute exponentials and sum */
+                int x = 0;
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    auto vec_elements     = wrapper::vloadq(in_ptr + x);
+                    vec_elements          = wrapper::vqsub(vec_max, vec_elements);
+                    auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+
+                    if (is_log)
+                    {
+                        vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
+                        vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
+                        vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
+                        vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
+                        vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
+                        vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
+                        vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
+                        vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
+                    }
+                    else
+                    {
+                        vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
+                        vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
+                        vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
+                        vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
+                        vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
+                        vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
+                        vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
+                        vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
+                    }
 
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                float element{};
-                if(is_log)
-                {
-                    element = (max_val - in_ptr[x]) * scale_beta;
-                    sum += std::exp(element);
-                }
-                else
-                {
-                    element = std::exp((max_val - in_ptr[x]) * scale_beta);
-                    sum += element;
+                    vst4q_f32(tmp_ptr + x, vec_elements_flt);
                 }
 
-                tmp_ptr[x] = element;
-            }
+                /* Reduce sum */
+                const auto sum_16_byte =
+                    vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
+                auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
+                sum_res      = vpadd_f32(sum_res, sum_res);
+                sum          = wrapper::vgetlane(sum_res, 0);
 
-            if(!is_log)
-            {
-                sum_inversed = 256.f / sum;
-            }
-            else
-            {
-                sum = std::log(sum);
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
-            /* Loop over row and compute softmax */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
-                float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
-                int_vec_type  normalized_value{};
-                if(is_log)
-                {
-                    const float32x4x4_t sub =
-                    {
-                        vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
-                    };
-                    normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
-                }
-                else
+                /* Run remaining elements */
+                for (; x < input_width; ++x)
                 {
-                    float32x4x4_t mul =
+                    float element{};
+                    if (is_log)
                     {
-                        vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
-                    };
-
-                    if(is_qasymm8_signed)
+                        element = (max_val - in_ptr[x]) * scale_beta;
+                        sum += std::exp(element);
+                    }
+                    else
                     {
-                        const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
-                        mul.val[0]            = wrapper::vsub(mul.val[0], offset_vec);
-                        mul.val[1]            = wrapper::vsub(mul.val[1], offset_vec);
-                        mul.val[2]            = wrapper::vsub(mul.val[2], offset_vec);
-                        mul.val[3]            = wrapper::vsub(mul.val[3], offset_vec);
+                        element = std::exp((max_val - in_ptr[x]) * scale_beta);
+                        sum += element;
                     }
 
-                    normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
-                }
-                wrapper::vstore(out_ptr + x, normalized_value);
-            }
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                if(is_log)
-                {
-                    out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
-                }
-                else
-                {
-                    out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
-                }
-            }
-        }
-    },
-    in_it, max_it, out_it);
-}
-
-template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                                 ITensor *out, float beta, bool is_log, const Window &window);
-template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                          ITensor *out, float beta, bool is_log, const Window &window);
-template <typename T>
-void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                  ITensor *out, const float beta, bool is_log, const Window &window)
-{
-    const int start_x     = in->info()->valid_region().anchor.x();
-    const int input_width = in->info()->valid_region().shape.x();
-
-    Iterator in_it(in, window);
-    Iterator max_it(max, window);
-    Iterator out_it(out, window);
-
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    constexpr int vec_size   = 16 / sizeof(T);
-    const int     sum_stages = log2(vec_size / 2);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<T *>(tmp);
-
-        T sum{};
-        T sum_inversed{};
-
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
-            const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
-
-            /* Init sum to zero */
-            auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-
-            /* Loop over row and compute exponentials and sum */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                auto vec_elements = wrapper::vloadq(in_ptr + x);
-                vec_elements      = wrapper::vsub(vec_elements, vec_max);
-                if(is_log)
-                {
-                    vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
-                    vec_sum      = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
-                }
-                else
-                {
-                    vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
-                    vec_sum      = wrapper::vadd(vec_sum, vec_elements);
+                    tmp_ptr[x] = element;
                 }
-                wrapper::vstore(tmp_ptr + x, vec_elements);
-            }
 
-            /* Reduce sum */
-            auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
-            for(int i = 0; i < sum_stages; ++i)
-            {
-                sum_res = wrapper::vpadd(sum_res, sum_res);
-            }
-            sum = wrapper::vgetlane(sum_res, 0);
-
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                T element{};
-
-                if(is_log)
+                if (!is_log)
                 {
-                    element = (in_ptr[x] - max_val) * beta;
-                    sum += std::exp(element);
+                    sum_inversed = 256.f / sum;
                 }
                 else
                 {
-                    element = std::exp((in_ptr[x] - max_val) * beta);
-                    sum += element;
+                    sum = std::log(sum);
                 }
-                tmp_ptr[x] = element;
-            }
-
-            if(!is_log)
-            {
-                sum_inversed = T(1) / sum;
-            }
-            else
-            {
-                sum = static_cast<T>(std::log(sum));
             }
-        }
 
-        /* Normalize exponentials */
-        {
-            /* Loop over row and compute softmax */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
+            /* Normalize exponentials */
             {
-                auto vec_in           = wrapper::vloadq(tmp_ptr + x);
-                auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-                if(is_log)
-                {
-                    normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
-                }
-                else
-                {
-                    normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
-                }
-                wrapper::vstore(out_ptr + x, normalized_value);
-            }
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                if(is_log)
-                {
-                    out_ptr[x] = tmp_ptr[x] - sum;
+                constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
+                /* Loop over row and compute softmax */
+                int x = 0;
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
+                    float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
+                    int_vec_type  normalized_value{};
+                    if (is_log)
+                    {
+                        const float32x4x4_t sub = {
+                            vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
+                            vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
+                            vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
+                            vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
+                        };
+                        normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
+                    }
+                    else
+                    {
+                        float32x4x4_t mul = {
+                            vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
+                            vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
+                            vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
+                            vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
+                        };
+
+                        if (is_qasymm8_signed)
+                        {
+                            const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
+                            mul.val[0]            = wrapper::vsub(mul.val[0], offset_vec);
+                            mul.val[1]            = wrapper::vsub(mul.val[1], offset_vec);
+                            mul.val[2]            = wrapper::vsub(mul.val[2], offset_vec);
+                            mul.val[3]            = wrapper::vsub(mul.val[3], offset_vec);
+                        }
+
+                        normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
+                    }
+                    wrapper::vstore(out_ptr + x, normalized_value);
                 }
-                else
+                /* Run remaining elements */
+                for (; x < input_width; ++x)
                 {
-                    out_ptr[x] = tmp_ptr[x] * sum_inversed;
+                    if (is_log)
+                    {
+                        out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
+                    }
+                    else
+                    {
+                        out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) -
+                                                                   (is_qasymm8_signed ? 128.f : 0));
+                    }
                 }
             }
-        }
-    },
-    in_it, max_it, out_it);
+        },
+        in_it, max_it, out_it);
 }
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-template void neon_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                      ITensor *out, const float beta, bool is_log, const Window &window);
-#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-template void neon_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                  ITensor *out, const float beta, bool is_log, const Window &window);
+
+template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
+                                                                 const ITensor *max,
+                                                                 void *const    tmp,
+                                                                 ITensor       *out,
+                                                                 float          beta,
+                                                                 bool           is_log,
+                                                                 const Window  &window);
+template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
+                                                          const ITensor *max,
+                                                          void *const    tmp,
+                                                          ITensor       *out,
+                                                          float          beta,
+                                                          bool           is_log,
+                                                          const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h
index 6ca659919a9e8399bd384464a874cf05cebd3541..4d9b7892974f167290c24ebe6ae56d00594cbbc9 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.h
+++ b/src/cpu/kernels/softmax/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,20 +26,209 @@
 
 #include "arm_compute/core/Helpers.h"
 
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
 namespace arm_compute
 {
 namespace cpu
 {
 template <typename T>
-void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window);
+void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    constexpr int window_step_x  = 16 / sizeof(T);
+    const auto    window_start_x = static_cast<int>(window.x().start());
+    const auto    window_end_x   = static_cast<int>(window.x().end());
+
+    Window win{window};
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    const int sum_stages = log2(window_step_x / 2);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            // Get pointers
+            const auto in_ptr  = reinterpret_cast<const T *>(input.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output.ptr());
+
+            // Init max value
+            auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+            int  x       = window_start_x;
+
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto current_value = wrapper::vloadq(in_ptr + x);
+                vec_max                  = wrapper::vmax(vec_max, current_value);
+            }
+            auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+
+            for (int i = 0; i < sum_stages; ++i)
+            {
+                carry_max = wrapper::vpmax(carry_max, carry_max);
+            }
+            T max_val = wrapper::vgetlane(carry_max, 0);
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
+            }
+
+            *out_ptr = max_val;
+        },
+        input, output);
+}
 
 template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                      ITensor *out, float beta, bool is_log, const Window &window);
+void neon_softmax_logits_1d_quantized(const ITensor *in,
+                                      const ITensor *max,
+                                      void *const    tmp,
+                                      ITensor       *out,
+                                      float          beta,
+                                      bool           is_log,
+                                      const Window  &window);
 
 template <typename T>
-void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                  ITensor *out, const float beta, bool is_log, const Window &window);
+void neon_softmax_logits_1d_float(const ITensor *in,
+                                  const ITensor *max,
+                                  void *const    tmp,
+                                  ITensor       *out,
+                                  const float    beta,
+                                  bool           is_log,
+                                  const Window  &window)
+{
+    const int start_x     = in->info()->valid_region().anchor.x();
+    const int input_width = in->info()->valid_region().shape.x();
+
+    Iterator in_it(in, window);
+    Iterator max_it(max, window);
+    Iterator out_it(out, window);
+
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    constexpr int vec_size   = 16 / sizeof(T);
+    const int     sum_stages = log2(vec_size / 2);
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
+        {
+            /* Get pointers */
+            const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+            const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+            const auto tmp_ptr = reinterpret_cast<T *>(tmp);
+
+            T sum{};
+            T sum_inversed{};
+
+            /* Compute exponentials and sum */
+            {
+                /* Get max value */
+                const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+                const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
+
+                /* Init sum to zero */
+                auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+
+                /* Loop over row and compute exponentials and sum */
+                int x = 0;
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    auto vec_elements = wrapper::vloadq(in_ptr + x);
+                    vec_elements      = wrapper::vsub(vec_elements, vec_max);
+                    if (is_log)
+                    {
+                        vec_elements =
+                            wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
+                        vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
+                    }
+                    else
+                    {
+                        vec_elements = wrapper::vexpq(
+                            wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
+                        vec_sum = wrapper::vadd(vec_sum, vec_elements);
+                    }
+                    wrapper::vstore(tmp_ptr + x, vec_elements);
+                }
+
+                /* Reduce sum */
+                auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
+                for (int i = 0; i < sum_stages; ++i)
+                {
+                    sum_res = wrapper::vpadd(sum_res, sum_res);
+                }
+                sum = wrapper::vgetlane(sum_res, 0);
+
+                /* Run remaining elements */
+                for (; x < input_width; ++x)
+                {
+                    T element{};
+
+                    if (is_log)
+                    {
+                        element = (in_ptr[x] - max_val) * beta;
+                        sum += std::exp(element);
+                    }
+                    else
+                    {
+                        element = std::exp((in_ptr[x] - max_val) * beta);
+                        sum += element;
+                    }
+                    tmp_ptr[x] = element;
+                }
+
+                if (!is_log)
+                {
+                    sum_inversed = T(1) / sum;
+                }
+                else
+                {
+                    sum = static_cast<T>(std::log(sum));
+                }
+            }
+
+            /* Normalize exponentials */
+            {
+                /* Loop over row and compute softmax */
+                int x = 0;
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    auto vec_in           = wrapper::vloadq(tmp_ptr + x);
+                    auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+                    if (is_log)
+                    {
+                        normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
+                    }
+                    else
+                    {
+                        normalized_value =
+                            wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
+                    }
+                    wrapper::vstore(out_ptr + x, normalized_value);
+                }
+                /* Run remaining elements */
+                for (; x < input_width; ++x)
+                {
+                    if (is_log)
+                    {
+                        out_ptr[x] = tmp_ptr[x] - sum;
+                    }
+                    else
+                    {
+                        out_ptr[x] = tmp_ptr[x] * sum_inversed;
+                    }
+                }
+            }
+        },
+        in_it, max_it, out_it);
+}
 } // namespace cpu
 } // namespace arm_compute
 
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
index a5728915615e3e3222b91d263579b7d501e892f6..40713dc49659f4f337a83cb23c85f49e4c717be0 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
@@ -22,14 +22,20 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                          ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_qasymm8_softmax(const ITensor *in,
+                          const ITensor *max,
+                          void *const    tmp,
+                          ITensor       *out,
+                          const float    beta,
+                          bool           is_log,
+                          const Window  &window)
 {
     return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
 }
@@ -38,5 +44,5 @@ void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
 {
     return neon_logits_1d_max<qasymm8_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
index 7d3fe6e046f0f1a62e7a2cdc0c2da64048329a15..2c5e284f5450a19fd9b10011ebe8c44eec2074da 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
@@ -22,14 +22,20 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_qasymm8_signed_softmax(const ITensor *in,
+                                 const ITensor *max,
+                                 void *const    tmp,
+                                 ITensor       *out,
+                                 const float    beta,
+                                 bool           is_log,
+                                 const Window  &window)
 {
     return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
 }
@@ -38,5 +44,5 @@ void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &w
 {
     return neon_logits_1d_max<qasymm8_signed_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/softmax/generic/sve/fp16.cpp
index 15a523bfc9b1fbe7b10326fec76c359b96ac6c4d..5e94f72faf4092ccc95a81841592112e9329dbc0 100644
--- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/fp16.cpp
@@ -23,14 +23,20 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/CpuTypes.h"
 #include "src/cpu/kernels/softmax/generic/sve/impl.h"
 namespace arm_compute
 {
 namespace cpu
 {
-void sve_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                      ITensor *out, const float beta, bool is_log, const Window &window)
+void sve_fp16_softmax(const ITensor *in,
+                      const ITensor *max,
+                      void *const    tmp,
+                      ITensor       *out,
+                      const float    beta,
+                      bool           is_log,
+                      const Window  &window)
 {
     return sve_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
 }
@@ -39,6 +45,6 @@ void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
 {
     return sve_logits_1d_max<float16_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/softmax/generic/sve/fp32.cpp
index 55c4aee426689ab7b3a9dd42c05f435b5887b3a0..d692cc2477b73ecfd881c58c977e0b1350a101e3 100644
--- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/fp32.cpp
@@ -23,14 +23,20 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/sve/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                      ITensor *out, const float beta, bool is_log, const Window &window)
+void sve_fp32_softmax(const ITensor *in,
+                      const ITensor *max,
+                      void *const    tmp,
+                      ITensor       *out,
+                      const float    beta,
+                      bool           is_log,
+                      const Window  &window)
 {
     return sve_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
 }
@@ -39,5 +45,5 @@ void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
 {
     return sve_logits_1d_max<float>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp
index 2340a31cbd4dba077677116a31d350f7c9eb6258..24f1bb8143db6553c73483ec9c489df5a5b8fa49 100644
--- a/src/cpu/kernels/softmax/generic/sve/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "src/cpu/kernels/softmax/generic/sve/impl.h"
+
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 
 namespace arm_compute
@@ -36,42 +37,48 @@ void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     Iterator input(in, win);
     Iterator output(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        // Get pointers
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            // Get pointers
+            const auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+            const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
 
-        // Init max value
-        auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
+            // Init max value
+            auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
 
-        int      x  = window_start_x;
-        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        do
-        {
-            const auto current_value = svld1(pg, in_ptr + x);
-            vec_max                  = svmax_m(pg, vec_max, current_value);
+            int      x  = window_start_x;
+            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            do
+            {
+                const auto current_value = svld1(pg, in_ptr + x);
+                vec_max                  = svmax_m(pg, vec_max, current_value);
 
-            x += wrapper::svcnt<ScalarType>();
-            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        }
-        while(svptest_any(all_true_pg, pg));
+                x += wrapper::svcnt<ScalarType>();
+                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            } while (svptest_any(all_true_pg, pg));
 
-        auto max_val = svmaxv(all_true_pg, vec_max);
+            auto max_val = svmaxv(all_true_pg, vec_max);
 
-        *out_ptr = max_val;
-    },
-    input, output);
+            *out_ptr = max_val;
+        },
+        input, output);
 }
 
 template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window)
+void sve_softmax_logits_1d_float(const ITensor *in,
+                                 const ITensor *max,
+                                 void *const    tmp,
+                                 ITensor       *out,
+                                 const float    beta,
+                                 bool           is_log,
+                                 const Window  &window)
 {
     const int start_x     = in->info()->valid_region().anchor.x();
     const int input_width = in->info()->valid_region().shape.x();
@@ -82,88 +89,88 @@ void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *co
 
     const auto all_true_pg = wrapper::svptrue<ScalarType>();
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
-
-        ScalarType sum{ 0 };
-
-        /* Compute exponentials and sum */
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            /* Get max value */
-            const auto max_val  = *reinterpret_cast<const ScalarType *>(max_it.ptr());
-            const auto vec_max  = wrapper::svdup_n(max_val);
-            const auto vec_beta = wrapper::svdup_n(static_cast<ScalarType>(beta));
+            /* Get pointers */
+            const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
+            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
+            const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
 
-            /* Init sum to zero */
-            auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
+            ScalarType sum{0};
 
-            /* Loop over row and compute exponentials and sum */
-            int      x  = 0;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            do
+            /* Compute exponentials and sum */
             {
-                auto vec_elements = svld1(pg, in_ptr + x);
-                vec_elements      = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta);
-                if(!is_log)
+                /* Get max value */
+                const auto max_val  = *reinterpret_cast<const ScalarType *>(max_it.ptr());
+                const auto vec_max  = wrapper::svdup_n(max_val);
+                const auto vec_beta = wrapper::svdup_n(static_cast<ScalarType>(beta));
+
+                /* Init sum to zero */
+                auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
+
+                /* Loop over row and compute exponentials and sum */
+                int      x  = 0;
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                do
                 {
-                    vec_elements = wrapper::svexp_z(pg, vec_elements);
-                    vec_sum      = svadd_m(pg, vec_sum, vec_elements);
+                    auto vec_elements = svld1(pg, in_ptr + x);
+                    vec_elements      = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta);
+                    if (!is_log)
+                    {
+                        vec_elements = wrapper::svexp_z(pg, vec_elements);
+                        vec_sum      = svadd_m(pg, vec_sum, vec_elements);
+                    }
+                    svst1(pg, tmp_ptr + x, vec_elements);
+
+                    if (is_log)
+                    {
+                        vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
+                    }
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                } while (svptest_any(all_true_pg, pg));
+
+                /* Reduce sum */
+                sum = svaddv(all_true_pg, vec_sum);
+
+                if (is_log)
+                {
+                    sum = static_cast<ScalarType>(std::log(sum));
                 }
-                svst1(pg, tmp_ptr + x, vec_elements);
-
-                if(is_log)
+                else
                 {
-                    vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
+                    sum = ScalarType(1) / sum;
                 }
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
             }
-            while(svptest_any(all_true_pg, pg));
 
-            /* Reduce sum */
-            sum = svaddv(all_true_pg, vec_sum);
-
-            if(is_log)
-            {
-                sum = static_cast<ScalarType>(std::log(sum));
-            }
-            else
-            {
-                sum = ScalarType(1) / sum;
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            /* Loop over row and compute softmax */
-            int      x  = 0;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            do
+            /* Normalize exponentials */
             {
-                auto vec_in           = svld1(pg, tmp_ptr + x);
-                auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
-                if(is_log)
-                {
-                    normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
-                }
-                else
+                /* Loop over row and compute softmax */
+                int      x  = 0;
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                do
                 {
-                    normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
-                }
-                svst1(pg, out_ptr + x, normalized_value);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                    auto vec_in           = svld1(pg, tmp_ptr + x);
+                    auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
+                    if (is_log)
+                    {
+                        normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+                    }
+                    else
+                    {
+                        normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+                    }
+                    svst1(pg, out_ptr + x, normalized_value);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                } while (svptest_any(all_true_pg, pg));
             }
-            while(svptest_any(all_true_pg, pg));
-        }
-    },
-    in_it, max_it, out_it);
+        },
+        in_it, max_it, out_it);
 }
 
 template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
@@ -171,9 +178,19 @@ template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, cons
 template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
 template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
 
-template void sve_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                 ITensor *out, const float beta, bool is_log, const Window &window);
-template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                     ITensor *out, const float beta, bool is_log, const Window &window);
+template void sve_softmax_logits_1d_float<float>(const ITensor *in,
+                                                 const ITensor *max,
+                                                 void *const    tmp,
+                                                 ITensor       *out,
+                                                 const float    beta,
+                                                 bool           is_log,
+                                                 const Window  &window);
+template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in,
+                                                     const ITensor *max,
+                                                     void *const    tmp,
+                                                     ITensor       *out,
+                                                     const float    beta,
+                                                     bool           is_log,
+                                                     const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/impl.h b/src/cpu/kernels/softmax/generic/sve/impl.h
index 4f76ec6a26a27315207e8f9403e47a87a2392dfb..89a30d042f15515d6a31ce9cde886ed3983d4a2a 100644
--- a/src/cpu/kernels/softmax/generic/sve/impl.h
+++ b/src/cpu/kernels/softmax/generic/sve/impl.h
@@ -33,8 +33,13 @@ template <typename ScalarType>
 void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window);
 
 template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window);
+void sve_softmax_logits_1d_float(const ITensor *in,
+                                 const ITensor *max,
+                                 void *const    tmp,
+                                 ITensor       *out,
+                                 const float    beta,
+                                 bool           is_log,
+                                 const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
 
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
index e9044d5fc987785e08b6d8c8a6a42fc64bbd157e..85e5ccfea17f4d324bcc8a1d5d3e7380c050773f 100644
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/sve/impl.h"
 
 namespace arm_compute
@@ -33,5 +34,5 @@ void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
 {
     return sve_logits_1d_max<qasymm8_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
index ab45ce598d58f24b4af6016c830876b5f039430b..4be2e2eed6619297bd58ab09d8ebd8937e02f621 100644
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/sve/impl.h"
 
 namespace arm_compute
@@ -33,5 +34,5 @@ void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &wi
 {
     return sve_logits_1d_max<qasymm8_signed_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
index 8f677c62d4075f988e1af24d00486b3b8fa28240..98b2f5117fd8d7a162d0ced13b1752be9c380c15 100644
--- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
@@ -23,7 +23,9 @@
  */
 
 #include "src/cpu/kernels/softmax/generic/sve2/impl.h"
+
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -31,8 +33,8 @@ namespace arm_compute
 namespace cpu
 {
 template <typename ScalarType>
-void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                      ITensor *out, float beta, bool is_log, const Window &window)
+void sve2_softmax_logits_1d_quantized(
+    const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
 {
     const int start_x     = in->info()->valid_region().anchor.x();
     const int input_width = in->info()->valid_region().shape.x();
@@ -50,162 +52,173 @@ void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, voi
     const int inc_2 = static_cast<int>(2 * svcntw());
     const int inc_3 = static_cast<int>(3 * svcntw());
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<float *>(tmp);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
+        {
+            /* Get pointers */
+            const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
+            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
+            const auto tmp_ptr = reinterpret_cast<float *>(tmp);
 
-        float sum{};
+            float sum{};
 
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
-            const auto vec_max = wrapper::svdup_n(max_val);
-
-            /* Init sum to zero */
-            auto vec_sum_0 = svdup_n_f32(0.f);
-            auto vec_sum_1 = svdup_n_f32(0.f);
-            auto vec_sum_2 = svdup_n_f32(0.f);
-            auto vec_sum_3 = svdup_n_f32(0.f);
-
-            /* Loop over row and compute exponentials and sum */
-            int      x    = 0;
-            svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-            svbool_t pg_0 = svunpklo(svunpklo(pg));
-            svbool_t pg_1 = svunpkhi(svunpklo(pg));
-            svbool_t pg_2 = svunpklo(svunpkhi(pg));
-            svbool_t pg_3 = svunpkhi(svunpkhi(pg));
-            do
+            /* Compute exponentials and sum */
             {
-                const auto vec_elements     = svld1(pg, in_ptr + x);
-                const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements));
+                /* Get max value */
+                const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
+                const auto vec_max = wrapper::svdup_n(max_val);
+
+                /* Init sum to zero */
+                auto vec_sum_0 = svdup_n_f32(0.f);
+                auto vec_sum_1 = svdup_n_f32(0.f);
+                auto vec_sum_2 = svdup_n_f32(0.f);
+                auto vec_sum_3 = svdup_n_f32(0.f);
+
+                /* Loop over row and compute exponentials and sum */
+                int      x    = 0;
+                svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                svbool_t pg_0 = svunpklo(svunpklo(pg));
+                svbool_t pg_1 = svunpkhi(svunpklo(pg));
+                svbool_t pg_2 = svunpklo(svunpkhi(pg));
+                svbool_t pg_3 = svunpkhi(svunpkhi(pg));
+                do
+                {
+                    const auto vec_elements     = svld1(pg, in_ptr + x);
+                    const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements));
+
+                    auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub)));
+                    auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub)));
+                    auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub)));
+                    auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub)));
 
-                auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub)));
-                auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub)));
-                auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub)));
-                auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub)));
+                    if (is_log)
+                    {
+                        vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec);
+                        vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec);
+                        vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec);
+                        vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec);
+                        vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0));
+                        vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1));
+                        vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2));
+                        vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3));
+                    }
+                    else
+                    {
+                        vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec));
+                        vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec));
+                        vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec));
+                        vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec));
+                        vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0);
+                        vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1);
+                        vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2);
+                        vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3);
+                    }
 
-                if(is_log)
+                    svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0);
+                    svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1);
+                    svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2);
+                    svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                    pg_0 = svunpklo(svunpklo(pg));
+                    pg_1 = svunpkhi(svunpklo(pg));
+                    pg_2 = svunpklo(svunpkhi(pg));
+                    pg_3 = svunpkhi(svunpkhi(pg));
+                } while (svptest_any(all_true_pg, pg));
+
+                /* Reduce sum */
+                const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1),
+                                                 svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3));
+                sum                = svaddv_f32(all_true_pg, vec_sum);
+
+                /* Run remaining elements */
+                x = 0;
+                if (is_log)
                 {
-                    vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec);
-                    vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec);
-                    vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec);
-                    vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec);
-                    vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0));
-                    vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1));
-                    vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2));
-                    vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3));
+                    sum = std::log(sum);
                 }
                 else
                 {
-                    vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec));
-                    vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec));
-                    vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec));
-                    vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec));
-                    vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0);
-                    vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1);
-                    vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2);
-                    vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3);
+                    sum = 256.f / sum;
                 }
-
-                svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0);
-                svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1);
-                svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2);
-                svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-                pg_0 = svunpklo(svunpklo(pg));
-                pg_1 = svunpkhi(svunpklo(pg));
-                pg_2 = svunpklo(svunpkhi(pg));
-                pg_3 = svunpkhi(svunpkhi(pg));
             }
-            while(svptest_any(all_true_pg, pg));
 
-            /* Reduce sum */
-            const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3));
-            sum                = svaddv_f32(all_true_pg, vec_sum);
-
-            /* Run remaining elements */
-            x = 0;
-            if(is_log)
-            {
-                sum = std::log(sum);
-            }
-            else
+            /* Normalize exponentials */
             {
-                sum = 256.f / sum;
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value;
-            /* Loop over row and compute softmax */
-            int      x    = 0;
-            svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-            svbool_t pg_0 = svunpklo(svunpklo(pg));
-            svbool_t pg_1 = svunpkhi(svunpklo(pg));
-            svbool_t pg_2 = svunpklo(svunpkhi(pg));
-            svbool_t pg_3 = svunpkhi(svunpkhi(pg));
-            do
-            {
-                auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x);
-                auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1);
-                auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2);
-                auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3);
-
-                svfloat32_t res_0{};
-                svfloat32_t res_1{};
-                svfloat32_t res_2{};
-                svfloat32_t res_3{};
-
-                if(is_log)
+                constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value;
+                /* Loop over row and compute softmax */
+                int      x    = 0;
+                svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                svbool_t pg_0 = svunpklo(svunpklo(pg));
+                svbool_t pg_1 = svunpkhi(svunpklo(pg));
+                svbool_t pg_2 = svunpklo(svunpkhi(pg));
+                svbool_t pg_3 = svunpkhi(svunpkhi(pg));
+                do
                 {
-                    res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
-                    res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
-                    res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
-                    res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
-                }
-                else
-                {
-                    res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
-                    res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
-                    res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
-                    res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+                    auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x);
+                    auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1);
+                    auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2);
+                    auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3);
+
+                    svfloat32_t res_0{};
+                    svfloat32_t res_1{};
+                    svfloat32_t res_2{};
+                    svfloat32_t res_3{};
 
-                    if(is_qasymm8_signed)
+                    if (is_log)
                     {
-                        const auto offset_vec = svdup_n_f32(128.f);
-                        res_0                 = svsub_z(pg_0, res_0, offset_vec);
-                        res_1                 = svsub_z(pg_1, res_1, offset_vec);
-                        res_2                 = svsub_z(pg_2, res_2, offset_vec);
-                        res_3                 = svsub_z(pg_3, res_3, offset_vec);
+                        res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
+                        res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
+                        res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
+                        res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+                    }
+                    else
+                    {
+                        res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
+                        res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
+                        res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
+                        res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+
+                        if (is_qasymm8_signed)
+                        {
+                            const auto offset_vec = svdup_n_f32(128.f);
+                            res_0                 = svsub_z(pg_0, res_0, offset_vec);
+                            res_1                 = svsub_z(pg_1, res_1, offset_vec);
+                            res_2                 = svsub_z(pg_2, res_2, offset_vec);
+                            res_3                 = svsub_z(pg_3, res_3, offset_vec);
+                        }
                     }
-                }
 
-                // Store value
-                const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3);
-                svst1(pg, out_ptr + x, out);
-                x += wrapper::svcnt<ScalarType>();
-                pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-                pg_0 = svunpklo(svunpklo(pg));
-                pg_1 = svunpkhi(svunpklo(pg));
-                pg_2 = svunpklo(svunpkhi(pg));
-                pg_3 = svunpkhi(svunpkhi(pg));
+                    // Store value
+                    const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3);
+                    svst1(pg, out_ptr + x, out);
+                    x += wrapper::svcnt<ScalarType>();
+                    pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                    pg_0 = svunpklo(svunpklo(pg));
+                    pg_1 = svunpkhi(svunpklo(pg));
+                    pg_2 = svunpklo(svunpkhi(pg));
+                    pg_3 = svunpkhi(svunpkhi(pg));
+                } while (svptest_any(all_true_pg, pg));
             }
-            while(svptest_any(all_true_pg, pg));
-        }
-    },
-    in_it, max_it, out_it);
+        },
+        in_it, max_it, out_it);
 }
 
-template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                                 ITensor *out, float beta, bool is_log, const Window &window);
-template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                          ITensor *out, float beta, bool is_log, const Window &window);
+template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
+                                                                 const ITensor *max,
+                                                                 void *const    tmp,
+                                                                 ITensor       *out,
+                                                                 float          beta,
+                                                                 bool           is_log,
+                                                                 const Window  &window);
+template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
+                                                          const ITensor *max,
+                                                          void *const    tmp,
+                                                          ITensor       *out,
+                                                          float          beta,
+                                                          bool           is_log,
+                                                          const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.h b/src/cpu/kernels/softmax/generic/sve2/impl.h
index abbcc151815bc022fc67f7722c44e0e1bc21b32f..33fcc26cda1c3b3f1bfd17de591bf3ba3fb7cba7 100644
--- a/src/cpu/kernels/softmax/generic/sve2/impl.h
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.h
@@ -31,8 +31,13 @@ namespace arm_compute
 namespace cpu
 {
 template <typename ScalarType>
-void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                      ITensor *out, float beta, bool is_log, const Window &window);
+void sve2_softmax_logits_1d_quantized(const ITensor *in,
+                                      const ITensor *max,
+                                      void *const    tmp,
+                                      ITensor       *out,
+                                      float          beta,
+                                      bool           is_log,
+                                      const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif /* SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H */
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
index 810035eb9cf1730b63ecd4ff1a7642d578854a87..95623786b33a9e133406c2e2c6affee6af19e173 100644
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
@@ -23,16 +23,22 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/sve2/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                          ITensor *out, const float beta, bool is_log, const Window &window)
+void sve2_qasymm8_softmax(const ITensor *in,
+                          const ITensor *max,
+                          void *const    tmp,
+                          ITensor       *out,
+                          const float    beta,
+                          bool           is_log,
+                          const Window  &window)
 {
     return sve2_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
index 283b55e9cead991bb24bc1640d90ffc3a1f58579..c20462fcef98c90f7a3174abe65c84c0c0ef4737 100644
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
@@ -23,16 +23,22 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/sve2/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window)
+void sve2_qasymm8_signed_softmax(const ITensor *in,
+                                 const ITensor *max,
+                                 void *const    tmp,
+                                 ITensor       *out,
+                                 const float    beta,
+                                 bool           is_log,
+                                 const Window  &window)
 {
     return sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h
index ed3515f417d13d6dfc5d6405519d9154f9618193..627ce0c2641eb813d84b6c57c01812233e1d1dc4 100644
--- a/src/cpu/kernels/softmax/list.h
+++ b/src/cpu/kernels/softmax/list.h
@@ -28,9 +28,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_SOFTMAX_KERNEL(func_name)                                  \
-    void func_name(const ITensor *in, const ITensor *max, void *const tmp, \
-                   ITensor *out, const float beta, bool is_log, const Window &window)
+#define DECLARE_SOFTMAX_KERNEL(func_name)                                                                  \
+    void func_name(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, \
+                   bool is_log, const Window &window)
 
 DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax);
 DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax);
@@ -43,8 +43,7 @@ DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_softmax);
 
 #undef DECLARE_SOFTMAX_KERNEL
 
-#define DECLARE_LOGITS_KERNEL(func_name) \
-    void func_name(const ITensor *in, ITensor *out, const Window &window)
+#define DECLARE_LOGITS_KERNEL(func_name) void func_name(const ITensor *in, ITensor *out, const Window &window)
 
 DECLARE_LOGITS_KERNEL(neon_fp32_logits);
 DECLARE_LOGITS_KERNEL(neon_fp16_logits);
diff --git a/src/cpu/kernels/sub/neon/fp16.cpp b/src/cpu/kernels/sub/neon/fp16.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..023068817bd1b16a006b5f0919b2d6c2565ece83
--- /dev/null
+++ b/src/cpu/kernels/sub/neon/fp16.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/cpu/kernels/sub/neon/impl.h"
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sub_same_neon_fp16(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    sub_same_neon<float16_t>(src0, src1, dst, policy, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/sub/neon/impl.h b/src/cpu/kernels/sub/neon/impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..6123f7e25abb67bc1138d127d45be415f7452e6c
--- /dev/null
+++ b/src/cpu/kernels/sub/neon/impl.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CPU_KERNELS_SUB_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_SUB_NEON_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/core/NEON/wrapper/scalar/sub.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void sub_same_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    bool is_sat = policy == ConvertPolicy::SATURATE;
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    constexpr int window_step_x         = 16 / sizeof(T);
+    const auto    window_start_x        = static_cast<int>(window.x().start());
+    const auto    window_end_x          = static_cast<int>(window.x().end());
+    const bool    is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
+
+    Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()));
+    Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
+    Iterator output(dst, window);
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<T *>(output.ptr());
+
+                const T    broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
+                const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                    auto       res             = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v)
+                                                        : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
+                    if (is_broadcast_input_2)
+                    {
+                        res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{}));
+                    }
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    auto       res =
+                        is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v;
+                    if (is_broadcast_input_2)
+                    {
+                        res = static_cast<T>(-1) * res;
+                    }
+
+                    *(output_ptr + x) = res;
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src0, input1_win);
+        Iterator input2(src1, input2_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto val1 = wrapper::vloadq(input1_ptr + x);
+                    const auto val2 = wrapper::vloadq(input2_ptr + x);
+                    const auto res  = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto val1   = *(input1_ptr + x);
+                    const auto val2   = *(input2_ptr + x);
+                    *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2;
+                }
+            },
+            input1, input2, output);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_SUB_NEON_IMPL_H
diff --git a/src/cpu/kernels/sub/neon/list.h b/src/cpu/kernels/sub/neon/list.h
index f7e1a040bdc95db5d45eca702f031096242b9b86..f29571f122eab8fa688adc8c911fc3e7e2025dbb 100644
--- a/src/cpu/kernels/sub/neon/list.h
+++ b/src/cpu/kernels/sub/neon/list.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,143 +21,30 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_SUB_LIST_H
-#define SRC_CORE_NEON_KERNELS_SUB_LIST_H
+#ifndef ACL_SRC_CPU_KERNELS_SUB_NEON_LIST_H
+#define ACL_SRC_CPU_KERNELS_SUB_NEON_LIST_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_SUB_KERNEL(func_name) \
-    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+#define DECLARE_SUB_KERNEL(func_name)                                                                   \
+    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, \
+                   const Window &window)
 
 DECLARE_SUB_KERNEL(sub_qasymm8_neon_fixedpoint);
 DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon_fixedpoint);
 DECLARE_SUB_KERNEL(sub_qasymm8_neon);
 DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon);
 DECLARE_SUB_KERNEL(sub_qsymm16_neon);
+DECLARE_SUB_KERNEL(sub_same_neon_fp16);
 
 #undef DECLARE_SUB_KERNEL
-
-template <typename T>
-void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    bool is_sat = policy == ConvertPolicy::SATURATE;
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    constexpr int window_step_x         = 16 / sizeof(T);
-    const auto    window_start_x        = static_cast<int>(window.x().start());
-    const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()));
-    Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
-    Iterator output(dst, window);
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<T *>(output.ptr());
-
-            const T    broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
-            const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
-                if(is_broadcast_input_2)
-                {
-                    res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{}));
-                }
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v;
-                if(is_broadcast_input_2)
-                {
-                    res = static_cast<T>(-1) * res;
-                }
-
-                *(output_ptr + x) = res;
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto val1 = wrapper::vloadq(input1_ptr + x);
-                const auto val2 = wrapper::vloadq(input2_ptr + x);
-                const auto res  = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto val1   = *(input1_ptr + x);
-                const auto val2   = *(input2_ptr + x);
-                *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2;
-            }
-        },
-        input1, input2, output);
-    }
-}
 } // namespace cpu
 } // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_SUB_LIST_H
+#endif // ACL_SRC_CPU_KERNELS_SUB_NEON_LIST_H
diff --git a/src/cpu/kernels/sub/neon/qasymm8.cpp b/src/cpu/kernels/sub/neon/qasymm8.cpp
index ea6e5826ddba8b2c57dabf61c73c100712358ace..b750afce6e666a7f86d2f118a518338cb8a0ca2a 100644
--- a/src/cpu/kernels/sub/neon/qasymm8.cpp
+++ b/src/cpu/kernels/sub/neon/qasymm8.cpp
@@ -23,21 +23,24 @@
  */
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/kernels/add/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sub_qasymm8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qasymm8_neon_fixedpoint(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_q8_neon_fixedpoint<uint8_t>(src0, src1, dst, policy, window, false /*is_addition*/);
 }
 
-void sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qasymm8_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_qasymm8_neon(src0, src1, dst, policy, window, false /*is_addition*/);
 }
 
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/sub/neon/qasymm8_signed.cpp b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp
index a86c7f22f671312be3f28a54a661506b6b79b647..fb0bb626822442e14b54198f67aadc8abed1af8a 100644
--- a/src/cpu/kernels/sub/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp
@@ -24,21 +24,24 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/kernels/add/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sub_qasymm8_signed_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qasymm8_signed_neon_fixedpoint(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_q8_neon_fixedpoint<int8_t>(src0, src1, dst, policy, window, false /*is_addition*/);
 }
 
-void sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qasymm8_signed_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_qasymm8_signed_neon(src0, src1, dst, policy, window, false /*is_addition*/);
 }
 
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/sub/neon/qsymm16.cpp b/src/cpu/kernels/sub/neon/qsymm16.cpp
index 4dfdc0e78c5a75003e653dac258569c8e5e8ba96..23e4b03843e924c8a934ddee6cf02e34ffeab9f3 100644
--- a/src/cpu/kernels/sub/neon/qsymm16.cpp
+++ b/src/cpu/kernels/sub/neon/qsymm16.cpp
@@ -25,14 +25,16 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qsymm16_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -57,7 +59,7 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
     const float32x4_t vscale2    = vdupq_n_f32(iq2_info.scale);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
         Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -65,7 +67,7 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
         const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
         const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
 
         // Clear X Dimension on execution window as we handle manually
         non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -74,61 +76,62 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
 
-            const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
-            const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
+                const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
+                const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
 
-            const float32x4x2_t bf =
-            {
-                {
-                    vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
-                }
-            };
-            const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
+                const float32x4x2_t bf  = {{
+                     vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
+                     vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
+                }};
+                const float         bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
 
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int16x8_t     a = vld1q_s16(non_broadcast_input_ptr + x);
-                const float32x4x2_t af =
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
+                    const int16x8_t     a  = vld1q_s16(non_broadcast_input_ptr + x);
+                    const float32x4x2_t af = {{
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
-                    }
-                };
+                    }};
 
-                const int32x4x4_t rf =
-                {
-                    {
+                    const int32x4x4_t rf = {{
 #ifdef __aarch64__
-                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0])
+                                                                      : vsubq_f32(af.val[0], bf.val[0]),
+                                                 invvscaleo)),
+                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1])
+                                                                      : vsubq_f32(af.val[1], bf.val[1]),
+                                                 invvscaleo)),
 #else  //__aarch64__
-                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0])
+                                                                     : vsubq_f32(af.val[0], bf.val[0]),
+                                                invvscaleo)),
+                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1])
+                                                                     : vsubq_f32(af.val[1], bf.val[1]),
+                                                invvscaleo)),
 #endif //__aarch64__
-                    }
-                };
+                    }};
 
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
-                vst1q_s16(output_ptr + x, pa);
-            }
+                    const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
+                    vst1q_s16(output_ptr + x, pa);
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
+                    *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -140,38 +143,32 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         Iterator input2(src1, input2_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const int16x8_t a = vld1q_s16(input1_ptr + x);
-                const int16x8_t b = vld1q_s16(input2_ptr + x);
+                const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
 
-                const float32x4x2_t af =
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
+                    const int16x8_t a = vld1q_s16(input1_ptr + x);
+                    const int16x8_t b = vld1q_s16(input2_ptr + x);
+
+                    const float32x4x2_t af = {{
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
-                    }
-                };
+                    }};
 
-                const float32x4x2_t bf =
-                {
-                    {
+                    const float32x4x2_t bf = {{
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),
-                    }
-                };
+                    }};
 
-                const int32x4x2_t rf =
-                {
-                    {
+                    const int32x4x2_t rf = {{
 #ifdef __aarch64__
                         vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
                         vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
@@ -179,23 +176,22 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
                         vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
                         vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
 #endif //__aarch64__
-                    }
-                };
+                    }};
 
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
-                vst1q_s16(output_ptr + x, pa);
-            }
+                    const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
+                    vst1q_s16(output_ptr + x, pa);
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
-                const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
-                *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info());
-            }
-        },
-        input1, input2, output);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
+                    const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
+                    *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info());
+                }
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuActivation.cpp b/src/cpu/operators/CpuActivation.cpp
index 197e9850b9c80f8821157beada0b8635b636609c..44d70cf503243da5c5158a70695a757b2d2b3b2c 100644
--- a/src/cpu/operators/CpuActivation.cpp
+++ b/src/cpu/operators/CpuActivation.cpp
@@ -24,6 +24,7 @@
 #include "src/cpu/operators/CpuActivation.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/IOperator.h"
 #include "src/common/utils/LegacySupport.h"
 #include "src/common/utils/Log.h"
@@ -42,7 +43,8 @@ void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, con
     _kernel = std::move(k);
 }
 
-Status CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
+Status
+CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
 {
     return kernels::CpuActivationKernel::validate(input, output, activation_info);
 }
@@ -54,13 +56,17 @@ void CpuActivation::run(ITensorPack &tensors)
     NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
 }
 
-std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate)
+std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor     &src,
+                                                                  const AclTensorDescriptor     &dst,
+                                                                  const AclActivationDescriptor &act,
+                                                                  bool                           is_validate)
 {
     TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
     TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst);
     auto       info     = detail::convert_to_activation_info(act);
 
-    if(is_validate && !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
+    if (is_validate &&
+        !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
     {
         return std::make_tuple(nullptr, StatusCode::UnsupportedConfig);
     }
@@ -69,7 +75,7 @@ std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTenso
     act_op->configure(&src_info, &dst_info, info);
 
     auto op = new arm_compute::IOperator(static_cast<IContext *>(this));
-    if(op == nullptr)
+    if (op == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
         return std::make_tuple(nullptr, StatusCode::OutOfMemory);
diff --git a/src/cpu/operators/CpuActivation.h b/src/cpu/operators/CpuActivation.h
index e21fc7d32c16e6862e14e11548479cf15cd596c3..ec442f92c8e0daa662da60b5f731234ef7e7f103 100644
--- a/src/cpu/operators/CpuActivation.h
+++ b/src/cpu/operators/CpuActivation.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_ACTIVATION_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
diff --git a/src/cpu/operators/CpuAdd.cpp b/src/cpu/operators/CpuAdd.cpp
index 41def8e22f3f3c4d18e4e350c3ddf7f0aeab801d..53cd7fa1b7520027d389b431fb1a8cef6028499b 100644
--- a/src/cpu/operators/CpuAdd.cpp
+++ b/src/cpu/operators/CpuAdd.cpp
@@ -23,17 +23,20 @@
  */
 #include "src/cpu/operators/CpuAdd.h"
 
-#include "src/cpu/kernels/CpuAddKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuAddKernel.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuAdd::configure(const ITensorInfo         *src0,
+                       const ITensorInfo         *src1,
+                       ITensorInfo               *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy, act_info);
@@ -42,7 +45,11 @@ void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensor
     _kernel = std::move(k);
 }
 
-Status CpuAdd::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuAdd::validate(const ITensorInfo         *src0,
+                        const ITensorInfo         *src1,
+                        const ITensorInfo         *dst,
+                        ConvertPolicy              policy,
+                        const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return kernels::CpuAddKernel::validate(src0, src1, dst, policy);
diff --git a/src/cpu/operators/CpuAdd.h b/src/cpu/operators/CpuAdd.h
index db05c100cccd03a63c0123e006be367ba588e6c5..5f60102de25320fd9f713efbd58e9dfcdc608b11 100644
--- a/src/cpu/operators/CpuAdd.h
+++ b/src/cpu/operators/CpuAdd.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_ADD_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
@@ -55,14 +56,22 @@ public:
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      *
      */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensorInfo         *src0,
+                   const ITensorInfo         *src1,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuAdd::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src0,
+                           const ITensorInfo         *src1,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
diff --git a/src/cpu/operators/CpuAddMulAdd.cpp b/src/cpu/operators/CpuAddMulAdd.cpp
index 3fd690e3f94298f91061c6f25c2fe09e388e96ec..2f19f2f842223f19e112addd5ec90a816f500e8b 100644
--- a/src/cpu/operators/CpuAddMulAdd.cpp
+++ b/src/cpu/operators/CpuAddMulAdd.cpp
@@ -21,39 +21,49 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "src/cpu/operators/CpuAddMulAdd.h"
+
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/kernels/CpuAddMulAddKernel.h"
-#include "src/cpu/operators/CpuAddMulAdd.h"
 #include "src/cpu/utils/CpuAuxTensorHandler.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void CpuAddMulAdd::configure(const ITensorInfo *input1, const ITensorInfo *input2,
-                             const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                             ITensorInfo *add_output, ITensorInfo *final_output,
-                             ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuAddMulAdd::configure(const ITensorInfo         *input1,
+                             const ITensorInfo         *input2,
+                             const ITensorInfo         *bn_mul,
+                             const ITensorInfo         *bn_add,
+                             ITensorInfo               *add_output,
+                             ITensorInfo               *final_output,
+                             ConvertPolicy              policy,
+                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
 
     auto k = std::make_unique<kernels::CpuAddMulAddKernel>();
 
     const DataType data_type = input1->data_type();
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         _dequantize_bn_mul.configure(bn_mul, &_dequantized_bn_mul);
         _dequantize_bn_add.configure(bn_add, &_dequantized_bn_add);
 
-        k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy, act_info);
+        k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy,
+                     act_info);
 
         // Save auxilary memory requirements after configuration
-        _aux_mem[DequantizedBnMul] = experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary, _dequantized_bn_mul.total_size());
-        _aux_mem[DequantizedBnAdd] = experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary, _dequantized_bn_add.total_size());
+        _aux_mem[DequantizedBnMul] =
+            experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary,
+                                     _dequantized_bn_mul.total_size());
+        _aux_mem[DequantizedBnAdd] =
+            experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary,
+                                     _dequantized_bn_add.total_size());
     }
     else
     {
@@ -63,63 +73,64 @@ void CpuAddMulAdd::configure(const ITensorInfo *input1, const ITensorInfo *input
     _kernel = std::move(k);
 }
 
-Status CpuAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                              const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                              const ITensorInfo *add_output, const ITensorInfo *final_output,
-                              ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuAddMulAdd::validate(const ITensorInfo         *input1,
+                              const ITensorInfo         *input2,
+                              const ITensorInfo         *bn_mul,
+                              const ITensorInfo         *bn_add,
+                              const ITensorInfo         *add_output,
+                              const ITensorInfo         *final_output,
+                              ConvertPolicy              policy,
+                              const ActivationLayerInfo &act_info)
 {
     const DataType data_type = input1->data_type();
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
-        TensorInfo dequantized_bn_mul;
-        TensorInfo dequantized_bn_add;
+        TensorInfo dequantized_bn_mul = bn_mul->clone()->set_data_type(DataType::F32);
+        TensorInfo dequantized_bn_add = bn_add->clone()->set_data_type(DataType::F32);
 
         ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_mul, &dequantized_bn_mul));
         ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_add, &dequantized_bn_add));
 
-        return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add, add_output, final_output, policy, act_info);
+        return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add,
+                                                     add_output, final_output, policy, act_info);
     }
     else
     {
-        return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
+        return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy,
+                                                     act_info);
     }
 }
 
 void CpuAddMulAdd::run(ITensorPack &tensors)
 {
-    const DataType data_type       = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info()->data_type();
+    const DataType data_type = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info()->data_type();
 
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         const ITensor *bn_mul = tensors.get_const_tensor(TensorType::ACL_SRC_2);
         const ITensor *bn_add = tensors.get_const_tensor(TensorType::ACL_SRC_3);
 
-        CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors, true);
-        CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors, true);
+        CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors,
+                                                       true);
+        CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors,
+                                                       true);
 
-        ITensorPack dequantize_mul_pack =
-        {
-            { TensorType::ACL_SRC_0, bn_mul },
-            { TensorType::ACL_DST_0, dequantized_bn_mul_handler.get() }
-        };
+        ITensorPack dequantize_mul_pack = {{TensorType::ACL_SRC_0, bn_mul},
+                                           {TensorType::ACL_DST_0, dequantized_bn_mul_handler.get()}};
 
-        ITensorPack dequantize_add_pack =
-        {
-            { TensorType::ACL_SRC_0, bn_add },
-            { TensorType::ACL_DST_0, dequantized_bn_add_handler.get() }
-        };
+        ITensorPack dequantize_add_pack = {{TensorType::ACL_SRC_0, bn_add},
+                                           {TensorType::ACL_DST_0, dequantized_bn_add_handler.get()}};
 
         _dequantize_bn_mul.run(dequantize_mul_pack);
         _dequantize_bn_add.run(dequantize_add_pack);
 
-        ITensorPack add_mul_add_pack =
-        {
-            { TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0) },
-            { TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1) },
-            { TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get() },
-            { TensorType::ACL_SRC_3, dequantized_bn_add_handler.get() },
-            { TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0) },
-            { TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1) },
+        ITensorPack add_mul_add_pack = {
+            {TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0)},
+            {TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1)},
+            {TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get()},
+            {TensorType::ACL_SRC_3, dequantized_bn_add_handler.get()},
+            {TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0)},
+            {TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1)},
         };
 
         NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), add_mul_add_pack);
diff --git a/src/cpu/operators/CpuAddMulAdd.h b/src/cpu/operators/CpuAddMulAdd.h
index cf1ece68f1e1c39550decc6e563daa5b40971359..47db75c37efafbf4c6fad2cb1cf1382afda54316 100644
--- a/src/cpu/operators/CpuAddMulAdd.h
+++ b/src/cpu/operators/CpuAddMulAdd.h
@@ -42,20 +42,28 @@ public:
      * Similar to @ref NEAddMulAdd::configure()
      *
      */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2,
-                   const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                   ITensorInfo *add_output, ITensorInfo *final_output,
-                   ConvertPolicy policy, const ActivationLayerInfo &act_info);
+    void configure(const ITensorInfo         *input1,
+                   const ITensorInfo         *input2,
+                   const ITensorInfo         *bn_mul,
+                   const ITensorInfo         *bn_add,
+                   ITensorInfo               *add_output,
+                   ITensorInfo               *final_output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuAddMulAdd::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                           const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                           const ITensorInfo *add_output, const ITensorInfo *final_output,
-                           ConvertPolicy policy, const ActivationLayerInfo &act_info);
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *bn_mul,
+                           const ITensorInfo         *bn_add,
+                           const ITensorInfo         *add_output,
+                           const ITensorInfo         *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info);
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
@@ -77,7 +85,7 @@ private:
     TensorInfo _dequantized_bn_mul{};
     TensorInfo _dequantized_bn_add{};
 
-    experimental::MemoryRequirements _aux_mem{ Count };
+    experimental::MemoryRequirements _aux_mem{Count};
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuCast.cpp b/src/cpu/operators/CpuCast.cpp
index 1cfd8c1d0ecc1fe0dfb7331242833fd3c3881f75..55b9204d716b2623e3e68ef424a520950620894d 100644
--- a/src/cpu/operators/CpuCast.cpp
+++ b/src/cpu/operators/CpuCast.cpp
@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuCast.h"
 
-#include "src/cpu/kernels/CpuCastKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuCastKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/cpu/operators/CpuCast.h b/src/cpu/operators/CpuCast.h
index 356b033dbd37805ff7e885c57b71a19dc16d6141..1f4da6e2a0bf17679465a9211762be3652b3d225 100644
--- a/src/cpu/operators/CpuCast.h
+++ b/src/cpu/operators/CpuCast.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_CAST_H
-#define ARM_COMPUTE_CPU_CAST_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUCAST_H
+#define ACL_SRC_CPU_OPERATORS_CPUCAST_H
 
 #include "src/cpu/ICpuOperator.h"
 
@@ -51,14 +51,13 @@ public:
      * |S16            | QASYMM8_SIGNED, U8, S32                        |
      * |F16            | QASYMM8_SIGNED, QASYMM8, F32, S32, U8          |
      * |S32            | QASYMM8_SIGNED, QASYMM8, F16, F32, U8          |
-     * |F32            | QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8|
+     * |F32            | QASYMM8_SIGNED, QASYMM8, F16, S32, U8|
      * |S64            | F32                                            |
      *
      * @param[in]  src    The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/S64/F16/F32.
      * @param[out] dst    The destination tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
      * @param[in]  policy Conversion policy.
      *
-     * @deprecated Support for BFLOAT16 will be removed in 23.05 release
      *
      */
     void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
@@ -72,4 +71,4 @@ public:
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ACTIVATION_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUCAST_H
diff --git a/src/cpu/operators/CpuConcatenate.cpp b/src/cpu/operators/CpuConcatenate.cpp
index 4021fd8dedf3628191b5961f4648fe7161121fa2..5f517a8fcbe0070b7ee08a21d4ecc36e92041c41 100644
--- a/src/cpu/operators/CpuConcatenate.cpp
+++ b/src/cpu/operators/CpuConcatenate.cpp
@@ -23,21 +23,20 @@
  */
 #include "src/cpu/operators/CpuConcatenate.h"
 
-#include "src/cpu/kernels/CpuConcatenateBatchKernel.h"
-#include "src/cpu/kernels/CpuConcatenateDepthKernel.h"
-#include "src/cpu/kernels/CpuConcatenateHeightKernel.h"
-#include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/cpu/kernels/CpuConcatenateBatchKernel.h"
+#include "src/cpu/kernels/CpuConcatenateDepthKernel.h"
+#include "src/cpu/kernels/CpuConcatenateHeightKernel.h"
+#include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
 
 namespace arm_compute
 {
@@ -59,9 +58,9 @@ void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vect
 
     unsigned int offset = 0;
 
-    for(unsigned int i = 0; i < _num_srcs; ++i)
+    for (unsigned int i = 0; i < _num_srcs; ++i)
     {
-        switch(axis)
+        switch (axis)
         {
             case Window::DimX:
             {
@@ -98,16 +97,17 @@ void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vect
     }
 }
 
-Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis)
+Status
+CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
     ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2);
 
     unsigned int offset = 0;
-    for(const auto &src : srcs_vector)
+    for (const auto &src : srcs_vector)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-        switch(axis)
+        switch (axis)
         {
             case Window::DimX:
             {
@@ -135,7 +135,7 @@ Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vec
         offset += src->dimension(axis);
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis);
         ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
@@ -146,18 +146,18 @@ Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vec
 
 void CpuConcatenate::run(ITensorPack &tensors)
 {
-    if(tensors.empty())
+    if (tensors.empty())
     {
         ARM_COMPUTE_ERROR("No inputs provided");
     }
 
-    if(static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs))
+    if (static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs))
     {
         ARM_COMPUTE_ERROR("Configured with different number of inputs");
     }
 
     int i = 0;
-    for(auto &k : _concat_kernels)
+    for (auto &k : _concat_kernels)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
diff --git a/src/cpu/operators/CpuConcatenate.h b/src/cpu/operators/CpuConcatenate.h
index eb11926b486f62cae3f104a04d64a6e5155530b5..c36977c70f73dbeb89c946a5140bb14b106ec83b 100644
--- a/src/cpu/operators/CpuConcatenate.h
+++ b/src/cpu/operators/CpuConcatenate.h
@@ -68,8 +68,8 @@ public:
 
 private:
     std::vector<std::unique_ptr<ICPPKernel>> _concat_kernels{};
-    unsigned int                             _num_srcs{ 0 };
-    unsigned int                             _axis{ 0 };
+    unsigned int                             _num_srcs{0};
+    unsigned int                             _axis{0};
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp
index 447b7409892a8f499d271a9d20b65ed9cdfac3d6..19311733dbaa4475d1b7c21f981a6535c7d3bb27 100644
--- a/src/cpu/operators/CpuConv2d.cpp
+++ b/src/cpu/operators/CpuConv2d.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "src/cpu/operators/CpuConv2d.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/operators/CpuDirectConv2d.h"
 #include "src/cpu/operators/CpuGemm.h"
@@ -35,26 +37,35 @@ namespace arm_compute
 {
 namespace cpu
 {
-CpuConv2d::CpuConv2d()
-    : _function()
+CpuConv2d::CpuConv2d() : _function()
 {
 }
 
 CpuConv2d::~CpuConv2d() = default;
 
-void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                          const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CpuConv2d::configure(ITensorInfo               *input,
+                          ITensorInfo               *weights,
+                          const ITensorInfo         *biases,
+                          ITensorInfo               *output,
+                          const PadStrideInfo       &conv_info,
+                          const WeightsInfo         &weights_info,
+                          const Size2D              &dilation,
+                          const ActivationLayerInfo &act_info,
+                          bool                       enable_fast_math,
+                          unsigned int               num_groups)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_UNUSED(num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
-                                                   enable_fast_math, num_groups));
+    ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation,
+                                                   act_info, enable_fast_math, num_groups));
 
-    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                           enable_fast_math, num_groups);
 
     const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
-    switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+    switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+                                              enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
         {
@@ -92,19 +103,30 @@ void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITenso
     _aux_mem = _function->workspace();
 }
 
-Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status CpuConv2d::validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info,
+                           const Size2D              &dilation,
+                           const ActivationLayerInfo &act_info,
+                           bool                       enable_fast_math,
+                           unsigned int               num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon");
 
     const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
-    switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+    switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+                                              enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
-            ARM_COMPUTE_RETURN_ON_ERROR(CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
             break;
         case ConvolutionMethod::GEMM:
-            ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math));
+            ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info,
+                                                                dilation, act_info, enable_fast_math));
             break;
         case ConvolutionMethod::GEMM_CONV2D:
             ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmDirectConv2d::validate(input, weights, biases, output, info));
@@ -120,9 +142,14 @@ Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights,
     return Status{};
 }
 
-ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
-                                                    const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo         *input,
+                                                    const ITensorInfo         *weights,
+                                                    const ITensorInfo         *output,
+                                                    const PadStrideInfo       &conv_info,
+                                                    const WeightsInfo         &weights_info,
+                                                    const Size2D              &dilation,
+                                                    const ActivationLayerInfo &act_info,
+                                                    bool                       enable_fast_math)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
     ARM_COMPUTE_UNUSED(weights_info);
@@ -137,35 +164,46 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co
     using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>;
     using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
 
-    const std::vector<ConfigurationMethod> known_configs =
-    {
+    const std::vector<ConfigurationMethod> known_configs = {
         // Alexnet
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U),
+                                                     PadStrideInfo(1U, 1U, 2U, 2U)),
+                            ConvolutionMethod::GEMM),
         // VGG16 / VGG19
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U),
+                                                     PadStrideInfo(1U, 1U, 1U, 1U)),
+                            ConvolutionMethod::GEMM),
         // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM),
+        ConfigurationMethod(
+            ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+                                     PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)),
+            ConvolutionMethod::GEMM),
         // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM)
-    };
+        ConfigurationMethod(
+            ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+                                     PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)),
+            ConvolutionMethod::GEMM)};
 
     const auto find_config = [&](ConfigurationMethod c)
     {
         const ConvolutionConfiguration config = c.first;
         const PadStrideInfo            info   = std::get<3>(config);
 
-        return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
-               && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
-               && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
+        return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) &&
+               std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+               std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+               info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+               info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+               info.stride() == conv_info.stride();
     };
 
     std::vector<ConfigurationMethod>::const_iterator found;
-    if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+    if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
     {
         return (*found).second;
     }
 
-    if(dilation != Size2D(1U, 1U))
+    if (dilation != Size2D(1U, 1U))
     {
         return ConvolutionMethod::GEMM;
     }
@@ -173,59 +211,66 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co
     {
         // SRGAN
         // Output might not be initialized when it is an internal tensor of the layer using the convolution
-        if(input->total_size() > 1e7 && (weights->dimension(idx_h) > 7)
-           && (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info)))
+        if (input->total_size() > 1e7 && (weights->dimension(idx_h) > 7) &&
+            (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info)))
         {
             return ConvolutionMethod::DIRECT;
         }
-        if(input->dimension(idx_c) < 16)
+        if (input->dimension(idx_c) < 16)
         {
             return ConvolutionMethod::GEMM;
         }
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         // This heuristics only applies to F16 data type on A55r1
-        if(NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && input->data_type() == DataType::F16)
+        if (NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math &&
+            input->data_type() == DataType::F16)
         {
             // Exclude known bad winograd configs (and defaults to GEMM)
-            const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs =
-            {
+            const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs = {
                 // Squeezenet_V1_1 fire2 and fire3
-                ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)),
+                ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U),
+                                         PadStrideInfo(1U, 1U, 1U, 1U)),
                 // Squeezenet_V1_1 fire6 and fire7
-                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), PadStrideInfo(1U, 1U, 1U, 1U)),
+                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U),
+                                         PadStrideInfo(1U, 1U, 1U, 1U)),
                 // Squeezenet_V1_1 fire8 and fire9
-                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), PadStrideInfo(1U, 1U, 1U, 1U)),
+                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U),
+                                         PadStrideInfo(1U, 1U, 1U, 1U)),
             };
             const auto find_conv_config = [&](ConvolutionConfiguration c)
             {
                 const PadStrideInfo info = std::get<3>(c);
 
-                return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
-                       && std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
-                       && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
+                return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) &&
+                       std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+                       std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+                       info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+                       info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+                       info.stride() == conv_info.stride();
             };
 
-            bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), known_bad_winograd_f16_with_fastmath_configs.end(),
-                                          find_conv_config)
-                             != known_bad_winograd_f16_with_fastmath_configs.end();
-            if(found_bad)
+            bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(),
+                                          known_bad_winograd_f16_with_fastmath_configs.end(),
+                                          find_conv_config) != known_bad_winograd_f16_with_fastmath_configs.end();
+            if (found_bad)
             {
                 return ConvolutionMethod::GEMM;
             }
         }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
         // For 1x1 convolutions run the default GEMM
-        if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
+        if (weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
         {
             return ConvolutionMethod::GEMM;
         }
 
-        if(bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
+        if (bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
         {
             return ConvolutionMethod::WINOGRAD;
         }
-        if(bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)))
+        if (bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)))
         {
             return ConvolutionMethod::GEMM_CONV2D;
         }
diff --git a/src/cpu/operators/CpuConv2d.h b/src/cpu/operators/CpuConv2d.h
index 0908ac0cbbde8266ed3e1e845d9a458e3f5bf11a..71b9e15dc1e758ef1268bbd3fdf85ba4e09da7ee 100644
--- a/src/cpu/operators/CpuConv2d.h
+++ b/src/cpu/operators/CpuConv2d.h
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -102,17 +103,32 @@ public:
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
-    void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+    void configure(ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   const ITensorInfo         *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CpuConv2d
      *
      * Similar to CpuConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
-                           unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
     /** Static function to check if given info will return the convolution called by @ref CpuConv2d
      *
      * @param[in] src              Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
@@ -132,11 +148,17 @@ public:
      *
      * @return the Convolution Method Hint
      */
-    static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                                                    const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static ConvolutionMethod get_convolution_method(const ITensorInfo         *src,
+                                                    const ITensorInfo         *weights,
+                                                    const ITensorInfo         *dst,
+                                                    const PadStrideInfo       &conv_info,
+                                                    const WeightsInfo         &weights_info     = WeightsInfo(),
+                                                    const Size2D              &dilation         = Size2D(1U, 1U),
+                                                    const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                                                    bool                       enable_fast_math = false);
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
index 810ffb1e4eb96ce12e439419d177879301d1cb43..49e31926e3c7070e3fb00edb8fef1c2d1130ea03 100644
--- a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
+++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
@@ -24,6 +24,7 @@
 #include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h"
 
@@ -31,7 +32,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src,
+                                                ITensorInfo       *dst,
+                                                const TensorShape &original_src_shape,
+                                                DataLayout         data_layout)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout);
     auto k = std::make_unique<kernels::CpuConvertFullyConnectedWeightsKernel>();
@@ -39,7 +43,10 @@ void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorI
     _kernel = std::move(k);
 }
 
-Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src,
+                                                 const ITensorInfo *dst,
+                                                 const TensorShape &original_src_shape,
+                                                 DataLayout         data_layout)
 {
     return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
 }
@@ -48,5 +55,5 @@ void CpuConvertFullyConnectedWeights::run(ITensorPack &tensors)
 {
     NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors);
 }
-} // namesapce cpu
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.h b/src/cpu/operators/CpuConvertFullyConnectedWeights.h
index ea70eee1349892eb574e4b6fa74f743ce15f8691..e208cca3a0c81f0ae972a3438c48c907fabc5145 100644
--- a/src/cpu/operators/CpuConvertFullyConnectedWeights.h
+++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.h
@@ -41,14 +41,18 @@ public:
      * @param[in]  original_src_shape Shape of the original src tensor (the one entering fully connected layer).
      * @param[in]  data_layout        The data layout the weights have been trained in.
      */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    void
+    configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuConvertFullyConnectedWeights::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_src_shape,
+                           DataLayout         data_layout);
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
 };
diff --git a/src/cpu/operators/CpuCopy.cpp b/src/cpu/operators/CpuCopy.cpp
index 7420ff6240ba6c7778c1de610b37ad3650727bb5..92c19d4df2b03cf2f916551f19f4947cd933d44a 100644
--- a/src/cpu/operators/CpuCopy.cpp
+++ b/src/cpu/operators/CpuCopy.cpp
@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuCopy.h"
 
-#include "src/cpu/kernels/CpuCopyKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuCopyKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/cpu/operators/CpuDepthwiseConv2d.cpp b/src/cpu/operators/CpuDepthwiseConv2d.cpp
index 884fe5c4ed024ec43613f87a0e7dd36685316679..54075f2afa765e54782f552b24270309a31a2495 100644
--- a/src/cpu/operators/CpuDepthwiseConv2d.cpp
+++ b/src/cpu/operators/CpuDepthwiseConv2d.cpp
@@ -24,10 +24,11 @@
 #include "src/cpu/operators/CpuDepthwiseConv2d.h"
 
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
 
@@ -37,11 +38,16 @@ namespace cpu
 {
 namespace
 {
-Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status validate_arguments_optimized(const ITensorInfo     *src,
+                                    const ITensorInfo     *weights,
+                                    const ITensorInfo     *biases,
+                                    const ITensorInfo     *dst,
+                                    const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    if(!is_data_type_quantized_per_channel(weights->data_type()))
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    if (!is_data_type_quantized_per_channel(weights->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
@@ -49,14 +55,17 @@ Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *w
     ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
     const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() +
-                                info.pad_stride_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() +
-                                info.pad_stride_info.pad_bottom());
-
-    if(biases != nullptr)
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) >
+                                src->dimension(idx_w) + info.pad_stride_info.pad_left() +
+                                    info.pad_stride_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) >
+                                src->dimension(idx_h) + info.pad_stride_info.pad_top() +
+                                    info.pad_stride_info.pad_bottom());
+
+    if (biases != nullptr)
     {
-        const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+        const unsigned int channel_idx =
+            get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
     }
@@ -64,7 +73,7 @@ Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *w
     ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info));
 
     // Validate Activation Layer
-    if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
+    if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
     }
@@ -80,8 +89,8 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases,
-                                                                             dst, info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info));
 
     _is_quantized      = is_data_type_quantized_asymmetric(src->data_type());
     _has_bias          = biases != nullptr;
@@ -91,10 +100,11 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI
     _are_weights_const = weights->are_values_constant();
 
     // Configure pipeline
-    _is_activationlayer_enabled = info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info);
+    _is_activationlayer_enabled =
+        info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info);
 
     _dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>();
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permute_input   = std::make_unique<cpu::CpuPermute>();
         _permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -128,7 +138,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI
     }
 
     // Configure activation
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activationlayer_function = std::make_unique<cpu::CpuActivation>();
         _activationlayer_function->configure(dst, nullptr, info.act_info);
@@ -155,7 +165,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
     auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
 
     // Permute input
-    if(_permute)
+    if (_permute)
     {
         ITensorPack pack;
         auto        src      = tensors.get_const_tensor(TensorType::ACL_SRC_0);
@@ -166,7 +176,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
     }
 
     // Run assembly function
-    if(_is_nchw)
+    if (_is_nchw)
     {
         auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
         auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
@@ -198,7 +208,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
     }
 
     // Permute output
-    if(_is_nchw)
+    if (_is_nchw)
     {
         ITensorPack pack;
         auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
@@ -208,7 +218,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
     }
 
     // Run activation
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC, dst);
@@ -221,7 +231,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac
 {
     // if weights are not constant then we need to repack so that weights
     // can be updated in-place
-    if(!_are_weights_const)
+    if (!_are_weights_const)
     {
         auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
@@ -238,14 +248,14 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac
         return;
     }
 
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
         auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
 
         // Permute weights
-        if(_permute)
+        if (_permute)
         {
             auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1);
 
@@ -279,11 +289,15 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac
     }
 }
 
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo           *src,
+                                                              const ITensorInfo     *weights,
+                                                              const ITensorInfo     *biases,
+                                                              ITensorInfo           *dst,
+                                                              const ConvolutionInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases,
-                                                            dst, info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info));
 
     _is_nchw     = src->data_layout() == DataLayout::NCHW;
     _is_prepared = !_is_nchw;
@@ -294,9 +308,10 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src,
 
     auto input_perm   = std::make_unique<TensorInfo>();
     auto weights_perm = std::make_unique<TensorInfo>();
-    auto output_perm  = std::make_unique<TensorInfo>(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+    auto output_perm  = std::make_unique<TensorInfo>(
+        dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permute_input   = std::make_unique<cpu::CpuPermute>();
         _permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -315,7 +330,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src,
     _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
     _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info);
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permute_output = std::make_unique<cpu::CpuPermute>();
         _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
@@ -324,43 +339,61 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src,
 
     //Configure Activation Layer
     _is_activationlayer_enabled = info.act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activationlayer_function = std::make_unique<cpu::CpuActivation>();
         _activationlayer_function->configure(dst, nullptr, info.act_info);
     }
 }
 
-Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo     *src,
+                                                               const ITensorInfo     *weights,
+                                                               const ITensorInfo     *biases,
+                                                               const ITensorInfo     *dst,
                                                                const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    if(src->data_layout() == DataLayout::NCHW)
+    if (src->data_layout() == DataLayout::NCHW)
     {
         TensorShape permuted_input_shape   = src->tensor_shape();
         TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        TensorShape permuted_output_shape =
+            misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
         permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
 
-        const TensorInfo permuted_input   = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_output  = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
+        const TensorInfo permuted_input   = TensorInfo(src->clone()
+                                                           ->set_is_resizable(true)
+                                                           .reset_padding()
+                                                           .set_tensor_shape(permuted_input_shape)
+                                                           .set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_weights = TensorInfo(weights->clone()
+                                                           ->set_is_resizable(true)
+                                                           .reset_padding()
+                                                           .set_tensor_shape(permuted_weights_shape)
+                                                           .set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_output  = TensorInfo(dst->clone()
+                                                           ->set_is_resizable(true)
+                                                           .reset_padding()
+                                                           .set_tensor_shape(permuted_output_shape)
+                                                           .set_data_layout(DataLayout::NCHW));
 
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U)));
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U)));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(
+            &permuted_input, &permuted_weights, biases, &permuted_output, info));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
     }
 
     // Validate Activation Layer
-    if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
+    if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
     }
@@ -375,7 +408,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
     auto biases  = tensors.get_const_tensor(TensorType::ACL_SRC_2);
     auto dst     = tensors.get_tensor(TensorType::ACL_DST_0);
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         prepare(tensors);
         auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
@@ -392,7 +425,8 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
         pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm);
         pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
         pack_depth.add_tensor(TensorType::ACL_DST, dst_perm);
-        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(),
+                                       pack_depth);
     }
     else
     {
@@ -401,10 +435,11 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
         pack_depth.add_tensor(TensorType::ACL_SRC_1, weights);
         pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
         pack_depth.add_tensor(TensorType::ACL_DST, dst);
-        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(),
+                                       pack_depth);
     }
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         ITensorPack pack;
         auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
@@ -413,7 +448,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
         _permute_output->run(pack);
     }
 
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC, dst);
@@ -424,7 +459,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
 
 void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         auto weights      = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
@@ -441,12 +476,17 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors
     }
 }
 
-void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+void CpuDepthwiseConv2d::configure(ITensorInfo           *src,
+                                   const ITensorInfo     *weights,
+                                   const ITensorInfo     *biases,
+                                   ITensorInfo           *dst,
+                                   const ConvolutionInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info);
 
-    _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
-    switch(_depth_conv_func)
+    _depth_conv_func =
+        get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
+    switch (_depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _func_optimized.configure(src, weights, biases, dst, info);
@@ -459,10 +499,14 @@ void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights,
     }
 }
 
-Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2d::validate(const ITensorInfo     *src,
+                                    const ITensorInfo     *weights,
+                                    const ITensorInfo     *biases,
+                                    const ITensorInfo     *dst,
+                                    const ConvolutionInfo &info)
 {
     DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info);
-    switch(depth_conv_func)
+    switch (depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info);
@@ -475,10 +519,13 @@ Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *w
     }
 }
 
-DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo     *src,
+                                                                                   const ITensorInfo     *weights,
+                                                                                   const ITensorInfo     *biases,
+                                                                                   const ITensorInfo     *dst,
                                                                                    const ConvolutionInfo &info)
 {
-    if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
+    if (bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
     {
         return DepthwiseConvolutionFunction::OPTIMIZED;
     }
@@ -490,7 +537,7 @@ DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_functi
 
 void CpuDepthwiseConv2d::run(ITensorPack &tensors)
 {
-    switch(_depth_conv_func)
+    switch (_depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _func_optimized.run(tensors);
@@ -505,7 +552,7 @@ void CpuDepthwiseConv2d::run(ITensorPack &tensors)
 
 void CpuDepthwiseConv2d::prepare(ITensorPack &tensors)
 {
-    switch(_depth_conv_func)
+    switch (_depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _func_optimized.prepare(tensors);
diff --git a/src/cpu/operators/CpuDepthwiseConv2d.h b/src/cpu/operators/CpuDepthwiseConv2d.h
index 3d8719ee44254e9e62af1da814f594fa0f5b763b..7eaa0df8574006d6446bfbd1393795c41c804067 100644
--- a/src/cpu/operators/CpuDepthwiseConv2d.h
+++ b/src/cpu/operators/CpuDepthwiseConv2d.h
@@ -24,8 +24,9 @@
 #ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H
 #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H
 
-#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/ITensorInfo.h"
+
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/ICpuOperator.h"
 #include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
@@ -56,14 +57,22 @@ public:
      *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
      * @param[in]      info    Depthwise convolution meta-data.
      */
-    void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+    void configure(ITensorInfo           *src,
+                   const ITensorInfo     *weights,
+                   const ITensorInfo     *biases,
+                   ITensorInfo           *dst,
+                   const ConvolutionInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDepthwiseConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *weights,
+                           const ITensorInfo     *biases,
+                           const ITensorInfo     *dst,
+                           const ConvolutionInfo &info);
     /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d
      *
      * @param[in] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
@@ -76,7 +85,10 @@ public:
      *
      * @return a Depthwise Convolution Function
      */
-    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo     *src,
+                                                                          const ITensorInfo     *weights,
+                                                                          const ITensorInfo     *biases,
+                                                                          const ITensorInfo     *dst,
                                                                           const ConvolutionInfo &info);
 
     // Inherited methods overriden:
@@ -118,32 +130,40 @@ private:
          * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
          * @param[in]      info    Depthwise convolution meta-data.
          */
-        void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+        void configure(ITensorInfo           *src,
+                       const ITensorInfo     *weights,
+                       const ITensorInfo     *biases,
+                       ITensorInfo           *dst,
+                       const ConvolutionInfo &info);
         /** Static function to check if given info will lead to a valid configuration
          *
          * Similar to CpuDepthwiseConv2dOptimizedInternal::configure()
          *
          * @return a status
          */
-        static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+        static Status validate(const ITensorInfo     *src,
+                               const ITensorInfo     *weights,
+                               const ITensorInfo     *biases,
+                               const ITensorInfo     *dst,
+                               const ConvolutionInfo &info);
 
         // Inherited methods overriden:
         void run(ITensorPack &tensors) override;
         void prepare(ITensorPack &tensors) override;
 
     private:
-        std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{ nullptr };
-        std::unique_ptr<CpuPermute>                         _permute_input{ nullptr };
-        std::unique_ptr<CpuPermute>                         _permute_weights{ nullptr };
-        std::unique_ptr<CpuPermute>                         _permute_output{ nullptr };
-        std::unique_ptr<CpuActivation>                      _activationlayer_function{ nullptr };
-        bool                                                _has_bias{ false };
-        bool                                                _is_quantized{ false };
-        bool                                                _is_nchw{ true };
-        bool                                                _permute{ false };
-        bool                                                _is_activationlayer_enabled{ false };
-        bool                                                _is_prepared{ false };
-        bool                                                _are_weights_const{ true };
+        std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{nullptr};
+        std::unique_ptr<CpuPermute>                         _permute_input{nullptr};
+        std::unique_ptr<CpuPermute>                         _permute_weights{nullptr};
+        std::unique_ptr<CpuPermute>                         _permute_output{nullptr};
+        std::unique_ptr<CpuActivation>                      _activationlayer_function{nullptr};
+        bool                                                _has_bias{false};
+        bool                                                _is_quantized{false};
+        bool                                                _is_nchw{true};
+        bool                                                _permute{false};
+        bool                                                _is_activationlayer_enabled{false};
+        bool                                                _is_prepared{false};
+        bool                                                _are_weights_const{true};
     };
 
     /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
@@ -176,7 +196,11 @@ private:
          *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
          * @param[in]      info    Depthwise convolution meta-data.
          */
-        void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+        void configure(ITensorInfo           *src,
+                       const ITensorInfo     *weights,
+                       const ITensorInfo     *biases,
+                       ITensorInfo           *dst,
+                       const ConvolutionInfo &info);
 
         /** Static function to check if given info will lead to a valid configuration
          *
@@ -184,24 +208,28 @@ private:
          *
          * @return a status
          */
-        static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+        static Status validate(const ITensorInfo     *src,
+                               const ITensorInfo     *weights,
+                               const ITensorInfo     *biases,
+                               const ITensorInfo     *dst,
+                               const ConvolutionInfo &info);
 
         // Inherited methods overridden:
         void run(ITensorPack &tensors) override;
         void prepare(ITensorPack &tensors) override;
 
     private:
-        std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_input{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_weights{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_output{ nullptr };
-        std::unique_ptr<CpuActivation>                           _activationlayer_function{ nullptr };
-        bool                                                     _is_nchw{ true };
-        bool                                                     _is_prepared{ false };
-        bool                                                     _is_activationlayer_enabled{ false };
+        std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{nullptr};
+        std::unique_ptr<CpuPermute>                              _permute_input{nullptr};
+        std::unique_ptr<CpuPermute>                              _permute_weights{nullptr};
+        std::unique_ptr<CpuPermute>                              _permute_output{nullptr};
+        std::unique_ptr<CpuActivation>                           _activationlayer_function{nullptr};
+        bool                                                     _is_nchw{true};
+        bool                                                     _is_prepared{false};
+        bool                                                     _is_activationlayer_enabled{false};
     };
 
-    DepthwiseConvolutionFunction        _depth_conv_func{ DepthwiseConvolutionFunction::GENERIC };
+    DepthwiseConvolutionFunction        _depth_conv_func{DepthwiseConvolutionFunction::GENERIC};
     CpuDepthwiseConv2dOptimizedInternal _func_optimized{};
     CpuDepthwiseConv2dGeneric           _func_generic{};
 };
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
index d078155155ce84c4becba6b16209c63e38e3152f..8d3741de9605102b389e65ee187a72684d5d80fa 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -38,15 +39,14 @@ namespace cpu
 {
 struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl
 {
-    std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{ nullptr };
-    bool                                                              is_prepared{ false };
-    bool                                                              are_weights_const{ true };
+    std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{nullptr};
+    bool                                                              is_prepared{false};
+    bool                                                              are_weights_const{true};
     experimental::MemoryRequirements                                  mem_req{};
 };
 
 #ifndef DOXYGEN_SKIP_THIS
-CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch()
-    : _pImpl(std::make_unique<LocalImpl>())
+CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() : _pImpl(std::make_unique<LocalImpl>())
 {
 }
 #endif /* DOXYGEN_SKIP_THIS */
@@ -66,7 +66,7 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo     *src,
     _pImpl->are_weights_const      = weights->are_values_constant();
 
     // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
-    if(!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info))
+    if (!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info))
     {
         return;
     }
@@ -77,12 +77,16 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo     *src,
 
     // Compute memory requirements for assembly kernels
     constexpr size_t alignment = 4096;
-    _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment });
-    _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment });
+    _pImpl->mem_req.push_back({TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment});
+    _pImpl->mem_req.push_back({TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment});
     _pImpl->asm_kernel = std::move(dwc_wrapper);
 }
 
-Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo     *src,
+                                                    const ITensorInfo     *weights,
+                                                    const ITensorInfo     *bias,
+                                                    const ITensorInfo     *dst,
+                                                    const ConvolutionInfo &info)
 {
     return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info);
 }
@@ -111,7 +115,7 @@ void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
 {
     const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
 
-    if((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared)
+    if ((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared)
     {
         // Pack weights and bias
         const ITensor *bias    = tensors.get_const_tensor(TensorType::ACL_SRC_2);
@@ -125,11 +129,12 @@ void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
         const auto weights_padding = weights->info()->padding();
 
         const size_t ld_weights_col = weights_shape[0] + weights_padding.left + weights_padding.right;
-        const size_t ld_weights_row = ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom);
+        const size_t ld_weights_row =
+            ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom);
         _pImpl->asm_kernel->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weights_row);
 
         weights->mark_as_unused();
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             bias->mark_as_unused();
         }
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
index f222ab9cf946e1159b9d486a87749dffca299c9f..f1816625d25f991030317093b353d8e795daf2e2 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -53,14 +54,22 @@ public:
      * @param[out] dst     Destination tensor info. Data type supported: same as @p src.
      * @param[in]  info    Depthwise convolution meta-data.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info);
+    void configure(const ITensorInfo     *src,
+                   const ITensorInfo     *weights,
+                   const ITensorInfo     *bias,
+                   ITensorInfo           *dst,
+                   const ConvolutionInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *weights,
+                           const ITensorInfo     *bias,
+                           const ITensorInfo     *dst,
+                           const ConvolutionInfo &info);
     /** Checks if activation is supported by the assembly kernels
      *
      * @param[in] activation Activation to check
@@ -70,8 +79,8 @@ public:
     static bool is_activation_supported(const ActivationLayerInfo &activation);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/cpu/operators/CpuDequantize.cpp b/src/cpu/operators/CpuDequantize.cpp
index 12dc136ba369927fb7f28733ff356bee4f83c85c..c05a23f3a7bc4ffb089187cbe9272369da4da89b 100644
--- a/src/cpu/operators/CpuDequantize.cpp
+++ b/src/cpu/operators/CpuDequantize.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuDequantizeKernel.h"
 
diff --git a/src/cpu/operators/CpuDirectConv2d.cpp b/src/cpu/operators/CpuDirectConv2d.cpp
index 9cdbdb61c1acd654dc6ef527450d0074e938f5a8..135a3bb2b9525000aee2faa0b051245d029647be 100644
--- a/src/cpu/operators/CpuDirectConv2d.cpp
+++ b/src/cpu/operators/CpuDirectConv2d.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -36,12 +37,25 @@ namespace cpu
 CpuDirectConv2d::~CpuDirectConv2d() = default;
 
 CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
-      _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
+    : _memory_group(std::move(memory_manager)),
+      _output_stage_kernel(),
+      _conv_kernel(),
+      _input_border_handler(),
+      _activationlayer_function(),
+      _accumulator(),
+      _has_bias(false),
+      _is_activationlayer_enabled(false),
+      _dim_split(Window::DimZ),
+      _is_padding_required()
 {
 }
 
-void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CpuDirectConv2d::configure(ITensorInfo               *src,
+                                ITensorInfo               *weights,
+                                const ITensorInfo         *bias,
+                                ITensorInfo               *dst,
+                                const PadStrideInfo       &conv_info,
+                                const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, conv_info, act_info);
@@ -51,7 +65,7 @@ void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const IT
     _input_border_handler = std::make_unique<NEFillBorderKernel>();
 
     // Free accumulator
-    if(_accumulator.buffer() != nullptr)
+    if (_accumulator.buffer() != nullptr)
     {
         _accumulator.allocator()->free();
     }
@@ -62,28 +76,33 @@ void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const IT
     _has_bias = (bias != nullptr);
 
     _conv_kernel->configure(src, weights, dst, conv_info);
-    if(_has_bias)
+    if (_has_bias)
     {
         _output_stage_kernel->configure(dst, bias);
     }
     _is_padding_required = !_conv_kernel->border_size().empty();
 
-    if(_is_padding_required)
+    if (_is_padding_required)
     {
         // Add zero padding XY
-        _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+        _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT,
+                                         PixelValue(static_cast<float>(0.f)));
     }
 
     //Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activationlayer_function = std::make_unique<CpuActivation>();
         _activationlayer_function->configure(dst, dst, act_info);
     }
 }
 
-Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+Status CpuDirectConv2d::validate(const ITensorInfo         *src,
+                                 const ITensorInfo         *weights,
+                                 const ITensorInfo         *bias,
+                                 const ITensorInfo         *dst,
+                                 const PadStrideInfo       &conv_info,
                                  const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
@@ -95,7 +114,7 @@ Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weig
     // Validate Convolution kernel
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info));
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
@@ -106,7 +125,7 @@ Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weig
     // Validate bias kernel
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst));
 
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info));
     }
@@ -122,14 +141,15 @@ void CpuDirectConv2d::run(ITensorPack &tensors)
     auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
     auto dst  = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_is_padding_required)
+    if (_is_padding_required)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC_DST, src);
-        NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack);
+        NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(),
+                                       pack);
     }
     NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
-    if(_has_bias)
+    if (_has_bias)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC_0, dst);
@@ -138,7 +158,7 @@ void CpuDirectConv2d::run(ITensorPack &tensors)
         NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack);
     }
 
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC, dst);
diff --git a/src/cpu/operators/CpuDirectConv2d.h b/src/cpu/operators/CpuDirectConv2d.h
index fa8d61e0832c36bfbafbefa64ca07c5cb4aaeef1..73c85f2dcd5c1c6a17b531f865587687df3fc66c 100644
--- a/src/cpu/operators/CpuDirectConv2d.h
+++ b/src/cpu/operators/CpuDirectConv2d.h
@@ -24,13 +24,14 @@
 #ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H
 #define ARM_COMPUTE_CPU_DIRECTCONV2D_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/ICpuOperator.h"
@@ -75,14 +76,23 @@ public:
      * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]      act_info  (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   const ITensorInfo         *bias,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDirectConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
@@ -95,10 +105,10 @@ private:
     std::unique_ptr<NEFillBorderKernel>                        _input_border_handler;
     std::unique_ptr<CpuActivation>                             _activationlayer_function;
     Tensor                                                     _accumulator;
-    bool                                                       _has_bias{ false };
-    bool                                                       _is_activationlayer_enabled{ false };
-    unsigned int                                               _dim_split{ 0 };
-    bool                                                       _is_padding_required{ false };
+    bool                                                       _has_bias{false};
+    bool                                                       _is_activationlayer_enabled{false};
+    unsigned int                                               _dim_split{0};
+    bool                                                       _is_padding_required{false};
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuDirectConv3d.cpp b/src/cpu/operators/CpuDirectConv3d.cpp
index aa74e420a6a4216aaafd14a0d9978f44830c5769..626f1c6775e76507c2e283c9b9a34a1d2ab71316 100644
--- a/src/cpu/operators/CpuDirectConv3d.cpp
+++ b/src/cpu/operators/CpuDirectConv3d.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -36,11 +37,17 @@ namespace cpu
 CpuDirectConv3d::~CpuDirectConv3d() = default;
 
 CpuDirectConv3d::CpuDirectConv3d(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _conv_kernel(), _activationlayer_function(), _accumulator(), _is_activationlayer_enabled(false), _dim_split(Window::DimZ)
+    : _memory_group(std::move(memory_manager)),
+      _conv_kernel(),
+      _activationlayer_function(),
+      _accumulator(),
+      _is_activationlayer_enabled(false),
+      _dim_split(Window::DimZ)
 {
 }
 
-void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info)
+void CpuDirectConv3d::configure(
+    ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src0, src1, src2, dst, conv_info);
     ARM_COMPUTE_ERROR_ON(src0->data_layout() != DataLayout::NDHWC);
@@ -48,7 +55,7 @@ void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITen
     _conv_kernel = std::make_unique<kernels::CpuDirectConv3dKernel>();
 
     // Free accumulator
-    if(_accumulator.buffer() != nullptr)
+    if (_accumulator.buffer() != nullptr)
     {
         _accumulator.allocator()->free();
     }
@@ -59,21 +66,25 @@ void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITen
 
     //Configure Activation Layer
     _is_activationlayer_enabled = conv_info.act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activationlayer_function = std::make_unique<CpuActivation>();
         _activationlayer_function->configure(dst, dst, conv_info.act_info);
     }
 }
 
-Status CpuDirectConv3d::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo conv_info)
+Status CpuDirectConv3d::validate(const ITensorInfo *src0,
+                                 const ITensorInfo *src1,
+                                 const ITensorInfo *src2,
+                                 const ITensorInfo *dst,
+                                 const Conv3dInfo   conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // Validate Convolution kernel
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv3dKernel::validate(src0, src1, src2, dst, conv_info));
 
-    if(conv_info.act_info.enabled())
+    if (conv_info.act_info.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, conv_info.act_info));
     }
@@ -89,7 +100,7 @@ void CpuDirectConv3d::run(ITensorPack &tensors)
 
     NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
 
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC, dst);
@@ -98,4 +109,4 @@ void CpuDirectConv3d::run(ITensorPack &tensors)
     }
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuDirectConv3d.h b/src/cpu/operators/CpuDirectConv3d.h
index cde01f07c2830fa3cc19c16d7ff0fa058e1e96e3..3ad1e09a14c37d37c28bfe4ae8d92544f85cc090 100644
--- a/src/cpu/operators/CpuDirectConv3d.h
+++ b/src/cpu/operators/CpuDirectConv3d.h
@@ -24,14 +24,15 @@
 #ifndef ARM_COMPUTE_CPU_DIRECTCONV3D_H
 #define ARM_COMPUTE_CPU_DIRECTCONV3D_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/ICpuOperator.h"
@@ -76,14 +77,19 @@ public:
      *                           The 1st dimensions must be equal to the 1st dimension of the @p kernels tensor.
      * @param[in]      conv_info Contains padding, stride, acitvation information.
      */
-    void configure(ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info);
+    void configure(
+        ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDirectConv3d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo conv_info);
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo   conv_info);
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
@@ -93,8 +99,8 @@ private:
     std::unique_ptr<kernels::CpuDirectConv3dKernel> _conv_kernel;
     std::unique_ptr<CpuActivation>                  _activationlayer_function;
     Tensor                                          _accumulator;
-    bool                                            _is_activationlayer_enabled{ false };
-    unsigned int                                    _dim_split{ 0 };
+    bool                                            _is_activationlayer_enabled{false};
+    unsigned int                                    _dim_split{0};
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuElementwise.cpp b/src/cpu/operators/CpuElementwise.cpp
index b88ae3e51465627c7bbcdd823f6e07e3d78832d7..c2ae8773c6305881d15d4c09df064a1bbff5303c 100644
--- a/src/cpu/operators/CpuElementwise.cpp
+++ b/src/cpu/operators/CpuElementwise.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/cpu/operators/CpuElementwise.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/CpuElementwiseKernel.h"
@@ -33,7 +34,7 @@ namespace cpu
 void CpuElementwiseBase::run(ITensorPack &tensors)
 {
     // If the kernel has been configured, use the window from the kernel.
-    if(_kernel->is_window_configured())
+    if (_kernel->is_window_configured())
     {
         ICpuOperator::run(tensors);
         return;
@@ -101,12 +102,16 @@ void CpuElementwiseComparisonStatic<COP>::configure(const ITensorInfo *src0, con
 }
 
 template <ComparisonOperation COP>
-Status CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status
+CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
     return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst);
 }
 
-void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op)
+void CpuElementwiseComparison::configure(const ITensorInfo  *src0,
+                                         const ITensorInfo  *src1,
+                                         ITensorInfo        *dst,
+                                         ComparisonOperation op)
 {
     ARM_COMPUTE_LOG_PARAMS(src0, src1, dst);
     auto k = std::make_unique<kernels::CpuComparisonKernel>();
@@ -114,7 +119,10 @@ void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorI
     _kernel = std::move(k);
 }
 
-Status CpuElementwiseComparison::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op)
+Status CpuElementwiseComparison::validate(const ITensorInfo  *src0,
+                                          const ITensorInfo  *src1,
+                                          const ITensorInfo  *dst,
+                                          ComparisonOperation op)
 {
     return kernels::CpuComparisonKernel::validate(op, src0, src1, dst);
 }
@@ -127,4 +135,4 @@ template class CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>
 template class CpuElementwiseComparisonStatic<ComparisonOperation::Less>;
 template class CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuElementwise.h b/src/cpu/operators/CpuElementwise.h
index b6c61cf245e92f66f24a34d2019e97d2ebe224b4..5db53c80261750e0534339363d4b7f6d33942c1e 100644
--- a/src/cpu/operators/CpuElementwise.h
+++ b/src/cpu/operators/CpuElementwise.h
@@ -139,7 +139,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op);
+    static Status
+    validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op);
 };
 
 /** Basic function to run @ref cpu::kernels::CpuComparisonKernel
@@ -182,4 +183,4 @@ using NELessEqual = CpuElementwiseComparisonStatic<ComparisonOperation::LessEqua
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */
diff --git a/src/cpu/operators/CpuElementwiseUnary.cpp b/src/cpu/operators/CpuElementwiseUnary.cpp
index 7fd14dba7d32b1a28eb0d7acfbccde34170732da..04ab7bf8f55ff902b8d83b12eabd902e08364e6f 100644
--- a/src/cpu/operators/CpuElementwiseUnary.cpp
+++ b/src/cpu/operators/CpuElementwiseUnary.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/cpu/operators/CpuElementwiseUnary.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/CpuElementwiseUnaryKernel.h"
@@ -47,7 +48,7 @@ Status CpuElementwiseUnary::validate(ElementWiseUnary op, const ITensorInfo &src
 
 void CpuElementwiseUnary::run(ITensorPack &tensors)
 {
-    if(_kernel->is_window_configured())
+    if (_kernel->is_window_configured())
     {
         ICpuOperator::run(tensors);
         return;
@@ -57,4 +58,4 @@ void CpuElementwiseUnary::run(ITensorPack &tensors)
     ICpuOperator::run(tensors, compute_output_shape_and_window(src_info->tensor_shape()).second);
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuElementwiseUnary.h b/src/cpu/operators/CpuElementwiseUnary.h
index 5e8e98d04729b62f8d74e576dd1f934a66eaf2fa..1e51bfaa1ca5e3000b454265cfa4d321b99f3cba 100644
--- a/src/cpu/operators/CpuElementwiseUnary.h
+++ b/src/cpu/operators/CpuElementwiseUnary.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
@@ -56,4 +57,4 @@ public:
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */
diff --git a/src/cpu/operators/CpuFill.cpp b/src/cpu/operators/CpuFill.cpp
index 3d8f62fe07ff80f61137e7b9f3dd0241dcf53b23..1890d0b9163e844e9c0ec87cdd8ce230374ee3dc 100644
--- a/src/cpu/operators/CpuFill.cpp
+++ b/src/cpu/operators/CpuFill.cpp
@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuFill.h"
 
-#include "src/cpu/kernels/CpuFillKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuFillKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/cpu/operators/CpuFill.h b/src/cpu/operators/CpuFill.h
index 41d9a9fa8aa33a856d826caec6dbb167a384759e..cb83745d29df8b22704ad005b0d818b158c18073 100644
--- a/src/cpu/operators/CpuFill.h
+++ b/src/cpu/operators/CpuFill.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_FILL_H
 
 #include "arm_compute/core/PixelValue.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
diff --git a/src/cpu/operators/CpuFlatten.cpp b/src/cpu/operators/CpuFlatten.cpp
index f6ae13979426255c41f6d0a58647e25552b4de70..2609d44590a7f6dc898586dda1c84c3f22542e76 100644
--- a/src/cpu/operators/CpuFlatten.cpp
+++ b/src/cpu/operators/CpuFlatten.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,25 +23,34 @@
  */
 #include "src/cpu/operators/CpuFlatten.h"
 
-#include "src/cpu/kernels/CpuReshapeKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuReshape.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
+CpuFlatten::CpuFlatten() : _reshape(nullptr)
+{
+}
+
+CpuFlatten::~CpuFlatten() = default;
+
 void CpuFlatten::configure(const ITensorInfo *src, ITensorInfo *dst)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst);
-    auto k = std::make_unique<kernels::CpuReshapeKernel>();
-    k->configure(src, dst);
-    _kernel = std::move(k);
+    _reshape = std::make_unique<CpuReshape>();
+    _reshape->configure(src, dst);
 }
 
 Status CpuFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst)
 {
-    return kernels::CpuReshapeKernel::validate(src, dst);
+    return CpuReshape::validate(src, dst);
+}
+
+void CpuFlatten::run(ITensorPack &tensors)
+{
+    _reshape->run(tensors);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuFlatten.h b/src/cpu/operators/CpuFlatten.h
index 0e9fcbdc3512bd5a3d61de2d9265c2b482a92154..911760dd956ccb7108a111355618d0025c3576a2 100644
--- a/src/cpu/operators/CpuFlatten.h
+++ b/src/cpu/operators/CpuFlatten.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_FLATTEN_H
-#define ARM_COMPUTE_CPU_FLATTEN_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUFLATTEN_H
+#define ACL_SRC_CPU_OPERATORS_CPUFLATTEN_H
 
 #include "src/cpu/ICpuOperator.h"
 
@@ -30,10 +30,15 @@ namespace arm_compute
 {
 namespace cpu
 {
+class CpuReshape;
 /** Basic function to flatten a given input */
 class CpuFlatten : public ICpuOperator
 {
 public:
+    /** Constructor */
+    CpuFlatten();
+    /** Destructor */
+    ~CpuFlatten();
     /** Configure operator for a given list of arguments
      *
      * Valid data layouts:
@@ -58,7 +63,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<CpuReshape> _reshape;
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FLATTEN_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUFLATTEN_H
diff --git a/src/cpu/operators/CpuFloor.cpp b/src/cpu/operators/CpuFloor.cpp
index 868add7d29bb69e11f520901933cbf068b66bbd9..a107393b0133ad7b0bb81278a4085a32d6978ad6 100644
--- a/src/cpu/operators/CpuFloor.cpp
+++ b/src/cpu/operators/CpuFloor.cpp
@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuFloor.h"
 
-#include "src/cpu/kernels/CpuFloorKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuFloorKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/cpu/operators/CpuFullyConnected.cpp b/src/cpu/operators/CpuFullyConnected.cpp
index 395d8d2aa54ca8f1018e351264b355e1c352441a..85a0b0311bca77b8eed6efd2427146b81ebff450 100644
--- a/src/cpu/operators/CpuFullyConnected.cpp
+++ b/src/cpu/operators/CpuFullyConnected.cpp
@@ -25,10 +25,11 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
@@ -49,8 +50,11 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
-                                      GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
+Status get_gemmlowp_output_stage_info(const ITensorInfo         *src,
+                                      const ITensorInfo         *weights,
+                                      const ITensorInfo         *dst,
+                                      const ActivationLayerInfo &act,
+                                      GEMMLowpOutputStageInfo   &gemmlowp_output_stage_info)
 {
     const auto                    data_type = src->data_type();
     const QuantizationInfo        oq_info   = dst->quantization_info();
@@ -62,10 +66,11 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
     int32_t output_multiplier;
     int32_t output_shift;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
-    int32_t type_min = 0;
-    int32_t type_max = 0;
+    int32_t type_min             = 0;
+    int32_t type_max             = 0;
     std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
 
     gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -78,14 +83,22 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
     return Status{};
 }
 
-Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act, bool enable_fast_math, WeightFormat weight_format)
+Status validate_mm(const ITensorInfo         *src,
+                   const ITensorInfo         *weights,
+                   const ITensorInfo         *biases,
+                   const ITensorInfo         *dst,
+                   const ActivationLayerInfo &act,
+                   bool                       enable_fast_math,
+                   WeightFormat               weight_format)
 {
-    if(is_data_type_quantized_asymmetric(src->data_type()))
+    if (is_data_type_quantized_asymmetric(src->data_type()))
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate src and weights offset
-        const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset);
-        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
+        const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale,
+                                                     -src->quantization_info().uniform().offset);
+        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale,
+                                                         -weights->quantization_info().uniform().offset);
 
         GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
         ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(src, weights, dst, act, gemmlowp_output_stage_info));
@@ -97,11 +110,8 @@ Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITe
         // Validate gemmlowp function
         TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
         TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmLowpMatrixMultiplyCore::validate(&src_info,
-                                                                            &weights_info,
-                                                                            biases,
-                                                                            dst,
-                                                                            gemm_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CpuGemmLowpMatrixMultiplyCore::validate(&src_info, &weights_info, biases, dst, gemm_info));
     }
     else
     {
@@ -142,21 +152,28 @@ CpuFullyConnected::CpuFullyConnected()
 
 CpuFullyConnected::~CpuFullyConnected() = default;
 
-void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+void CpuFullyConnected::configure_mm(const ITensorInfo         *src,
+                                     const ITensorInfo         *weights,
+                                     const ITensorInfo         *biases,
+                                     ITensorInfo               *dst,
+                                     const ActivationLayerInfo &act)
 {
-    if(_is_quantized_asymmetric)
+    if (_is_quantized_asymmetric)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate src and weights offset
-        const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset);
-        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
+        const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale,
+                                                     -src->quantization_info().uniform().offset);
+        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale,
+                                                         -weights->quantization_info().uniform().offset);
 
         TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
         TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
 
         // Configure gemmlowp function and output stage for asymmetric quantized types
         GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
-        const Status            status = get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info);
+        const Status            status =
+            get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info);
         ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK);
 
         GEMMInfo gemm_info;
@@ -179,7 +196,11 @@ void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo *
     }
 }
 
-void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+void CpuFullyConnected::configure_conv_fc(const ITensorInfo         *src,
+                                          const ITensorInfo         *weights,
+                                          const ITensorInfo         *biases,
+                                          ITensorInfo               *dst,
+                                          const ActivationLayerInfo &act)
 {
     ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
 
@@ -195,7 +216,11 @@ void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorI
     configure_mm(&_flattened_src, weights, biases, dst, act);
 }
 
-void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+void CpuFullyConnected::configure_fc_fc(const ITensorInfo         *src,
+                                        const ITensorInfo         *weights,
+                                        const ITensorInfo         *biases,
+                                        ITensorInfo               *dst,
+                                        const ActivationLayerInfo &act)
 {
     ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1));
 
@@ -203,17 +228,17 @@ void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInf
     configure_mm(src, weights, biases, dst, act);
 }
 
-void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
-                                  FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info)
+void CpuFullyConnected::configure(const ITensorInfo      *src,
+                                  const ITensorInfo      *weights,
+                                  const ITensorInfo      *biases,
+                                  ITensorInfo            *dst,
+                                  FullyConnectedLayerInfo fc_info,
+                                  const WeightsInfo      &weights_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuFullyConnected::validate(src,
-                                                           weights,
-                                                           biases != nullptr ? biases : nullptr,
-                                                           dst,
-                                                           fc_info,
-                                                           weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuFullyConnected::validate(src, weights, biases != nullptr ? biases : nullptr, dst, fc_info, weights_info));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info);
 
     _needs_weights_conversion = false;
@@ -238,9 +263,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
 
     // Check if we have a fully connected layer with batches
     const bool is_batched_fc_layer = dst->dimension(1) > 1;
-    if(is_batched_fc_layer)
+    if (is_batched_fc_layer)
     {
-        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), dst->tensor_shape().cbegin() + 1));
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                            (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                        dst->tensor_shape().cbegin() + 1));
     }
     else
     {
@@ -248,7 +275,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
     }
 
     // Reshape weights if needed
-    if(_needs_weights_reshape)
+    if (_needs_weights_reshape)
     {
         // Reshape the weights
         _transpose_weights = std::make_unique<kernels::CpuTransposeKernel>();
@@ -260,13 +287,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
     }
 
     // Convert weights if needed
-    if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
     {
         // Convert weights
         _convert_weights = std::make_unique<CpuConvertFullyConnectedWeights>();
-        _convert_weights->configure(weights_to_use,
-                                    &_converted_weights,
-                                    src->tensor_shape(),
+        _convert_weights->configure(weights_to_use, &_converted_weights, src->tensor_shape(),
                                     fc_info.weights_trained_layout);
         _converted_weights.set_are_values_constant(weights_to_use->are_values_constant());
 
@@ -275,7 +300,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
         _trans_weights_idx        = AuxTensorIdx::ConvertedWeights;
     }
 
-    if(_is_fc_after_conv)
+    if (_is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
         configure_conv_fc(src, weights_to_use, biases, dst, fc_info.activation_info);
@@ -287,54 +312,57 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
     }
 
     // Retain the tensorinfo with the weights to use
-    if(_needs_weights_reshape || _needs_weights_conversion)
+    if (_needs_weights_reshape || _needs_weights_conversion)
     {
         _trans_weights = *weights_to_use;
     }
 
     // Set auxiliary memory requirements
     auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
-    for(unsigned int i = 0; i < gemm_mem_req.size(); ++i)
+    for (unsigned int i = 0; i < gemm_mem_req.size(); ++i)
     {
         _aux_mem[i] = gemm_mem_req[i];
     }
 
-    if(_aux_mem[Pretranspose].size > 0)
+    if (_aux_mem[Pretranspose].size > 0)
     {
         // Release permuted weights at the end of prepare as they are further transposed by the assembly dispatch
         // Do not release them if biases are dynamic and data type is quantized, since the weights tensor will be used for biases offset calculation
         // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time.
         _aux_mem[TransposedWeights] = MemoryInfo(
             offset_int_vec(TransposedWeights),
-            _dynamic_weights                                                         ? MemoryLifetime::Temporary  :
-            (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent :
-                                                                                       MemoryLifetime::Prepare,
+            _dynamic_weights                                                           ? MemoryLifetime::Temporary
+            : (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent
+                                                                                       : MemoryLifetime::Prepare,
             _reshaped_weights.total_size());
 
-        _aux_mem[ConvertedWeights] = MemoryInfo(
-            offset_int_vec(ConvertedWeights),
-            _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
-            _converted_weights.total_size());
+        _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights),
+                                                _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+                                                _converted_weights.total_size());
     }
     else
     {
-        _aux_mem[TransposedWeights] = MemoryInfo(
-            offset_int_vec(TransposedWeights),
-            _dynamic_weights          ? MemoryLifetime::Temporary :
-            _needs_weights_conversion ? MemoryLifetime::Prepare   :
-                                        MemoryLifetime::Persistent,
-            _reshaped_weights.total_size());
+        _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights),
+                                                 _dynamic_weights            ? MemoryLifetime::Temporary
+                                                 : _needs_weights_conversion ? MemoryLifetime::Prepare
+                                                                             : MemoryLifetime::Persistent,
+                                                 _reshaped_weights.total_size());
 
         _aux_mem[ConvertedWeights] = MemoryInfo(
-            offset_int_vec(ConvertedWeights),
-            _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent,
+            offset_int_vec(ConvertedWeights), _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent,
             _converted_weights.total_size());
     }
-    _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
+    _aux_mem[FlattenedSrc] =
+        MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
 }
 
-Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights,
-                                       const ITensorInfo *biases, const ITensorInfo *dst, FullyConnectedLayerInfo fc_info, WeightsInfo weights_info)
+Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                                       const ITensorInfo         *src,
+                                       const ITensorInfo         *weights,
+                                       const ITensorInfo         *biases,
+                                       const ITensorInfo         *dst,
+                                       FullyConnectedLayerInfo    fc_info,
+                                       WeightsInfo                weights_info)
 {
     GEMMInfo gemm_info;
     gemm_info.set_activation_info(fc_info.activation_info);
@@ -345,12 +373,17 @@ Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weigh
     return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
 }
 
-Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                   FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info)
+Status CpuFullyConnected::validate(const ITensorInfo      *src,
+                                   const ITensorInfo      *weights,
+                                   const ITensorInfo      *biases,
+                                   const ITensorInfo      *dst,
+                                   FullyConnectedLayerInfo fc_info,
+                                   const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     if (is_fixed_format_fast_math(weights_info.weight_format()))
     {
@@ -364,15 +397,22 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
-                                && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
 
     bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
     bool is_fc_after_conv = true;
 
-    const ITensorInfo &flatten_src       = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)));
-    const ITensorInfo &reshaped_weights  = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
-    const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
+    const ITensorInfo &flatten_src =
+        TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)));
+    const ITensorInfo &reshaped_weights = TensorInfo(
+        weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+    const ITensorInfo &converted_weights = weights_reshaped
+                                               ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+                                               : TensorInfo(*reshaped_weights.clone());
 
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
@@ -386,10 +426,10 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
     // Check if we have a fully connected layer with batches
     const bool is_batched_fc_layer = dst->dimension(1) > 1;
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        if(is_data_type_quantized(src->data_type()))
+        if (is_data_type_quantized(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -399,36 +439,37 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
         }
     }
 
-    if(is_batched_fc_layer)
+    if (is_batched_fc_layer)
     {
-        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), dst->tensor_shape().cbegin() + 1));
+        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                           (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                       dst->tensor_shape().cbegin() + 1));
     }
     else
     {
         is_fc_after_conv = src->num_dimensions() > 1;
     }
 
-    if(!weights_reshaped)
+    if (!weights_reshaped)
     {
         // Validate reshape weights kernel
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuTransposeKernel::validate(weights, &reshaped_weights));
         weights_to_use = &reshaped_weights;
     }
 
-    if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
     {
         // Validate convert weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(weights_to_use,
-                                                                              &converted_weights,
-                                                                              src->tensor_shape(),
-                                                                              fc_info.weights_trained_layout));
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(
+            weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout));
         weights_to_use = &converted_weights;
     }
 
-    if(is_fc_after_conv)
+    if (is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
-        ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            (weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
 
         // Validate flatten kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CpuFlatten::validate(src, &flatten_src));
@@ -440,7 +481,8 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1));
     }
     // Validate matrix multiply kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info, fc_info.enable_fast_math, weights_info.weight_format()));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info,
+                                            fc_info.enable_fast_math, weights_info.weight_format()));
 
     return Status{};
 }
@@ -460,21 +502,21 @@ void CpuFullyConnected::run(ITensorPack &tensors)
     CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false);
 
     // Linearize src if it comes from a convolutional layer
-    if(_is_fc_after_conv)
+    if (_is_fc_after_conv)
     {
-        ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } };
+        ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}};
         _flatten->run(flatten_pack);
     }
 
     ITensorPack gemm_pack = tensors;
     gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
-    if(_needs_weights_reshape || _needs_weights_conversion)
+    if (_needs_weights_reshape || _needs_weights_conversion)
     {
         gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get());
     }
 
     // Run matrix multiply
-    if(_is_quantized_asymmetric)
+    if (_is_quantized_asymmetric)
     {
         _mm_gemmlowp->run(gemm_pack);
     }
@@ -486,7 +528,7 @@ void CpuFullyConnected::run(ITensorPack &tensors)
 
 void CpuFullyConnected::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared || _dynamic_weights)
+    if (!_is_prepared || _dynamic_weights)
     {
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
         ++_asrt_prepare_count;
@@ -502,20 +544,21 @@ void CpuFullyConnected::prepare(ITensorPack &tensors)
         const ITensor *cur_weights = weights;
 
         // Reshape of the weights (happens only once)
-        if(_needs_weights_reshape)
+        if (_needs_weights_reshape)
         {
             // Run reshape weights kernel and mark weights as unused
-            ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
-            NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), transpose_pack);
+            ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}};
+            NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(),
+                                           transpose_pack);
 
             cur_weights->mark_as_unused();
             cur_weights = reshaped_weights.get();
         }
 
         // Convert weights if needed (happens only once)
-        if(_needs_weights_conversion)
+        if (_needs_weights_conversion)
         {
-            ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
+            ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}};
             _convert_weights->run(convert_pack);
 
             cur_weights->mark_as_unused();
@@ -526,7 +569,7 @@ void CpuFullyConnected::prepare(ITensorPack &tensors)
         gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
 
         // Prepare GEMM prepare and release unused weights
-        if(!_is_quantized_asymmetric)
+        if (!_is_quantized_asymmetric)
         {
             _mm_gemm->prepare(gemm_pack);
         }
diff --git a/src/cpu/operators/CpuFullyConnected.h b/src/cpu/operators/CpuFullyConnected.h
index 1e8c6478d09283ae863078cc3f0b083afcbef55c..b72f77e5c43269e2dc4ed071a55f711b0c8d3751 100644
--- a/src/cpu/operators/CpuFullyConnected.h
+++ b/src/cpu/operators/CpuFullyConnected.h
@@ -21,14 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_FULLY_CONNECTED_H
-#define ARM_COMPUTE_CPU_FULLY_CONNECTED_H
-
-#include "src/cpu/ICpuOperator.h"
+#ifndef ACL_SRC_CPU_OPERATORS_CPUFULLYCONNECTED_H
+#define ACL_SRC_CPU_OPERATORS_CPUFULLYCONNECTED_H
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/function_info/FullyConnectedLayerInfo.h"
 
+#include "src/cpu/ICpuOperator.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -86,16 +86,24 @@ public:
      * @param[in]  fc_info      (Optional) Fully connected layer additional info
      * @param[in]  weights_info (Optional) Stores neccessary compute information when weights are already reshaped
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
-                   FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const ITensorInfo      *src,
+                   const ITensorInfo      *weights,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *dst,
+                   FullyConnectedLayerInfo fc_info      = FullyConnectedLayerInfo(),
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CpuFullyConnected
      *
      * Similar to @ref CpuFullyConnected::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                           FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo());
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *dst,
+                           FullyConnectedLayerInfo fc_info      = FullyConnectedLayerInfo(),
+                           const WeightsInfo      &weights_info = WeightsInfo());
 
     /** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format
      * weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same
@@ -103,31 +111,49 @@ public:
      *
      * @return a status
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights,
-                               const ITensorInfo *biases, const ITensorInfo *dst,
-                               FullyConnectedLayerInfo fc_info, WeightsInfo weights_info);
+    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                               const ITensorInfo         *src,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *dst,
+                               FullyConnectedLayerInfo    fc_info,
+                               WeightsInfo                weights_info);
 
     //Inherited methods override
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
-    void configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
-    void configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
-    void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
+    void configure_fc_fc(const ITensorInfo         *src,
+                         const ITensorInfo         *weights,
+                         const ITensorInfo         *biases,
+                         ITensorInfo               *dst,
+                         const ActivationLayerInfo &act);
+    void configure_conv_fc(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           ITensorInfo               *dst,
+                           const ActivationLayerInfo &act);
+    void configure_mm(const ITensorInfo         *src,
+                      const ITensorInfo         *weights,
+                      const ITensorInfo         *biases,
+                      ITensorInfo               *dst,
+                      const ActivationLayerInfo &act);
 
     enum AuxTensorIdx
     {
         AsmGemmWorkspace = 0,
         Pretranspose,
-        GemmTemp1, // Both CpuGemm and CpuGemmLowpMatrixMultiplyCore
-        GemmTemp2, // Both CpuGemm and CpuGemmLowpMatrixMultiplyCore
-        GemmTemp3, // Both CpuGemm and CpuGemmLowpMatrixMultiplyCore
-        GemmTemp4, // CpuGemmLowpMatrixMultiplyCore only
-        GemmTemp5, // CpuGemmLowpMatrixMultiplyCore only
-        GemmTemp6, // CpuGemmLowpMatrixMultiplyCore only
-        GemmTemp7, // CpuGemmLowpMatrixMultiplyCore only
+        GemmTemp1,
+        GemmTemp2,
+        GemmTemp3,
+        GemmTemp4,
+        GemmTemp5,
+        GemmTemp6,
+        GemmTemp7,
+        GemmTemp8,
+        // Slots above (0-9) reserved for either CpuGemm or CpuGemmLowpMatrixMultiplyCore
         TransposedWeights,
         ConvertedWeights,
         FlattenedSrc,
@@ -165,4 +191,4 @@ private:
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FULLY_CONNECTED_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUFULLYCONNECTED_H
diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp
index 34b845928df58f7431676fcdee1ee3577b04b1d8..e035de01316633dfaaf9519b47f39e4143d05c81 100644
--- a/src/cpu/operators/CpuGemm.cpp
+++ b/src/cpu/operators/CpuGemm.cpp
@@ -24,9 +24,10 @@
 #include "src/cpu/operators/CpuGemm.h"
 
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -52,22 +53,32 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
     asm_info.fast_mode               = info.fast_math();
     asm_info.fixed_format            = info.fixed_format();
     asm_info.weight_format           = info.weight_format();
+    asm_info.transpose_b =
+        info.pretranspose_B(); // The "pretranspose_B" flag here is not the same as the pretranspose_B_array method. The flag here signals to pretranspose_B_array method if we want to perform additional transpose on B before the pretranspose_B_array method
 
     return asm_info;
 }
 } // namespace
 
-void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
+void CpuGemm::configure(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        ITensorInfo       *d,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
     ARM_COMPUTE_ERROR_THROW_ON(CpuGemm::validate(a, b, c, d, alpha, beta, gemm_info));
     ARM_COMPUTE_LOG_PARAMS(a, b, c, d, alpha, beta, gemm_info);
 
-    const cpu::AsmGemmInfo asm_info      = init_assembly_metadata(gemm_info);
-    const bool             is_c_bias     = beta == 1 && c != nullptr;
-    bool                   run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) &&
-                                           (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
-                                           !(!b->are_values_constant() && b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
+    const cpu::AsmGemmInfo asm_info  = init_assembly_metadata(gemm_info);
+    const bool             is_c_bias = beta == 1 && c != nullptr;
+    const bool             run_optimised =
+        bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) &&
+        (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
+        !(!b->are_values_constant() &&
+          b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
 
     // Check if we need to reshape the matrix B only on the first run
     _is_prepared                      = false;
@@ -76,144 +87,215 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso
     _run_alpha_scale                  = alpha != 1.f;
     _run_bias_addition                = is_c_bias;
     _run_addition                     = beta != 0 && beta != 1 && c != nullptr;
-    _run_activation                   = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
+    _run_activation =
+        gemm_info.activation_info().enabled() &&
+        (!run_optimised ||
+         (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
 
-    if(run_optimised)
+    if (run_optimised)
     {
+        _run_interleave_transpose   = false;
         const ITensorInfo *c_to_use = is_c_bias ? c : nullptr;
         _asm_glue                   = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
         _asm_glue->configure(a, b, c_to_use, d, asm_info);
         ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
 
-        auto asm_mem_req           = _asm_glue->workspace();
-        _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
-        _aux_mem[Pretraspose]      = asm_mem_req[Pretraspose];
+        const auto asm_mem_req = _asm_glue->workspace();
+        for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)
+        {
+            _aux_mem[slot] = asm_mem_req[slot];
+        }
 
         // Scale product by alpha
-        if(_run_alpha_scale)
+        if (_run_alpha_scale)
         {
             _alpha_scale_func = std::make_unique<cpu::CpuActivation>();
-            _alpha_scale_func->configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
+            _alpha_scale_func->configure(
+                d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
         }
     }
     else
     {
+        _run_interleave_transpose = !_run_vector_matrix_multiplication;
         // Pick output tensor in case bias addition should be performed
         ITensorInfo *gemm_output_to_use = (_run_bias_addition) ? &_tmp_d : d;
+        // Pick b tensor in case pretranspose should be performed
+        const ITensorInfo *b_to_use = b;
 
         _mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>();
 
+        // Configure rhs pretranspose
+        if (gemm_info.pretranspose_B())
+        {
+            _pretranspose_b_func = std::make_unique<CpuTranspose>();
+            _pretranspose_b_func->configure(b_to_use, &_pretransposed_b);
+            MemoryLifetime lifetime;
+            if (_reshape_b_only_on_first_run)
+            {
+                if (_run_interleave_transpose)
+                {
+                    // PreTransposedRHS tensor is only used in prepare(), but is then succeeded by Transposed1xWRHS
+                    // So PreTransposedRHS can be freed inside prepare()
+                    lifetime = MemoryLifetime::Prepare;
+                }
+                else
+                {
+                    // PreTransposedRHS tensor is only used in prepare(), but is the final transformation of rhs
+                    // So PreTransposedRHS needs to persist beyond prepare()
+                    lifetime = MemoryLifetime::Persistent;
+                }
+            }
+            else
+            {
+                // PreTransposedRHS tensor is always used in run() and doesn't need to persist
+                lifetime = MemoryLifetime::Temporary;
+            }
+            _aux_mem[PreTransposedRHS] =
+                MemoryInfo(offset_int_vec(PreTransposedRHS), lifetime, _pretransposed_b.total_size());
+            b_to_use = &_pretransposed_b;
+        }
+
         // Select between GEMV and GEMM
-        if(_run_vector_matrix_multiplication)
+        if (_run_vector_matrix_multiplication)
         {
             // Configure the matrix multiply kernel
-            _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false);
+            _mm_kernel->configure(a, b_to_use, gemm_output_to_use, alpha, false);
         }
         else
         {
-            const int m = a->dimension(1);
-            const int n = b->dimension(0);
-            const int k = a->dimension(0);
-
+            ARM_COMPUTE_ERROR_ON(!_run_interleave_transpose);
             // Configure interleave kernel
             _interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>();
             _interleave_kernel->configure(a, &_tmp_a);
-            _aux_mem[InterleavedLHS] = MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
+            _aux_mem[InterleavedLHS] =
+                MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
 
-            // Configure transpose kernel
-            _transpose_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
-            _transpose_kernel->configure(b, &_tmp_b);
-            _aux_mem[TransposedRHS] = MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
+            // Configure rhs transpose1xw kernel
+            _transpose1xW_b_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
+            _transpose1xW_b_kernel->configure(b_to_use, &_tmp_b);
+            _aux_mem[Transposed1xWRHS] =
+                MemoryInfo(offset_int_vec(Transposed1xWRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
+
+            // Use a and b here instead of _tmp_a and _tmp_b because CpuGemmMatrixMultiplyKernel requires the original m,n,k in case of interleaved a and transposed1xw b
+            const int m = a->dimension(1);
+            const int n = b_to_use->dimension(0);
+            const int k = a->dimension(0);
 
             // Configure matrix multiplication kernel
-            _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
+            _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, _run_interleave_transpose,
+                                  GEMMReshapeInfo(m, n, k));
         }
 
-        if(_run_bias_addition)
+        if (_run_bias_addition)
         {
             _add_bias = std::make_unique<cpu::CpuAdd>();
             _add_bias->configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE);
-            _aux_mem[TempResult] = MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size());
+            _aux_mem[TempResult] =
+                MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size());
         }
     }
 
     // Configure matrix addition kernel
-    if(_run_addition)
+    if (_run_addition)
     {
         _ma_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixAdditionKernel>();
         _ma_kernel->configure(c, d, beta);
     }
 
     // Configure activation
-    if(_run_activation)
+    if (_run_activation)
     {
         _activation_func = std::make_unique<cpu::CpuActivation>();
         _activation_func->configure(d, nullptr, gemm_info.activation_info());
     }
 }
 
-Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
+Status CpuGemm::validate(const ITensorInfo *a,
+                         const ITensorInfo *b,
+                         const ITensorInfo *c,
+                         const ITensorInfo *d,
+                         float              alpha,
+                         float              beta,
+                         const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     const bool is_c_bias    = beta == 1 && c != nullptr;
     const bool run_addition = c != nullptr && beta != 0 && beta != 1;
+    // Check if we should use the pretransposed_b or original b
+    // TODO: COMPMID-6597
+    // Note that this check should only apply to the non-optimized path. The reason we brought this at the beginning
+    // instead of only for the fallback path is because of the checks performed below, between here and the run_optimised decision
+    // We should simplify this by
+    //   1. Moving the checks between "fix-start" and "fix-end" into their corresponding ops / kernels (e.g. the weights format checks can and should be moved into CpuGemmAssemblyDispatch)
+    //   2. Moving this b_to_use check back into the non-optimized path
+    TensorInfo pretransposed_b  = b->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*b));
+    const ITensorInfo *b_to_use = gemm_info.pretranspose_B() ? &pretransposed_b : b;
+    // TODO: COMPMID-6597 fix-start
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32);
 
-    if(is_fixed_format_fast_math(gemm_info.weight_format()))
+    if (is_fixed_format_fast_math(gemm_info.weight_format()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b_to_use, DataType::BFLOAT16);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b_to_use);
     }
 
     const int block_by = arm_compute::block_by(gemm_info.weight_format());
     // test if im2col has changed the dimensions that are needed for padding
-    if(a->dimension(0) != b->dimension(1) && block_by > 1)
+    if (a->dimension(0) != b_to_use->dimension(1) && block_by > 1)
     {
         // have to verify bias
         const size_t dim0_sz = a->dimension(0);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((dim0_sz % block_by) != 0, ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str());
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            (dim0_sz % block_by) != 0,
+            ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str());
         // a->dimension(0) = kernel_area * input_channel + kernel_area * input_pad_right
-        // b->dimension(1) = kernel_area * input_channel
-        // a->dimension(0) = b->dimension(1) + kernel_area * input_pad_right
-        const size_t input_pad_right = (dim0_sz - b->dimension(1)) % block_by;
-        const size_t kernel_area     = (dim0_sz - b->dimension(1)) / input_pad_right;
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((dim0_sz - kernel_area * input_pad_right) != b->dimension(1), "The product AB is defined only if A number of columns and B number of rows are related");
+        // b_to_use->dimension(1) = kernel_area * input_channel
+        // a->dimension(0) = b_to_use->dimension(1) + kernel_area * input_pad_right
+        const size_t input_pad_right = (dim0_sz - b_to_use->dimension(1)) % block_by;
+        const size_t kernel_area     = (dim0_sz - b_to_use->dimension(1)) / input_pad_right;
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            (dim0_sz - kernel_area * input_pad_right) != b_to_use->dimension(1),
+            "The product AB is defined only if A number of columns and B number of rows are related");
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            a->dimension(0) != b_to_use->dimension(1),
+            "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-    if(a->data_type() != DataType::BFLOAT16)
+    if (a->data_type() != DataType::BFLOAT16)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, d);
     }
 
-    if(run_addition)
+    if (run_addition)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0);
         ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1),
+                                        "The C matrix must have the same number of rows as the matrix A");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b_to_use->dimension(0) != c->dimension(0),
+                                        "The C matrix must have the same number of columns as the matrix B");
     }
 
-    if(d->total_size() != 0)
+    if (d->total_size() != 0)
     {
         // For fixed format we are expecting some kind of blocked format for B/RHS so the dimension won't necessarily match the result matrix any more.
-        ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b->dimension(0) != d->dimension(0));
-        if(gemm_info.depth_output_gemm3d() != 0)
+        ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b_to_use->dimension(0) != d->dimension(0));
+        if (gemm_info.depth_output_gemm3d() != 0)
         {
-            if(gemm_info.reinterpret_input_as_3d())
+            if (gemm_info.reinterpret_input_as_3d())
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1));
                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != d->dimension(2));
@@ -228,74 +310,89 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens
             ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1));
         }
     }
+    // TODO: COMPMID-6597 fix-end
 
     // Check if we need to run the optimized assembly kernel
-    cpu::AsmGemmInfo asm_info      = init_assembly_metadata(gemm_info);
-    const bool       run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) &&
-                                     (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
-                                     !(!b->are_values_constant() && b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
+    cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+
+    // Note we use b instead of b_to_use here because asm_info also captures the pretranspose_b() flag
+    // so we pass the original b to CpuGemmAssemblyDispatch
+    const bool run_optimised =
+        bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) &&
+        (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
+        !(!b->are_values_constant() &&
+          b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
 
-    if(!run_optimised)
+    if (!run_optimised)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "CpuGemm cannot reinterpret the input tensor as 3D");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "CpuGemm cannot reinterpret the output tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(),
+                                        "CpuGemm cannot reinterpret the input tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0,
+                                        "CpuGemm cannot reinterpret the output tensor as 3D");
 
         // Check if the first input tensor is a vector.
         const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
         // Check if we need to reshape the matrix A and matrix B
-        const bool run_interleave_transpose = !run_vector_matrix_multiplication && !b->are_values_constant();
+        const bool run_interleave_transpose = !run_vector_matrix_multiplication;
 
         // Arguments used by GEMMReshapeInfo
         // If we pass the matrix A and matrix B reshaped to CpuGemmMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GEMMReshapeInfo
         // in order to know how the matrices have been reshaped
         const int m                         = a->dimension(1);
-        const int n                         = b->dimension(0);
+        const int n                         = b_to_use->dimension(0);
         const int k                         = a->dimension(0);
         int       mult_transpose1xW_width   = 1;
         int       mult_interleave4x4_height = 1;
 
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
+        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(
+            m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
 
         const ITensorInfo *matrix_a_info = a;
-        const ITensorInfo *matrix_b_info = b;
+        const ITensorInfo *matrix_b_info = b_to_use;
 
         TensorInfo tmp_a_info{};
         TensorInfo tmp_b_info{};
         TensorInfo tmp_output_info = *d->clone();
 
-        if(run_interleave_transpose)
+        if (run_interleave_transpose)
         {
             matrix_a_info = &tmp_a_info;
             matrix_b_info = &tmp_b_info;
 
             // Validate interleave kernel
-            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
+            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(
+                                               *a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
             ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info));
 
             // Validate transpose kernel
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
-            ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
+            auto_init_if_empty(tmp_b_info,
+                               b_to_use->clone()->set_tensor_shape(
+                                   compute_transpose1xW_with_element_size_shape(*b_to_use, mult_transpose1xW_width)));
+            ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b_to_use, &tmp_b_info));
         }
 
         // Validate matrix multiply
-        auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
+        auto_init_if_empty(tmp_output_info,
+                           matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(
+                               *matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(
+            matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
 
-        if(is_c_bias)
+        if (is_c_bias)
         {
             ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuAdd::validate(&tmp_output_info, c, d, ConvertPolicy::SATURATE));
         }
     }
 
     // Validate matrix addition kernel
-    if(run_addition)
+    if (run_addition)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixAdditionKernel::validate(c, d, beta));
     }
 
     // Validate activation
     const ActivationLayerInfo &activation = gemm_info.activation_info();
-    if(activation.enabled())
+    if (activation.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuActivation::validate(d, nullptr, activation));
     }
@@ -312,85 +409,123 @@ void CpuGemm::run(ITensorPack &tensors)
     auto c = tensors.get_const_tensor(ACL_SRC_2);
     auto d = tensors.get_tensor(ACL_DST);
 
-    if(_asm_glue && _asm_glue->is_configured())
+    if (_asm_glue && _asm_glue->is_configured())
     {
         // Pass c to asm dispatch only if it's the bias tensor
         ITensorPack asm_pack = tensors;
         asm_pack.add_const_tensor(ACL_SRC_2, _run_bias_addition ? c : nullptr);
         _asm_glue->run(asm_pack);
-        if(_run_alpha_scale)
+        if (_run_alpha_scale)
         {
-            ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
+            ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}};
             _alpha_scale_func->run(pack);
         }
     }
     else
     {
         CpuAuxTensorHandler interleaved_a(offset_int_vec(InterleavedLHS), _tmp_a, tensors, true);
-        CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true);
+        CpuAuxTensorHandler pretransposed_b(offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors);
+        CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors, true);
         CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true);
 
-        ITensorPack mm_pack{ { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_DST, (_run_bias_addition) ? temp_d.get() : d } };
-        if(!_run_vector_matrix_multiplication)
+        ITensorPack mm_pack{{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_DST, (_run_bias_addition) ? temp_d.get() : d}};
+
+        if (_run_interleave_transpose)
         {
             // Run interleave kernel
-            ITensorPack interleave_pack{ { ACL_SRC, a }, { ACL_DST, interleaved_a.get() } };
-            NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), interleave_pack);
+            ITensorPack interleave_pack{{ACL_SRC, a}, {ACL_DST, interleaved_a.get()}};
+            NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(),
+                                           interleave_pack);
+            // Use reshaped matrices
+            mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get());
+        }
 
-            if(!_reshape_b_only_on_first_run)
+        const ITensor *b_to_use = b;
+        if (_pretranspose_b_func)
+        {
+            if (!_reshape_b_only_on_first_run)
             {
-                // Run transpose kernel
-                ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } };
-                NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack);
+                // Run pretranspose kernel
+                ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}};
+                _pretranspose_b_func->run(pretranspose_pack);
             }
-
-            // Use reshaped matrices
-            mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get());
-            mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get());
+            b_to_use = pretransposed_b.get();
         }
+        if (_run_interleave_transpose)
+        {
+            if (!_reshape_b_only_on_first_run)
+            {
+                // Run transpose1xw kernel
+                ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}};
+                NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY,
+                                               _transpose1xW_b_kernel->window(), transpose_pack);
+            }
+            b_to_use = transposed1xw_b.get();
+        }
+        // Use reshaped matrices
+        mm_pack.add_const_tensor(ACL_SRC_1, b_to_use);
 
-        NEScheduler::get().schedule_op(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, _mm_kernel->window(), mm_pack);
+        NEScheduler::get().schedule_op(_mm_kernel.get(),
+                                       _run_vector_matrix_multiplication ? Window::DimX : Window::DimY,
+                                       _mm_kernel->window(), mm_pack);
 
         // Run bias addition kernel
-        if(_run_bias_addition)
+        if (_run_bias_addition)
         {
-            ITensorPack pack{ { ACL_SRC_0, temp_d.get() }, { ACL_SRC_1, c }, { ACL_DST, d } };
+            ITensorPack pack{{ACL_SRC_0, temp_d.get()}, {ACL_SRC_1, c}, {ACL_DST, d}};
             _add_bias->run(pack);
         }
     }
 
     // Run matrix addition kernel
-    if(_run_addition)
+    if (_run_addition)
     {
-        ITensorPack c_add_pack{ { ACL_SRC, c }, { ACL_DST, d } };
+        ITensorPack c_add_pack{{ACL_SRC, c}, {ACL_DST, d}};
         NEScheduler::get().schedule_op(_ma_kernel.get(), Window::DimY, _ma_kernel->window(), c_add_pack);
     }
 
     // Run activation function
-    if(_run_activation)
+    if (_run_activation)
     {
-        ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
+        ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}};
         _activation_func->run(pack);
     }
 }
 
 void CpuGemm::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        if(_asm_glue && _asm_glue->is_configured())
+        if (_asm_glue && _asm_glue->is_configured())
         {
             _asm_glue->prepare(tensors);
         }
-        else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication)
+        else if (_reshape_b_only_on_first_run)
         {
-            const ITensor *b     = tensors.get_const_tensor(ACL_SRC_1);
-            ITensor       *b_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS)));
-            ARM_COMPUTE_ERROR_ON_NULLPTR(b, b_aux);
-
-            CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux);
-            ITensorPack         transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } };
-            NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack);
+            const ITensor      *b        = tensors.get_const_tensor(ACL_SRC_1);
+            const ITensor      *b_to_use = b;
+            CpuAuxTensorHandler pretransposed_b(
+                offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors,
+                false /*pack_inject: no need to inject into tensors*/,
+                _pretranspose_b_func ==
+                    nullptr /*bypass_alloc: no need to allocate if _pretranspose_b_func is not run*/);
+            CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors,
+                                                false /*pack_inject*/, !_run_interleave_transpose /*bypass_alloc*/);
+
+            if (_pretranspose_b_func)
+            {
+                // Run pretranspose kernel
+                ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}};
+                _pretranspose_b_func->run(pretranspose_pack);
+                b_to_use = pretransposed_b.get();
+            }
+            if (_run_interleave_transpose)
+            {
+                // Run transpose kernel
+                ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}};
+                NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY,
+                                               _transpose1xW_b_kernel->window(), transpose_pack);
+            }
         }
         _is_prepared = true;
     }
@@ -401,8 +536,12 @@ experimental::MemoryRequirements CpuGemm::workspace() const
     return _aux_mem;
 }
 
-Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
-                             const GEMMInfo &gemm_info)
+Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                             const ITensorInfo         *a,
+                             const ITensorInfo         *b,
+                             const ITensorInfo         *c,
+                             const ITensorInfo         *d,
+                             const GEMMInfo            &gemm_info)
 {
     const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
 
diff --git a/src/cpu/operators/CpuGemm.h b/src/cpu/operators/CpuGemm.h
index 9b08e5d0f69c3f484ad6089cc373a7664f9001da..a05258d20631b4381406f6b66c8e4f48243a02c5 100644
--- a/src/cpu/operators/CpuGemm.h
+++ b/src/cpu/operators/CpuGemm.h
@@ -21,21 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_GEMM_H
-#define ARM_COMPUTE_CPU_GEMM_H
-
-#include "src/cpu/ICpuOperator.h"
+#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMM_H
+#define ACL_SRC_CPU_OPERATORS_CPUGEMM_H
 
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/GEMMInfo.h"
+
+#include "src/cpu/ICpuOperator.h"
 #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
 #include "src/cpu/kernels/CpuGemmMatrixAdditionKernel.h"
 #include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h"
 #include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
 #include "src/cpu/operators/CpuActivation.h"
 #include "src/cpu/operators/CpuAdd.h"
+#include "src/cpu/operators/CpuTranspose.h"
 #include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
 
 #include <memory>
@@ -93,16 +94,26 @@ public:
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should happen only for the first run
      */
-    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                   float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ITensorInfo *a,
+                   const ITensorInfo *b,
+                   const ITensorInfo *c,
+                   ITensorInfo       *d,
+                   float              alpha,
+                   float              beta,
+                   const GEMMInfo    &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CpuGemm.
      *
      * Similar to @ref CpuGemm::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
-                           float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *d,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
      *
@@ -111,12 +122,16 @@ public:
      * the value of arm_compute::WeightFormat need to be passed via the
      * parameter gemm_info.
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
-                               const GEMMInfo &gemm_info = GEMMInfo());
+    static Status has_opt_impl(arm_compute::WeightFormat &weight_format,
+                               const ITensorInfo         *a,
+                               const ITensorInfo         *b,
+                               const ITensorInfo         *c,
+                               const ITensorInfo         *d,
+                               const GEMMInfo            &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
     /** Indicates if the convolution executes in variable weights mode.
@@ -130,37 +145,41 @@ public:
 private:
     enum AuxTensorIdx
     {
-        AsmGemmWorkspace = 0,
-        Pretraspose,
-        InterleavedLHS,
-        TransposedRHS,
+        /* Slots 0 - 2 reserved for CpuGemmAssemblyDispatch */
+        InterleavedLHS = 3,
+        PreTransposedRHS,
+        Transposed1xWRHS,
         TempResult,
         Count
     };
 
-    std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel>  _interleave_kernel{ nullptr };
-    std::unique_ptr<kernels::CpuGemmTranspose1xWKernel>   _transpose_kernel{ nullptr };
-    std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{ nullptr };
-    std::unique_ptr<CpuGemmAssemblyDispatch>              _asm_glue{ nullptr };
-    std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{ nullptr };
-    std::unique_ptr<CpuActivation>                        _alpha_scale_func{ nullptr };
-    std::unique_ptr<CpuAdd>                               _add_bias{ nullptr };
-    std::unique_ptr<CpuActivation>                        _activation_func{ nullptr };
+    std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel>  _interleave_kernel{nullptr};
+    std::unique_ptr<CpuTranspose>                         _pretranspose_b_func{nullptr};
+    std::unique_ptr<kernels::CpuGemmTranspose1xWKernel>   _transpose1xW_b_kernel{nullptr};
+    std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{nullptr};
+    std::unique_ptr<CpuGemmAssemblyDispatch>              _asm_glue{nullptr};
+    std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{nullptr};
+    std::unique_ptr<CpuActivation>                        _alpha_scale_func{nullptr};
+    std::unique_ptr<CpuAdd>                               _add_bias{nullptr};
+    std::unique_ptr<CpuActivation>                        _activation_func{nullptr};
 
     TensorInfo _tmp_a{};
+    TensorInfo _pretransposed_b{};
     TensorInfo _tmp_b{};
     TensorInfo _tmp_d{};
 
-    bool _run_vector_matrix_multiplication{ false };
-    bool _run_alpha_scale{ false };
-    bool _run_addition{ false };
-    bool _run_bias_addition{ false };
-    bool _run_activation{ false };
-    bool _reshape_b_only_on_first_run{ false };
-    bool _is_prepared{ false };
+    bool _run_vector_matrix_multiplication{false};
+    bool _run_interleave_transpose{
+        true}; /**< If we run CpuGemmInterleave4x4Kernel on lhs and CpuGemmTranspose1xWKernel on rhs */
+    bool _run_alpha_scale{false};
+    bool _run_addition{false};
+    bool _run_bias_addition{false};
+    bool _run_activation{false};
+    bool _reshape_b_only_on_first_run{false};
+    bool _is_prepared{false};
 
-    experimental::MemoryRequirements _aux_mem{ Count };
+    experimental::MemoryRequirements _aux_mem{Count};
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_GEMM_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUGEMM_H
diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp
index 7c0e58b94e9fb7a46915b440f98d99272acc5f87..31c873c2bac1a92146173cbb9b4d0ab49e7b5f85 100644
--- a/src/cpu/operators/CpuGemmConv2d.cpp
+++ b/src/cpu/operators/CpuGemmConv2d.cpp
@@ -26,20 +26,22 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/helpers/Utils.h"
 #include "src/cpu/kernels/CpuCol2ImKernel.h"
 #include "src/cpu/kernels/CpuIm2ColKernel.h"
-#include "src/cpu/kernels/CpuReshapeKernel.h"
 #include "src/cpu/kernels/CpuWeightsReshapeKernel.h"
 #include "src/cpu/operators/CpuGemm.h"
 #include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
 #include "src/cpu/operators/CpuGemmLowpOutputStage.h"
+#include "src/cpu/operators/CpuReshape.h"
 #include "src/cpu/utils/CpuAuxTensorHandler.h"
 
 #include <set>
@@ -52,8 +54,122 @@ namespace arm_compute
 {
 namespace cpu
 {
-CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info,
-                                                        const Size2D &dilation, const ActivationLayerInfo &act_info)
+
+/** @section note_CpuGemmConv2d_weight_transformation Weight Transformations in CpuGemmConv2d
+ *
+ * A. Terminology
+ *      Throughout CpuGemmConv2d, we use the following terms in ways that may differ from other operators / kernels:
+ *          - "Transform" or "Reshape" of the weights: they both mean all the operations that we perform on the weight
+ *             tensor up until they are consumed by gemm (CpuGemm or CpuGemmLowpMatrixMultiplyCore)
+ *             Note that the specific gemm operator may perform further transformations on the weights, but the
+ *             transformations here only mean those performed in CpuGemmConv2d
+ *          - "Transpose" of weights: The @ref CpuTranspose operation. I.e. transpose of the weights' lowest two
+ *             dimensions
+ *
+ * B. Gemm-based conv2d
+ *      We want to convert the 2d convolution op (ignoring bias):
+ *          dst = conv2d(src, weight)
+ *      into a matrix multiplication op:
+ *          gemm_dst = gemm(lhs, rhs)
+ *
+ *      E.g.: For data layout NHWC
+ *                               3 (hi) <----------> (lo) 0
+ *               src.shape =    [batch,  in_h , in_w,  in_c]
+ *               weight.shape = [out_c,   k_h ,  k_w,  in_c]
+ *               dst.shape =    [batch, out_h, out_w, out_c]
+ *
+ *      This requires three transformations:
+ *          * src -> lhs, transform conv input to gemm lhs; gemm_lhs is a 2d matrix where each row (or column,
+ *                          depending on the convention) is a linearized "patch" of the conv_input that corresponds to
+ *                          the receptive field of the corresponding output element.
+ *                          The convention is to use "column", but to disambiguate from the column vector of a matrix,
+ *                          in this documentation we shall use "patch".
+ *                          This transform is called im2col (for details see @ref CpuIm2ColKernel)
+ *          * weight -> rhs, transform conv weight to gemm rhs, known as weight transform/reshape (wt)
+ *          * gemm_dst -> dst, transform gemm output back to conv output, known as col2im (for details see
+ *                          @ref CpuCol2ImKernel)
+ *
+ *      This section focuses on the weight transformation and assumes the im2col is already performed
+ *
+ * C. Weight Transformation
+ *      After im2col, assume: lhs.shape = [num_patch, patch_size],
+ *          where patch_size is the number of elements in a "patch": patch_size = k_h * k_w * in_c
+ *                num_patch is the number of patches; we can ignore it here (for details see @ref CpuIm2ColKernel)
+ *
+ *      After wt, rhs should have the shape: rhs = [patch_size, out_c]
+ *
+ *      Therefore, the weight transformation consists of two steps:
+ *          1. Collapsing all 3 spatial dimensions: [out_c, k_h, k_w, in_c] -> [out_c, patch_size]
+ *          2. Transpose the collapsed shape: [out_c, patch_size] -> [patch_size, out_c]
+ *
+ * D. Implementation
+ *      There are 4 paths for weight transformation
+ *
+ *      1. Path 1: Fixed weight format - no transformation
+ *          The underlying gemm kernel may adopt fixed weight format (isVarWeightsKernel() == true), which requires
+ *          that no weight transformation shall be performed
+ *          Note that this no-transform requirement applies both to this op (CpuGemmConv2d) and the constituent ops, up
+ *          until the fixed format kernels themselves
+ *
+ *      2. Path 2: Reinterpret then transpose later
+ *          If the weight tensor has no "holes" (see @ref has_holes), there are two optimizations we can apply:
+ *              - We can ignore the first step (collapsing of spatial dimensions) by simply re-interpreting the shape
+ *                in TensorInfo
+ *              - Instead of performing transpose here, we can pass the transpose flag to the underlying gemm. The gemm
+ *                may then decide to fuse the transpose with any further transformations
+ *
+ *      3. Path 3: Reshape then transpose later
+ *          If the weight tensor has holes, then we use a dedicated @ref CpuReshape, followed by transpose later
+ *
+ *      4. Path 4: Fused reshape and transpose
+ *          This is only for quantized types for now (TODO: Remove (COMPMID-6596)). We fall back to a legacy
+ *          non-optimized kernel @ref CpuWeightsReshapeKernel to perform a fused reshape + transpose
+ *
+ *      Path 1 is the long term solution that we shall migrate to once (if) we adopt fixed weight format for all gemm
+ *      kernels.
+ *      In the short term, Path 2 is the favored, more performant path.
+ */
+
+namespace
+{
+/** Initialize reshaped / transformed weight info
+ *
+ * @param[in]  weights          Input weights
+ * @param[out] reshaped_weights Transformed weights
+ */
+void initialize_reshaped_weight_info(const ITensorInfo &weights, ITensorInfo &reshaped_weights)
+{
+    auto_init_if_empty(reshaped_weights, weights);
+    if (is_data_type_quantized(weights.data_type()))
+    {
+        // WT method: FusedReshapeAndTranspose
+        reshaped_weights.set_tensor_shape(compute_weights_reshaped_shape(weights, /* has_bias */ false));
+    }
+    else
+    {
+        TensorShape collapsed_weights = weights.tensor_shape();
+        collapsed_weights.collapse(3);
+        reshaped_weights.set_tensor_shape(collapsed_weights);
+    }
+}
+} // namespace
+
+CpuGemmConv2d::WeightTransformMethod CpuGemmConv2d::get_wt_method(const ITensorInfo &weights)
+{
+    // TODO: Extend ReinterpretThenTranspose support for quantized data types COMPMID-6596
+    if (is_data_type_quantized(weights.data_type()))
+    {
+        return WeightTransformMethod::FusedReshapeAndTranspose;
+    }
+    return has_holes(weights) ? WeightTransformMethod::ReshapeThenTranspose
+                              : WeightTransformMethod::ReinterpretThenTranspose;
+}
+
+CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo         *src,
+                                                        const ITensorInfo         *weights,
+                                                        const PadStrideInfo       &conv_info,
+                                                        const Size2D              &dilation,
+                                                        const ActivationLayerInfo &act_info)
 {
     const DataLayout   data_layout   = src->data_layout();
     const int          idx_width     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -62,63 +178,83 @@ CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src,
     const unsigned int kernel_height = weights->dimension(idx_height);
     unsigned int       conv_w        = 0;
     unsigned int       conv_h        = 0;
-    std::tie(conv_w, conv_h)         = scaled_dimensions(src->dimension(idx_width),
-                                                         src->dimension(idx_height),
-                                                         kernel_width,
-                                                         kernel_height,
-                                                         conv_info,
-                                                         dilation);
-    const bool skip_im2col           = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-
-    if(skip_im2col)
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv_info, dilation);
+    const bool skip_im2col   = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                              conv_info.stride().first == 1 && conv_info.stride().second == 1);
+
+    if (skip_im2col)
     {
-        const bool skip_col2im = (data_layout == DataLayout::NHWC && (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true))));
-        if(skip_col2im)
+        const bool skip_col2im =
+            (data_layout == DataLayout::NHWC &&
+             (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true))));
+        if (skip_col2im)
         {
-            return { true, true };
+            return {true, true};
         }
     }
     else
     {
-        const bool skip_col2im = (data_layout == DataLayout::NHWC && (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false))));
-        if(skip_col2im)
+        const bool skip_col2im =
+            (data_layout == DataLayout::NHWC &&
+             (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false))));
+        if (skip_col2im)
         {
-            return { false, true };
+            return {false, true};
         }
     }
 
     // Default case when we cannot reinterpret the input and output as 3D.
-    return { false, false };
+    return {false, false};
 }
 
 CpuGemmConv2d::CpuGemmConv2d()
-    : _weights_reshape_kernel(nullptr), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(), _col2im_kernel(), _reshape_kernel(), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(),
-      _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
+    : _weights_reshape(nullptr),
+      _weights_reshape_and_transpose_kernel(nullptr),
+      _im2col_kernel(),
+      _mm_gemm(),
+      _mm_gemmlowp(),
+      _col2im_kernel(),
+      _reshape(),
+      _im2col_output(),
+      _weights_reshaped(),
+      _gemm_output(),
+      _gemm_output_3d(),
+      _data_layout(DataLayout::NCHW),
+      _skip_im2col(false),
+      _skip_col2im(false),
+      _is_quantized(false),
+      _is_prepared(false),
+      _wt_method(WeightTransformMethod::ReshapeThenTranspose),
+      _run_wt(true),
+      _aux_mem(AuxTensorIdx::Count)
 {
 }
 CpuGemmConv2d::~CpuGemmConv2d() = default;
 
-void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act_info,
-                                 bool enable_fast_math, int gemm_3d_depth, bool fixed_format, arm_compute::WeightFormat weight_format)
+void CpuGemmConv2d::configure_mm(const ITensorInfo         *src,
+                                 const ITensorInfo         *weights,
+                                 const ITensorInfo         *biases,
+                                 ITensorInfo               *dst,
+                                 const ActivationLayerInfo &act_info,
+                                 bool                       enable_fast_math,
+                                 int                        gemm_3d_depth,
+                                 bool                       fixed_format,
+                                 arm_compute::WeightFormat  weight_format)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth, _skip_im2col, fixed_format, weight_format));
-
-    // Create GEMMInfo structure
-    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                         gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                         false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, experimental::PostOpList<ITensorInfo *>(), fixed_format, weight_format);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth,
+                                           _skip_im2col, fixed_format, weight_format));
 
     // Supported activations in GEMM
-    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                             };
+    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+        ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
-        TensorInfo tmp_src{ *src };
-        TensorInfo tmp_weights{ *weights };
+        TensorInfo tmp_src{*src};
+        TensorInfo tmp_weights{*weights};
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
         const QuantizationInfo        iqinfo    = src->quantization_info();
@@ -129,7 +265,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
         const DataType                data_type = src->data_type();
 
         tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
-        if(!is_data_type_quantized_per_channel(tmp_weights.data_type()))
+        if (!is_data_type_quantized_per_channel(tmp_weights.data_type()))
         {
             const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
             tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
@@ -142,7 +278,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
         int32_t min_activation       = type_min.get<int32_t>();
         int32_t max_activation       = type_max.get<int32_t>();
 
-        if(supported_acts.count(act_info.activation()) != 0)
+        if (supported_acts.count(act_info.activation()) != 0)
         {
             std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
         }
@@ -156,41 +292,53 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
         quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
 
         _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
-        _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info,
-                                                                              experimental::PostOpList<ITensorInfo *>(), fixed_format, weight_format));
+        _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst,
+                                GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false,
+                                         enable_fast_math, false, act_info, fixed_format, weight_format,
+                                         false /* pretranspose_B. TODO: COMPMID-6596 */));
 
         auto mm_mem_req = _mm_gemmlowp->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
         {
             _aux_mem[cont] = mm_mem_req[cont];
         }
     }
     else
     {
+        // Create GEMMInfo structure
+        const GEMMInfo &gemm_info =
+            GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+                     _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false,
+                     GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format,
+                     true /*pretranspose_B. For fp gemm (wt path 1 - 3), We always pretranspose B (for wt path 1 this
+                     flag is ignored)*/);
         // Configure matrix multiply function
         _mm_gemm = std::make_unique<CpuGemm>();
         _mm_gemm->configure(src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
         auto mm_mem_req = _mm_gemm->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
         {
             _aux_mem[cont] = mm_mem_req[cont];
         }
     }
 }
 
-Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                  const ActivationLayerInfo &act_info, bool enable_fast_math, int gemm_3d_depth, bool skip_im2col, bool fixed_format, arm_compute::WeightFormat weight_format)
+Status CpuGemmConv2d::validate_mm(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info,
+                                  bool                       enable_fast_math,
+                                  int                        gemm_3d_depth,
+                                  bool                       skip_im2col,
+                                  bool                       fixed_format,
+                                  arm_compute::WeightFormat  weight_format)
 {
     const DataType data_type             = src->data_type();
     const bool     is_quantized          = is_data_type_quantized_asymmetric(data_type);
     const bool     is_activation_enabled = act_info.enabled();
 
-    // Create GEMMInfo structure
-    const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                        gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                        false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, experimental::PostOpList<ITensorInfo *>(), fixed_format, weight_format);
-
-    if(is_quantized)
+    if (is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
@@ -206,11 +354,10 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei
         int32_t min_activation       = type_min.get<int32_t>();
         int32_t max_activation       = type_max.get<int32_t>();
 
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
-        if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+        if (is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
         {
             std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
         }
@@ -229,46 +376,64 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei
         input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
         weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
 
-        return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, enable_fast_math,
-                                                                                                               false, act_info));
+        return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst,
+                                                       GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false,
+                                                                output_info, false, enable_fast_math, false, act_info,
+                                                                false /* pretranspose_B. TODO: COMPMID-6596 */));
     }
     else
     {
+        // Create GEMMInfo structure
+        const GEMMInfo gemm_info =
+            GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+                     skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false,
+                     GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format,
+                     true /*pretranspose_B. For fp gemm (wt path 1 - 3), We always pretranspose B (for wt path 1 this
+                     flag is ignored)*/);
+
         // Perform validation step on Matrix multiply function
         return CpuGemm::validate(src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
     }
 }
 
-Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
+Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo         *input_info,
+                                      const ITensorInfo         *weights_info,
+                                      const ActivationLayerInfo &act_info,
+                                      int                        gemm_3d_depth,
+                                      bool                       skip_im2col)
 {
     const DataType     data_type = input_info->data_type();
     const unsigned int mult_y    = skip_im2col ? 1U : gemm_3d_depth;
     const unsigned int mult_z    = skip_im2col ? gemm_3d_depth : 1U;
 
     // Set dummy tensor shapes for the validation
-    const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
+    const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type,
+                                      input_info->quantization_info());
     const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
-    const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
+    const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type,
+                                       input_info->quantization_info());
 
-    return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false, gemm_3d_depth, skip_im2col);
+    return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false,
+                       gemm_3d_depth, skip_im2col);
 }
 
-void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                              const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CpuGemmConv2d::configure(const ITensorInfo         *src,
+                              const ITensorInfo         *weights,
+                              const ITensorInfo         *biases,
+                              ITensorInfo               *dst,
+                              const PadStrideInfo       &conv_info,
+                              const WeightsInfo         &weights_info,
+                              const Size2D              &dilation,
+                              const ActivationLayerInfo &act_info,
+                              bool                       enable_fast_math,
+                              unsigned int               num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_UNUSED(num_groups, weights_info);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src,
-                                                       weights,
-                                                       biases,
-                                                       dst,
-                                                       conv_info,
-                                                       weights_info,
-                                                       dilation,
-                                                       act_info,
-                                                       enable_fast_math,
-                                                       num_groups));
-    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src, weights, biases, dst, conv_info, weights_info, dilation,
+                                                       act_info, enable_fast_math, num_groups));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math,
+                           num_groups);
 
     const DataType   data_type   = src->data_type();
     const DataLayout data_layout = src->data_layout();
@@ -283,7 +448,8 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
     _is_prepared  = weights_info.retain_internal_weights();
     _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
     _data_layout  = data_layout;
-    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                    conv_info.stride().first == 1 && conv_info.stride().second == 1);
 
     const ITensorInfo *gemm_input_to_use  = src;
     ITensorInfo       *gemm_output_to_use = dst;
@@ -291,54 +457,50 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
     // Get convolved dimensions
     unsigned int conv_w      = 0;
     unsigned int conv_h      = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv_info,
-                                                 dilation);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv_info, dilation);
 
     ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
                              "Output shape does not match the expected one");
 
     // Check if GEMM3D is supported
-    const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
-    _skip_im2col                            = skip_info.skip_im2col;
-    _skip_col2im                            = skip_info.skip_col2im;
+    const CpuGemmConv2d::SkipInfo skip_info =
+        CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
+    _skip_im2col = skip_info.skip_im2col;
+    _skip_col2im = skip_info.skip_col2im;
 
     // Get parameters from conv_info
     unsigned int stride_x        = 0;
     unsigned int stride_y        = 0;
     std::tie(stride_x, stride_y) = conv_info.stride();
 
-    unsigned int mat_weights_cols = weights->dimension(idx_kernels);
-
-    // _weights_reshaped will be auto configured in the kernel.
-    // Just append biases and do not transpose 1xW as it will be reshaped in CpuGemm
-    _weights_reshape_kernel = std::make_unique<kernels::CpuWeightsReshapeKernel>();
-    _weights_reshape_kernel->configure(weights, nullptr, &_weights_reshaped);
-    _weights_reshaped.set_quantization_info(weights->quantization_info());
+    // Initialize reshaped weights
+    initialize_reshaped_weight_info(*weights, _weights_reshaped);
 
     // Create tensor to store im2col reshaped inputs
-    if(!_skip_im2col)
+    if (!_skip_im2col)
     {
         const int    block_by        = arm_compute::block_by(weights_info.weight_format());
         unsigned int input_pad_right = 0;
-        if(block_by > 1)
+        if (block_by > 1)
         {
-            input_pad_right = (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
+            input_pad_right =
+                (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
         }
         // Configure
         _im2col_kernel = std::make_unique<kernels::CpuIm2ColKernel>();
-        _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation, num_groups, input_pad_right);
+        _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation,
+                                  num_groups, input_pad_right);
 
         // Update GEMM input
         gemm_input_to_use = &_im2col_output;
     }
 
+    const unsigned int mat_weights_cols = weights->dimension(idx_kernels);
+
     // Create temporary GEMM output tensor in case we cannot skip col2im
     const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
         TensorShape shape_gemm;
 
@@ -368,9 +530,39 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
     // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
     const bool         fixed_format  = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
-    configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math, gemm_3d_depth, fixed_format, weights_info.weight_format());
-
-    if(!_skip_col2im && _data_layout == DataLayout::NCHW)
+    /** @section note_CpuGemmConv2d_weight_use_in_configure  Which weights tensor should we use to configure gemm
+     *
+     *  A. The problem:
+     *      In principle, we should use the weights tensor corresponding to the weights transformation path. I.e.:
+     *          - If no weight transformation (_run_wt == false): Use original weights
+     *          - else:                                           Use transformed weights
+     *      However in practice we have a dilemma:
+     *          - We need to know _run_wt before we can configure gemm with the corresponding weights, but
+     *          - _run_wt depends on isVarWeightsKernel(), which is only known after gemm is configured
+     *
+     *  B. The decision:
+     *      To simplify the matter, we decide to always use the transformed weights, regardless of _run_wt
+     *
+     *      This decision requires the following conditions:
+     *          1. The underlying gemm where isVarWeightsKernel() == true, must guarantee that:
+     *              A. Ignore the flag to transpose weights (GEMMInfo::pretranspose_B)
+     *              B. Use weights/B tensor passed to it at prepare() or run() instead of that passed at configure()
+     *          2. CpuGemmConv2d where isVarWeightsKernel() == true, must guarantee that:
+     *              A. Pass original weights instead of reshaped or reinterpreted weights
+     *
+     *  C. Future actions:
+     *      Condition 2 is a given, based on our implementation.
+     *      If condition 1 cannot hold, we must make changes to the underlying gemm to:
+     *           1. Either expose isVarWeightsKernel() before gemm is configured somehow, or
+     *           2. Take in an additional "original_weights" tensor info at configure
+     */
+    configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math,
+                 gemm_3d_depth, fixed_format, weights_info.weight_format());
+
+    // Can only decide isVarWeightsKernel after gemm is configured
+    _run_wt = !isVarWeightsKernel();
+
+    if (!_skip_col2im && _data_layout == DataLayout::NCHW)
     {
         // Configure col2im
         _col2im_kernel = std::make_unique<kernels::CpuCol2ImKernel>();
@@ -379,25 +571,44 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
     else
     {
         // Configure reshape layer
-        _reshape_kernel = std::make_unique<kernels::CpuReshapeKernel>();
-        _reshape_kernel->configure(gemm_output_to_use, dst);
+        _reshape = std::make_unique<CpuReshape>();
+        _reshape->configure(gemm_output_to_use, dst);
     }
 
-    // Check if GEMM transforms weights
-    // Modernise through COMPMID-4535
-    bool gemm_trans_wei = _aux_mem[1].size > 0;                                            // Asm Pretranspose
-    gemm_trans_wei      = _mm_gemm != nullptr ? _aux_mem[3].size > 0 : gemm_trans_wei;     // Tranpose RHS
-    gemm_trans_wei      = _mm_gemmlowp != nullptr ? _aux_mem[5].size > 0 : gemm_trans_wei; // Transpose RHS
-
     // Check lifetime
-    _aux_mem[Im2ColOutput]    = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
-    _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _weights_reshaped.total_size());
-    _aux_mem[GemmOutput]      = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
+    _aux_mem[Im2ColOutput] =
+        MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
+    // Add WeightsReshaped memory requirement to workspace
+    // Note that in case of WeightTransformMethod::ReinterpretThenTranspose, we do not need to allocate this memory
+    // However since we cannot determine weight transformation method until prepare (see prepare()), we will have to
+    // settle with allocating more
+    if (_run_wt)
+    {
+        // Check if GEMM transforms weights
+        // If weight is further transformed by underlying gemm after ReshapeThenTranspose then we can free
+        // WeightsReshaped in prepare
+        // Otherwise WeightsReshaped is the final transformation of weights and needs to persist
+        bool gemm_trans_wei = _aux_mem[GemmAsmPretransposedRHS].size > 0;
+        gemm_trans_wei      = _mm_gemm != nullptr ? _aux_mem[GemmTransposed1xWRHS].size > 0 : gemm_trans_wei;
+        gemm_trans_wei      = _mm_gemmlowp != nullptr ? _aux_mem[GemmLowpTransposed1xWRHS].size > 0 : gemm_trans_wei;
+
+        _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped),
+                                               gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent,
+                                               _weights_reshaped.total_size());
+    }
+    _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
 }
 
-Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                   const PadStrideInfo &conv_info,
-                                   const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, const bool enable_fast_math)
+Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                                   const ITensorInfo         *src,
+                                   const ITensorInfo         *weights,
+                                   const ITensorInfo         *biases,
+                                   const ITensorInfo         *dst,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   const bool                 enable_fast_math)
 {
     const DataLayout   data_layout   = src->data_layout();
     const int          idx_width     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -406,36 +617,52 @@ Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_fo
     const unsigned int kernel_height = weights->dimension(idx_height);
     unsigned int       conv_w        = 0;
     unsigned int       conv_h        = 0;
-    std::tie(conv_w, conv_h)         = scaled_dimensions(src->dimension(idx_width),
-                                                         src->dimension(idx_height),
-                                                         kernel_width,
-                                                         kernel_height,
-                                                         conv_info,
-                                                         dilation);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv_info, dilation);
 
-    const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info,
-                                                                              dilation, act_info);
+    const CpuGemmConv2d::SkipInfo skip_info =
+        CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
 
     const bool         skip_im2col   = skip_info.skip_im2col;
     const bool         skip_col2im   = skip_info.skip_col2im;
     const unsigned int gemm_3d_depth = skip_col2im ? conv_h : 0;
     const bool         fixed_format  = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
-    const GEMMInfo     gemm_info     = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                                gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                                false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, experimental::PostOpList<ITensorInfo *>(), fixed_format, weights_info.weight_format());
+
+    /** @section note_CpuGemmConv2d_weight_use_in_has_opt_impl Which weights tensor should we use for has_opt_impl
+     *
+     *  For the pretranspose_B flag, this shares a similar problem and thus the same decision as that of
+     *  @ref note_CpuGemmConv2d_weight_use_in_configure
+     *
+     *  But for the weights, we shall always use the original instead of reshaped weights here
+     */
+    const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+                                        skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false,
+                                        GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info,
+                                        fixed_format, weights_info.weight_format(), true /* pretranspose_B */);
 
     return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
 }
 
-Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                               const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status CpuGemmConv2d::validate(const ITensorInfo         *src,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *dst,
+                               const PadStrideInfo       &conv_info,
+                               const WeightsInfo         &weights_info,
+                               const Size2D              &dilation,
+                               const ActivationLayerInfo &act_info,
+                               bool                       enable_fast_math,
+                               unsigned int               num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16,
+                                                         DataType::F16, DataType::F32);
 
-    if(!is_fixed_format(weights_info.weight_format()))
+    if (!is_fixed_format(weights_info.weight_format()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
     }
@@ -468,29 +695,25 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
 
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv_info,
-                                                 dilation);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv_info, dilation);
 
     // Check if GEMM3D is supported
-    const CpuGemmConv2d::SkipInfo skip_info   = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info,
-                                                                                dilation, act_info);
-    const bool                    skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im;
+    const CpuGemmConv2d::SkipInfo skip_info =
+        CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
+    const bool skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im;
 
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != src->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_quantized)
+        if (is_quantized)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
-        else if(is_bf16)
+        else if (is_bf16)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
         }
@@ -503,20 +726,25 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight
     }
 
     unsigned int mat_weights_cols = weights->dimension(idx_kernels);
-    unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
+    unsigned int mat_weights_rows =
+        weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
 
-    weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, weights->data_type());
-    weights_reshaped_info.set_quantization_info(weights->quantization_info());
+    // Initialize reshaped weights
+    initialize_reshaped_weight_info(*weights, weights_reshaped_info);
+    // No need to call CpuReshape::validate() or CpuTranspose::validate() as the dst info is auto-configured from the
+    // src
     weights_to_use = &weights_reshaped_info;
 
-    if(!skip_im2col)
+    if (!skip_im2col)
     {
         const int block_by        = arm_compute::block_by(weights_info.weight_format());
         int       input_pad_right = 0;
-        if(block_by > 1)
+        if (block_by > 1)
         {
-            input_pad_right  = (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
-            mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * (weights->dimension(idx_channel) + input_pad_right);
+            input_pad_right =
+                (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
+            mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) *
+                               (weights->dimension(idx_channel) + input_pad_right);
         }
 
         // Create tensor info for im2col reshaped inputs
@@ -528,13 +756,15 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight
 
         im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
         im2col_reshaped_info.set_quantization_info(src->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups, input_pad_right));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height),
+                                               conv_info, append_bias, dilation, num_groups, input_pad_right));
         gemm_input_to_use = &im2col_reshaped_info;
     }
 
     // Create temporary GEMM output tensor in case we cannot skip col2im
     const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
-    if(!skip_col2im)
+    if (!skip_col2im)
     {
         TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
         shape_gemm.set(0, mat_weights_cols);
@@ -549,13 +779,16 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight
     gemm_output_to_use      = &info_gemm;
     const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format,
+    // See note_CpuGemmConv2d_weight_use_in_configure regarding the choice of the weights
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info,
+                                            enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format,
                                             weights_info.weight_format()));
 
     // Validate Col2Im/ReshapeLayer
-    if(!skip_col2im && (data_layout == DataLayout::NCHW))
+    if (!skip_col2im && (data_layout == DataLayout::NCHW))
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h)));
     }
 
     return Status{};
@@ -571,18 +804,13 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
 
     CpuAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false);
     CpuAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false);
-    CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
 
     bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0);
-    if(!_skip_im2col)
+    if (!_skip_im2col)
     {
         // Run input reshaping
         unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-        ITensorPack  pack  =
-        {
-            { TensorType::ACL_SRC, src },
-            { TensorType::ACL_DST, im2col_output.get() }
-        };
+        ITensorPack  pack  = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}};
         NEScheduler::get().schedule_op(_im2col_kernel.get(), y_dim, _im2col_kernel->window(), pack);
         gemm_input_to_use = im2col_output.get();
     }
@@ -595,93 +823,150 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
     gemm3d.allocator()->import_memory(out_to_use->buffer());
     auto gemm_output_to_use = gemm_output.get();
 
-    if(_skip_im2col)
+    if (_skip_im2col)
     {
         gemm_output_to_use = &gemm3d;
     }
-    if(_skip_col2im && !out_has_padding)
+    if (_skip_col2im && !out_has_padding)
     {
         gemm_output_to_use = dst;
     }
 
-    // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions
-    ITensorPack pack_mm = tensors;
-    pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
-    if(!this->isVarWeightsKernel())
-    {
-        pack_mm.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
-    }
-    pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
-    if(_is_quantized)
-    {
-        // Run gemmlowp
-        _mm_gemmlowp->run(pack_mm);
-    }
-    else
+    ITensorPack gemm_pack = tensors;
+    gemm_pack.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
+    gemm_pack.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
+    // Allocate reshaped weights if required
+    auto weights = gemm_pack.get_const_tensor(TensorType::ACL_SRC_1);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
+    // Re-interpreted weights. Only tensor shape is changed. Only memory import, no allocation
+    CpuAuxTensorHandler reinterpreted_wei(
+        _weights_reshaped, *weights,
+        /* import only if we chose the ReinterpretThenTranspose path, because otherwise the weight may have been freed */
+        !(_run_wt && _wt_method == WeightTransformMethod::ReinterpretThenTranspose));
+    CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors);
+    // Update the weights to use if it has been reshaped
+    if (_run_wt)
     {
-        // Run gemm
-        _mm_gemm->run(pack_mm);
+        if (_wt_method == WeightTransformMethod::ReinterpretThenTranspose)
+        {
+            gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reinterpreted_wei.get());
+        }
+        else if (_wt_method == WeightTransformMethod::ReshapeThenTranspose ||
+                 _wt_method == WeightTransformMethod::FusedReshapeAndTranspose)
+        {
+            gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
+        }
     }
 
+    // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions
+    _is_quantized ? _mm_gemmlowp->run(gemm_pack) : _mm_gemm->run(gemm_pack);
+
     // Reshape output matrix
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
-        if(_data_layout == DataLayout::NCHW)
+        if (_data_layout == DataLayout::NCHW)
         {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, gemm_output.get() },
-                { TensorType::ACL_DST, dst }
-            };
+            ITensorPack pack = {{TensorType::ACL_SRC, gemm_output.get()}, {TensorType::ACL_DST, dst}};
             NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack);
         }
         else
         {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, gemm_output_to_use },
-                { TensorType::ACL_DST, dst }
-            };
-            NEScheduler::get().schedule_op(_reshape_kernel.get(), Window::DimY, _reshape_kernel->window(), pack);
+            ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
+            _reshape->run(pack);
         }
     }
-    else if(out_has_padding)
+    else if (out_has_padding)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, gemm_output_to_use },
-            { TensorType::ACL_DST, dst }
-        };
-        NEScheduler::get().schedule_op(_reshape_kernel.get(), Window::DimY, _reshape_kernel->window(), pack);
+        ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
+        _reshape->run(pack);
     }
 }
 
 void CpuGemmConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        // Variable weights executions that use fixed-format kernels
-        // need no reshaping of the weights.
-        if(this->isVarWeightsKernel())
+        auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        // Determine which weights reshape path to take
+        // Note that this decision can only occur at prepare instead of configure because it relies on the presence of
+        // any holes in the weight tensor, which may change after configure (e.g. from extending padding)
+        if (_run_wt)
         {
-            _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors);
-            _is_prepared = true;
-            return;
+            _wt_method = get_wt_method(*(weights->info()));
+            switch (_wt_method)
+            {
+                case (WeightTransformMethod::FusedReshapeAndTranspose):
+                {
+                    ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Perform weight transformation: FusedReshapeAndTranspose");
+                    _weights_reshape_and_transpose_kernel = std::make_unique<kernels::CpuWeightsReshapeKernel>();
+                    _weights_reshape_and_transpose_kernel->configure(weights->info(), nullptr, &_weights_reshaped);
+                    break;
+                }
+                case (WeightTransformMethod::ReshapeThenTranspose):
+                {
+                    ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Perform weight transformation: ReshapeThenTranspose");
+                    _weights_reshape = std::make_unique<CpuReshape>();
+                    _weights_reshape->configure(weights->info(), &_weights_reshaped);
+                    break;
+                }
+                case (WeightTransformMethod::ReinterpretThenTranspose):
+                {
+                    ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Perform weight transformation: ReinterpretThenTranspose");
+                    // Nothing to configure
+                    break;
+                }
+                default:
+                {
+                    ARM_COMPUTE_ERROR("Unsupported weight transform method");
+                }
+            }
         }
-
-        // Run weights reshaping and mark original weights tensor as unused
-        CpuAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors);
-        auto                weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        ITensorPack         pack    =
+        else
         {
-            { TensorType::ACL_SRC, weights },
-            { TensorType::ACL_DST, weights_reshaped.get() }
-        };
-        NEScheduler::get().schedule_op(_weights_reshape_kernel.get(), 3, _weights_reshape_kernel->window(), pack);
-        weights->mark_as_unused();
+            ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("No weight transformation is performed");
+        }
         ITensorPack gemm_pack = tensors;
-        gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
+        // Allocate reshaped weights if required
+        CpuAuxTensorHandler reinterpreted_wei(
+            _weights_reshaped,
+            *weights); // Re-interpreted weights. Only tensor shape is changed. No allocation
+        CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors);
+        // Run weights reshape if required
+        if (_run_wt)
+        {
+            switch (_wt_method)
+            {
+                case (WeightTransformMethod::FusedReshapeAndTranspose):
+                {
+                    ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, reshaped_wei.get()}};
+                    NEScheduler::get().schedule_op(_weights_reshape_and_transpose_kernel.get(), Window::DimW,
+                                                   _weights_reshape_and_transpose_kernel->window(), pack);
+                    weights->mark_as_unused();
+                    gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
+                    break;
+                }
+                case (WeightTransformMethod::ReshapeThenTranspose):
+                {
+                    ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, reshaped_wei.get()}};
+                    _weights_reshape->run(pack);
+                    weights->mark_as_unused();
+                    gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
+                    break;
+                }
+                case (WeightTransformMethod::ReinterpretThenTranspose):
+                {
+                    gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reinterpreted_wei.get());
+                    // Nothing to run
+                    break;
+                }
+                default:
+                {
+                    ARM_COMPUTE_ERROR("Unsupported weight transform method");
+                }
+            }
+        }
         _is_quantized ? _mm_gemmlowp->prepare(gemm_pack) : _mm_gemm->prepare(gemm_pack);
+
         _is_prepared = true;
     }
 }
diff --git a/src/cpu/operators/CpuGemmConv2d.h b/src/cpu/operators/CpuGemmConv2d.h
index 81d34ae93de7f1d00d0b202738b951eebbfa4dc9..48a0d11107bef620fbe14fb5d946a202a502f113 100644
--- a/src/cpu/operators/CpuGemmConv2d.h
+++ b/src/cpu/operators/CpuGemmConv2d.h
@@ -21,12 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_GEMM_CONV2D_H
-#define ARM_COMPUTE_CPU_GEMM_CONV2D_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMMCONV2D_H
+#define ACL_SRC_CPU_OPERATORS_CPUGEMMCONV2D_H
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 #include <memory>
@@ -38,24 +39,15 @@ namespace cpu
 class CpuGemm;
 class CpuGemmLowpMatrixMultiplyCore;
 class CpuGemmLowpOutputStage;
+class CpuReshape;
 namespace kernels
 {
-class CpuWeightsReshapeKernel;
 class CpuIm2ColKernel;
 class CpuCol2ImKernel;
-class CpuReshapeKernel;
+class CpuWeightsReshapeKernel;
 } // namespace kernels
 
-/** Basic function to compute the convolution layer. This function calls the following kernels/functions:
- *
- * -# @ref cpu::kernels::CpuIm2ColKernel
- * -# @ref CpuGemm (if the data type is BFLOAT16/FP16/FP32)
- * -# @ref CpuGemmLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref CpuGemmLowpOutputStage (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref cpu::kernels::CpuCol2ImKernel (if NCHW data layout)
- * -# @ref kernels::CpuWeightsReshapeKernel
- *
- */
+/** Basic function to compute the convolution layer. @ref note_CpuGemmConv2d_weight_transformation */
 class CpuGemmConv2d : public ICpuOperator
 {
 public:
@@ -98,7 +90,7 @@ public:
      * @param[out] dst              Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                              Data types supported: Same as @p input.
      * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     * @param[in]  weights_info     Specifies if the weights tensor has been reshaped with CpuWeightsReshapeKernel. If this is not part of the fully connected layer the weights
      *                              tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
@@ -106,32 +98,53 @@ public:
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+    void configure(const ITensorInfo         *src,
+                   const ITensorInfo         *weights,
+                   const ITensorInfo         *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmConvolution::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                           bool enable_fast_math = false, unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
 
     /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
      *
-     * The paramter list is the same as @ref NEGEMMConvolutionLayer::has_opt_impl
+     * The parameter list is the same as @ref NEGEMMConvolutionLayer::has_opt_impl
      *
      * @return a status.
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                               const PadStrideInfo &conv_info,
-                               const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                               const bool enable_fast_math = false);
+    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                               const ITensorInfo         *src,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *output,
+                               const PadStrideInfo       &conv_info,
+                               const WeightsInfo         &weights_info     = WeightsInfo(),
+                               const Size2D              &dilation         = Size2D(1U, 1U),
+                               const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                               const bool                 enable_fast_math = false);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -150,8 +163,15 @@ private:
      * @param[in]  fixed_format     (Optional) Select GEMM execution with variable weights.
      * @param[in]  weight_format    (Optional) The layout to be used for the weights tensor when running GEMM with variable weights.
      */
-    void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                      bool enable_fast_math = false, int gemm_3d_depth = 1, bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED);
+    void configure_mm(const ITensorInfo         *src,
+                      const ITensorInfo         *weights,
+                      const ITensorInfo         *biases,
+                      ITensorInfo               *output,
+                      const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                      bool                       enable_fast_math = false,
+                      int                        gemm_3d_depth    = 1,
+                      bool                       fixed_format     = false,
+                      arm_compute::WeightFormat  weight_format    = arm_compute::WeightFormat::UNSPECIFIED);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines
      *
      * @param[in] src              Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
@@ -170,8 +190,16 @@ private:
      *
      * @return a status
      */
-    static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                              bool enable_fast_math = false, int gemm_3d_depth = 1, bool skip_im2col = false, bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED);
+    static Status validate_mm(const ITensorInfo         *src,
+                              const ITensorInfo         *weights,
+                              const ITensorInfo         *biases,
+                              const ITensorInfo         *dst,
+                              const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                              bool                       enable_fast_math = false,
+                              int                        gemm_3d_depth    = 1,
+                              bool                       skip_im2col      = false,
+                              bool                       fixed_format     = false,
+                              arm_compute::WeightFormat  weight_format    = arm_compute::WeightFormat::UNSPECIFIED);
     /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref CpuGemmMLowpMatrixMultiplyCore
      *
      * @param[in] src           Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
@@ -182,7 +210,11 @@ private:
      *
      * @return a status
      */
-    static Status validate_gemm3d(const ITensorInfo *src, const ITensorInfo *weights, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col);
+    static Status validate_gemm3d(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ActivationLayerInfo &act_info,
+                                  int                        gemm_3d_depth,
+                                  bool                       skip_im2col);
 
     struct SkipInfo
     {
@@ -200,8 +232,11 @@ private:
      *
      * @return a SkipInfo instance.
      */
-    static SkipInfo skip_im_col_info(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info,
-                                     const Size2D &dilation, const ActivationLayerInfo &act_info);
+    static SkipInfo skip_im_col_info(const ITensorInfo         *src,
+                                     const ITensorInfo         *weights,
+                                     const PadStrideInfo       &conv_info,
+                                     const Size2D              &dilation,
+                                     const ActivationLayerInfo &act_info);
 
     /** Indicates if the convolution executes in variable weights mode.
      *
@@ -210,19 +245,39 @@ private:
     bool isVarWeightsKernel() const;
     enum AuxTensorIdx
     {
-        // CpuGemmLowpMatrixMultiplyCore has up to 8 internal tensors
-        Im2ColOutput = 9,
+        GemmAsmPretransposedRHS  = 2, // CpuGemmAssemblyDispatch::Pretranspose
+        GemmTransposed1xWRHS     = 5, // CpuGemm::Transposed1xWRHS
+        GemmLowpTransposed1xWRHS = 6, // CpuGemmLowpMatrixMultiplyCore::TmpB
+        /* Slots 0 - 9 reserved and shared by CpuGemmLowpMatrixMultiplyCore and CpuGemm */
+        Im2ColOutput = 10,
         WeightsReshaped,
         GemmOutput,
         Count
     };
 
-    std::unique_ptr<kernels::CpuWeightsReshapeKernel> _weights_reshape_kernel;
-    std::unique_ptr<cpu::kernels::CpuIm2ColKernel>    _im2col_kernel;
+    /** Weight transformation method. See @ref note_CpuGemmConv2d_weight_transformation */
+    enum class WeightTransformMethod
+    {
+        ReinterpretThenTranspose,
+        ReshapeThenTranspose,
+        FusedReshapeAndTranspose,
+    };
+
+    /** Select weight transformation method
+     *
+     * @param[in] weights Input weights
+     *
+     * @return WeightTransformMethod
+     */
+    static WeightTransformMethod get_wt_method(const ITensorInfo &weights);
+
+    std::unique_ptr<CpuReshape>                       _weights_reshape;
+    std::unique_ptr<kernels::CpuWeightsReshapeKernel> _weights_reshape_and_transpose_kernel;
+    std::unique_ptr<kernels::CpuIm2ColKernel>         _im2col_kernel;
     std::unique_ptr<CpuGemm>                          _mm_gemm;
     std::unique_ptr<CpuGemmLowpMatrixMultiplyCore>    _mm_gemmlowp;
     std::unique_ptr<kernels::CpuCol2ImKernel>         _col2im_kernel;
-    std::unique_ptr<kernels::CpuReshapeKernel>        _reshape_kernel;
+    std::unique_ptr<CpuReshape>                       _reshape;
 
     TensorInfo _im2col_output;
     TensorInfo _weights_reshaped;
@@ -231,13 +286,15 @@ private:
 
     DataLayout _data_layout;
 
-    bool _skip_im2col;
-    bool _skip_col2im;
-    bool _is_quantized;
-    bool _is_prepared;
+    bool                  _skip_im2col;
+    bool                  _skip_col2im;
+    bool                  _is_quantized;
+    bool                  _is_prepared;
+    WeightTransformMethod _wt_method;
+    bool                  _run_wt;
 
-    experimental::MemoryRequirements _aux_mem{ Count };
+    experimental::MemoryRequirements _aux_mem{Count};
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_GEMM_CONV2D_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUGEMMCONV2D_H
diff --git a/src/cpu/operators/CpuGemmDirectConv2d.cpp b/src/cpu/operators/CpuGemmDirectConv2d.cpp
index 5ce285cb6f414f64a1cb88765ea55f65873a198c..918792754123f299bc844c053355e5d962fcef54 100644
--- a/src/cpu/operators/CpuGemmDirectConv2d.cpp
+++ b/src/cpu/operators/CpuGemmDirectConv2d.cpp
@@ -26,10 +26,10 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/utils/CpuAuxTensorHandler.h"
-
 #include "support/Cast.h"
 
 #include <set>
@@ -43,7 +43,10 @@ using namespace arm_compute::utils::cast;
 
 namespace
 {
-GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act)
+GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo         *src,
+                                                        const ITensorInfo         *weights,
+                                                        const ITensorInfo         *dst,
+                                                        const ActivationLayerInfo &act)
 {
     // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
     // Extract and negate input and weights offset
@@ -53,16 +56,15 @@ GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src,
     const UniformQuantizationInfo uoqinfo   = oqinfo.uniform();
     const DataType                data_type = src->data_type();
     // Merge activation with output stage
-    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                             };
-    PixelValue                                              type_min{};
-    PixelValue                                              type_max{};
+    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+        ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+    PixelValue type_min{};
+    PixelValue type_max{};
     std::tie(type_min, type_max) = get_min_max(data_type);
     int32_t min_activation       = type_min.get<int32_t>();
     int32_t max_activation       = type_max.get<int32_t>();
-    if(supported_acts.count(act.activation()) != 0)
+    if (supported_acts.count(act.activation()) != 0)
     {
         std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo);
     }
@@ -107,58 +109,70 @@ CpuGemmDirectConv2d::CpuGemmDirectConv2d()
 
 CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default;
 
-void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info)
+void CpuGemmDirectConv2d::configure(const ITensorInfo *src,
+                                    const ITensorInfo *weights,
+                                    const ITensorInfo *biases,
+                                    ITensorInfo       *dst,
+                                    const Conv2dInfo  &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmDirectConv2d::validate(src,
-                                                             weights,
-                                                             biases != nullptr ? biases : nullptr,
-                                                             dst,
-                                                             info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuGemmDirectConv2d::validate(src, weights, biases != nullptr ? biases : nullptr, dst, info));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info);
 
     _run_activation = info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info);
     _is_prepared    = false;
 
-    _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{ 3, 0, 1, 2 });
+    _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{3, 0, 1, 2});
 
     // Configure assembly dispatch
     cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
-    if(is_data_type_quantized(src->data_type()))
+    if (is_data_type_quantized(src->data_type()))
     {
         asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info);
     }
     _gemm_asm_func->configure(src, &_perm_weights, biases, dst, asm_info);
 
     // Configure activation
-    if(_run_activation)
+    if (_run_activation)
     {
         _activation_func->configure(dst, nullptr, info.act_info);
     }
 
     // Add auxiliary memory requirements of the assembly dispatch
-    auto asm_mem_req           = _gemm_asm_func->workspace();
-    _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
-    _aux_mem[Pretranspose]     = asm_mem_req[Pretranspose];
+    const auto asm_mem_req = _gemm_asm_func->workspace();
+    for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)
+    {
+        _aux_mem[slot] = asm_mem_req[slot];
+    }
 
-    if(_aux_mem[Pretranspose].size > 0)
+    if (_aux_mem[Pretranspose].size > 0)
     {
         // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
-        _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size());
+        _aux_mem[PermutedWeights] =
+            MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size());
     }
     else
     {
         // We must permute weights if they are WeightFormat::UNSPECIFIED
-        if(info.weights_info.weight_format() == WeightFormat::UNSPECIFIED)
-            _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size());
+        if (info.weights_info.weight_format() == WeightFormat::UNSPECIFIED)
+            _aux_mem[PermutedWeights] =
+                MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size());
     }
 }
-Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info)
+Status CpuGemmDirectConv2d::validate(const ITensorInfo *src,
+                                     const ITensorInfo *weights,
+                                     const ITensorInfo *biases,
+                                     const ITensorInfo *dst,
+                                     const Conv2dInfo  &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    if(!is_fixed_format(info.weights_info.weight_format()))
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16,
+                                                         DataType::F16, DataType::F32);
+    if (!is_fixed_format(info.weights_info.weight_format()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
     }
@@ -171,13 +185,13 @@ Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *
     ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(data_type))
+        if (is_data_type_quantized_asymmetric(data_type))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
-        else if(data_type == DataType::BFLOAT16)
+        else if (data_type == DataType::BFLOAT16)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
         }
@@ -198,31 +212,32 @@ void CpuGemmDirectConv2d::run(ITensorPack &tensors)
     prepare(tensors);
 
     _gemm_asm_func->run(tensors);
-    if(_run_activation)
+    if (_run_activation)
     {
         ITensor    *io = tensors.get_tensor(ACL_DST);
-        ITensorPack pack{ { ACL_SRC, io }, { ACL_DST, io } };
+        ITensorPack pack{{ACL_SRC, io}, {ACL_DST, io}};
         _activation_func->run(pack);
     }
 }
 
 void CpuGemmDirectConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // If we are using fixed-format kernel the weights are already reshaped
-        if(_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel())
+        if (_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel())
         {
             _gemm_asm_func->prepare(tensors);
             _is_prepared = true;
             return;
         }
-        const ITensor *weights     = tensors.get_const_tensor(ACL_SRC_1);
-        ITensor       *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
+        const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
+        ITensor       *weights_aux =
+            utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
         ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux);
 
         CpuAuxTensorHandler permuted_weights(_perm_weights, *weights_aux);
-        ITensorPack         permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } };
+        ITensorPack         permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}};
         _weights_permute_func->run(permute_tensors);
 
         tensors.add_const_tensor(ACL_SRC_1, permuted_weights.get());
diff --git a/src/cpu/operators/CpuGemmDirectConv2d.h b/src/cpu/operators/CpuGemmDirectConv2d.h
index e55a461f36fe3c4c27097c41e82442eb430b5cfc..a7365615b9416616435cdcbeecdf890a8ea4e53c 100644
--- a/src/cpu/operators/CpuGemmDirectConv2d.h
+++ b/src/cpu/operators/CpuGemmDirectConv2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
-#define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMMDIRECTCONV2D_H
+#define ACL_SRC_CPU_OPERATORS_CPUGEMMDIRECTCONV2D_H
 
 #include "arm_compute/core/TensorInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 #include "src/cpu/operators/CpuActivation.h"
@@ -69,25 +70,35 @@ public:
      *                    Data types supported: Same as @p input.
      * @param[in] info    Contains padding and stride information described in @ref PadStrideInfo.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info);
+    void configure(const ITensorInfo *src,
+                   const ITensorInfo *weights,
+                   const ITensorInfo *biases,
+                   ITensorInfo       *dst,
+                   const Conv2dInfo  &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d
      *
      * Similar to CpuGemmDirectConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *dst,
+                           const Conv2dInfo  &info);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
     enum AuxTensorIdx
     {
-        AsmGemmWorkspace = 0,
+        GemmTemp0 = 0,
+        GemmTemp1,
         Pretranspose,
+        /* Slots above (0-2) are reserved for CpuGemmAssemblyDispatch */
         PermutedWeights,
         Count
     };
@@ -103,4 +114,4 @@ private:
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUGEMMDIRECTCONV2D_H
diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
index 8ca128fb07f9d00e14216fd0b6d6f9bf2e0cbcac..b25505a85dc97532e0ff50f45df0dd49a36019a8 100644
--- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
@@ -28,14 +28,14 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/MemoryHelpers.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"
 #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
 #include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
@@ -59,12 +59,12 @@ namespace
 cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
 {
     cpu::AsmGemmInfo asm_info;
-    asm_info.method                      = cpu::AsmConvMethod::Im2Col;
-    asm_info.reinterpret_input_as_3d     = info.reinterpret_input_as_3d();
-    asm_info.depth_output_gemm3d         = info.depth_output_gemm3d();
-    asm_info.activation_info             = info.activation_info();
-    asm_info.output_stage                = info.gemmlowp_output_stage();
-    asm_info.fast_mode                   = info.fast_math();
+    asm_info.method                  = cpu::AsmConvMethod::Im2Col;
+    asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
+    asm_info.depth_output_gemm3d     = info.depth_output_gemm3d();
+    asm_info.activation_info         = info.activation_info();
+    asm_info.output_stage            = info.gemmlowp_output_stage();
+    asm_info.fast_mode               = info.fast_math();
 
     return asm_info;
 }
@@ -105,7 +105,8 @@ CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore()
 }
 CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default;
 
-void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
+void CpuGemmLowpMatrixMultiplyCore::configure(
+    const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst);
     ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info));
@@ -122,28 +123,31 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
     _reshape_b_only_on_first_run      = b->are_values_constant();
     _is_prepared                      = false;
     _fused_assembly_path              = false;
-    _flip_signedness                  = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
-    _gemm_info                        = gemm_info;
+    _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) &&
+                       _reshape_b_only_on_first_run;
+    _gemm_info = gemm_info;
 
     _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
 
     const ITensorInfo *a_to_use = a;
 
     // Convert to QASYMM8 -> QASYMM8_SIGNED and back
-    if(_flip_signedness)
+    if (_flip_signedness)
     {
         const int32_t                 offset_correction = 128;
         const DataType                dt                = DataType::QASYMM8_SIGNED;
         const UniformQuantizationInfo iqinfo            = a_to_use->quantization_info().uniform();
 
-        _signed_a                = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+        _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(
+            QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
         _convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
         _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
         a_to_use  = &_signed_a;
         _a_offset = _signed_a.quantization_info().uniform().offset;
 
         const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
-        _signed_output                       = dst->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+        _signed_output                       = dst->clone()->set_data_type(dt).set_quantization_info(
+                                  QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
 
         // Output stage correction
         GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
@@ -157,7 +161,7 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
     }
 
     // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
-    if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
     {
         _fuse_output_stage = true;
         _mm_result_s32     = TensorInfo(dst->tensor_shape(), 1, DataType::S32);
@@ -166,16 +170,18 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
     // Initialize assembly kernel meta-data
     const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
 #ifdef __aarch64__
-    if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
+    if (!(!b->are_values_constant() &&
+          b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
     {
-        switch(a->data_type())
+        switch (a->data_type())
         {
             case DataType::QASYMM8:
             case DataType::QASYMM8_SIGNED:
             case DataType::U8:
             case DataType::S8:
             {
-                if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+                if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&
+                    info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
                 {
                     auto c_info_to_use = c == nullptr ? nullptr : c;
                     _asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info);
@@ -197,13 +203,14 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
         }
     }
 #endif /* __aarch64__ */
-    if(!(_assembly_path || _run_vector_matrix_multiplication))
+    if (!(_assembly_path || _run_vector_matrix_multiplication))
     {
         matrix_a = &_tmp_a;
         matrix_b = &_tmp_b;
 
         // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-        _tmp_a = TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
+        _tmp_a =
+            TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
         // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
         _tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info());
 
@@ -216,13 +223,13 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
         _mtx_b_reshape_kernel->configure(b, &_tmp_b);
     }
 
-    if(!_fused_assembly_path)
+    if (!_fused_assembly_path)
     {
         // Build reduction info
         const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
 
         // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0)
+        if (_a_offset != 0)
         {
             _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
 
@@ -232,7 +239,7 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
         }
 
         // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-        if(_b_offset != 0)
+        if (_b_offset != 0)
         {
             _vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32);
 
@@ -241,24 +248,23 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
             _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
         }
 
-        if(_fuse_output_stage)
+        if (_fuse_output_stage)
         {
             // Configure matrix multiply kernel
-            if(!_assembly_path)
+            if (!_assembly_path)
             {
                 _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
                 _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
             }
 
-            _offset_contribution_output_stage_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
-            _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
-                                                                _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                                _b_offset == 0 ? nullptr : &_vector_sum_row, c,
-                                                                _flip_signedness ? &_signed_output : dst,
-                                                                a->dimension(0),
-                                                                _a_offset, _b_offset, info.gemmlowp_output_stage());
+            _offset_contribution_output_stage_kernel =
+                std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
+            _offset_contribution_output_stage_kernel->configure(
+                &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                _b_offset == 0 ? nullptr : &_vector_sum_row, c, _flip_signedness ? &_signed_output : dst,
+                a->dimension(0), _a_offset, _b_offset, info.gemmlowp_output_stage());
 
-            if(_flip_signedness)
+            if (_flip_signedness)
             {
                 _convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
                 _convert_from_signed_asymm->configure(&_signed_output, dst);
@@ -267,55 +273,73 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
         else
         {
             // Configure matrix multiply kernel
-            if(!_assembly_path)
+            if (!_assembly_path)
             {
                 _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
                 _mm_kernel->configure(matrix_a, matrix_b, dst);
             }
             // Configure offset contribution kernel
             _offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();
-            _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
+            _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                                   _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
                                                    _a_offset, _b_offset);
         }
     }
     // Configure activation
     const ActivationLayerInfo &activation = gemm_info.activation_info();
-    _run_activation                       = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
-    if(_run_activation)
+    _run_activation =
+        activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
+    if (_run_activation)
     {
         _activation_func = std::make_unique<CpuActivation>();
         _activation_func->configure(dst, nullptr, activation);
     }
 
-    if(_assembly_path)
+    if (_assembly_path)
     {
-        auto asm_mem_req           = _asm_glue->workspace();
-        _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
-        _aux_mem[Pretranspose]     = asm_mem_req[Pretranspose];
+        const auto asm_mem_req = _asm_glue->workspace();
+        for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)
+        {
+            _aux_mem[slot] = asm_mem_req[slot];
+        }
     }
 
     // Request memory for LHS and RHS reshape matrix
-    _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), !_fused_assembly_path && _a_offset != 0
-                                        && _reshape_b_only_on_first_run ?
-                                        MemoryLifetime::Persistent :
-                                        MemoryLifetime::Temporary,
-                                        _vector_sum_col.total_size());
-    _aux_mem[VectorSumRow] = MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
-    _aux_mem[TmpA]         = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
-    _aux_mem[TmpB]         = MemoryInfo(offset_int_vec(TmpB), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
-    _aux_mem[MMResultS32]  = MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
-    _aux_mem[SignedA]      = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
-    _aux_mem[SignedOutput] = MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
+    _aux_mem[VectorSumCol] =
+        MemoryInfo(offset_int_vec(VectorSumCol),
+                   !_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run ? MemoryLifetime::Persistent
+                                                                                           : MemoryLifetime::Temporary,
+                   _vector_sum_col.total_size());
+    _aux_mem[VectorSumRow] =
+        MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
+    _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
+    _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB),
+                                _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+                                _tmp_b.total_size());
+    _aux_mem[MMResultS32] =
+        MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
+    _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
+    _aux_mem[SignedOutput] =
+        MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
 }
 
-Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                               const ITensorInfo *b,
+                                               const ITensorInfo *c,
+                                               const ITensorInfo *output,
+                                               const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
-                                    "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr &&
+                                        gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,
+                                    "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (a)->dimension(0) != (b)->dimension(1),
+        "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 
@@ -333,28 +357,32 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
     int32_t b_offset = b->quantization_info().uniform().offset;
 
     bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
-    if(fuse_output_stage)
+    if (fuse_output_stage)
     {
-        auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+        auto_init_if_empty(mm_result_s32_info,
+                           a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
     }
 
     // Convert QASYMM8->QASYMM8_SIGNED
     TensorInfo signed_a{};
     TensorInfo signed_output{};
-    bool       flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
-    if(flip_signedness)
+    bool       flip_signedness = is_data_type_quantized_per_channel(b->data_type()) &&
+                           (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
+    if (flip_signedness)
     {
         const int32_t                 offset_correction = 128;
         const DataType                dt                = DataType::QASYMM8_SIGNED;
         const UniformQuantizationInfo iqinfo            = a_to_use->quantization_info().uniform();
 
-        signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+        signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(
+            QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
         a_to_use = &signed_a;
         a_offset = signed_a.quantization_info().uniform().offset;
 
         const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
-        signed_output                        = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+        signed_output                        = output->clone()->set_data_type(dt).set_quantization_info(
+                                   QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
 
         // Output stage correction
         GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
@@ -374,25 +402,28 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
     bool run_optimised             = false;
     bool run_optimised_requantized = false;
 
-    if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
+    if (!(!b->are_values_constant() &&
+          b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
     {
-        if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&
+            info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             run_optimised             = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
             run_optimised_requantized = run_optimised;
         }
         else
         {
-            run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
+            run_optimised = bool(CpuGemmAssemblyDispatch::validate(
+                a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
         }
     }
 
-    if(run_optimised)
+    if (run_optimised)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
-        if(info.depth_output_gemm3d() != 0)
+        if (info.depth_output_gemm3d() != 0)
         {
-            if(info.reinterpret_input_as_3d())
+            if (info.reinterpret_input_as_3d())
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
@@ -409,11 +440,13 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),
+                                        "NEGEMM cannot reinterpret the input tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,
+                                        "NEGEMM cannot reinterpret the output tensor as 3D");
 
         const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
-        if(!run_vector_matrix_multiplication)
+        if (!run_vector_matrix_multiplication)
         {
             matrix_a_info = &tmp_a_info;
             matrix_b_info = &tmp_b_info;
@@ -437,7 +470,7 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
         }
     }
 
-    if(!run_optimised_requantized)
+    if (!run_optimised_requantized)
     {
         TensorInfo info_vector_sum_col{};
         TensorInfo info_vector_sum_row{};
@@ -445,62 +478,70 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
         const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
 
         // Validate matrix B reduction kernel only if _a_offset is not equal to 0
-        if(a_offset != 0)
+        if (a_offset != 0)
         {
             info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
 
             // Configure Matrix B reduction kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
         }
 
         // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
-        if(b_offset != 0)
+        if (b_offset != 0)
         {
             info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 
             // Configure matrix A reduction kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
         }
 
-        if(fuse_output_stage)
+        if (fuse_output_stage)
         {
-            if(!run_optimised)
+            if (!run_optimised)
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
-
-                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    info.reinterpret_input_as_3d(),
+                    "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    info.depth_output_gemm3d() != 0,
+                    "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(
+                    matrix_a_info, matrix_b_info, &mm_result_s32_info));
             }
 
             // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
-                                                                                                          a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                          b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                          c,
-                                                                                                          flip_signedness ? &signed_output : output,
-                                                                                                          a_offset, b_offset,
-                                                                                                          info.gemmlowp_output_stage()));
+            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(
+                &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+                b_offset == 0 ? nullptr : &info_vector_sum_row, c, flip_signedness ? &signed_output : output, a_offset,
+                b_offset, info.gemmlowp_output_stage()));
         }
         else
         {
-            if(!run_optimised)
+            if (!run_optimised)
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
-
-                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    info.reinterpret_input_as_3d(),
+                    "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    info.depth_output_gemm3d() != 0,
+                    "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+
+                ARM_COMPUTE_RETURN_ON_ERROR(
+                    kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
             }
             // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(output,
-                                                                                               a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                               b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                               a_offset, b_offset));
+            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(
+                output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row,
+                a_offset, b_offset));
         }
     }
 
     // Validate activation
     const ActivationLayerInfo &activation = gemm_info.activation_info();
-    if(activation.enabled())
+    if (activation.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation));
     }
@@ -529,24 +570,22 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
     CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false);
 
     // Convert QASYMM8->QASYMM8_SIGNED
-    if(_flip_signedness)
+    if (_flip_signedness)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, a },
-            { TensorType::ACL_DST, signed_a.get() }
-        };
-        NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack);
+        ITensorPack pack = {{TensorType::ACL_SRC, a}, {TensorType::ACL_DST, signed_a.get()}};
+        NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(),
+                                       pack);
         a_to_use = signed_a.get();
         matrix_a = signed_a.get();
     }
 
     // Run GEMM
-    if(_asm_glue->is_configured())
+    if (_asm_glue->is_configured())
     {
         ITensorPack asm_glue_tensors = tensors;
         auto        output_to_use    = (_fuse_output_stage ? mm_result_s32.get() : dst);
-        if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        if (is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) &&
+            _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
             asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
@@ -563,35 +602,25 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
     }
     else
     {
-        if(!_run_vector_matrix_multiplication)
+        if (!_run_vector_matrix_multiplication)
         {
             matrix_a = tmp_a.get();
             matrix_b = tmp_b.get();
             // Run interleave kernel
-            ITensorPack pack_a =
-            {
-                { TensorType::ACL_SRC, a_to_use },
-                { TensorType::ACL_DST, tmp_a.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), pack_a);
+            ITensorPack pack_a = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, tmp_a.get()}};
+            NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(),
+                                           pack_a);
 
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
-                ITensorPack pack_b =
-                {
-                    { TensorType::ACL_SRC, b },
-                    { TensorType::ACL_DST, tmp_b.get() }
-                };
+                ITensorPack pack_b = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, tmp_b.get()}};
                 // Run transpose kernel
-                NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack_b);
+                NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY,
+                                               _mtx_b_reshape_kernel->window(), pack_b);
             }
         }
-        ITensorPack pack_mm =
-        {
-            { TensorType::ACL_SRC_0, matrix_a },
-            { TensorType::ACL_SRC_1, matrix_b }
-        };
-        if(_fuse_output_stage)
+        ITensorPack pack_mm = {{TensorType::ACL_SRC_0, matrix_a}, {TensorType::ACL_SRC_1, matrix_b}};
+        if (_fuse_output_stage)
         {
             pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get());
         }
@@ -602,31 +631,25 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
         NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm);
     }
 
-    if(!_fused_assembly_path)
+    if (!_fused_assembly_path)
     {
         // Run matrix A reduction kernel only if _b_offset is not equal to 0
-        if(_b_offset != 0)
+        if (_b_offset != 0)
         {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, a_to_use },
-                { TensorType::ACL_DST, vector_sum_row.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, _mtx_a_reduction_kernel->window(), pack);
+            ITensorPack pack = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, vector_sum_row.get()}};
+            NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX,
+                                           _mtx_a_reduction_kernel->window(), pack);
         }
 
         // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0 && !_reshape_b_only_on_first_run)
+        if (_a_offset != 0 && !_reshape_b_only_on_first_run)
         {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, b },
-                { TensorType::ACL_DST, vector_sum_col.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
+            ITensorPack pack = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, vector_sum_col.get()}};
+            NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,
+                                           _mtx_b_reduction_kernel->window(), pack);
         }
 
-        if(_fuse_output_stage)
+        if (_fuse_output_stage)
         {
             ITensorPack pack;
             pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get());
@@ -636,7 +659,8 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
             pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst);
 
             // Run offset contribution kernel
-            NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, _offset_contribution_output_stage_kernel->window(), pack);
+            NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY,
+                                           _offset_contribution_output_stage_kernel->window(), pack);
         }
         else
         {
@@ -646,68 +670,57 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
             pack.add_tensor(TensorType::ACL_DST, dst);
 
             // Run offset contribution kernel
-            NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, _offset_contribution_kernel->window(), pack);
+            NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY,
+                                           _offset_contribution_kernel->window(), pack);
         }
     }
 
     // Convert QASYMM8_SIGNED->QASYMM8
-    if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
+    if (!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, signed_output.get() },
-            { TensorType::ACL_DST, dst }
-        };
-        NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, _convert_from_signed_asymm->window(), pack);
+        ITensorPack pack = {{TensorType::ACL_SRC, signed_output.get()}, {TensorType::ACL_DST, dst}};
+        NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY,
+                                       _convert_from_signed_asymm->window(), pack);
     }
 
     // Run fused activation unless already run in the fused assembly
-    if(_run_activation)
+    if (_run_activation)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, dst },
-            { TensorType::ACL_DST, dst }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}};
         _activation_func->run(pack);
     }
 }
 
 void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         // Run assembly reshape
-        if(_asm_glue->is_configured())
+        if (_asm_glue->is_configured())
         {
             _asm_glue->prepare(tensors);
         }
         // Run non-assembly reshape
-        else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
+        else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
         {
             // Run reshape kernel and mark original weights tensor as unused
-            ITensor            *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
+            ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
             CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p);
-            ITensorPack         pack =
-            {
-                { TensorType::ACL_SRC, original_b },
-                { TensorType::ACL_DST, tmp_b.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack);
+            ITensorPack         pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, tmp_b.get()}};
+            NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(),
+                                           pack);
         }
 
         // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
+        if (!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
         {
-            ITensor            *vector_sum_col_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
+            ITensor *vector_sum_col_p =
+                utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
             CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p);
-            ITensorPack         pack =
-            {
-                { TensorType::ACL_SRC, original_b },
-                { TensorType::ACL_DST, vector_sum_col.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
+            ITensorPack         pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, vector_sum_col.get()}};
+            NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,
+                                           _mtx_b_reduction_kernel->window(), pack);
         }
         _is_prepared = true;
     }
diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
index a1b34291d06fb367f5193e278ae937ef84a4b232..78065a89535188f1dcb61562b466117f321a0ca3 100644
--- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
@@ -21,11 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H
-#define ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMMLOWPMATRIXMULTIPLYCORE_H
+#define ACL_SRC_CPU_OPERATORS_CPUGEMMLOWPMATRIXMULTIPLYCORE_H
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/function_info/GEMMInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -108,26 +109,33 @@ public:
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ITensorInfo *a,
+                   const ITensorInfo *b,
+                   const ITensorInfo *c,
+                   ITensorInfo       *dst,
+                   const GEMMInfo    &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmLowpMatrixMultiplyCore::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *dst,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
     enum AuxTensorIdx
     {
-        AsmGemmWorkspace = 0,
-        Pretranspose,
-        VectorSumCol,
+        /* Slots 0 - 2 reserved for CpuGemmAssemblyDispatch */
+        VectorSumCol = 3,
         VectorSumRow,
         TmpA,
         TmpB,
@@ -172,4 +180,4 @@ private:
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUGEMMLOWPMATRIXMULTIPLYCORE_H
diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.cpp b/src/cpu/operators/CpuGemmLowpOutputStage.cpp
index 58f98acff0199eb5ff279d6c1fdda4822141126e..4215eed1991dc17d5d60ee36e573cb977dd5166d 100644
--- a/src/cpu/operators/CpuGemmLowpOutputStage.cpp
+++ b/src/cpu/operators/CpuGemmLowpOutputStage.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h"
 #include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
@@ -36,36 +37,42 @@ namespace arm_compute
 {
 namespace cpu
 {
-void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+void CpuGemmLowpOutputStage::configure(ITensorInfo                   *src,
+                                       ITensorInfo                   *bias,
+                                       ITensorInfo                   *dst,
+                                       const GEMMLowpOutputStageInfo &info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpOutputStage::validate(src, bias, dst, info));
     ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info);
 
-    switch(info.type)
+    switch (info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
         {
-            switch(info.output_data_type)
+            switch (info.output_data_type)
             {
                 case DataType::QASYMM8:
                 {
                     auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
-                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset,
+                                 info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                     _kernel = std::move(k);
                     break;
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
                     auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
-                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset,
+                                 info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                     _kernel = std::move(k);
                     break;
                 }
                 case DataType::QSYMM16:
                 {
                     auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
-                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound,
+                                 info.gemmlowp_max_bound);
                     _kernel = std::move(k);
                     break;
                 }
@@ -79,7 +86,7 @@ void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITen
         }
         case GEMMLowpOutputStageType::QUANTIZE_DOWN:
         {
-            switch(info.output_data_type)
+            switch (info.output_data_type)
             {
                 case DataType::QASYMM8:
                 case DataType::QASYMM8_SIGNED:
@@ -102,32 +109,41 @@ void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITen
     }
 }
 
-Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+Status CpuGemmLowpOutputStage::validate(const ITensorInfo             *src,
+                                        const ITensorInfo             *bias,
+                                        const ITensorInfo             *dst,
+                                        const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN, "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type.");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
-    ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN,
+                                    "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type.");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) &&
+                                (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT));
 
-    switch(info.type)
+    switch (info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
         {
-            switch(dst->data_type())
+            switch (dst->data_type())
             {
                 case DataType::QASYMM8:
-                    return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(
+                        src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                 case DataType::QASYMM8_SIGNED:
-                    return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(
+                        src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                 case DataType::QSYMM16:
-                    return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(
+                        src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                 default:
                     return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
             }
         }
         case GEMMLowpOutputStageType::QUANTIZE_DOWN:
         {
-            switch(dst->data_type())
+            switch (dst->data_type())
             {
                 case DataType::QASYMM8:
                 case DataType::QASYMM8_SIGNED:
@@ -146,4 +162,4 @@ void CpuGemmLowpOutputStage::run(ITensorPack &tensors)
     NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.h b/src/cpu/operators/CpuGemmLowpOutputStage.h
index 39394f6b5f0562019d7753b67252bc698a5b1e3e..e5e2f41fa9440fa11a859778fcc16ce6b12910ed 100644
--- a/src/cpu/operators/CpuGemmLowpOutputStage.h
+++ b/src/cpu/operators/CpuGemmLowpOutputStage.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 /** This file contains all available output stages for GEMMLowp.
@@ -76,7 +77,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo &info);
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp
index 8811a7ea6b850ed7b232a7324123220579c99774..89087129c37da0a0497d50aac29d14754578f4ac 100644
--- a/src/cpu/operators/CpuMatMul.cpp
+++ b/src/cpu/operators/CpuMatMul.cpp
@@ -23,14 +23,16 @@
  */
 
 #include "src/cpu/operators/CpuMatMul.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
+
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/function_info/MatMulInfo.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEMatMul.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -46,8 +48,11 @@ namespace cpu
 {
 namespace
 {
-Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
-                                      GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
+Status get_gemmlowp_output_stage_info(const ITensorInfo         *src,
+                                      const ITensorInfo         *weights,
+                                      const ITensorInfo         *dst,
+                                      const ActivationLayerInfo &act,
+                                      GEMMLowpOutputStageInfo   &gemmlowp_output_stage_info)
 {
     const auto                    data_type = src->data_type();
     const QuantizationInfo        oq_info   = dst->quantization_info();
@@ -59,10 +64,11 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
     int32_t output_multiplier;
     int32_t output_shift;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
-    int32_t type_min = 0;
-    int32_t type_max = 0;
+    int32_t type_min             = 0;
+    int32_t type_max             = 0;
     std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
 
     gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -77,14 +83,27 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
 } // namespace
 
 CpuMatMul::CpuMatMul()
-    : _transpose_kernel_lhs(), _transpose_kernel_rhs(), _asm_glue(), _lhs_transposed(), _rhs_transposed(), _original_lhs_shape(), _original_rhs_shape(), _original_dst_shape()
+    : _transpose_kernel_lhs(),
+      _transpose_kernel_rhs(),
+      _asm_glue(),
+      _lhs_transposed(),
+      _rhs_transposed(),
+      _original_lhs_shape(),
+      _original_rhs_shape(),
+      _original_dst_shape()
 {
 }
 
-Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+Status CpuMatMul::validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &info,
+                           const CpuMatMulSettings   &settings,
+                           const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->are_values_constant(), "LHS Tensor must be dynamic.");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs->are_values_constant(), "RHS Tensor must be dynamic.");
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs);
@@ -103,34 +122,39 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
     gemm_info.fast_mode       = settings.fast_math();
 
     // Validate and then permute a/b
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        auto_init_if_empty(lhs_transposed, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs)));
+        auto_init_if_empty(lhs_transposed,
+                           lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs)));
         ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(lhs_to_use, &lhs_transposed));
         // Assign lhs_to_use pointer to use transposed TensorInfo
         lhs_to_use = &lhs_transposed;
     }
-    if(adj_rhs)
+    if (adj_rhs)
     {
-        auto_init_if_empty(rhs_transposed, rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs)));
+        auto_init_if_empty(rhs_transposed,
+                           rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs)));
         ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(rhs_to_use, &rhs_transposed));
         // Assign rhs_to_use pointer to use transposed TensorInfo
         rhs_to_use = &rhs_transposed;
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(0) != rhs_to_use->dimension(1),
-                                    "The product AB is defined only if the number of columns in A is equal to the number of rows in B (after transpose)");
+                                    "The product AB is defined only if the number of columns in A is equal to the "
+                                    "number of rows in B (after transpose)");
 
     // Iterate over dimensions to be collapsed in operator - check dimensions are equivalent between tensors
-    for(unsigned int i = 2; i < Coordinates::num_max_dimensions; i++)
+    for (unsigned int i = 2; i < Coordinates::num_max_dimensions; i++)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i), "Broadcasting in Batch dimension is unsupported by this operator.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i),
+                                        "Broadcasting in Batch dimension is unsupported by this operator.");
     }
 
     // Quantized-specific configuration
-    if(is_data_type_quantized(lhs->data_type()))
+    if (is_data_type_quantized(lhs->data_type()))
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst, gemm_info.activation_info, gemm_info.output_stage));
+        ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst,
+                                                                   gemm_info.activation_info, gemm_info.output_stage));
     }
 
     cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info);
@@ -138,7 +162,12 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
     return Status{};
 }
 
-void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+void CpuMatMul::configure(ITensorInfo               *lhs,
+                          ITensorInfo               *rhs,
+                          ITensorInfo               *dst,
+                          const MatMulInfo          &info,
+                          const CpuMatMulSettings   &settings,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings);
@@ -163,21 +192,23 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst,
     _original_rhs_shape = rhs_to_use.tensor_shape();
 
     // Reshape lhs for use with assembly kernels.
-    lhs_to_use.set_tensor_shape(TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z()));
-    dst_to_use.set_tensor_shape(TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z()));
+    lhs_to_use.set_tensor_shape(
+        TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z()));
+    dst_to_use.set_tensor_shape(
+        TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z()));
     rhs_to_use.set_tensor_shape(_original_rhs_shape.collapsed_from(2));
 
     // 2.  Configuration for transpose of lhs/rhs
     // ------------------------------------------------------
     // Initialise transposed TensorInfo class for aux tensors (intermediary tensors)
-    if(_adj_lhs)
+    if (_adj_lhs)
     {
         // Setup transpose LHS
         _transpose_kernel_lhs = std::make_unique<cpu::kernels::CpuTransposeKernel>();
         _transpose_kernel_lhs->configure(&lhs_to_use, &_lhs_transposed);
     }
 
-    if(_adj_rhs)
+    if (_adj_rhs)
     {
         // Setup transpose RHS
         _transpose_kernel_rhs = std::make_unique<cpu::kernels::CpuTransposeKernel>();
@@ -196,20 +227,22 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst,
     rhs_to_use = (_adj_rhs) ? _rhs_transposed : rhs_to_use;
 
     // Quantized-specific configuration
-    if(is_data_type_quantized(lhs->data_type()))
+    if (is_data_type_quantized(lhs->data_type()))
     {
-        get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info, _gemm_info.output_stage);
+        get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info,
+                                       _gemm_info.output_stage);
     }
 
     // Configure Asm Kernel
     _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
-    _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use, _gemm_info); // c is nullptr as bias not supported in MatMul
+    _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use,
+                         _gemm_info); // c is nullptr as bias not supported in MatMul
 
     // Specify memory requirements for intermediate tensors
     auto asm_mem_req = _asm_glue->workspace();
     // Specify memory required by gemm kernel
     int idx = 0;
-    for(const auto &aux : asm_mem_req)
+    for (const auto &aux : asm_mem_req)
     {
         _aux_mem[idx] = aux;
         idx++;
@@ -228,8 +261,12 @@ void CpuMatMul::run(ITensorPack &tensors)
 
     // Reshape LHS and DST to ensure compatibility with GEMM asm kernel (Batch dimensions is 4th for lhs and dst within asm)
     // Collapse RHS (necessary to support dimensions larger than 3 in gemm assembly)
-    lhs->info()->set_tensor_shape(TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
-    dst->info()->set_tensor_shape(TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
+    lhs->info()->set_tensor_shape(
+        TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1,
+                    _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
+    dst->info()->set_tensor_shape(
+        TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1,
+                    _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
     rhs->info()->set_tensor_shape(_original_rhs_shape.collapsed_from(2));
 
     // Initialise object to handle stored transposed tensors in auxillary memory
@@ -240,17 +277,19 @@ void CpuMatMul::run(ITensorPack &tensors)
     ITensorPack asm_tensors(tensors);
 
     // Run transpose lhs if necessary
-    if(_adj_lhs)
+    if (_adj_lhs)
     {
-        ITensorPack lhs_transpose_pack = { { TensorType::ACL_SRC, lhs }, { TensorType::ACL_DST, lhs_transposed.get() } };
-        NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(), lhs_transpose_pack);
+        ITensorPack lhs_transpose_pack = {{TensorType::ACL_SRC, lhs}, {TensorType::ACL_DST, lhs_transposed.get()}};
+        NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(),
+                                       lhs_transpose_pack);
         asm_tensors.add_const_tensor(TensorType::ACL_SRC_0, lhs_transposed.get());
     }
     // Run transpose rhs if necessary
-    if(_adj_rhs)
+    if (_adj_rhs)
     {
-        ITensorPack rhs_transpose_pack = { { TensorType::ACL_SRC, rhs }, { TensorType::ACL_DST, rhs_transposed.get() } };
-        NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(), rhs_transpose_pack);
+        ITensorPack rhs_transpose_pack = {{TensorType::ACL_SRC, rhs}, {TensorType::ACL_DST, rhs_transposed.get()}};
+        NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(),
+                                       rhs_transpose_pack);
         asm_tensors.add_const_tensor(TensorType::ACL_SRC_1, rhs_transposed.get());
     }
     // Run asm kernel
diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h
index 475c019fd0fd99c915527fdeff8b37cd8962159d..2b1b4cf0ffac0fca0d108b5df49663ec6b4ab762 100644
--- a/src/cpu/operators/CpuMatMul.h
+++ b/src/cpu/operators/CpuMatMul.h
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_CPU_OPERATORS_CPUMATMUL
-#define ACL_SRC_CPU_OPERATORS_CPUMATMUL
+#ifndef ACL_SRC_CPU_OPERATORS_CPUMATMUL_H
+#define ACL_SRC_CPU_OPERATORS_CPUMATMUL_H
 
 #include "arm_compute/core/TensorInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 #include "src/cpu/kernels/CpuTransposeKernel.h"
@@ -66,34 +67,42 @@ public:
      * @param[in]  settings The settings for matmul operation (i.e fast math)
      * @param[in]  act_info Class containing information about fused activation function.
      */
-    void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *dst,
+                   const MatMulInfo          &info,
+                   const CpuMatMulSettings   &settings,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuMatMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings,
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &info,
+                           const CpuMatMulSettings   &settings,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
     enum InternalTensorIdx
     {
-        AsmGemmWorkspace = 0, // Pre-allocate workspace tensors for CpuGemmAssemblyDispatch
-        PretransposeRHS,      // Pre-allocate workspace tensors for CpuGemmAssemblyDispatch
-        TransposeLHS,
+        /* Slots 0 - 2 reserved for CpuGemmAssemblyDispatch */
+        TransposeLHS = 3,
         TransposeRHS,
         Count
     };
 
     // Define unique pointers to kernels/operators used by matmul
-    std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{ nullptr };
-    std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{ nullptr };
-    std::unique_ptr<CpuGemmAssemblyDispatch>     _asm_glue{ nullptr };
+    std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{nullptr};
+    std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{nullptr};
+    std::unique_ptr<CpuGemmAssemblyDispatch>     _asm_glue{nullptr};
 
     // TensorInfo for tensors stored in auxillary memory
     TensorInfo _lhs_transposed{};
@@ -105,13 +114,13 @@ private:
     TensorShape _original_dst_shape{};
 
     // Note : adj_lhs means the same as transposing lhs
-    bool                             _adj_lhs{ false };
-    bool                             _adj_rhs{ false };
-    bool                             _fast_math{ false };
+    bool                             _adj_lhs{false};
+    bool                             _adj_rhs{false};
+    bool                             _fast_math{false};
     AsmGemmInfo                      _gemm_info{};
-    experimental::MemoryRequirements _aux_mem{ Count };
+    experimental::MemoryRequirements _aux_mem{Count};
 };
-}
-}
+} // namespace cpu
+} // namespace arm_compute
 
-#endif /* ACL_SRC_CPU_OPERATORS_CPUMATMUL */
+#endif // ACL_SRC_CPU_OPERATORS_CPUMATMUL_H
diff --git a/src/cpu/operators/CpuMaxUnpooling.cpp b/src/cpu/operators/CpuMaxUnpooling.cpp
index 24e9fd6d46d0588017ae4708652f13c7dd1d553b..697fc40ab33d5f4a7c30ba6d5e9eafa17cda52bb 100644
--- a/src/cpu/operators/CpuMaxUnpooling.cpp
+++ b/src/cpu/operators/CpuMaxUnpooling.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/cpu/operators/CpuMaxUnpooling.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
 
@@ -29,7 +30,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-void CpuMaxUnpooling::configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+void CpuMaxUnpooling::configure(const ITensorInfo      *src,
+                                const ITensorInfo      *indices,
+                                ITensorInfo            *dst,
+                                const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src, indices, dst, pool_info);
     auto k = std::make_unique<kernels::CpuMaxUnpoolingLayerKernel>();
@@ -37,9 +41,12 @@ void CpuMaxUnpooling::configure(const ITensorInfo *src, const ITensorInfo *indic
     _kernel = std::move(k);
 }
 
-Status CpuMaxUnpooling::validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+Status CpuMaxUnpooling::validate(const ITensorInfo      *src,
+                                 const ITensorInfo      *indices,
+                                 const ITensorInfo      *dst,
+                                 const PoolingLayerInfo &pool_info)
 {
     return kernels::CpuMaxUnpoolingLayerKernel::validate(src, indices, dst, pool_info);
 }
-} // namesapce cpu
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuMaxUnpooling.h b/src/cpu/operators/CpuMaxUnpooling.h
index aa1f1072a5ce5db957c4aedfb29a89301f0f396b..5dc00bce9e07407b86c7a17d65a7b194b082fb57 100644
--- a/src/cpu/operators/CpuMaxUnpooling.h
+++ b/src/cpu/operators/CpuMaxUnpooling.h
@@ -44,14 +44,18 @@ public:
      * @param[out] dst       Destination tensor. Data types supported: Same as @p src
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+    void
+    configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuMaxUnpooling::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info);
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuMul.cpp b/src/cpu/operators/CpuMul.cpp
index 4c15015206b0c2a652bc6349b8e8603552e75560..ac9847111dfc76899245709d932f52dcd08b25c4 100644
--- a/src/cpu/operators/CpuMul.cpp
+++ b/src/cpu/operators/CpuMul.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuMulKernel.h"
 
@@ -33,14 +34,24 @@ namespace arm_compute
 {
 namespace cpu
 {
-Status CpuMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+Status CpuMul::validate(const ITensorInfo         *src1,
+                        const ITensorInfo         *src2,
+                        const ITensorInfo         *dst,
+                        float                      scale,
+                        ConvertPolicy              overflow_policy,
+                        RoundingPolicy             rounding_policy,
                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy);
 }
 
-void CpuMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+void CpuMul::configure(ITensorInfo               *src1,
+                       ITensorInfo               *src2,
+                       ITensorInfo               *dst,
+                       float                      scale,
+                       ConvertPolicy              overflow_policy,
+                       RoundingPolicy             rounding_policy,
                        const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
@@ -58,13 +69,19 @@ void CpuMul::run(ITensorPack &tensors)
     NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
 }
 
-Status CpuComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status CpuComplexMul::validate(const ITensorInfo         *src1,
+                               const ITensorInfo         *src2,
+                               const ITensorInfo         *dst,
+                               const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return kernels::CpuComplexMulKernel::validate(src1, src2, dst);
 }
 
-void CpuComplexMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void CpuComplexMul::configure(ITensorInfo               *src1,
+                              ITensorInfo               *src2,
+                              ITensorInfo               *dst,
+                              const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
@@ -80,4 +97,4 @@ void CpuComplexMul::run(ITensorPack &tensors)
     NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuMul.h b/src/cpu/operators/CpuMul.h
index 3e0edbf0508f8805905619876be274393cb1ecb5..82b309830b9e91b6b68f40a1e2da88b07bbfdbf8 100644
--- a/src/cpu/operators/CpuMul.h
+++ b/src/cpu/operators/CpuMul.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
@@ -61,7 +62,12 @@ public:
      * @param[in]      rounding_policy Rounding policy.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+    void configure(ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -69,7 +75,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
@@ -89,14 +100,20 @@ public:
      * @param[out]     dst      The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuComplexMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
diff --git a/src/cpu/operators/CpuPermute.cpp b/src/cpu/operators/CpuPermute.cpp
index babaf21b6f5ebe74f903ac18e75029017168efee..25acc92d0027e49f43d5c306127ffff3cfe6e1aa 100644
--- a/src/cpu/operators/CpuPermute.cpp
+++ b/src/cpu/operators/CpuPermute.cpp
@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuPermute.h"
 
-#include "src/cpu/kernels/CpuPermuteKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuPermuteKernel.h"
 
 namespace arm_compute
 {
@@ -43,5 +42,5 @@ Status CpuPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, cons
 {
     return kernels::CpuPermuteKernel::validate(src, dst, perm);
 }
-} // namesapce cpu
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp
index 722cd36ee5f261079deb58752f33d7a0b256ef08..b72bde69782e6a3fc141baf8a0ef676427933350 100644
--- a/src/cpu/operators/CpuPool2d.cpp
+++ b/src/cpu/operators/CpuPool2d.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuPool2dKernel.h"
 #include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
@@ -53,7 +54,8 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
     ARM_COMPUTE_LOG_PARAMS(src, dst, pool_info, indices);
 
     // Check if we can run assembly kernels. Currently, indices are not supported by those kernels
-    const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+    const bool run_optimised =
+        bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
 
     // Get data layout
     _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
@@ -61,10 +63,11 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
     // Check if we have Global Pooling Layer
     const unsigned int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
     const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    _is_global_pooling_layer      = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height);
-    _use_kernel_indices           = pool_info.use_kernel_indices;
+    _is_global_pooling_layer      = (src->dimension(idx_width) == pool_info.pool_size.width) &&
+                               (src->dimension(idx_height) == pool_info.pool_size.height);
+    _use_kernel_indices = pool_info.use_kernel_indices;
 
-    if(run_optimised)
+    if (run_optimised)
     {
         const CPUInfo     &ci          = NEScheduler::get().cpu_info();
         const unsigned int num_threads = NEScheduler::get().num_threads();
@@ -76,7 +79,7 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
         // Get kernel's memory requirements
         constexpr size_t alignment      = 4096;
         const size_t     workspace_size = pooling_wrapper->get_working_size(num_threads);
-        _aux_mem[0]                     = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment);
+        _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment);
 
         _asm_glue = std::move(pooling_wrapper);
     }
@@ -89,11 +92,15 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
     }
 }
 
-Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CpuPool2d::validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices)
 {
-    const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+    const bool run_optimised =
+        bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
 
-    if(run_optimised)
+    if (run_optimised)
     {
         return Status{};
     }
@@ -105,20 +112,24 @@ void CpuPool2d::run(ITensorPack &tensors)
 {
     ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
 
-    if(_asm_glue)
+    if (_asm_glue)
     {
         const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY;
         NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors);
     }
     else
     {
-        switch(_data_layout)
+        switch (_data_layout)
         {
             case DataLayout::NCHW:
-                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors);
+                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(),
+                                               _is_global_pooling_layer ? Window::DimZ : Window::DimY,
+                                               _pooling_layer_kernel->window(), tensors);
                 break;
             case DataLayout::NHWC:
-                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), (_use_kernel_indices ? Window::DimY : Window::DimX), _pooling_layer_kernel->window(), tensors);
+                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(),
+                                               (_use_kernel_indices ? Window::DimY : Window::DimX),
+                                               _pooling_layer_kernel->window(), tensors);
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data layout not supported");
diff --git a/src/cpu/operators/CpuPool2d.h b/src/cpu/operators/CpuPool2d.h
index 5c571db88a22adc814d6df38065f15993b708dad..ea73e3f335d3cd7b90b3420154c543c8b0064bb4 100644
--- a/src/cpu/operators/CpuPool2d.h
+++ b/src/cpu/operators/CpuPool2d.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_POOL2D_H
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -58,17 +59,21 @@ public:
      * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      * @param[out]     indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    void
+    configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuPool2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/cpu/operators/CpuPool3d.cpp b/src/cpu/operators/CpuPool3d.cpp
index 14e4ac6c977823fedf2712a68c104cf3390dea2f..7fa78c1f802b46d703a3ed7729b27617918cee94 100644
--- a/src/cpu/operators/CpuPool3d.cpp
+++ b/src/cpu/operators/CpuPool3d.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/Scheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuPool3dKernel.h"
 
@@ -35,8 +36,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-CpuPool3d::CpuPool3d()
-    : _aux_mem(1)
+CpuPool3d::CpuPool3d() : _aux_mem(1)
 {
 }
 
@@ -70,4 +70,4 @@ experimental::MemoryRequirements CpuPool3d::workspace() const
 }
 
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuPool3d.h b/src/cpu/operators/CpuPool3d.h
index 8a73f8a0af9ac8b0e077393190340ba94e768c05..235d798095e6753d194812d069cea6a8855fc3aa 100644
--- a/src/cpu/operators/CpuPool3d.h
+++ b/src/cpu/operators/CpuPool3d.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_POOL3D_H
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -61,7 +62,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/cpu/operators/CpuQuantize.cpp b/src/cpu/operators/CpuQuantize.cpp
index f9e14d1f88cd8f40b0a41cf1c78359799941b439..4315499c3978787f9ad4dfaf572ec8e025fe3896 100644
--- a/src/cpu/operators/CpuQuantize.cpp
+++ b/src/cpu/operators/CpuQuantize.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuQuantizeKernel.h"
 
diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp
index 79e7b8fe6e0b30fb31a0f7f7bddf4d90f9fc9fd2..a423abb49aab292fd559a8f4d81c19f0bcbdeb15 100644
--- a/src/cpu/operators/CpuReshape.cpp
+++ b/src/cpu/operators/CpuReshape.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,9 +23,10 @@
  */
 #include "src/cpu/operators/CpuReshape.h"
 
-#include "src/cpu/kernels/CpuReshapeKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuReshapeKernel.h"
 
 namespace arm_compute
 {
@@ -43,5 +44,17 @@ Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
 {
     return kernels::CpuReshapeKernel::validate(src, dst);
 }
+
+void CpuReshape::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    if (!_is_prepared)
+    {
+        static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->prepare(tensors);
+        _is_prepared = true;
+    }
+    const auto split_dimension = static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->get_split_dimension();
+    NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
+}
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuReshape.h b/src/cpu/operators/CpuReshape.h
index 92dcb09aa9fa418b657f7096a06c415826e4874f..33da792319bef8ea62cbccdba3a12ccf23ff50ac 100644
--- a/src/cpu/operators/CpuReshape.h
+++ b/src/cpu/operators/CpuReshape.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,8 @@
 #ifndef ARM_COMPUTE_CPU_RESHAPE_H
 #define ARM_COMPUTE_CPU_RESHAPE_H
 
+#include "arm_compute/core/Window.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
@@ -47,6 +49,12 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    bool _is_prepared{false};
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuScale.cpp b/src/cpu/operators/CpuScale.cpp
index 8a712bf088a28489c7a93852643952ef41954639..7df92969312ef66f973438ffd567d5bbddbfc2cc 100644
--- a/src/cpu/operators/CpuScale.cpp
+++ b/src/cpu/operators/CpuScale.cpp
@@ -24,8 +24,9 @@
 #include "src/cpu/operators/CpuScale.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "src/cpu/kernels/CpuScaleKernel.h"
@@ -37,11 +38,12 @@ namespace cpu
 {
 namespace
 {
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners)
+void precompute_dx_dy_offsets(
+    ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners)
 {
     ARM_COMPUTE_ERROR_ON(offsets == nullptr);
     float sampling_offset = 0.0f;
-    if(sampling_policy == SamplingPolicy::CENTER)
+    if (sampling_policy == SamplingPolicy::CENTER)
     {
         sampling_offset = 0.5f;
     }
@@ -50,38 +52,44 @@ void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float
     win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
     win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1));
 
-    if(dx != nullptr && dy != nullptr)
+    if (dx != nullptr && dy != nullptr)
     {
         // Pre-compute the offset and pixel's distance for BILINEAR interpolation
         Iterator offsets_it(offsets, win);
         Iterator dx_it(dx, win);
         Iterator dy_it(dy, win);
 
-        execute_window_loop(win, [&](const Coordinates & id)
-        {
-            const float in_x  = (id.x() + sampling_offset) * wr - sampling_offset;
-            const float in_y  = (id.y() + sampling_offset) * hr - sampling_offset;
-            const int   in_xi = std::floor(in_x);
-            const int   in_yi = std::floor(in_y);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &id)
+            {
+                const float in_x  = (id.x() + sampling_offset) * wr - sampling_offset;
+                const float in_y  = (id.y() + sampling_offset) * hr - sampling_offset;
+                const int   in_xi = std::floor(in_x);
+                const int   in_yi = std::floor(in_y);
 
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
-            *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
-            *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
-        },
-        offsets_it, dx_it, dy_it);
+                *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
+                *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
+                *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
+            },
+            offsets_it, dx_it, dy_it);
     }
     else
     {
         // Pre-compute the offset for NEAREST interpolation
         Iterator offsets_it(offsets, win);
 
-        execute_window_loop(win, [&](const Coordinates & id)
-        {
-            const float float_in_xi                        = (id.x() + sampling_offset) * wr;
-            const auto  in_xi                              = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi));
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
-        },
-        offsets_it);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &id)
+            {
+                const float float_in_xi = (id.x() + sampling_offset) * wr;
+                const auto  in_xi       = static_cast<size_t>(
+                    align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi)
+                                         : std::floor(float_in_xi));
+                *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
+            },
+            offsets_it);
     }
 }
 } // namespace
@@ -96,20 +104,24 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn
     _is_prepared = false;
 
     // Get data layout and width/height indices
-    _data_layout         = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout;
-    const int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    _data_layout        = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout;
+    const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
     const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the ratio between source width/height and destination width/height
-    const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
-    const auto wr                    = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used);
-    const auto hr                    = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used);
+    const bool is_align_corners_used =
+        _scale_info.align_corners &&
+        arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
+    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width),
+                                                                     dst->dimension(idx_width), is_align_corners_used);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height),
+                                                                     dst->dimension(idx_height), is_align_corners_used);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f
-                                         && hr <= 1.f) ?
-                                        InterpolationPolicy::NEAREST_NEIGHBOR :
-                                        _scale_info.interpolation_policy;
+    InterpolationPolicy policy_to_use =
+        (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+            ? InterpolationPolicy::NEAREST_NEIGHBOR
+            : _scale_info.interpolation_policy;
 
     // Get the tensor shape
     TensorShape shape(dst->dimension(idx_width));
@@ -122,7 +134,7 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn
     auto dy           = std::make_unique<TensorInfo>(tensor_info_dxdy);
     auto offsets      = std::make_unique<TensorInfo>(tensor_info_offsets);
     auto scale_kernel = std::make_unique<kernels::CpuScaleKernel>();
-    switch(policy_to_use)
+    switch (policy_to_use)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
         {
@@ -148,7 +160,8 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn
 Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER &&
+                                info.sampling_policy != SamplingPolicy::TOP_LEFT);
 
     ITensorInfo *offsets = nullptr;
     ITensorInfo *dx      = nullptr;
@@ -160,19 +173,25 @@ Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const
     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the ratio between source width/height and destination width/height
-    const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
-    const auto wr                    = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used);
-    const auto hr                    = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used);
+    const bool is_align_corners_used =
+        info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
+    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width),
+                                                                     dst->dimension(idx_width), is_align_corners_used);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height),
+                                                                     dst->dimension(idx_height), is_align_corners_used);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
+    InterpolationPolicy policy_to_use =
+        (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+            ? InterpolationPolicy::NEAREST_NEIGHBOR
+            : info.interpolation_policy;
 
     // Get the tensor shape of auxilary buffers
     const TensorShape shape(dst->dimension(idx_width), dst->dimension(idx_height));
     TensorInfo        tensor_info_offsets(shape, Format::S32);
     TensorInfo        tensor_info_dx(shape, Format::F32);
     TensorInfo        tensor_info_dy(shape, Format::F32);
-    switch(policy_to_use)
+    switch (policy_to_use)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
             offsets = &tensor_info_offsets;
@@ -186,13 +205,14 @@ Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const
             break;
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info));
     return Status{};
 }
 
 void CpuScale::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _is_prepared       = true;
         const auto src     = tensors.get_const_tensor(TensorType::ACL_SRC);
@@ -206,22 +226,27 @@ void CpuScale::prepare(ITensorPack &tensors)
         const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
 
         // Compute the ratio between source width/height and destination width/height
-        const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
-        const auto wr                    = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used);
-        const auto hr                    = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used);
+        const bool is_align_corners_used =
+            _scale_info.align_corners &&
+            arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
+        const auto wr = arm_compute::scale_utils::calculate_resize_ratio(
+            src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used);
+        const auto hr = arm_compute::scale_utils::calculate_resize_ratio(
+            src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used);
 
         // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-        InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f
-                                             && hr <= 1.f) ?
-                                            InterpolationPolicy::NEAREST_NEIGHBOR :
-                                            _scale_info.interpolation_policy;
+        InterpolationPolicy policy_to_use =
+            (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+                ? InterpolationPolicy::NEAREST_NEIGHBOR
+                : _scale_info.interpolation_policy;
         const SamplingPolicy sampling_policy = _scale_info.sampling_policy;
 
-        bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(_data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode);
+        bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(
+            _data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode);
 
-        if(precompute_indices_weights)
+        if (precompute_indices_weights)
         {
-            switch(policy_to_use)
+            switch (policy_to_use)
             {
                 case InterpolationPolicy::NEAREST_NEIGHBOR:
                 {
@@ -245,7 +270,8 @@ void CpuScale::prepare(ITensorPack &tensors)
         }
         else
         {
-            if(policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA)
+            if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR &&
+                policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA)
             {
                 ARM_COMPUTE_ERROR("Unsupported interpolation mode");
             }
diff --git a/src/cpu/operators/CpuScale.h b/src/cpu/operators/CpuScale.h
index ee7c523bad6ce4b14394e5e18acce00f739ed2be..c12a8e733a5dfc474863a03353e9abe46c55d3cb 100644
--- a/src/cpu/operators/CpuScale.h
+++ b/src/cpu/operators/CpuScale.h
@@ -24,9 +24,10 @@
 #ifndef ARM_COMPUTE_CPU_SCALE_H
 #define ARM_COMPUTE_CPU_SCALE_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/experimental/Types.h"
+
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -62,9 +63,9 @@ public:
     void run(ITensorPack &tensors) override;
 
 private:
-    ScaleKernelInfo _scale_info{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED };
-    DataLayout      _data_layout{ DataLayout::UNKNOWN };
-    bool            _is_prepared{ false };
+    ScaleKernelInfo _scale_info{InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED};
+    DataLayout      _data_layout{DataLayout::UNKNOWN};
+    bool            _is_prepared{false};
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp
index bf4c2fa3a266683b10548cd7cfd9b25baf9e3586..e55d7f903e046556e87b56ad32782df4ba067c20 100644
--- a/src/cpu/operators/CpuSoftmax.cpp
+++ b/src/cpu/operators/CpuSoftmax.cpp
@@ -25,9 +25,10 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/helpers/SoftmaxHelpers.h"
@@ -63,13 +64,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
     ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis));
     ARM_COMPUTE_LOG_PARAMS(src, dst, beta, axis);
 
-    const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
+    const unsigned int actual_axis =
+        static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
 
     _needs_permute = actual_axis > 0;
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
-        _permute_input.configure(src, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
+        _permute_input.configure(src, &_input_permuted,
+                                 softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
     }
 
     // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case)
@@ -79,10 +82,11 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
     // Create intermediate tensors shapes
     TensorShape max_sum_shape = tmp_input->tensor_shape();
     max_sum_shape.set(0, 1);
-    const TensorInfo input_info    = tmp_input->clone()->reset_padding().set_is_resizable(true);
-    DataType         tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
-    TensorInfo       tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
-    TensorInfo       max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
+    const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
+    DataType         tmp_data_type =
+        is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
+    TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
+    TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
 
     // Init intermediate tensors
     _max = TensorInfo(max_info);
@@ -94,13 +98,14 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
     _max_kernel = std::move(mk);
 
     auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>();
-    if(_needs_permute)
+    if (_needs_permute)
     {
         // The normalization kernel stores the result in a permuted output tensor
         sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp);
 
         // Re-permute the permuted output into the requested (4D) output
-        _permute_output.configure(&_output_permuted, dst, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
+        _permute_output.configure(&_output_permuted, dst,
+                                  softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
     }
     else
     {
@@ -109,11 +114,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
     }
     _softmax_kernel = std::move(sm);
 
-    _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
-    _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
+    _aux_mem[InternalTensorIdx::MAX] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
+    _aux_mem[InternalTensorIdx::TMP] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
 
-    _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _input_permuted.total_size());
-    _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _output_permuted.total_size());
+    _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC),
+                                                           MemoryLifetime::Temporary, _input_permuted.total_size());
+    _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST),
+                                                           MemoryLifetime::Temporary, _output_permuted.total_size());
 }
 
 template <bool IS_LOG>
@@ -123,7 +132,8 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported");
     ARM_COMPUTE_UNUSED(beta);
-    ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) || static_cast<int32_t>(src->num_dimensions()) <= axis);
+    ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) ||
+                                static_cast<int32_t>(src->num_dimensions()) <= axis);
 
     // Create intermediate tensor info
     DataType         tmp_data_type = src->data_type();
@@ -131,25 +141,33 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor
 
     TensorShape max_sum_shape = src->tensor_shape();
     max_sum_shape.set(0, 1);
-    const TensorInfo tensor_info_max_sum(src->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(src->quantization_info()).set_is_resizable(true));
+    const TensorInfo tensor_info_max_sum(src->clone()
+                                             ->set_tensor_shape(max_sum_shape)
+                                             .set_data_type(tmp_data_type)
+                                             .set_quantization_info(src->quantization_info())
+                                             .set_is_resizable(true));
     const TensorInfo dont_care;
 
-    const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
+    const unsigned int actual_axis =
+        static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
 
     const bool needs_permute = actual_axis > 0;
 
-    if(needs_permute)
+    if (needs_permute)
     {
-        const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
-        const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector);
-        TensorInfo              input_permuted(src->clone()->set_tensor_shape(permuted_shape));
+        const PermutationVector permutation_vector =
+            softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
+        const TensorShape permuted_shape =
+            misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector);
+        TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape));
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &input_permuted, permutation_vector));
         TensorInfo output_permuted(dst->clone()->set_tensor_shape(permuted_shape));
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector));
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
+        &tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
 
     return Status{};
 }
@@ -166,43 +184,38 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
     CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true);
 
     CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true);
-    CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, true);
+    CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors,
+                                        true);
 
     ITensorPack max_pack;
     ITensorPack softmax_pack;
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
-        ITensorPack permute_in_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, input_permuted.get() } };
+        ITensorPack permute_in_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, input_permuted.get()}};
         _permute_input.run(permute_in_pack);
 
-        max_pack = { { TensorType::ACL_SRC, input_permuted.get() }, { TensorType::ACL_DST, max.get() } };
+        max_pack = {{TensorType::ACL_SRC, input_permuted.get()}, {TensorType::ACL_DST, max.get()}};
 
-        softmax_pack =
-        {
-            { TensorType::ACL_SRC_0, input_permuted.get() },
-            { TensorType::ACL_SRC_1, max.get() },
-            { TensorType::ACL_DST_0, output_permuted.get() },
-            { TensorType::ACL_DST_1, tmp.get() }
-        };
+        softmax_pack = {{TensorType::ACL_SRC_0, input_permuted.get()},
+                        {TensorType::ACL_SRC_1, max.get()},
+                        {TensorType::ACL_DST_0, output_permuted.get()},
+                        {TensorType::ACL_DST_1, tmp.get()}};
     }
     else
     {
-        max_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, max.get() } };
-
-        softmax_pack =
-        {
-            { TensorType::ACL_SRC_0, src },
-            { TensorType::ACL_SRC_1, max.get() },
-            { TensorType::ACL_DST_0, dst },
-            { TensorType::ACL_DST_1, tmp.get() }
-        };
+        max_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, max.get()}};
+
+        softmax_pack = {{TensorType::ACL_SRC_0, src},
+                        {TensorType::ACL_SRC_1, max.get()},
+                        {TensorType::ACL_DST_0, dst},
+                        {TensorType::ACL_DST_1, tmp.get()}};
     }
 
     NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack);
     NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         ITensorPack permute_out_pack;
         permute_out_pack.add_tensor(TensorType::ACL_SRC, output_permuted.get());
@@ -211,7 +224,7 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
     }
 }
 
-template <bool                   IS_LOG>
+template <bool IS_LOG>
 experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const
 {
     return _aux_mem;
diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h
index 64df8704f9e713058117d91851c64c89b9b51afd..8cab70e14f32b69e82650beba4513e4541df1ea9 100644
--- a/src/cpu/operators/CpuSoftmax.h
+++ b/src/cpu/operators/CpuSoftmax.h
@@ -24,11 +24,13 @@
 #ifndef ARM_COMPUTE_CPU_SOFTMAX_H
 #define ARM_COMPUTE_CPU_SOFTMAX_H
 
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/TensorInfo.h"
+
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/ICpuOperator.h"
 #include "src/cpu/operators/CpuPermute.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -77,7 +79,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/cpu/operators/CpuSub.cpp b/src/cpu/operators/CpuSub.cpp
index 91a5b6e63c9f48f42c8f9f13b5828a8767093a92..7d27efbc96e15a9c568ae909f1db9e4d7d42a056 100644
--- a/src/cpu/operators/CpuSub.cpp
+++ b/src/cpu/operators/CpuSub.cpp
@@ -23,17 +23,20 @@
  */
 #include "src/cpu/operators/CpuSub.h"
 
-#include "src/cpu/kernels/CpuSubKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuSubKernel.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuSub::configure(const ITensorInfo         *src0,
+                       const ITensorInfo         *src1,
+                       ITensorInfo               *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy);
@@ -42,7 +45,11 @@ void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensor
     _kernel = std::move(k);
 }
 
-Status CpuSub::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuSub::validate(const ITensorInfo         *src0,
+                        const ITensorInfo         *src1,
+                        const ITensorInfo         *dst,
+                        ConvertPolicy              policy,
+                        const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return kernels::CpuSubKernel::validate(src0, src1, dst, policy);
diff --git a/src/cpu/operators/CpuSub.h b/src/cpu/operators/CpuSub.h
index 88908637aa1dcec46a54a4b8315649947db48e6b..d1782a1d3c5d39d0aac228c33745f9e1f06d8e58 100644
--- a/src/cpu/operators/CpuSub.h
+++ b/src/cpu/operators/CpuSub.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_SUB_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
@@ -53,14 +54,22 @@ public:
      * @param[in]  policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensorInfo         *src0,
+                   const ITensorInfo         *src1,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuSub::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src0,
+                           const ITensorInfo         *src1,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
diff --git a/src/cpu/operators/CpuTranspose.cpp b/src/cpu/operators/CpuTranspose.cpp
index 4e7854fd6ea6812fc44ae21aee9207e7a14f5898..ea548e05114e3a5b6c9ef8cbbedaba5fde015492 100644
--- a/src/cpu/operators/CpuTranspose.cpp
+++ b/src/cpu/operators/CpuTranspose.cpp
@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuTranspose.h"
 
-#include "src/cpu/kernels/CpuTransposeKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuTransposeKernel.h"
 
 namespace arm_compute
 {
@@ -43,5 +42,5 @@ Status CpuTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst)
 {
     return kernels::CpuTransposeKernel::validate(src, dst);
 }
-} // namesapce cpu
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuWinogradConv2d.cpp b/src/cpu/operators/CpuWinogradConv2d.cpp
index c4edd89964136952c09b3488011ea02766879c58..e4bcdc0b64fc43f5a70aff9a920028bf3929cde4 100644
--- a/src/cpu/operators/CpuWinogradConv2d.cpp
+++ b/src/cpu/operators/CpuWinogradConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,23 +22,25 @@
  * SOFTWARE.
  */
 #include "src/cpu/operators/CpuWinogradConv2d.h"
+
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/kernels/assembly/winograd.hpp"
 #include "src/core/NEON/kernels/convolution/common/tensor.hpp"
 #include "src/core/NEON/kernels/convolution/common/utils.hpp"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/AssemblyUtils.h"
-#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/operators/CpuActivation.h"
 #include "src/cpu/operators/CpuPermute.h"
 #include "src/cpu/utils/CpuAuxTensorHandler.h"
@@ -56,21 +58,26 @@ namespace
 inline Tensor4DShape internal_get_shape(const ITensorInfo *in)
 {
     const DataLayout data_layout = in->data_layout();
-    const int        in_width    = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
-    const int        in_height   = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
-    const int        in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
-    const int        in_batches  = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES));
+    const int        in_width = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
+    const int in_height   = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
+    const int in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+    const int in_batches  = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES));
 
-    return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
+    return Tensor4DShape{in_batches, in_height, in_width, in_channels};
 }
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+Status validate_arguments(const ITensorInfo   *src,
+                          const ITensorInfo   *weights,
+                          const ITensorInfo   *biases,
+                          const ITensorInfo   *dst,
+                          const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_UNUSED(dst, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
-    if(biases != nullptr)
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1,
+                                    "Winograd layer only supports unit strides.");
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
@@ -80,43 +87,46 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     return Status{};
 }
 
-bool get_winograd_kernel_implementation(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
-                                        const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math,
-                                        arm_conv::winograd::WinogradImpl *winograd_impl, std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args)
+bool get_winograd_kernel_implementation(const ITensorInfo                          *src,
+                                        const ITensorInfo                          *weights,
+                                        const ITensorInfo                          *dst,
+                                        const PadStrideInfo                        &conv_info,
+                                        const ActivationLayerInfo                  &act_info,
+                                        bool                                        enable_fast_math,
+                                        arm_conv::winograd::WinogradImpl           *winograd_impl,
+                                        std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args)
 {
     arm_conv::winograd::WinogradConfig winograd_cfg;
     arm_gemm::GemmConfig               cfg;
 
     const DataType data_type = src->data_type();
-    Tensor4DShape  in_shape{ internal_get_shape(src) };
-    Tensor4DShape  out_shape{ internal_get_shape(dst) };
-    Tensor4DShape  kernel_shape{ internal_get_shape(weights) };
+    Tensor4DShape  in_shape{internal_get_shape(src)};
+    Tensor4DShape  out_shape{internal_get_shape(dst)};
+    Tensor4DShape  kernel_shape{internal_get_shape(weights)};
     uint32_t       nthreads = NEScheduler::get().num_threads();
     // Get configuration arguments for Winograd
     winograd_cfg.output_rows = 0;
     winograd_cfg.output_cols = 0;
     conv_args                = std::make_unique<arm_conv::ConvolutionArgs>(
-                                   in_shape.n_batches,
-                                   arm_conv::Shape2D{ static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols) },
-                                   in_shape.n_channels,
-                                   conv_info.pad_top(),
-                                   conv_info.pad_left(),
-                                   arm_conv::Shape2D{ static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols) },
-                                   out_shape.n_channels,
-                                   arm_conv::Shape2D{ static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols) },
-                                   assembly_utils::map_to_arm_gemm_activation(act_info));
+        in_shape.n_batches,
+        arm_conv::Shape2D{static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols)},
+        in_shape.n_channels, conv_info.pad_top(), conv_info.pad_left(),
+        arm_conv::Shape2D{static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols)},
+        out_shape.n_channels,
+        arm_conv::Shape2D{static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols)},
+        assembly_utils::map_to_arm_gemm_activation(act_info));
 
     bool success = false;
-    if(data_type == DataType::F32)
+    if (data_type == DataType::F32)
     {
-        success = arm_conv::winograd::get_implementation<float>(
-                      *winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr);
+        success = arm_conv::winograd::get_implementation<float>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads,
+                                                                enable_fast_math, &winograd_cfg, nullptr);
     }
 #if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    else if(data_type == DataType::F16)
+    else if (data_type == DataType::F16)
     {
-        success = arm_conv::winograd::get_implementation<__fp16>(
-                      *winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr);
+        success = arm_conv::winograd::get_implementation<__fp16>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads,
+                                                                 enable_fast_math, &winograd_cfg, nullptr);
     }
 #endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     else
@@ -127,7 +137,8 @@ bool get_winograd_kernel_implementation(const ITensorInfo *src, const ITensorInf
 }
 inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
 {
-    return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
+    return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ||
+           act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
 }
 } // namespace
 
@@ -141,7 +152,7 @@ CpuWinogradConv2d::CpuWinogradConv2d()
       _permute_output(std::make_unique<CpuPermute>()),
       _permute_weights(std::make_unique<CpuPermute>()),
       _aux_mem(AuxTensorIdx::Count),
-      _conv_args{ nullptr },
+      _conv_args{nullptr},
       _winograd_impl{},
       _data_layout(),
       _winograd_transformed_input{},
@@ -152,15 +163,20 @@ CpuWinogradConv2d::CpuWinogradConv2d()
       _weights_hwio(),
       _input_nhwc(),
       _output_nhwc(),
-      _is_prepared{ false },
-      _run_activation{ false }
+      _is_prepared{false},
+      _run_activation{false}
 {
 }
 
 CpuWinogradConv2d::~CpuWinogradConv2d() = default;
 
-void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
-                                  const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CpuWinogradConv2d::configure(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  ITensorInfo               *dst,
+                                  const PadStrideInfo       &conv_info,
+                                  const ActivationLayerInfo &act_info,
+                                  bool                       enable_fast_math)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
@@ -169,21 +185,29 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
     const DataType data_type = src->data_type();
     uint32_t       nthreads  = NEScheduler::get().num_threads();
     _data_layout             = src->data_layout();
-    const Tensor4DShape kernel_shape{ internal_get_shape(weights) };
-
-    bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, &_winograd_impl, _conv_args);
-
-    ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, kernel_shape.n_cols);
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", _winograd_impl.input_transform->get_name().c_str());
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", _winograd_impl.input_transform->get_name().c_str());
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", _winograd_impl.input_transform->get_name().c_str());
-
-    const bool has_impl = ((_winograd_impl.input_transform != nullptr) && (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr));
-    if(has_impl)
+    const Tensor4DShape kernel_shape{internal_get_shape(weights)};
+
+    bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math,
+                                                      &_winograd_impl, _conv_args);
+
+    ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows,
+                                kernel_shape.n_cols);
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n",
+                                        _winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n",
+                                        _winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n",
+                                        _winograd_impl.input_transform->get_name().c_str());
+
+    const bool has_impl = ((_winograd_impl.input_transform != nullptr) &&
+                           (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr));
+    if (has_impl)
     {
         // Determine how much working space is required, allocate it.
-        const size_t input_workspace_size  = _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads);
-        const size_t output_workspace_size = _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads);
+        const size_t input_workspace_size =
+            _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads);
+        const size_t output_workspace_size =
+            _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads);
 
         TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8);
         TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8);
@@ -232,7 +256,7 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
         PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U);
 
         // Configure the kernel to transform the input tensor from NCHW -> NHWC
-        if(_data_layout == DataLayout::NCHW)
+        if (_data_layout == DataLayout::NCHW)
         {
             _permute_input->configure(src, &_input_nhwc, PermutationVector(2U, 0U, 1U));
             weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
@@ -242,74 +266,90 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
         _permute_weights->configure(weights, &_weights_hwio, weights_permutation_vector);
 
         // Reorder the convoluted output to ACL's ordering NCHW
-        if(_data_layout == DataLayout::NCHW)
+        if (_data_layout == DataLayout::NCHW)
         {
             // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
-            TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0),
-                                        dst->dimension(1), dst->dimension(3)),
-                            1, dst->data_type());
+            TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), dst->dimension(1), dst->dimension(3)), 1,
+                            dst->data_type());
             _output_nhwc = info;
             _permute_output->configure(&_output_nhwc, dst, PermutationVector(1U, 2U, 0U));
         }
 
         // Configure input transform kernel
-        _transform_input_kernel = std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads);
+        _transform_input_kernel =
+            std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads);
 
         // Configure GEMM function
-        _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr, &_winograd_transformed_output, 1.0f, 0.f);
+        _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr,
+                                  &_winograd_transformed_output, 1.0f, 0.f);
 
         // Configure output transform kernel
-        _transform_output_kernel = std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads);
+        _transform_output_kernel =
+            std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads);
 
         //Configure Activation Layer
         _run_activation = act_info.enabled() && !fuse_function_supported(act_info);
-        if(_run_activation)
+        if (_run_activation)
         {
             _activation_func->configure(dst, nullptr, act_info);
         }
 
-        auto asm_mem_req         = _gemm_function->workspace();
-        _aux_mem[GemmWorkspace]  = asm_mem_req[GemmWorkspace];
-        _aux_mem[Pretranspose]   = asm_mem_req[Pretranspose];
-        _aux_mem[InterleavedLHS] = asm_mem_req[InterleavedLHS];
-        _aux_mem[TransposedRHS]  = asm_mem_req[TransposedRHS];
-        _aux_mem[TempResult]     = asm_mem_req[TempResult];
+        const auto mm_mem_req = _gemm_function->workspace();
+        for (unsigned int slot = 0; slot < mm_mem_req.size(); ++slot)
+        {
+            _aux_mem[slot] = mm_mem_req[slot];
+        }
 
         // Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps.
-        _aux_mem[TransformedInput]   = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, wds.input_matrix_size_bytes, storage_alignment);
-        _aux_mem[TransformedOutput]  = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, wds.output_matrix_size_bytes, storage_alignment);
-        _aux_mem[WorkspaceIO]        = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, std::max(input_workspace_size, output_workspace_size));
-        _aux_mem[PermutedWeights]    = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
-        _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, wds.weight_matrix_size_bytes, storage_alignment);
-        if(_data_layout == DataLayout::NCHW)
+        _aux_mem[TransformedInput]  = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary,
+                                                 wds.input_matrix_size_bytes, storage_alignment);
+        _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary,
+                                                 wds.output_matrix_size_bytes, storage_alignment);
+        _aux_mem[WorkspaceIO]       = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary,
+                                                 std::max(input_workspace_size, output_workspace_size));
+        _aux_mem[PermutedWeights] =
+            MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
+        _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent,
+                                                  wds.weight_matrix_size_bytes, storage_alignment);
+        if (_data_layout == DataLayout::NCHW)
         {
             _aux_mem[PermutedInput].merge(offset_int_vec(PermutedInput), src->total_size());
             _aux_mem[PermutedOutput].merge(offset_int_vec(PermutedOutput), dst->total_size());
         }
     }
 }
-Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                   const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CpuWinogradConv2d::validate(const ITensorInfo         *src,
+                                   const ITensorInfo         *weights,
+                                   const ITensorInfo         *biases,
+                                   const ITensorInfo         *dst,
+                                   const PadStrideInfo       &conv_info,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info));
 
     // Disable winograd for fp16 if fast math is false.
-    if(!enable_fast_math)
+    if (!enable_fast_math)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
     }
 
-    const Tensor4DShape              kernel_shape{ internal_get_shape(weights) };
+    const Tensor4DShape              kernel_shape{internal_get_shape(weights)};
     arm_conv::winograd::WinogradImpl winograd_impl{};
 
     std::unique_ptr<arm_conv::ConvolutionArgs> conv_args;
-    const bool                                 success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, &winograd_impl, conv_args);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, kernel_shape.n_cols);
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", winograd_impl.input_transform->get_name().c_str());
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", winograd_impl.input_transform->get_name().c_str());
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", winograd_impl.input_transform->get_name().c_str());
+    const bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math,
+                                                            &winograd_impl, conv_args);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows,
+                                        kernel_shape.n_cols);
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n",
+                                        winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n",
+                                        winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n",
+                                        winograd_impl.input_transform->get_name().c_str());
     return Status{};
 }
 
@@ -328,24 +368,29 @@ void CpuWinogradConv2d::run(ITensorPack &tensors)
 
     // Wrap the winograd-domain tensorInfos created in configuration in tensors and allocate the required memory.
     CpuAuxTensorHandler input_nhwc(offset_int_vec(PermutedInput), _input_nhwc, tensors, true);
-    CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input, tensors, true);
+    CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input,
+                                                   tensors, true);
     CpuAuxTensorHandler input_workspace(offset_int_vec(WorkspaceIO), _input_workspace, tensors, true);
     const bool          is_nchw = _data_layout == DataLayout::NCHW;
-    if(is_nchw)
+    if (is_nchw)
     {
         //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
-        ITensorPack pack{ { ACL_SRC, src }, { ACL_DST, input_nhwc.get() } };
+        ITensorPack pack{{ACL_SRC, src}, {ACL_DST, input_nhwc.get()}};
         _permute_input->run(pack);
     }
 
-    CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output, tensors, true);
+    CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output,
+                                                    tensors, true);
     CpuAuxTensorHandler output_workspace(offset_int_vec(WorkspaceIO), _output_workspace, tensors, true);
     CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true);
 
-    ITensorPack transform_input_pack{ { ACL_SRC, is_nchw ? input_nhwc.get() : src }, { ACL_DST, winograd_input_transformed.get() }, { ACL_INT, input_workspace.get() } };
+    ITensorPack transform_input_pack{{ACL_SRC, is_nchw ? input_nhwc.get() : src},
+                                     {ACL_DST, winograd_input_transformed.get()},
+                                     {ACL_INT, input_workspace.get()}};
     NEScheduler::get().schedule_op(_transform_input_kernel.get(), Window::DimX, win, transform_input_pack);
 
-    CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights, tensors, true);
+    CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights,
+                                                     tensors, true);
 
     // Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
     ITensorPack gemm_pack = tensors;
@@ -356,30 +401,34 @@ void CpuWinogradConv2d::run(ITensorPack &tensors)
     _gemm_function->run(gemm_pack);
 
     // Output transform
-    ITensorPack transform_output_pack{ { ACL_SRC_0, winograd_output_transformed.get() }, { ACL_DST, is_nchw ? output_nhwc.get() : output }, { ACL_SRC_1, biases }, { ACL_INT, output_workspace.get() } };
+    ITensorPack transform_output_pack{{ACL_SRC_0, winograd_output_transformed.get()},
+                                      {ACL_DST, is_nchw ? output_nhwc.get() : output},
+                                      {ACL_SRC_1, biases},
+                                      {ACL_INT, output_workspace.get()}};
     NEScheduler::get().schedule_op(_transform_output_kernel.get(), Window::DimX, win, transform_output_pack);
-    if(is_nchw)
+    if (is_nchw)
     {
         // Reorder the convoluted output to ACL's ordering NCHW
-        ITensorPack pack{ { ACL_SRC, output_nhwc.get() }, { ACL_DST, output } };
+        ITensorPack pack{{ACL_SRC, output_nhwc.get()}, {ACL_DST, output}};
         _permute_output->run(pack);
     }
-    if(_run_activation)
+    if (_run_activation)
     {
-        ITensorPack pack{ { ACL_SRC, output }, { ACL_DST, output } };
+        ITensorPack pack{{ACL_SRC, output}, {ACL_DST, output}};
         _activation_func->run(pack);
     }
 }
 
 void CpuWinogradConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        const ITensor *weights     = tensors.get_const_tensor(ACL_SRC_1);
-        ITensor       *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
+        const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
+        ITensor       *weights_aux =
+            utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
 
         CpuAuxTensorHandler permuted_weights(_weights_hwio, *weights_aux);
-        ITensorPack         permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } };
+        ITensorPack         permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}};
         _permute_weights->run(permute_tensors);
         const int element_size_in_bytes = permuted_weights.get()->info()->element_size();
         // Weights were in OHWI format, before being permuted "permuted_weights" to be in HWIO format.
@@ -387,31 +436,32 @@ void CpuWinogradConv2d::prepare(ITensorPack &tensors)
         const unsigned int width_idx   = 2; // W in HWIO
         const unsigned int channel_idx = 1; // I in HWIO
 
-        const int permuted_weight_row_stride     = permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes;
-        const int permuted_weight_col_stride     = permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes;
-        const int permuted_weight_channel_stride = permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes;
+        const int permuted_weight_row_stride =
+            permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes;
+        const int permuted_weight_col_stride =
+            permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes;
+        const int permuted_weight_channel_stride =
+            permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes;
 
         // Wrap the winograd-domain transformed weight TensorInfo in Auxiliary tensor and allocate the required memory.
-        ITensor *weights_transf = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights)));
+        ITensor *weights_transf =
+            utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights)));
         ARM_COMPUTE_ERROR_ON_NULLPTR(weights_transf);
         CpuAuxTensorHandler winograd_transformed_weights(_winograd_transformed_weights, *weights_transf);
 
         const void *permuted_weights_ptr;
         void       *win_wght_transf_ptr;
 
-        permuted_weights_ptr = reinterpret_cast<const void *>(permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes());
-        win_wght_transf_ptr  = reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() + winograd_transformed_weights.get()->info()->offset_first_element_in_bytes());
+        permuted_weights_ptr = reinterpret_cast<const void *>(
+            permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes());
+        win_wght_transf_ptr =
+            reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() +
+                                     winograd_transformed_weights.get()->info()->offset_first_element_in_bytes());
 
         // Prepare Weights
         _winograd_impl.weight_transform->execute(
-            *_conv_args,
-            permuted_weights_ptr,
-            permuted_weight_row_stride,
-            permuted_weight_col_stride,
-            permuted_weight_channel_stride,
-            win_wght_transf_ptr,
-            _winograd_impl.winograd_spec,
-            0, 1 // Thread 1 of 1
+            *_conv_args, permuted_weights_ptr, permuted_weight_row_stride, permuted_weight_col_stride,
+            permuted_weight_channel_stride, win_wght_transf_ptr, _winograd_impl.winograd_spec, 0, 1 // Thread 1 of 1
         );
         ITensorPack gemm_pack = tensors;
         gemm_pack.add_const_tensor(ACL_SRC_1, winograd_transformed_weights.get());
diff --git a/src/cpu/operators/CpuWinogradConv2d.h b/src/cpu/operators/CpuWinogradConv2d.h
index e0df34e2db9b2b1990b72b076f17c1eeaf8e601a..03bfc51a460b7540213b529fcac13f6a1e6ed554 100644
--- a/src/cpu/operators/CpuWinogradConv2d.h
+++ b/src/cpu/operators/CpuWinogradConv2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,15 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_WINOGRAD_CONV2D_KERNEL_H
-#define ARM_COMPUTE_CPU_WINOGRAD_CONV2D_KERNEL_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUWINOGRADCONV2D_H
+#define ACL_SRC_CPU_OPERATORS_CPUWINOGRADCONV2D_H
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
-#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/kernels/assembly/gemm_common.hpp"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/operators/CpuActivation.h"
 #include "src/cpu/operators/CpuGemm.h"
 #include "src/cpu/operators/CpuPermute.h"
@@ -64,7 +65,7 @@ public:
      *                              while every optional dimension from 4 and above represent a batch of inputs.
      *                              Data types supported: F16/F32.
      * @param[in]  weights          Weights tensor Info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
-     *                              Currently only 3x3 and 5x5 kernels are supported.
+     *                              For supported kernel sizes, see @ref arm_compute::NEWinogradConvolutionLayer
      * @param[in]  biases           Biases tensor Info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
      * @param[out] dst              Destination tensor Info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                              Data types supported: Same as @p input.
@@ -73,7 +74,11 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+    void configure(const ITensorInfo         *src,
+                   const ITensorInfo         *weights,
+                   const ITensorInfo         *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
                    const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
                    bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2d
@@ -82,55 +87,56 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
                            bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
     enum AuxTensorIdx
     {
-        GemmWorkspace      = 0,
-        Pretranspose       = 1,
-        InterleavedLHS     = 2,
-        TransposedRHS      = 3,
-        TempResult         = 4,
-        TransformedInput   = 5,
-        TransformedOutput  = 6,
-        WorkspaceIO        = 7,
-        TransformedWeights = 8,
-        PermutedWeights    = 9,
-        PermutedInput      = TransformedOutput,
-        PermutedOutput     = TransformedInput,
-        Count              = 10
+        /** Slot 0 - 6 reserved for CpuGemm */
+        TransformedInput = 7,
+        TransformedOutput,
+        WorkspaceIO,
+        TransformedWeights,
+        PermutedWeights,
+        Count,
+        PermutedInput  = TransformedOutput,
+        PermutedOutput = TransformedInput
     };
-    std::unique_ptr<CpuGemm>                   _gemm_function;
-    std::unique_ptr<CpuActivation>             _activation_func;
-    std::unique_ptr<ICPPKernel>                _transform_input_kernel;
-    std::unique_ptr<ICPPKernel>                _transform_output_kernel;
-    std::unique_ptr<CpuPermute>                _permute_input;
-    std::unique_ptr<CpuPermute>                _permute_output;
-    std::unique_ptr<CpuPermute>                _permute_weights;
-    experimental::MemoryRequirements           _aux_mem{ Count };
-    std::unique_ptr<arm_conv::ConvolutionArgs> _conv_args; // Make it unique ptr because this type does not have a default constructor
-    arm_conv::winograd::WinogradImpl           _winograd_impl;
-    DataLayout                                 _data_layout;
-    TensorInfo                                 _winograd_transformed_input;
-    TensorInfo                                 _winograd_transformed_output;
-    TensorInfo                                 _winograd_transformed_weights;
-    TensorInfo                                 _input_workspace;
-    TensorInfo                                 _output_workspace;
-    TensorInfo                                 _weights_hwio;
-    TensorInfo                                 _input_nhwc;
-    TensorInfo                                 _output_nhwc;
-    bool                                       _is_prepared;
-    bool                                       _run_activation;
+    std::unique_ptr<CpuGemm>         _gemm_function;
+    std::unique_ptr<CpuActivation>   _activation_func;
+    std::unique_ptr<ICPPKernel>      _transform_input_kernel;
+    std::unique_ptr<ICPPKernel>      _transform_output_kernel;
+    std::unique_ptr<CpuPermute>      _permute_input;
+    std::unique_ptr<CpuPermute>      _permute_output;
+    std::unique_ptr<CpuPermute>      _permute_weights;
+    experimental::MemoryRequirements _aux_mem{Count};
+    std::unique_ptr<arm_conv::ConvolutionArgs>
+        _conv_args; // Make it unique ptr because this type does not have a default constructor
+    arm_conv::winograd::WinogradImpl _winograd_impl;
+    DataLayout                       _data_layout;
+    TensorInfo                       _winograd_transformed_input;
+    TensorInfo                       _winograd_transformed_output;
+    TensorInfo                       _winograd_transformed_weights;
+    TensorInfo                       _input_workspace;
+    TensorInfo                       _output_workspace;
+    TensorInfo                       _weights_hwio;
+    TensorInfo                       _input_nhwc;
+    TensorInfo                       _output_nhwc;
+    bool                             _is_prepared;
+    bool                             _run_activation;
 };
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_CPU_WINOGRAD_CONV2D_KERNEL_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUWINOGRADCONV2D_H
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index 3069d6b541250f3ca953ccf4688c5576c3b7098f..82bd465c9970856a574d213f7aff60ae621b5ec1 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -24,12 +24,14 @@
 #include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
 #include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
 #include "src/core/utils/AssemblyUtils.h"
-#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
 #include "src/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
+#include "src/cpu/operators/CpuTranspose.h"
 #include "src/cpu/utils/CpuAuxTensorHandler.h"
 
 #include <arm_neon.h>
@@ -53,7 +55,12 @@ namespace
  * @param[in] num_threads      Number of threads to run this method. Must be >= 1
  */
 template <typename TypeInput, typename TypeOutput>
-void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm, ITensor *dst, const TypeInput *src, int src_ld, int src_multi_stride, unsigned int num_threads)
+void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm,
+                                       ITensor                                     *dst,
+                                       const TypeInput                             *src,
+                                       int                                          src_ld,
+                                       int                                          src_multi_stride,
+                                       unsigned int                                 num_threads)
 {
     ARM_COMPUTE_ERROR_ON(gemm_asm == nullptr);
     ARM_COMPUTE_ERROR_ON(num_threads == 0);
@@ -61,14 +68,14 @@ void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutpu
     const unsigned int wsize = gemm_asm->get_B_pretranspose_window_size();
 
     std::vector<IScheduler::Workload> workloads(num_threads);
-    for(unsigned int t = 0; t < num_threads; ++t)
+    for (unsigned int t = 0; t < num_threads; ++t)
     {
-        workloads[t] = [ = ](const ThreadInfo & info)
+        workloads[t] = [=](const ThreadInfo &info)
         {
             const unsigned int start = (info.thread_id * wsize) / num_threads;
             const unsigned int end   = ((info.thread_id + 1) * wsize) / num_threads;
 
-            if(start < end)
+            if (start < end)
             {
                 gemm_asm->pretranspose_B_array_part(dst->buffer(), src, src_ld, src_multi_stride, start, end);
             }
@@ -113,7 +120,7 @@ Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITen
     p.sections = 1;
     p.indirect = false;
 
-    if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
+    if (info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
     {
         p.indirect = true;
         p.sections = b->tensor_shape()[2] * b->tensor_shape()[3];
@@ -125,7 +132,7 @@ Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITen
     }
 
     // Update M in case of GEMM3D for output
-    if(info.depth_output_gemm3d != 0)
+    if (info.depth_output_gemm3d != 0)
     {
         p.M       = d->tensor_shape().y() * d->tensor_shape().z();
         p.batches = d->tensor_shape().total_size_upper(3) / p.multis;
@@ -139,19 +146,24 @@ IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataTyp
     // Schedule assembly kernel
     const int         granule_threshold = 200;
     IScheduler::Hints scheduling_hint   = IScheduler::Hints(Window::DimX);
-    if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
+    if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
     {
         scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
     }
-    else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8))
+    else if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D &&
+             (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 ||
+              data_type == DataType::S8))
     {
         //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
-        scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+        scheduling_hint =
+            IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
     }
-    else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
+    else if (method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D &&
+             (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
     {
         //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
-        scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+        scheduling_hint =
+            IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
     }
 
     return scheduling_hint;
@@ -175,8 +187,12 @@ public:
      * @param[in]  gemm_info GEMM meta-data
      * @param[in]  os        Output stage meta-data.
      */
-    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                   arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
+    void configure(const ITensorInfo *a,
+                   const ITensorInfo *b,
+                   const ITensorInfo *c,
+                   ITensorInfo       *d,
+                   arm_gemm::GemmArgs args,
+                   const AsmGemmInfo &gemm_info,
                    const OutputStage &os = {});
 
     /** Set requantization shifts to be used
@@ -193,19 +209,20 @@ public:
       *
       * @return A tuple with the pointers to the shift and multiplier data respectively
       */
-    std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts,
-                                                                                            const std::vector<int32_t> &multipliers);
+    std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
+    set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     bool                             is_configured() const override;
     experimental::MemoryRequirements workspace() const override;
     bool                             isVarWeightsKernel() const override
     {
-        if(!_gemm_kernel_asm)
+        if (!_gemm_kernel_asm)
             return false;
-        const arm_compute::WeightFormat wf = assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format);
+        const arm_compute::WeightFormat wf =
+            assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format);
         return wf != arm_compute::WeightFormat::UNSPECIFIED && wf != arm_compute::WeightFormat::ANY;
     }
 
@@ -213,6 +230,7 @@ private:
     enum AuxTensorIdx
     {
         AsmGemmWorkspace = 0,
+        PrePretransposedB, /* Transposed B (rhs) before being passed to gemm or pretranspose_B_array */
         Pretranspose,
         Count
     };
@@ -228,16 +246,20 @@ private:
     /** Prepare the indirect buffer */
     void prepare_indirect_buffer(ITensorPack &tensors);
 
+    /** Operator to transpose B before gemm or pretranspose_B_array*/
+    std::unique_ptr<CpuTranspose> _pre_pretranspose_b{nullptr};
     /** Assembly Gemm kernel */
-    std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
+    std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{nullptr};
     /** Optimised Arm® Neon™ kernel */
-    std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
+    std::unique_ptr<INEKernel> _optimised_kernel{nullptr};
     /** Assembly GEMM workspace tensor info */
     TensorInfo _workspace_info{};
+    /** Pre-pre-transposed B tensor info */
+    TensorInfo _pre_pretransposed_b_info{};
     /** Pre-transpose tensor info */
     TensorInfo _pretranspose_info{};
     /** Prepared flag */
-    bool _is_prepared{ false };
+    bool _is_prepared{false};
     /** GEMM meta-data */
     AsmGemmInfo _gemm_info{};
     /** GEMM kernel description */
@@ -251,26 +273,27 @@ private:
     /** Indirect buffer */
     std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
     std::unique_ptr<const TypeInput *, free_delete>        _indirect_buf{};
-    std::vector<TypeInput>           _indirect_pad{};
-    arm_gemm::ConvolutionParameters  _cp{};
-    experimental::MemoryRequirements _aux_mem{ Count };
-    bool                             _B_pretranspose_required{ false };
-    bool                             _is_b_constant{ true };
-    bool                             _is_c_constant{ true };
+    std::vector<TypeInput>                                 _indirect_pad{};
+    arm_gemm::ConvolutionParameters                        _cp{};
+    experimental::MemoryRequirements                       _aux_mem{Count};
+    bool                                                   _B_pretranspose_required{false};
+    bool                                                   _is_b_constant{true};
+    bool                                                   _is_c_constant{true};
 };
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
 std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
-Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers)
+Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts,
+                                                                  const std::vector<int32_t> &multipliers)
 {
     _multipliers   = multipliers;
     _shifts        = shifts;
     bool need_left = false;
-    for(const auto s : _shifts)
+    for (const auto s : _shifts)
     {
         left_shifts.push_back(std::max(-s, int32_t(0)));
         right_shifts.push_back(std::min(-s, int32_t(0)));
-        if(s < 0 && !need_left)
+        if (s < 0 && !need_left)
         {
             need_left = true;
         }
@@ -295,32 +318,35 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITens
     const int    multi_size   = batch_size * batches;
     const size_t multi_stride = multi_size / sizeof(TypeInput);
 
-    for(int64_t m = 0; m < multis; m++)
+    for (int64_t m = 0; m < multis; m++)
     {
-        for(int64_t b = 0; b < batches; b++)
+        for (int64_t b = 0; b < batches; b++)
         {
-            for(int64_t output_y = 0; output_y < _cp.output_height; output_y++)
+            for (int64_t output_y = 0; output_y < _cp.output_height; output_y++)
             {
-                for(int64_t output_x = 0; output_x < _cp.output_width; output_x++)
+                for (int64_t output_x = 0; output_x < _cp.output_width; output_x++)
                 {
                     int64_t output_xy = (output_y * _cp.output_width) + output_x;
 
-                    for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
+                    for (int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
                     {
-                        for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
+                        for (int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
                         {
                             int64_t input_x   = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left;
                             int64_t input_y   = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top;
                             int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x;
                             int64_t input_xy  = (input_y * _cp.input_width) + input_x;
 
-                            if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
+                            if (input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
                             {
-                                _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data();
+                                _indirect_buf
+                                    .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
+                                    _indirect_pad.data();
                             }
                             else
                             {
-                                _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
+                                _indirect_buf
+                                    .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
                                     A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A);
                             }
                         }
@@ -332,12 +358,15 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITens
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a,
+                                                                      const ITensorInfo *b,
+                                                                      const ITensorInfo *d,
+                                                                      const AsmGemmInfo &info)
 {
     ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect));
 
     float zeropad = 0.f;
-    if(is_data_type_quantized(a->data_type()))
+    if (is_data_type_quantized(a->data_type()))
     {
         zeropad = a->quantization_info().uniform().offset;
     }
@@ -350,16 +379,25 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen
     const int64_t output_width   = static_cast<int64_t>(d->tensor_shape()[1]);
     const int64_t output_height  = static_cast<int64_t>(d->tensor_shape()[2]);
 
-    _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height,
-            info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad
-          };
-
-    if(info.method == AsmConvMethod::Conv)
+    _cp = {input_width,
+           input_height,
+           input_channels,
+           kernel_width,
+           kernel_height,
+           output_width,
+           output_height,
+           info.ps_info.stride().first,
+           info.ps_info.stride().second,
+           info.padding_top,
+           info.padding_left,
+           zeropad};
+
+    if (info.method == AsmConvMethod::Conv)
     {
         _gemm_kernel_asm->set_convolution_parameters(_cp);
     }
 
-    if(info.method == AsmConvMethod::Indirect)
+    if (info.method == AsmConvMethod::Indirect)
     {
         const unsigned int multis    = 1;
         const unsigned int batches   = a->tensor_shape().total_size_upper(3);
@@ -372,19 +410,22 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen
         const int    multi_size   = batch_size * batches;
         const size_t multi_stride = multi_size / sizeof(TypeInputPtr);
 
-        _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
-        _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
+        _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(
+            reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
+        _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(
+            reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
         _indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad));
 
         // Set indirect argument
         int64_t pos = 0;
-        for(int64_t m = 0; m < multis; m++)
+        for (int64_t m = 0; m < multis; m++)
         {
-            for(int64_t b = 0; b < batches; b++)
+            for (int64_t b = 0; b < batches; b++)
             {
-                for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
+                for (int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
                 {
-                    (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
+                    (_indirect_arg.get())[pos++] =
+                        _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
                 }
             }
         }
@@ -394,8 +435,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                                                             arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a,
+                                                             const ITensorInfo *b,
+                                                             const ITensorInfo *c,
+                                                             ITensorInfo       *d,
+                                                             arm_gemm::GemmArgs args,
+                                                             const AsmGemmInfo &gemm_info,
                                                              const OutputStage &os)
 {
     ARM_COMPUTE_UNUSED(c);
@@ -404,7 +449,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
     _is_c_constant = c ? c->are_values_constant() : true;
 
     _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os);
-    if(_gemm_kernel_asm == nullptr)
+    if (_gemm_kernel_asm == nullptr)
     {
         //configuration not supported: Leave function unconfigured:
         return;
@@ -419,13 +464,14 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
     const size_t       workspace_size = _gemm_kernel_asm->get_working_size();
     const unsigned int alignment      = 4096;
     _workspace_info                   = TensorInfo(TensorShape(workspace_size), 1, DataType::U8);
-    _aux_mem[AsmGemmWorkspace]        = MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
+    _aux_mem[AsmGemmWorkspace] =
+        MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
 
     //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
     //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
     {
         const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
-        if(window_size < static_cast<unsigned int>(args._maxthreads))
+        if (window_size < static_cast<unsigned int>(args._maxthreads))
         {
             _gemm_kernel_asm->set_nthreads(window_size);
         }
@@ -433,19 +479,56 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
 
     _optimised_kernel = std::move(acl_gemm_wrapper);
     _gemm_info        = gemm_info;
+    // Check if we need to pre-pretranspose B. Fixed format kernels need no pre-pretranspose.
+    const bool run_pre_pretranspose_b = _gemm_info.transpose_b && !isVarWeightsKernel();
+    if (run_pre_pretranspose_b)
+    {
+        _pre_pretranspose_b = std::make_unique<CpuTranspose>();
+        _pre_pretranspose_b->configure(b, &_pre_pretransposed_b_info);
+        MemoryLifetime lifetime;
+        if (_is_b_constant)
+        {
+            if (_gemm_kernel_asm->B_pretranspose_required())
+            {
+                // PrePretransposedB tensor is only used in prepare(), but is then succeeded by Pretranspose
+                // So PrePretransposedB can be freed inside prepare()
+                lifetime = MemoryLifetime::Prepare;
+            }
+            else
+            {
+                // PrePretransposedB tensor is only used in prepare(), but is the final transformation of B
+                // So PrePretransposedB needs to persist beyond prepare()
+                lifetime = MemoryLifetime::Persistent;
+            }
+        }
+        else
+        {
+            // PrePretransposedB tensor is always used in run() and doesn't need to persist
+            lifetime = MemoryLifetime::Temporary;
+        }
+        // Forcing 128-byte alignment (required by 32-bit kernels)
+        const unsigned int alignment = 128;
+        _aux_mem[PrePretransposedB] =
+            MemoryInfo(offset_int_vec(PrePretransposedB), lifetime, _pre_pretransposed_b_info.total_size(), alignment);
+    }
+
     // Check for pre-transposed support
-    if(_gemm_kernel_asm->B_pretranspose_required())
+    if (_gemm_kernel_asm->B_pretranspose_required())
     {
+        // Fixed format kernels need no pretranspose.
+        ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(
+            assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
         // Forcing 128-byte alignment (required by 32-bit kernels)
         const unsigned int alignment           = 128;
         const size_t       B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
         _pretranspose_info                     = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8);
-        _aux_mem[Pretranspose]                 = MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
-        _B_pretranspose_required               = true;
+        _aux_mem[Pretranspose] =
+            MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
+        _B_pretranspose_required = true;
     }
 
     // Handle indirect GEMM convolution
-    if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
+    if (gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
     {
         configure_indirect(a, b, d, gemm_info);
     }
@@ -454,34 +537,56 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
 template <typename TypeInput, typename TypeOutput, class OutputStage>
 void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
 
         // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
-        if(c && c->info()->data_type() == DataType::S32)
+        if (c && c->info()->data_type() == DataType::S32)
         {
-            _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
+            _gemm_kernel_asm->set_quantized_bias(
+                reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
+        }
+        const ITensor *b_to_use = b;
+        // Pre-pretranspose B if required
+        const bool          run_pre_pretranspose_b = _gemm_info.transpose_b && !isVarWeightsKernel();
+        CpuAuxTensorHandler pre_pretransposed_b(
+            offset_int_vec(PrePretransposedB), _pre_pretransposed_b_info, tensors,
+            /*pack_inject: no need to inject into tensors*/
+            false,
+            /*bypass_alloc: no need to allocate if pre-pretranspose B is not required as this handle will not be used*/
+            !run_pre_pretranspose_b);
+        if (run_pre_pretranspose_b)
+        {
+            ARM_COMPUTE_ERROR_ON(_pre_pretranspose_b == nullptr);
+            ITensorPack pre_pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pre_pretransposed_b.get()}};
+            _pre_pretranspose_b->run(pre_pretranspose_pack);
+            b_to_use = pre_pretransposed_b.get();
         }
 
         // Pretranspose B if required
-        if(_gemm_kernel_asm->B_pretranspose_required())
+        if (_gemm_kernel_asm->B_pretranspose_required())
         {
             // Fixed format kernels need no pretranspose.
-            ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
-            const int  ldb            = b->info()->strides_in_bytes().y() / b->info()->element_size();
-            const auto in1_ptr        = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
-            const int  multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
+            ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(
+                assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
+            const int  ldb            = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size();
+            const auto in1_ptr        = reinterpret_cast<const TypeInput *>(b_to_use->buffer() +
+                                                                     b_to_use->info()->offset_first_element_in_bytes());
+            const int  multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size();
 
             CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false);
             ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
-            run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads());
+            run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(),
+                                                                     in1_ptr, ldb, multi_stride_b,
+                                                                     NEScheduler::get().num_threads());
 
             b->mark_as_unused();
+            // Note that we don't need to mark b_to_use as unused, as if it's been assigned to pre_pretransposed_b, its memory will be auto-managed by the handler
         }
 
-        if(_gemm_info.method == AsmConvMethod::Indirect)
+        if (_gemm_info.method == AsmConvMethod::Indirect)
         {
             prepare_indirect_buffer(tensors);
         }
@@ -526,43 +631,67 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
     int       multi_stride_b = 0;
     const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / d->info()->element_size();
 
-    auto             in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes());
+    auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes());
     const TypeInput *in1_ptr = nullptr;
     auto             out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes());
 
+    const ITensor *b_to_use = b;
+
+    // Pre-pretranspose B if required
+    const bool          run_pre_pretranspose_b = _gemm_info.transpose_b && !isVarWeightsKernel();
+    CpuAuxTensorHandler pre_pretransposed_b(
+        offset_int_vec(PrePretransposedB), _pre_pretransposed_b_info, tensors,
+        false /*pack_inject: no need to inject into tensors*/,
+        !run_pre_pretranspose_b /*bypass_alloc: no need to allocate if pre-pretranspose B is not required as this handle will not be used*/);
+    if (b_to_use && !_is_b_constant && run_pre_pretranspose_b)
+    {
+        ARM_COMPUTE_ERROR_ON(_pre_pretranspose_b == nullptr);
+        ITensorPack pre_pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pre_pretransposed_b.get()}};
+        _pre_pretranspose_b->run(pre_pretranspose_pack);
+        b_to_use = pre_pretransposed_b.get();
+    }
+
     // Check if B is pre-tranposed and de-reference if not
-    if(!_gemm_kernel_asm->B_is_pretransposed())
+    if (!_gemm_kernel_asm->B_is_pretransposed())
     {
-        ldb            = b->info()->strides_in_bytes().y() / b->info()->element_size();
-        multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
-        in1_ptr        = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
+        ldb            = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size();
+        multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size();
+        in1_ptr =
+            reinterpret_cast<const TypeInput *>(b_to_use->buffer() + b_to_use->info()->offset_first_element_in_bytes());
     }
 
     // If necessary, run pretranspose every time if either weights or biases are non-constant
-    if((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32))
+    if ((b_to_use && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32))
     {
-        if(c && c->info()->data_type() == DataType::S32)
+        if (c && c->info()->data_type() == DataType::S32)
         {
-            _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
+            _gemm_kernel_asm->set_quantized_bias(
+                reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
         }
 
         // Pretranspose B if required
-        if(_B_pretranspose_required)
+        if (_B_pretranspose_required)
         {
-            const int  ldb            = b->info()->strides_in_bytes().y() / b->info()->element_size();
-            const auto b_ptr          = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
-            const int  multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
+            // Fixed format kernels need no pretranspose.
+            ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(
+                assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
+            const int  ldb            = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size();
+            const auto b_ptr          = reinterpret_cast<const TypeInput *>(b_to_use->buffer() +
+                                                                   b_to_use->info()->offset_first_element_in_bytes());
+            const int  multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size();
 
             CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, true);
             ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
 
-            if(_is_b_constant)
+            if (_is_b_constant)
             {
                 _gemm_kernel_asm->requantize_bias(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b);
             }
             else
             {
-                run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads());
+                run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(),
+                                                                         b_ptr, ldb, multi_stride_b,
+                                                                         NEScheduler::get().num_threads());
             }
         }
     }
@@ -571,17 +700,17 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
 
     // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
     CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false);
-    if(workspace.get()->buffer() != nullptr)
+    if (workspace.get()->buffer() != nullptr)
     {
         _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(workspace.get()->buffer()));
         const unsigned int split_dim   = scheduling_hint.split_dimension();
         const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
         unsigned int       num_threads = NEScheduler::get().num_threads();
-        if(window_size < num_threads)
+        if (window_size < num_threads)
         {
             num_threads = window_size;
         }
-        if(split_dim != IScheduler::split_dimensions_all)
+        if (split_dim != IScheduler::split_dimensions_all)
         {
             // Make sure the kernel does not expect more threads than we can actually spawn
             const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim);
@@ -595,12 +724,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
 
     // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
     TypeOutput *bias = nullptr;
-    if(c && c->info()->data_type() != DataType::S32)
+    if (c && c->info()->data_type() != DataType::S32)
     {
         bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes());
     }
 
-    if(_gemm_info.method == AsmConvMethod::Indirect)
+    if (_gemm_info.method == AsmConvMethod::Indirect)
     {
         in0_ptr        = nullptr;
         lda            = 0;
@@ -609,18 +738,20 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
     }
 
     // Set gemm parameters
-    _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
-                                 in1_ptr, ldb, multi_stride_b,
-                                 out_ptr, ldd, batch_stride_d, multi_stride_d,
-                                 bias, 0);
+    _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr,
+                                 ldd, batch_stride_d, multi_stride_d, bias, 0);
     // Schedule
     NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
 }
 
 template <typename TypeInput, typename TypeOutput>
 void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
-                     const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                     arm_gemm::Activation activation, const AsmGemmInfo &info)
+                     const ITensorInfo                                   *a,
+                     const ITensorInfo                                   *b,
+                     const ITensorInfo                                   *c,
+                     ITensorInfo                                         *d,
+                     arm_gemm::Activation                                 activation,
+                     const AsmGemmInfo                                   &info)
 {
     Params         p           = extract_parameters(a, b, d, info);
     const CPUInfo &ci          = NEScheduler::get().cpu_info();
@@ -628,7 +759,8 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
 
     arm_gemm::GemmConfig cfg;
     cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg);
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
+                            info.fixed_format, info.fast_mode, &cfg);
 
     // Create arm_gemm fallback
     auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
@@ -638,8 +770,12 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
 
 template <typename TypeInput, typename TypeOutput>
 void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
-                           const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                           arm_gemm::Activation activation, const AsmGemmInfo &info)
+                           const ITensorInfo                                   *a,
+                           const ITensorInfo                                   *b,
+                           const ITensorInfo                                   *c,
+                           ITensorInfo                                         *d,
+                           arm_gemm::Activation                                 activation,
+                           const AsmGemmInfo                                   &info)
 {
     ARM_COMPUTE_UNUSED(activation);
     Params             p           = extract_parameters(a, b, d, info);
@@ -648,7 +784,8 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
 
     arm_gemm::GemmConfig cfg;
     cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg);
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
+                            info.fixed_format, info.fast_mode, &cfg);
 
     // Create arm_gemm fallback
     auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
@@ -660,22 +797,20 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
     const GEMMLowpOutputStageInfo os_info  = info.output_stage;
 
     arm_gemm::Requantize32 gemm_requant_info{};
-    if(os_info.gemmlowp_shifts.size() > 1)
+    if (os_info.gemmlowp_shifts.size() > 1)
     {
-        const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
-        gemm_requant_info          = arm_gemm::Requantize32(nullptr, 0,
-                                                            a_offset, b_offset, os_info.gemmlowp_offset,
-                                                            (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr,
-                                                            std::get<2>(requantize_data),
-                                                            std::get<3>(requantize_data),
-                                                            os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
+        const auto requantize_data =
+            fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
+        gemm_requant_info = arm_gemm::Requantize32(
+            nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset,
+            (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, std::get<2>(requantize_data),
+            std::get<3>(requantize_data), os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
     }
     else
     {
-        gemm_requant_info = arm_gemm::Requantize32(nullptr, 0,
-                                                   a_offset, b_offset, os_info.gemmlowp_offset,
-                                                   -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier,
-                                                   os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
+        gemm_requant_info =
+            arm_gemm::Requantize32(nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset, -os_info.gemmlowp_shift,
+                                   os_info.gemmlowp_multiplier, os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
     }
 
     // Configure fallback
@@ -684,13 +819,16 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
 }
 } //namespace
 
-CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch()
-    : _arm_gemm(nullptr)
+CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() : _arm_gemm(nullptr)
 {
 }
 
-Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
-                                             const AsmGemmInfo &info)
+Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                                             const ITensorInfo         *a,
+                                             const ITensorInfo         *b,
+                                             const ITensorInfo         *c,
+                                             const ITensorInfo         *d,
+                                             const AsmGemmInfo         &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
     ARM_COMPUTE_UNUSED(c);
@@ -701,53 +839,62 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
     arm_gemm::GemmConfig cfg;
     cfg.weight_format                           = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
     arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format);
-    arm_gemm::GemmArgs     args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, info.fixed_format, info.fast_mode, &cfg);
-    switch(a->data_type())
+    arm_gemm::GemmArgs     args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads,
+                                info.fixed_format, info.fast_mode, &cfg);
+    // TODO: Incorporate info.transpose_b COMPMID-6595
+    switch (a->data_type())
     {
         case DataType::F32:
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
-                                            "We could not find an optimized kernel for F32 input");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                !(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                "We could not find an optimized kernel for F32 input");
             break;
 #ifdef __aarch64__
         case DataType::U8:
         case DataType::QASYMM8:
-            if(d->data_type() == DataType::S32)
+            if (d->data_type() == DataType::S32)
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
-                                                "We could not find an optimized kernel for U8/QASYMM8 input and U32 output");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for U8/QASYMM8 input and U32 output");
             }
             else
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
-                                                "We could not find an optimized kernel for U8 input and U8 output");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for U8 input and U8 output");
             }
             break;
         case DataType::S8:
         case DataType::QASYMM8_SIGNED:
-            if(d->data_type() == DataType::S32)
+            if (d->data_type() == DataType::S32)
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
-                                                "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output");
             }
             else
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
-                                                "We could not find an optimized kernel for S8 input and S8 output");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for S8 input and S8 output");
             }
             break;
 #endif /* __aarch64__ */
 #if defined(ARM_COMPUTE_ENABLE_BF16)
         case DataType::BFLOAT16:
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
-                                            "We could not find an optimized kernel for BFLOAT16 input and F32 output");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                !(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                "We could not find an optimized kernel for BFLOAT16 input and F32 output");
             break;
         }
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
-                                            "We could not find an optimized kernel for F16 input and F16 output");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                !(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                "We could not find an optimized kernel for F16 input and F16 output");
             break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
@@ -759,26 +906,30 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
     return Status{};
 }
 
-Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
+Status CpuGemmAssemblyDispatch::validate(
+    const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
 {
     ARM_COMPUTE_UNUSED(c, info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run), "Assembly kernel will not be executed when reshape_b_only_on_first_run is false");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run),
+                                    "Assembly kernel will not be executed when reshape_b_only_on_first_run is false");
 
 #ifndef __aarch64__
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64");
 #endif /* __aarch64__ */
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8,
-                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
-                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
-    if(is_data_type_quantized_per_channel(b->data_type()))
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S8, DataType::BFLOAT16,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+        b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+        DataType::BFLOAT16, DataType::F16, DataType::F32);
+    if (is_data_type_quantized_per_channel(b->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8);
     }
-    else if(is_fixed_format_fast_math(info.weight_format))
+    else if (is_fixed_format_fast_math(info.weight_format))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16);
@@ -787,22 +938,29 @@ Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, "Only F32 output supported for BFLOAT16 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32,
+                                    "Only F32 output supported for F32 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16,
+                                    "Only F16 output supported for F16 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32,
+                                    "Only F32 output supported for BFLOAT16 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32,
+                                    "Only U32 output supported for U8 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32,
+                                    "Only S32 output supported for S8 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 &&
+                                        (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32),
                                     "Only QASYMM8/S32 output supported for QASYMM8 input");
     arm_compute::WeightFormat expected_weight_format = arm_compute::WeightFormat::UNSPECIFIED;
     const Status              ret = CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a, b, c, d, info);
-    if((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY)
+    if ((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY)
     {
         // Correctness check: if the format expected by the kernel is
         // not "any", make sure that the one found matches the format
         // intended by the caller.
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((expected_weight_format != info.weight_format),
-                                        "The format expected by the kernel does not correspond with the one requested by the user.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            (expected_weight_format != info.weight_format),
+            "The format expected by the kernel does not correspond with the one requested by the user.");
     }
     return ret;
 }
@@ -813,18 +971,19 @@ bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo
     return act.type != arm_gemm::Activation::Type::None;
 }
 
-void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
+void CpuGemmAssemblyDispatch::configure(
+    const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
     arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
 
     //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
-    if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))
+    if (!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))
     {
         return;
     }
 
-    switch(a->data_type())
+    switch (a->data_type())
     {
         case DataType::F32:
             create_arm_gemm<float, float>(_arm_gemm, a, b, c, d, act, info);
@@ -832,7 +991,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo
 #ifdef __aarch64__
         case DataType::U8:
         case DataType::QASYMM8:
-            if(d->data_type() == DataType::S32)
+            if (d->data_type() == DataType::S32)
             {
                 create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info);
             }
@@ -843,7 +1002,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo
             break;
         case DataType::S8:
         case DataType::QASYMM8_SIGNED:
-            if(d->data_type() == DataType::S32)
+            if (d->data_type() == DataType::S32)
             {
                 create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info);
             }
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
index ceb7a3f7756ac7a8248a49841bd2b43823a7cc7c..671a222fed6ebeebc38842c39d023542edf2a6f0 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
-#define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
+#ifndef ACL_SRC_CPU_OPERATORS_INTERNAL_CPUGEMMASSEMBLYDISPATCH_H
+#define ACL_SRC_CPU_OPERATORS_INTERNAL_CPUGEMMASSEMBLYDISPATCH_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -42,20 +43,27 @@ enum class AsmConvMethod
 
 struct AsmGemmInfo
 {
-    AsmConvMethod             method{ AsmConvMethod::Im2Col };
+    AsmConvMethod             method{AsmConvMethod::Im2Col};
     PadStrideInfo             ps_info{};
     ActivationLayerInfo       activation_info{};
     GEMMLowpOutputStageInfo   output_stage{};
-    bool                      negated_offsets{ true };
-    bool                      reinterpret_input_as_3d{ false };
-    bool                      depth_output_gemm3d{ false };
-    int64_t                   padding_top{ 0 };
-    int64_t                   padding_left{ 0 };
-    float                     padding_value{ 0.f };
-    bool                      fast_mode{ false };
-    bool                      fixed_format{ false };
-    arm_compute::WeightFormat weight_format{ arm_compute::WeightFormat::UNSPECIFIED };
-    bool                      reshape_b_only_on_first_run{ true };
+    bool                      negated_offsets{true};
+    bool                      reinterpret_input_as_3d{false};
+    bool                      depth_output_gemm3d{false};
+    int64_t                   padding_top{0};
+    int64_t                   padding_left{0};
+    float                     padding_value{0.f};
+    bool                      fast_mode{false};
+    bool                      fixed_format{false};
+    arm_compute::WeightFormat weight_format{arm_compute::WeightFormat::UNSPECIFIED};
+    bool                      reshape_b_only_on_first_run{true};
+    /** Whether we want to perform an additional transpose of b before passing it to gemm or pretranspose_B_array
+     * @note This transpose b operation is also considered a form of "reshape" or "transform", so should be counted for
+     *       by the reshape_b_only_on_first_run flag
+     * @note This flag will be silently ignored (assumed to be false) when the weight_format is a fixed format. Because
+     *       fixed format kernels do not accept weights (B) with any prior transformations
+     */
+    bool transpose_b{false};
 };
 
 /** Assembly kernel glue */
@@ -72,12 +80,12 @@ public:
     class IFallback
     {
     public:
-        virtual void run(ITensorPack &tensors)                              = 0;
-        virtual void prepare(ITensorPack &tensors)                          = 0;
-        virtual experimental::MemoryRequirements workspace() const          = 0;
-        virtual bool                             is_configured() const      = 0;
-        virtual bool                             isVarWeightsKernel() const = 0;
-        virtual ~IFallback()                                                = default;
+        virtual void                             run(ITensorPack &tensors)     = 0;
+        virtual void                             prepare(ITensorPack &tensors) = 0;
+        virtual experimental::MemoryRequirements workspace() const             = 0;
+        virtual bool                             is_configured() const         = 0;
+        virtual bool                             isVarWeightsKernel() const    = 0;
+        virtual ~IFallback()                                                   = default;
     };
 
 public:
@@ -121,7 +129,8 @@ public:
      * @param[out] d    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
      * @param[in]  info GEMM meta-data
      */
-    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info);
+    void configure(
+        const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info);
 
     /** Indicates whether or not this function can be used to process the given parameters.
      *
@@ -133,7 +142,11 @@ public:
      *
      * @return a status.
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info);
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *d,
+                           const AsmGemmInfo &info);
 
     /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
      *
@@ -144,7 +157,12 @@ public:
      *
      * @return a status.
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info);
+    static Status has_opt_impl(arm_compute::WeightFormat &weight_format,
+                               const ITensorInfo         *a,
+                               const ITensorInfo         *b,
+                               const ITensorInfo         *c,
+                               const ITensorInfo         *d,
+                               const AsmGemmInfo         &info);
     /** Checks if activation is supported by the gemm assembly dispatcher
      *
      * @param[in] activation Activation to check
@@ -167,8 +185,8 @@ public:
     }
 
     // Inherited methods overridden:
-    void prepare(ITensorPack &tensors) override;
-    void run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -176,4 +194,4 @@ private:
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H */
+#endif // ACL_SRC_CPU_OPERATORS_INTERNAL_CPUGEMMASSEMBLYDISPATCH_H
diff --git a/src/cpu/utils/CpuAuxTensorHandler.h b/src/cpu/utils/CpuAuxTensorHandler.h
index ae1cffb659f624e934d9c84a3367dc4b283211b7..0a39fdba81a3647a83f8180a64967c44f82122d3 100644
--- a/src/cpu/utils/CpuAuxTensorHandler.h
+++ b/src/cpu/utils/CpuAuxTensorHandler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H
-#define ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H
+#ifndef ACL_SRC_CPU_UTILS_CPUAUXTENSORHANDLER_H
+#define ACL_SRC_CPU_UTILS_CPUAUXTENSORHANDLER_H
 
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -39,25 +39,26 @@ namespace cpu
 class CpuAuxTensorHandler
 {
 public:
-    CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
+    CpuAuxTensorHandler(
+        int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
         : _tensor()
     {
-        if(info.total_size() == 0)
+        if (info.total_size() == 0)
         {
             return;
         }
         _tensor.allocator()->soft_init(info);
 
         ITensor *packed_tensor = utils::cast::polymorphic_downcast<ITensor *>(pack.get_tensor(slot_id));
-        if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
+        if ((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
         {
-            if(!bypass_alloc)
+            if (!bypass_alloc)
             {
                 _tensor.allocator()->allocate();
                 ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor");
             }
 
-            if(pack_inject)
+            if (pack_inject)
             {
                 pack.add_tensor(slot_id, &_tensor);
                 _injected_tensor_pack = &pack;
@@ -70,22 +71,32 @@ public:
         }
     }
 
-    CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor)
-        : _tensor()
+    /** Create a temporary handle to the original tensor with a new @ref TensorInfo
+     * This is useful if we want to change a tensor's tensor info at run time without modifying the original tensor
+     *
+     * @param[in] info          New tensor info to "assign" to @p tensor
+     * @param[in] tensor        Tensor to be assigned a new @ref TensorInfo
+     * @param[in] bypass_import Bypass importing @p tensor's memory into the handler
+     */
+    CpuAuxTensorHandler(TensorInfo &info, const ITensor &tensor, bool bypass_import = false) : _tensor()
     {
         _tensor.allocator()->soft_init(info);
-        if(info.total_size() <= tensor.info()->total_size())
+        if (!bypass_import)
         {
-            _tensor.allocator()->import_memory(tensor.buffer());
+            ARM_COMPUTE_ERROR_ON(tensor.info() == nullptr);
+            if (info.total_size() <= tensor.info()->total_size())
+            {
+                _tensor.allocator()->import_memory(tensor.buffer());
+            }
         }
     }
 
-    CpuAuxTensorHandler(const CpuAuxTensorHandler &) = delete;
+    CpuAuxTensorHandler(const CpuAuxTensorHandler &)          = delete;
     CpuAuxTensorHandler &operator=(const CpuAuxTensorHandler) = delete;
 
     ~CpuAuxTensorHandler()
     {
-        if(_injected_tensor_pack)
+        if (_injected_tensor_pack)
         {
             _injected_tensor_pack->remove_tensor(_injected_slot_id);
         }
@@ -103,9 +114,9 @@ public:
 
 private:
     Tensor       _tensor{};
-    ITensorPack *_injected_tensor_pack{ nullptr };
-    int          _injected_slot_id{ TensorType::ACL_UNKNOWN };
+    ITensorPack *_injected_tensor_pack{nullptr};
+    int          _injected_slot_id{TensorType::ACL_UNKNOWN};
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */
\ No newline at end of file
+#endif // ACL_SRC_CPU_UTILS_CPUAUXTENSORHANDLER_H
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
index 15a5632d0b1ceba7ba5d427d344ca538de7ee870..9ca20fa152d588bd0b086c9905f76b876fb59c11 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
@@ -22,14 +22,15 @@
  * SOFTWARE.
  */
 #include "ClKernelRuntime.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/CLUtils.h"
 #ifdef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h"
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
 #include "src/gpu/cl/ClKernelLibrary.h"
-
 #include "support/Cast.h"
 namespace arm_compute
 {
@@ -43,13 +44,12 @@ void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKe
 {
     // Create kernel from kernel source string
     opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
-    _kernel                       = static_cast<cl::Kernel>(compile_ctx.create_kernel(code.name(),
-                                                                                      code.name(), // program name has to be provided to differentiate between different unfusable components' kernels.
-                                                                                      // Each program contains exactly one kernel
-                                                                                      code.code(),
-                                                                                      klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */,
-                                                                                      code.build_options().options(),
-                                                                                      false /* Is source binary */));
+    _kernel                       = static_cast<cl::Kernel>(compile_ctx.create_kernel(
+                              code.name(),
+                              code.name(), // program name has to be provided to differentiate between different unfusable components' kernels.
+                              // Each program contains exactly one kernel
+                              code.code(), klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */,
+                              code.build_options().options(), false /* Is source binary */));
 
     // Configure execution window
     IClKernel::configure_internal(code.window());
@@ -63,11 +63,15 @@ void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKe
 
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 
-inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images)
+inline void ClKernelRuntime::add_tensor_argument(unsigned int                &idx,
+                                                 const GpuKernelArgumentInfo &arg,
+                                                 const ICLTensor             *tensor,
+                                                 const Window                &arg_slice,
+                                                 std::vector<cl::Image2D>    &cl_images)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
 
-    switch(arg.type)
+    switch (arg.type)
     {
         case GpuKernelArgumentInfo::Type::Scalar:
         {
@@ -95,9 +99,13 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer
         }
         case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
         {
-            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
+            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) *
+                                                                            tensor->info()->dimension(2) *
+                                                                            tensor->info()->dimension(3));
             const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
-            cl::Image2D       tensor_image2d  = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+            cl::Image2D       tensor_image2d =
+                create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d,
+                                           tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
             cl_images.push_back(tensor_image2d);
             _kernel.setArg(idx++, tensor_image2d);
             break;
@@ -111,9 +119,13 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer
         }
         case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
         {
-            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
+            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) *
+                                                                            tensor->info()->dimension(2) *
+                                                                            tensor->info()->dimension(3));
             const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
-            cl::Image2D       tensor_image2d  = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+            cl::Image2D       tensor_image2d =
+                create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d,
+                                           tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
             cl_images.push_back(tensor_image2d);
             _kernel.setArg(idx++, tensor_image2d);
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
@@ -142,8 +154,9 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer
             const size_t image_h        = tensor->info()->tensor_shape().total_size_upper(1);
             const size_t image_stride_y = tensor->info()->strides_in_bytes()[1];
 
-            cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(),
-                                                                    TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y, CLImage2DType::ReadOnly);
+            cl::Image2D tensor_image2d = create_image2d_from_buffer(
+                CLKernelLibrary::get().context(), tensor->cl_buffer(), TensorShape(image_w, image_h),
+                tensor->info()->data_type(), image_stride_y, CLImage2DType::ReadOnly);
             cl_images.push_back(tensor_image2d);
 
             _kernel.setArg(idx++, tensor_image2d);
@@ -170,13 +183,16 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer
 }
 
 #else // ACL_INTERNAL_TEST_CKW_IN_DF
-inline void ClKernelRuntime::add_kernel_argument(unsigned int &idx, const GpuKernelArgumentBinding &arg, const ICLTensor *tensor, std::vector<cl::Image2D> &cl_images)
+inline void ClKernelRuntime::add_kernel_argument(unsigned int                   &idx,
+                                                 const GpuKernelArgumentBinding &arg,
+                                                 const ICLTensor                *tensor,
+                                                 std::vector<cl::Image2D>       &cl_images)
 {
-    switch(arg.type())
+    switch (arg.type())
     {
         case GpuKernelArgumentBinding::Type::TensorStorage:
         {
-            switch(arg.tensor_storage_type())
+            switch (arg.tensor_storage_type())
             {
                 case TensorStorageType::ClBufferUint8Ptr:
                 {
@@ -238,7 +254,7 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com
         // CLImages created from tensor arguments. Need to be retained until enqueue
         std::vector<cl::Image2D> cl_images;
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-        for(auto id_arg : _arguments)
+        for (auto id_arg : _arguments)
         {
             const auto arg    = id_arg.second;
             auto       tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(id_arg.first));
@@ -248,7 +264,7 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com
         }
 
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
-        for(const auto &arg : _arguments)
+        for (const auto &arg : _arguments)
         {
             auto tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(arg.id()));
             ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
@@ -259,8 +275,7 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com
 
         // Dispatch kernel
         enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items);
-    }
-    while(skip_sliding_window && window.slide_window_slice_3D(slice));
+    } while (skip_sliding_window && window.slide_window_slice_3D(slice));
 }
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
index 92e73503cecee2d332257ded4294af597da9e401..e78567eb9d936a1863a3066d534ad97ecc165f47 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
@@ -68,7 +68,11 @@ private:
      * @param[in]     arg_slice Window the kernel will be run on
      * @param[out]    cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
      */
-    inline void add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images);
+    inline void add_tensor_argument(unsigned int                &idx,
+                                    const GpuKernelArgumentInfo &arg,
+                                    const ICLTensor             *tensor,
+                                    const Window                &arg_slice,
+                                    std::vector<cl::Image2D>    &cl_images);
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Set a kernel argument as part of a tensor
      *
@@ -77,7 +81,10 @@ private:
      * @param[in]     tensor    Tensor of which the kernel argument @p arg is a part of
      * @param[out]    cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
      */
-    inline void add_kernel_argument(unsigned int &idx, const GpuKernelArgumentBinding &arg, const ICLTensor *tensor, std::vector<cl::Image2D> &cl_images);
+    inline void add_kernel_argument(unsigned int                   &idx,
+                                    const GpuKernelArgumentBinding &arg,
+                                    const ICLTensor                *tensor,
+                                    std::vector<cl::Image2D>       &cl_images);
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 private:
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
index cd21b10180fc593cf5cf7766ff1cc35d849d5955..ba39ff4c9d1dae26f67c48140927444328eff116 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+
 #include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
@@ -55,14 +56,14 @@ public:
     {
         DataView() = default;
         DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
-            : tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info }
+            : tensor{tensor}, tensor_info{tensor_info}, memory_info{memory_info}
         {
         }
-        ~DataView()                     = default;
-        DataView(const DataView &other) = default;
+        ~DataView()                                = default;
+        DataView(const DataView &other)            = default;
         DataView &operator=(const DataView &other) = default;
         DataView(DataView &&other)                 = default;
-        DataView &operator=(DataView &&other) = default;
+        DataView     &operator=(DataView &&other)  = default;
         CLTensor     *tensor{};      /**< Pointer to the auxiliary tensor */
         TensorInfo    tensor_info{}; /**< Associated tensor info */
         AuxMemoryInfo memory_info{}; /**< Memory requirement */
@@ -92,7 +93,7 @@ private:
     {
         const auto t_id             = tensor_info.id();
         auto       find_tensor_pair = _owned_tensors.find(t_id);
-        if(find_tensor_pair != _owned_tensors.end())
+        if (find_tensor_pair != _owned_tensors.end())
         {
             return find_tensor_pair->second.get();
         }
@@ -107,7 +108,7 @@ private:
     }
 
     std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{};
-    std::vector<DataView> _tensors{};
+    std::vector<DataView>                                _tensors{};
 };
 /** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode
  *
@@ -120,12 +121,12 @@ private:
  */
 Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code)
 {
-    for(auto t_id : code.tensors())
+    for (auto t_id : code.tensors())
     {
         // Get tensor object
         const auto workload_arg  = code.query_tensor(t_id);
         ICLTensor *tensor_object = nullptr;
-        if(workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
+        if (workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
         {
             // Create aux tensor CLTensor object
             const TensorInfo tensor_info = *workload_arg->tensor_info();
@@ -133,7 +134,7 @@ Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode
             const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info;
             tensor_object              = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info);
 
-            if(tensor_object == nullptr)
+            if (tensor_object == nullptr)
             {
                 return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor");
             }
@@ -156,7 +157,7 @@ public:
     ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id)
     {
         auto tensor_pack = _tensor_packs.find(uwk_id);
-        if(tensor_pack != _tensor_packs.end())
+        if (tensor_pack != _tensor_packs.end())
         {
             return &(tensor_pack->second);
         }
@@ -173,7 +174,10 @@ public:
         return _tensor_packs.at(uwk_id);
     }
 
-    friend Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor *> &user_tensors, const ClAuxTensors &aux_tensors);
+    friend Status create_tensor_lut(ClTensorLUT                   *tensor_lut,
+                                    const GpuWorkloadSourceCode   &code,
+                                    const std::vector<CLTensor *> &user_tensors,
+                                    const ClAuxTensors            &aux_tensors);
 
 private:
     /** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id
@@ -197,19 +201,22 @@ private:
  *
  * @return Status
  */
-Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor *> &user_tensors, const ClAuxTensors &aux_tensors)
+Status create_tensor_lut(ClTensorLUT                   *tensor_lut,
+                         const GpuWorkloadSourceCode   &code,
+                         const std::vector<CLTensor *> &user_tensors,
+                         const ClAuxTensors            &aux_tensors)
 {
     // Combine user tensors and aux tensors
     std::map<ITensorInfo::Id, CLTensor *> tensor_map;
-    for(auto tensor : user_tensors)
+    for (auto tensor : user_tensors)
     {
         const auto t_id = tensor->info()->id();
 
-        if(tensor_map.find(t_id) != tensor_map.end())
+        if (tensor_map.find(t_id) != tensor_map.end())
         {
             // In case of elementwise in-place: give another Id to the In/Out tensor when passed again
             std::vector<ITensorInfo::Id> ids;
-            for(auto &t : tensor_map)
+            for (auto &t : tensor_map)
             {
                 ids.push_back(t.first);
             }
@@ -221,11 +228,11 @@ Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &c
             tensor_map[t_id] = tensor;
         }
     }
-    for(const auto &data : aux_tensors.get_tensors())
+    for (const auto &data : aux_tensors.get_tensors())
     {
         const auto t_id   = data.tensor_info.id();
         const auto tensor = data.tensor;
-        if(tensor_map.find(t_id) != tensor_map.end())
+        if (tensor_map.find(t_id) != tensor_map.end())
         {
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
         }
@@ -233,25 +240,25 @@ Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &c
     }
 
     // Add tensor objects into corresponding tensor packs
-    for(auto id_tensor : tensor_map)
+    for (auto id_tensor : tensor_map)
     {
         const auto t_id          = id_tensor.first;
         const auto tensor_object = id_tensor.second;
-        if(tensor_object == nullptr)
+        if (tensor_object == nullptr)
         {
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
         }
-        if(tensor_object->allocator()->info().total_size() == 0U)
+        if (tensor_object->allocator()->info().total_size() == 0U)
         {
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor");
         }
 
-        for(auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
+        for (auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
         {
             ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id);
-            if(tensor_pack == nullptr)
+            if (tensor_pack == nullptr)
             {
-                tensor_lut->add_tensor_pack(uwk_id, ITensorPack{ { t_id, tensor_object } });
+                tensor_lut->add_tensor_pack(uwk_id, ITensorPack{{t_id, tensor_object}});
             }
             else
             {
@@ -269,15 +276,14 @@ struct ClWorkloadRuntime::Implementation
 {
     std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{};
     std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{};
-    bool                  _is_configured{ false };
-    bool                  _is_prepared{ false };
-    ClTensorLUT           _tensor_lut{};
-    ClAuxTensors          _aux_tensors{};
-    GpuWorkloadSourceCode _source_code{};
+    bool                                                       _is_configured{false};
+    bool                                                       _is_prepared{false};
+    ClTensorLUT                                                _tensor_lut{};
+    ClAuxTensors                                               _aux_tensors{};
+    GpuWorkloadSourceCode                                      _source_code{};
 };
 
-ClWorkloadRuntime::ClWorkloadRuntime()
-    : _impl{ std::make_unique<Implementation>() }
+ClWorkloadRuntime::ClWorkloadRuntime() : _impl{std::make_unique<Implementation>()}
 {
 }
 
@@ -286,18 +292,19 @@ ClWorkloadRuntime::~ClWorkloadRuntime() = default;
 Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL, "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL,
+                                    "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
     // Generate source code
     _impl->_source_code = sketch.implementation().generate_source_code();
     // Configure unit workload from source code
-    for(auto uwk_id : _impl->_source_code.unit_workloads())
+    for (auto uwk_id : _impl->_source_code.unit_workloads())
     {
         const auto work  = _impl->_source_code.query_unit_workload(uwk_id);
         const auto stage = work.stage().stage;
         auto       k     = std::make_unique<ClKernelRuntime>();
         k->configure(*sketch.gpu_context()->cl_compile_context(), work.code());
 
-        switch(stage)
+        switch (stage)
         {
             case UnitWorkloadStage::Stage::Run:
             {
@@ -323,9 +330,9 @@ Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
 
 void ClWorkloadRuntime::prepare()
 {
-    if(!_impl->_is_prepared)
+    if (!_impl->_is_prepared)
     {
-        for(auto &id_kernel_pair : _impl->_kernels_prep)
+        for (auto &id_kernel_pair : _impl->_kernels_prep)
         {
             const bool flush_queue = false;
             const auto uwk_id      = id_kernel_pair.first;
@@ -344,7 +351,7 @@ Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)
     const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors);
     ARM_COMPUTE_RETURN_ON_ERROR(st);
     prepare();
-    for(auto &id_kernel_pair : _impl->_kernels)
+    for (auto &id_kernel_pair : _impl->_kernels)
     {
         // Flush the command queue on the last kernel
         const bool flush_queue = false;
@@ -358,7 +365,7 @@ Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)
 std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors()
 {
     std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> aux_tensors;
-    for(const auto &data : _impl->_aux_tensors.get_tensors())
+    for (const auto &data : _impl->_aux_tensors.get_tensors())
     {
         aux_tensors.emplace_back(data.tensor, data.tensor_info, data.memory_info);
     }
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
index 84fb279237a1351b8c68f95f7f02b47429c1a20e..7044b0ea66319a8e04e424493813b3ee1683f8df 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
+++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
@@ -30,14 +30,17 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, const ICLTensor *tensor, TensorComponentType component)
+void cl_add_tensor_component_argument(cl::Kernel         &kernel,
+                                      unsigned int       &idx,
+                                      const ICLTensor    *tensor,
+                                      TensorComponentType component)
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
 
     const auto *info    = tensor->info();
     const auto &strides = info->strides_in_bytes();
 
-    switch(component)
+    switch (component)
     {
         case TensorComponentType::OffsetFirstElement:
             kernel.setArg<cl_uint>(idx++, info->offset_first_element_in_bytes());
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
index 4cbb157a48e05d9096b5016bb9a8fa1760b32675..306d547acb2f32e4dfcf068b79342d0ca659d5ef 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
+++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
@@ -42,7 +42,10 @@ namespace dynamic_fusion
  * @param[in]     tensor    Tensor from which to access the tensor component.
  * @param[in]     component Tensor component to select such as tensor dimensions, strides, etc.
  */
-void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, const ICLTensor *tensor, TensorComponentType component);
+void cl_add_tensor_component_argument(cl::Kernel         &kernel,
+                                      unsigned int       &idx,
+                                      const ICLTensor    *tensor,
+                                      TensorComponentType component);
 
 /** Add an OpenCL buffer object to the kernel's arguments at the specified index @p idx.
  *
diff --git a/src/dynamic_fusion/sketch/ArgumentPack.h b/src/dynamic_fusion/sketch/ArgumentPack.h
index f118d7d8515a9026ec3a5b6f0e7f0dc7df94fd83..d030bc3d458c2ee52b65a837bffd784c975974c6 100644
--- a/src/dynamic_fusion/sketch/ArgumentPack.h
+++ b/src/dynamic_fusion/sketch/ArgumentPack.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK
-#define SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK_H
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include <unordered_map>
 #include <vector>
 
@@ -45,33 +46,28 @@ template <typename T>
 class ArgumentPack
 {
 public:
-    /** @ref TensorType encodes the position of a tensor argument within the pack */
+    /** @ref arm_compute::TensorType encodes the position of a tensor argument within the pack */
     using Id = TensorType;
     /** A single argument element within the pack
      * It contains either a const pointer or a non-const pointer to the Tensor-related type T, but never at the same time
      */
     struct PackElement
     {
-        PackElement()                        = default;
-        PackElement(const PackElement &elem) = default;
+        PackElement()                                   = default;
+        PackElement(const PackElement &elem)            = default;
         PackElement &operator=(const PackElement &elem) = default;
         PackElement(PackElement &&elem)                 = default;
-        PackElement &operator=(PackElement &&elem) = default;
-        PackElement(Id id, T *tensor)
-            : id(id), tensor(tensor), ctensor(nullptr)
+        PackElement &operator=(PackElement &&elem)      = default;
+        PackElement(Id id, T *tensor) : id(id), tensor(tensor), ctensor(nullptr)
         {
         }
-        PackElement(Id id, const T *ctensor)
-            : id(id), tensor(nullptr), ctensor(ctensor)
+        PackElement(Id id, const T *ctensor) : id(id), tensor(nullptr), ctensor(ctensor)
         {
         }
 
-        Id       id{ ACL_UNKNOWN }; /**< Argument id within the pack */
-        T       *tensor{ nullptr }; /**< Non-const pointer to tensor-related object */
-        const T *ctensor
-        {
-            nullptr
-        }; /**< Const pointer to tensor-related object */
+        Id       id{ACL_UNKNOWN};  /**< Argument id within the pack */
+        T       *tensor{nullptr};  /**< Non-const pointer to tensor-related object */
+        const T *ctensor{nullptr}; /**< Const pointer to tensor-related object */
     };
 
 public:
@@ -88,10 +84,9 @@ public:
     /** Allow instances of this class to be moved */
     ArgumentPack<T> &operator=(ArgumentPack<T> &&other) = default;
     /** Initializer list Constructor */
-    ArgumentPack(const std::initializer_list<PackElement> &l)
-        : _pack{}
+    ArgumentPack(const std::initializer_list<PackElement> &l) : _pack{}
     {
-        for(const auto &e : l)
+        for (const auto &e : l)
         {
             _pack[e.id] = e;
         }
@@ -134,7 +129,7 @@ public:
     const T *get_const_tensor(Id id) const
     {
         auto it = _pack.find(id);
-        if(it != _pack.end())
+        if (it != _pack.end())
         {
             return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
         }
@@ -171,10 +166,10 @@ public:
     std::vector<T *> get_src_tensors()
     {
         std::vector<T *> src_tensors{};
-        for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+        for (int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
         {
             auto tensor = get_tensor(static_cast<TensorType>(id));
-            if(tensor != nullptr)
+            if (tensor != nullptr)
             {
                 src_tensors.push_back(tensor);
             }
@@ -188,10 +183,10 @@ public:
     std::vector<const T *> get_const_src_tensors() const
     {
         std::vector<const T *> src_tensors{};
-        for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+        for (int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
         {
             auto tensor = get_const_tensor(static_cast<TensorType>(id));
-            if(tensor != nullptr)
+            if (tensor != nullptr)
             {
                 src_tensors.push_back(tensor);
             }
@@ -205,10 +200,10 @@ public:
     std::vector<T *> get_dst_tensors()
     {
         std::vector<T *> dst_tensors{};
-        for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+        for (int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
         {
             auto tensor = get_tensor(static_cast<TensorType>(id));
-            if(tensor != nullptr)
+            if (tensor != nullptr)
             {
                 dst_tensors.push_back(tensor);
             }
@@ -222,10 +217,10 @@ public:
     std::vector<const T *> get_const_dst_tensors() const
     {
         std::vector<const T *> dst_tensors{};
-        for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+        for (int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
         {
             auto tensor = get_const_tensor(static_cast<TensorType>(id));
-            if(tensor != nullptr)
+            if (tensor != nullptr)
             {
                 dst_tensors.push_back(tensor);
             }
@@ -239,4 +234,4 @@ private:
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK_H
diff --git a/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp
index 3a5657e07b8a8e6492dae1abfb30515f888dcc25..6f3816568ce921fbb71dc65c1d17c38625558f48 100644
--- a/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp
+++ b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp
@@ -69,7 +69,8 @@ uint32_t DepthwiseConv2dAttributes::depth_multiplier() const
     return _depth_multiplier;
 }
 
-DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type)
+DepthwiseConv2dAttributes &
+DepthwiseConv2dAttributes::dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type)
 {
     _dimension_rounding_type = dimension_rounding_type;
     return *this;
diff --git a/src/dynamic_fusion/sketch/attributes/MatMulAttributes.cpp b/src/dynamic_fusion/sketch/attributes/MatMulAttributes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..027b5503770e4672fbe484664e7639f0a939d5ea
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/MatMulAttributes.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+MatMulAttributes MatMulAttributes::adj_lhs(bool adj_lhs)
+{
+    _adj_lhs = adj_lhs;
+    return *this;
+}
+MatMulAttributes MatMulAttributes::adj_rhs(bool adj_rhs)
+{
+    _adj_rhs = adj_rhs;
+    return *this;
+}
+bool MatMulAttributes::adj_lhs() const
+{
+    return _adj_lhs;
+}
+bool MatMulAttributes::adj_rhs() const
+{
+    return _adj_rhs;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp
index c28791f5fe32ac872a2422ea76dfecaa56fe4b28..80f65f926ad239cd950df3a6721336eb5e853790 100644
--- a/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp
+++ b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+
 #include "arm_compute/core/Size2D.h"
 
 namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
index 226e1a2df3a8293f62c9300472f574e764f901c9..03817173f4a576be3006e91d76975dacd0016622 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
@@ -61,11 +61,10 @@ struct GpuKernelArgumentInfo
     /** Default constructor */
     GpuKernelArgumentInfo() = default;
     /** Constructor */
-    GpuKernelArgumentInfo(Type type)
-        : type{ type }
+    GpuKernelArgumentInfo(Type type) : type{type}
     {
     }
-    Type type{ Type::Tensor_4D_t_Buffer };
+    Type type{Type::Tensor_4D_t_Buffer};
 };
 bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1);
 /** Kernel argument information linked with its corresponding @ref ITensorInfo
@@ -79,10 +78,8 @@ public:
      * @param[in] tensor_info     Associated @ref ITensorInfo
      * @param[in] kernel_arg_info Associated @ref GpuKernelArgumentInfo
      */
-    GpuKernelArgument(const ITensorInfo           &tensor_info,
-                      const GpuKernelArgumentInfo &kernel_arg_info)
-        : _tensor_info{ tensor_info },
-          _kernel_arg_info{ kernel_arg_info }
+    GpuKernelArgument(const ITensorInfo &tensor_info, const GpuKernelArgumentInfo &kernel_arg_info)
+        : _tensor_info{tensor_info}, _kernel_arg_info{kernel_arg_info}
     {
     }
     /** Get workload tensor id */
@@ -200,12 +197,12 @@ public:
         TensorComponent /** @ref TensorComponentType */
     };
     GpuKernelArgumentBinding(ITensorInfo::Id id, TensorStorageType storage)
-        : _type{ Type::TensorStorage }, _id{ id }, _value{}
+        : _type{Type::TensorStorage}, _id{id}, _value{}
     {
         _value.tensor_storage_type = storage;
     }
     GpuKernelArgumentBinding(ITensorInfo::Id id, TensorComponentType component)
-        : _type{ Type::TensorComponent }, _id{ id }, _value{}
+        : _type{Type::TensorComponent}, _id{id}, _value{}
     {
         _value.tensor_component_type = component;
     }
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
index 5a65ede38b77103e2def9ca64684d3471bab3844..1a458c98623e10b0d275cde8e265b10ebccd1a39 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
@@ -31,35 +31,31 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-std::vector<DependencyGraph::TensorId> GpuKernelComponentGraph::get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
+std::vector<DependencyGraph::TensorId>
+GpuKernelComponentGraph::get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
 {
     std::vector<DependencyGraph::TensorId> tensor_ids{};
-    std::transform(
-        std::begin(tensors), std::end(tensors),
-        std::back_inserter(tensor_ids),
-        [](const auto & t)
-    {
-        return t->id();
-    });
+    std::transform(std::begin(tensors), std::end(tensors), std::back_inserter(tensor_ids),
+                   [](const auto &t) { return t->id(); });
     return tensor_ids;
 }
 
 GpuKernelComponentGraph::GpuKernelComponentGraph(GpuWorkloadContext *context, GpuComponentServices *services)
-    : _context{ context }, _services{ services }, _components{}, _tensors{}, _dependency_graph{}
+    : _context{context}, _services{services}, _components{}, _tensors{}, _dependency_graph{}
 {
 }
 
 GpuKernelComponentStream GpuKernelComponentGraph::fuse(const MemoryDescriptorMap &mem_map) const
 {
-    GpuKernelComponentStream stream{ _context, _services, mem_map };
+    GpuKernelComponentStream stream{_context, _services, mem_map};
     const auto               op_seq = _dependency_graph.build_operators_sequence();
 
     stream.new_component_group();
-    for(auto op : op_seq)
+    for (auto op : op_seq)
     {
         const auto component = _components.at(op.op).get();
         const auto success   = stream.add_component(component);
-        if(!success) // Assume first failure was because the root component is unfusable
+        if (!success) // Assume first failure was because the root component is unfusable
         {
             stream.new_component_group();
             const auto success = stream.add_component(component);
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
index 85c9b4584004c349748015a5d2843b855a2fc5f6..6f871a3c90aa137fff5e7354316633c3d8f8ed6d 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
@@ -70,21 +70,21 @@ public:
      * @param[in] args Component arguments except for component id, which is auto-allocated
      */
     template <typename T, typename... Args>
-    void add_new_component(Args &&... args)
+    void add_new_component(Args &&...args)
     {
-        auto                      comp           = _services->component_factory().create<T>(std::forward<Args>(args)...);
-        ArgumentPack<ITensorInfo> tensors        = comp->tensors();
+        auto                      comp    = _services->component_factory().create<T>(std::forward<Args>(args)...);
+        ArgumentPack<ITensorInfo> tensors = comp->tensors();
         const auto                src_tensor_ids = get_tensor_ids(tensors.get_const_src_tensors());
         const auto                dst_tensor_ids = get_tensor_ids(tensors.get_const_dst_tensors());
-        bool                      success        = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids);
+        bool                      success = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids);
         ARM_COMPUTE_UNUSED(success);
         ARM_COMPUTE_ERROR_ON(!success);
         _components[comp->id()] = std::move(comp);
-        for(auto t : tensors.get_const_src_tensors())
+        for (auto t : tensors.get_const_src_tensors())
         {
             _tensors[t->id()] = t;
         }
-        for(auto t : tensors.get_const_dst_tensors())
+        for (auto t : tensors.get_const_dst_tensors())
         {
             _tensors[t->id()] = t;
         }
@@ -99,11 +99,11 @@ public:
 
 private:
     static std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors);
-    GpuWorkloadContext   *_context;
-    GpuComponentServices *_services;
+    GpuWorkloadContext                           *_context;
+    GpuComponentServices                         *_services;
     std::map<ComponentId, std::unique_ptr<IGpuKernelComponent>> _components;
     std::map<ITensorInfo::Id, const ITensorInfo *>              _tensors;
-    DependencyGraph _dependency_graph{};
+    DependencyGraph                                             _dependency_graph{};
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
index 81c3f0c8006ea7d7a73d4644128a01afd93bb75c..5a6d125d9632d9a23e8f39fef750aecae0a96ac4 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 #include <algorithm>
@@ -37,86 +38,87 @@ namespace dynamic_fusion
 {
 bool GpuKernelComponentGroup::add_component(ComponentPtr component)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(
-        _finalized, "The component group has been finalized and cannot be altered.");
+    ARM_COMPUTE_ERROR_ON_MSG(_finalized, "The component group has been finalized and cannot be altered.");
 
     // note: Constraint 1 is guaranteed as a precondition
     // Constraint 2
-    if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components)
+    if (component->type() != GpuComponentType::Output && _components.size() >= max_fused_components)
     {
         return false;
     }
     // Constraint 3.1: Pattern: (Unfusable + Output)
-    if(!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable && component->type() != GpuComponentType::Output)
+    if (!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable &&
+        component->type() != GpuComponentType::Output)
     {
         return false;
     }
     // Constraint 3.2
-    if(!_components.empty() && (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output))
+    if (!_components.empty() &&
+        (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output))
     {
         return false;
     }
     // Constraint 4
-    if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U)
+    if (component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U)
     {
         return false;
     }
     // Constraint 5
-    if(!_components.empty() && !(get_root_component()->properties() == component->properties()))
+    if (!_components.empty() && !(get_root_component()->properties() == component->properties()))
     {
         return false;
     }
     // Constraint 7
-    if(!_components.empty())
+    if (!_components.empty())
     {
         const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
         ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
         const auto first_dst_tensor = root_dst_tensors[0];
         const auto dst_tensors      = component->tensors().get_const_dst_tensors();
-        for(const auto &t : root_dst_tensors)
+        for (const auto &t : root_dst_tensors)
         {
-            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
             {
                 return false;
             }
         }
-        for(const auto &t : dst_tensors)
+        for (const auto &t : dst_tensors)
         {
-            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
             {
                 return false;
             }
         }
     }
     // Constraint 8
-    if(!_components.empty())
+    if (!_components.empty())
     {
         const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
         ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
         const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
         const auto dst_tensors             = component->tensors().get_const_dst_tensors();
-        for(const auto &t : root_dst_tensors)
+        for (const auto &t : root_dst_tensors)
         {
-            if(t->data_layout() != first_dst_tensor_layout)
+            if (t->data_layout() != first_dst_tensor_layout)
             {
                 return false;
             }
         }
-        for(const auto &t : dst_tensors)
+        for (const auto &t : dst_tensors)
         {
-            if(t->data_layout() != first_dst_tensor_layout)
+            if (t->data_layout() != first_dst_tensor_layout)
             {
                 return false;
             }
         }
     }
     // Constraint 9
-    if(component->tensors().get_const_dst_tensors().size() >= max_dst_tensors)
+    if (component->tensors().get_const_dst_tensors().size() >= max_dst_tensors)
     {
         return false;
     }
     // Constraint 9 corollary
-    if(component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors)
+    if (component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors)
     {
         return false;
     }
@@ -126,36 +128,36 @@ bool GpuKernelComponentGroup::add_component(ComponentPtr component)
 
 void GpuKernelComponentGroup::finalize()
 {
-    if(_finalized)
+    if (_finalized)
     {
         return;
     }
 
     _finalized = true;
 
-    std::set<const ITensorInfo *> output_tensors;
+    std::set<const ITensorInfo *>                                   output_tensors;
     std::map<const ITensorInfo *, std::vector<const ITensorInfo *>> possible_tile_map;
-    std::map<const ITensorInfo *, int32_t> tile_usages;
+    std::map<const ITensorInfo *, int32_t>                          tile_usages;
 
-    for(auto component : _components)
+    for (auto component : _components)
     {
-        const auto tensors = component->tensors();
+        const auto tensors     = component->tensors();
         const auto src_tensors = tensors.get_const_src_tensors();
         const auto dst_tensors = tensors.get_const_dst_tensors();
 
         // Detect input, output and intermediate tensors.
-        for(auto tensor : src_tensors)
+        for (auto tensor : src_tensors)
         {
             const auto output_tensors_it = output_tensors.find(tensor);
 
-            if(output_tensors_it != output_tensors.end())
+            if (output_tensors_it != output_tensors.end())
             {
                 // This tensor is the output of another operator.
                 // It must be marked as intermediate tensor.
                 output_tensors.erase(output_tensors_it);
                 _interm_tensors.insert(tensor);
             }
-            else if(_interm_tensors.find(tensor) == _interm_tensors.end())
+            else if (_interm_tensors.find(tensor) == _interm_tensors.end())
             {
                 _input_tensors.insert(tensor);
 
@@ -164,7 +166,7 @@ void GpuKernelComponentGroup::finalize()
             }
         }
 
-        for(auto tensor : dst_tensors)
+        for (auto tensor : dst_tensors)
         {
             ARM_COMPUTE_ERROR_ON(_input_tensors.find(tensor) != _input_tensors.end());
             ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end());
@@ -177,27 +179,27 @@ void GpuKernelComponentGroup::finalize()
 
         // Check if the output can overwrite the input tile.
         const auto component_type = component->type();
-        if(component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output)
+        if (component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output)
         {
             ARM_COMPUTE_ERROR_ON(dst_tensors.size() != 1);
 
-            const auto dst_tensor = dst_tensors[0];
-            const auto &dst_shape = dst_tensor->tensor_shape();
-            const auto &dst_type = dst_tensor->data_type();
+            const auto  dst_tensor = dst_tensors[0];
+            const auto &dst_shape  = dst_tensor->tensor_shape();
+            const auto &dst_type   = dst_tensor->data_type();
 
             tile_usages[dst_tensor] = 0;
 
-            for(auto src_tensor : src_tensors)
+            for (auto src_tensor : src_tensors)
             {
                 const auto &src_shape = src_tensor->tensor_shape();
-                const auto &src_type = src_tensor->data_type();
+                const auto &src_type  = src_tensor->data_type();
 
-                if(src_shape == dst_shape && src_type == dst_type)
+                if (src_shape == dst_shape && src_type == dst_type)
                 {
                     const auto tile_usages_it = tile_usages.find(src_tensor);
                     ARM_COMPUTE_ERROR_ON(tile_usages_it == tile_usages.end());
 
-                    if(component_type == GpuComponentType::Simple || tile_usages_it->second > 0)
+                    if (component_type == GpuComponentType::Simple || tile_usages_it->second > 0)
                     {
                         // Increase the number of tile usages unless this component is an output
                         // and the tile has not been shared with any component.
@@ -212,7 +214,7 @@ void GpuKernelComponentGroup::finalize()
         else
         {
             // Outputs of complex and unfusable components need dedicated tile.
-            for(auto tensor : dst_tensors)
+            for (auto tensor : dst_tensors)
             {
                 tile_usages[tensor] = 0;
             }
@@ -220,25 +222,25 @@ void GpuKernelComponentGroup::finalize()
     }
 
     // Find the smallest list of tiles that the intermediate tensors need to write to.
-    for(auto tensor : _input_tensors)
+    for (auto tensor : _input_tensors)
     {
         _tile_map[tensor] = tensor;
     }
 
-    for(auto component : _components)
+    for (auto component : _components)
     {
         const auto dst_tensors = component->tensors().get_const_dst_tensors();
 
-        for(auto tensor : dst_tensors)
+        for (auto tensor : dst_tensors)
         {
             const auto target_tiles = possible_tile_map.at(tensor);
-            _tile_map[tensor] = tensor;
+            _tile_map[tensor]       = tensor;
 
-            for(auto target : target_tiles)
+            for (auto target : target_tiles)
             {
                 const auto num_usage = tile_usages[target];
 
-                if(num_usage <= 1)
+                if (num_usage <= 1)
                 {
                     // The target tile is consumed by only this operator, so we can reuse it
                     // for the destination tensor data.
@@ -249,26 +251,23 @@ void GpuKernelComponentGroup::finalize()
         }
     }
 
-    for(auto tensor : output_tensors)
+    for (auto tensor : output_tensors)
     {
         _tile_map[tensor] = tensor;
     }
 
     // All intermediate tensors that cannot be shared with any previous tensor
     // will need to be declared as tile variable.
-    for(auto tensor_tile : _tile_map)
+    for (auto tensor_tile : _tile_map)
     {
-        if(tensor_tile.first == tensor_tile.second &&
-           _interm_tensors.find(tensor_tile.first) != _interm_tensors.end())
+        if (tensor_tile.first == tensor_tile.second && _interm_tensors.find(tensor_tile.first) != _interm_tensors.end())
         {
             _tiles.push_back(tensor_tile.first);
         }
     }
 
-    std::set_union(
-        _input_tensors.begin(), _input_tensors.end(),
-        output_tensors.begin(), output_tensors.end(),
-        std::back_inserter(_argument_tensors));
+    std::set_union(_input_tensors.begin(), _input_tensors.end(), output_tensors.begin(), output_tensors.end(),
+                   std::back_inserter(_argument_tensors));
     _any_output_tensor = *output_tensors.begin();
 }
 
@@ -282,7 +281,7 @@ const ITensorInfo *GpuKernelComponentGroup::get_tile_for_tensor(const ITensorInf
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
 
-    if(_tile_map.find(tensor) != _tile_map.end())
+    if (_tile_map.find(tensor) != _tile_map.end())
     {
         return _tile_map.at(tensor);
     }
@@ -304,7 +303,7 @@ std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_argument_tensors()
 
 GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const
 {
-    if(empty())
+    if (empty())
     {
         return nullptr;
     }
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
index c939aec369743b12fb0871e1925b2d31acb29f15..6ad71abb3996808ac40a1fa5de9786fb57306f19 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
@@ -25,12 +25,11 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP
 
 #include "components/Types.h"
-
 #include <cstdint>
 #include <cstdlib>
-#include <vector>
-#include <set>
 #include <map>
+#include <set>
+#include <vector>
 
 namespace arm_compute
 {
@@ -129,9 +128,9 @@ public:
     /** Get the number of components within the group */
     size_t size() const;
     /** Check if the component group is empty */
-    bool empty() const;
-    ComponentPtr &operator[](size_t index);
-    const ComponentPtr &operator[](size_t index) const;
+    bool                                               empty() const;
+    ComponentPtr                                      &operator[](size_t index);
+    const ComponentPtr                                &operator[](size_t index) const;
     typename std::vector<ComponentPtr>::iterator       begin();
     typename std::vector<ComponentPtr>::iterator       end();
     typename std::vector<ComponentPtr>::const_iterator begin() const;
@@ -142,13 +141,13 @@ public:
 private:
     std::vector<ComponentPtr> _components{};
 
-    bool _finalized{ false };
+    bool _finalized{false};
 
-    std::vector<const ITensorInfo *> _argument_tensors{};
-    std::set<const ITensorInfo *> _input_tensors{};
-    std::set<const ITensorInfo *> _interm_tensors{};
-    const ITensorInfo *_any_output_tensor{ nullptr };
-    std::vector<const ITensorInfo *> _tiles{};
+    std::vector<const ITensorInfo *>                   _argument_tensors{};
+    std::set<const ITensorInfo *>                      _input_tensors{};
+    std::set<const ITensorInfo *>                      _interm_tensors{};
+    const ITensorInfo                                 *_any_output_tensor{nullptr};
+    std::vector<const ITensorInfo *>                   _tiles{};
     std::map<const ITensorInfo *, const ITensorInfo *> _tile_map{};
 };
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
index a2b6623370afc24657368c8ddf273fc51bb20b60..8042e3dd089570cc59051c0065c82f9634016c73 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
@@ -23,9 +23,9 @@
  */
 #include "GpuKernelComponentStream.h"
 
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
 {
@@ -33,8 +33,10 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-GpuKernelComponentStream::GpuKernelComponentStream(GpuWorkloadContext *context, GpuComponentServices *services, const MemoryDescriptorMap &mem_map)
-    : _context{ context }, _services{ services }, _component_groups{}, _mem_map{ mem_map }
+GpuKernelComponentStream::GpuKernelComponentStream(GpuWorkloadContext        *context,
+                                                   GpuComponentServices      *services,
+                                                   const MemoryDescriptorMap &mem_map)
+    : _context{context}, _services{services}, _component_groups{}, _mem_map{mem_map}
 {
 }
 
@@ -42,7 +44,7 @@ GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code()
 {
     GpuWorkloadSourceCode source_code;
     // Traverse through component groups and assemble workload together
-    for(auto && group : _component_groups)
+    for (auto &&group : _component_groups)
     {
         group.finalize();
 
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
index ba2503a9381036b6cd17b4f76caa77e19796099f..ef8a8a15b09cfdce63eec40f9367c3763a68640f 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM
 
 #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
 
@@ -53,7 +54,9 @@ public:
      * @param[in] services @ref GpuComponentServices to be used throughout the stream
      * @param[in] mem_map  @ref MemoryDescriptor map used to assemble the @ref GpuWorkloadSourceCode
      */
-    GpuKernelComponentStream(GpuWorkloadContext *context, GpuComponentServices *services, const MemoryDescriptorMap &mem_map);
+    GpuKernelComponentStream(GpuWorkloadContext        *context,
+                             GpuComponentServices      *services,
+                             const MemoryDescriptorMap &mem_map);
     /** Allow instances of this class to be copy constructed */
     GpuKernelComponentStream(const GpuKernelComponentStream &stream) = default;
     /** Allow instances of this class to be copied */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
index 64e1cdc3bc5b5f4bce08947fc60e4fd72fcbfc45..24812cd8a746717bb2fe2e6968a429ca4cf5b4a1 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
index c99984fc0e1d9294634cfb669fe6b71354fec6e8..502ceab807266959516d4fee6854170599dfc4bf 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
@@ -26,9 +26,9 @@
 #include "arm_compute/core/experimental/Types.h"
 
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h"
 #else // ACL_INTERNAL_TEST_CKW_IN_DF
@@ -42,7 +42,7 @@ namespace experimental
 namespace dynamic_fusion
 {
 GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components)
-    : _comp_group{ components }, _store_components{}
+    : _comp_group{components}, _store_components{}
 {
     ARM_COMPUTE_UNUSED(services);
 }
@@ -51,9 +51,9 @@ GpuKernelSourceCode GpuLogicalKernel::write_kernel_code()
 {
     GpuKernelSourceCode code;
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    ClTemplateWriter writer { _comp_group };
+    ClTemplateWriter writer{_comp_group};
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
-    GpuCkwDriver writer { _comp_group };
+    GpuCkwDriver writer{_comp_group};
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
     code.name(writer.get_name());
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
index 7bb14c8698875343eb113e4075089a8f0cdb7fdf..aec8b9db4f0939b98c736ea84683d697287f54a8 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
@@ -36,20 +36,15 @@ namespace
 std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
 {
     std::vector<DependencyGraph::TensorId> tensor_ids{};
-    std::transform(
-        std::begin(tensors), std::end(tensors),
-        std::back_inserter(tensor_ids),
-        [](const auto & t)
-    {
-        return t->id();
-    });
+    std::transform(std::begin(tensors), std::end(tensors), std::back_inserter(tensor_ids),
+                   [](const auto &t) { return t->id(); });
     return tensor_ids;
 }
 
 } // namespace
 
 Operator::Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors)
-    : _id{ id }, _operator_type{ operator_type }, _tensors{ tensors }
+    : _id{id}, _operator_type{operator_type}, _tensors{tensors}
 {
 }
 
@@ -73,69 +68,69 @@ bool GpuOperatorGroup::try_add_operator(const Operator &op, bool is_output) cons
     const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
     const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
     // Constraint 1
-    if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output))
+    if (!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output))
     {
         return false;
     }
     // Constraint 2
-    if(_operators.size() >= max_fused_operators)
+    if (_operators.size() >= max_fused_operators)
     {
         return false;
     }
     // Constraint 3.1: Pattern: (Unfusable)
-    if(_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable)
+    if (_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable)
     {
         return false;
     }
     // Constraint 3.2
-    if(_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple))
+    if (_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple))
     {
         return false;
     }
     // Constraint 4
-    if(op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U)
+    if (op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U)
     {
         return false;
     }
     // Constraint 5
-    if(_operators.size() > 0)
+    if (_operators.size() > 0)
     {
         const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
         ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
         const auto first_dst_tensor = root_dst_tensors[0];
         const auto dst_tensors      = op.tensors().get_const_dst_tensors();
-        for(const auto &t : root_dst_tensors)
+        for (const auto &t : root_dst_tensors)
         {
-            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
             {
                 return false;
             }
         }
-        for(const auto &t : dst_tensors)
+        for (const auto &t : dst_tensors)
         {
-            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
             {
                 return false;
             }
         }
     }
     // Constraint 6
-    if(_operators.size() > 0)
+    if (_operators.size() > 0)
     {
         const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
         ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
         const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
         const auto dst_tensors             = op.tensors().get_const_dst_tensors();
-        for(const auto &t : root_dst_tensors)
+        for (const auto &t : root_dst_tensors)
         {
-            if(t->data_layout() != first_dst_tensor_layout)
+            if (t->data_layout() != first_dst_tensor_layout)
             {
                 return false;
             }
         }
-        for(const auto &t : dst_tensors)
+        for (const auto &t : dst_tensors)
         {
-            if(t->data_layout() != first_dst_tensor_layout)
+            if (t->data_layout() != first_dst_tensor_layout)
             {
                 return false;
             }
@@ -151,16 +146,17 @@ void GpuOperatorGroup::add_operator(const Operator &op, bool is_output)
     _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output);
     _operators[op.id()] = op;
 }
-Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const
+Operator GpuOperatorGroup::new_operator(const GpuOperatorType           &operator_type,
+                                        const ArgumentPack<ITensorInfo> &tensors) const
 {
     auto new_id = static_cast<OperatorId>(_operators.size());
-    return Operator{ new_id, operator_type, tensors };
+    return Operator{new_id, operator_type, tensors};
 }
 const Operator *GpuOperatorGroup::get_root_operator() const
 {
     const auto roots = _graph.get_root_ops();
     ARM_COMPUTE_ERROR_ON(roots.size() > 1);
-    if(roots.empty())
+    if (roots.empty())
     {
         return nullptr;
     }
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
index 308a9d796ae3e0222f5560e5f720d09b3789ef91..0a2369d357d50f18a785e4c4dd2e6d2a19f1aa73 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
@@ -25,9 +25,11 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP
 
 #include "arm_compute/core/ITensorInfo.h"
+
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h"
 #include "src/dynamic_fusion/sketch/utils/DependencyGraph.h"
+
 #include <map>
 
 namespace arm_compute
@@ -104,7 +106,7 @@ public:
     const Operator *get_root_operator() const;
 
 private:
-    DependencyGraph _graph{};
+    DependencyGraph                _graph{};
     std::map<OperatorId, Operator> _operators{};
 };
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
index c2bd01270395198341f94d267471ebba3620f149..36cad790c77a185960ed9510d7158737046a123a 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
@@ -23,7 +23,9 @@
  */
 
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+
 #include "arm_compute/core/CL/CLCompileContext.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
 
 namespace arm_compute
@@ -33,7 +35,7 @@ namespace experimental
 namespace dynamic_fusion
 {
 GpuWorkloadContext::GpuWorkloadContext(CLCompileContext *cl_compile_ctx)
-    : _impl{ std::make_unique<Impl>(GpuLanguage::OpenCL, cl_compile_ctx) }
+    : _impl{std::make_unique<Impl>(GpuLanguage::OpenCL, cl_compile_ctx)}
 {
 }
 
@@ -74,7 +76,11 @@ const GpuWorkloadContext::Impl &GpuWorkloadContext::implementation() const
 }
 
 GpuWorkloadContext::Impl::Impl(GpuLanguage gpu_language, CLCompileContext *cl_compile_ctx)
-    : _gpu_language(gpu_language), _cl_compile_ctx(cl_compile_ctx), _next_tensor_id(1), _mem_map(), _managed_tensor_info()
+    : _gpu_language(gpu_language),
+      _cl_compile_ctx(cl_compile_ctx),
+      _next_tensor_id(1),
+      _mem_map(),
+      _managed_tensor_info()
 {
 }
 
@@ -100,7 +106,7 @@ void GpuWorkloadContext::Impl::register_user_tensor(ITensorInfo &tensor_info)
     const auto tensor_id = next_tensor_id();
 
     tensor_info.set_id(tensor_id);
-    _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::User };
+    _mem_map[tensor_id] = MemoryDescriptor{MemoryType::User};
     // Save a *copy* of the user tensor info in workload context for future reference
     // Note that this means if the user modifies the @p tensor_info, the change will not be reflected in the context
     _managed_tensor_info.emplace(tensor_info.id(), std::make_unique<TensorInfo>(tensor_info));
@@ -111,7 +117,7 @@ ITensorInfo *GpuWorkloadContext::Impl::create_virtual_tensor()
     auto       tensor_info = std::make_unique<TensorInfo>();
     const auto tensor_id   = -next_tensor_id();
     tensor_info->set_id(tensor_id);
-    _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Virtual };
+    _mem_map[tensor_id] = MemoryDescriptor{MemoryType::Virtual};
     auto inserted       = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info));
     return inserted.first->second.get();
 }
@@ -121,7 +127,7 @@ ITensorInfo *GpuWorkloadContext::Impl::create_auxiliary_tensor(const ITensorInfo
     auto       tensor_info = std::make_unique<TensorInfo>(itensor_info);
     const auto tensor_id   = next_tensor_id();
     tensor_info->set_id(tensor_id);
-    _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Auxiliary, AuxMemoryInfo{ tensor_info->total_size() } };
+    _mem_map[tensor_id] = MemoryDescriptor{MemoryType::Auxiliary, AuxMemoryInfo{tensor_info->total_size()}};
     auto inserted       = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info));
     return inserted.first->second.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
index c169476a70c01d82c00f403abb5b68f36853df41..7d9699031f90ed51bc9a4e7a56f1a74f61a4aadf 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
@@ -27,8 +27,8 @@
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
 
 namespace arm_compute
 {
@@ -93,8 +93,8 @@ private:
     GpuLanguage       _gpu_language;
     CLCompileContext *_cl_compile_ctx;
 
-    ITensorInfo::Id     _next_tensor_id;
-    MemoryDescriptorMap _mem_map;
+    ITensorInfo::Id                                        _next_tensor_id;
+    MemoryDescriptorMap                                    _mem_map;
     std::map<ITensorInfo::Id, std::unique_ptr<TensorInfo>> _managed_tensor_info;
 };
 
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
index d3a20c0dfe4ce9119510b0c6793035130ecd8910..973f7c747fa6e7f0b7549200aec54fac10fba356 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
@@ -30,8 +31,7 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-GpuWorkloadSketch::GpuWorkloadSketch(Context *context)
-    : _impl{ std::make_unique<Implementation>(context) }
+GpuWorkloadSketch::GpuWorkloadSketch(Context *context) : _impl{std::make_unique<Implementation>(context)}
 {
 }
 GpuWorkloadSketch::~GpuWorkloadSketch()
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
index d3033898e9a722c82e4c1105eb9e444a17a50215..fea4fe957724bb5f77f4df4dbd9fa43cb1b3ce92 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
@@ -24,8 +24,9 @@
 #ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL
 
-#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
@@ -45,12 +46,8 @@ public:
      *
      * @param[in] context global workload creation context
      */
-    explicit Implementation(
-        Context *context)
-        : _context{ context },
-          _comp_services{},
-          _component_graph{ _context, &_comp_services },
-          _operator_group{}
+    explicit Implementation(Context *context)
+        : _context{context}, _comp_services{}, _component_graph{_context, &_comp_services}, _operator_group{}
     {
     }
     /** Prevent instances of this class from being copy constructed */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
index 578366daaf5b600ce9f0a1f1e6345eb3ceb015f9..43bcc47fa09fd12c6bf45fccf9ec856d4525a8eb 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
 
@@ -45,7 +46,7 @@ namespace
  */
 GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList &flat_kernel_args)
 {
-    if(flat_kernel_args.empty())
+    if (flat_kernel_args.empty())
     {
         return {};
     }
@@ -56,10 +57,10 @@ GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList &
     flat_kernel_args.pop_front();
     const auto tensor_id = karg_head.id();
 
-    while(!flat_kernel_args.empty())
+    while (!flat_kernel_args.empty())
     {
         const GpuKernelArgumentBinding &karg = flat_kernel_args.front();
-        if(karg.id() != tensor_id) // Encounter the next tensor, return the current tensor's kernel arguments
+        if (karg.id() != tensor_id) // Encounter the next tensor, return the current tensor's kernel arguments
         {
             return tensor_kargs;
         }
@@ -68,7 +69,7 @@ GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList &
     }
     return tensor_kargs;
 }
-}
+} // namespace
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 /** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */
 using UnitWorkloadId = int32_t;
@@ -92,9 +93,7 @@ public:
     GpuWorkloadArgument(const ITensorInfo           &tensor_info,
                         const MemoryDescriptor      &mem_desc,
                         const GpuKernelArgumentInfo &kernel_arg_info)
-        : _tensor_info{ tensor_info },
-          _mem_desc{ mem_desc },
-          _kernel_arg_info{ kernel_arg_info }
+        : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_arg_info{kernel_arg_info}
     {
     }
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
@@ -107,9 +106,7 @@ public:
     GpuWorkloadArgument(const ITensorInfo           &tensor_info,
                         const MemoryDescriptor      &mem_desc,
                         const GpuKernelArgumentList &kernel_args)
-        : _tensor_info{ tensor_info },
-          _mem_desc{ mem_desc },
-          _kernel_args{ kernel_args }
+        : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_args{kernel_args}
     {
     }
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
@@ -175,9 +172,9 @@ private:
     TensorInfo       _tensor_info{};
     MemoryDescriptor _mem_desc{};
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    GpuKernelArgumentInfo _kernel_arg_info {};
+    GpuKernelArgumentInfo _kernel_arg_info{};
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
-    GpuKernelArgumentList     _kernel_args {};
+    GpuKernelArgumentList _kernel_args{};
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 
@@ -190,7 +187,7 @@ struct UnitWorkloadStage
         Prepare, /**< Only run once at the beginning. */
         Run,     /**< Run every time after the first time. */
     };
-    Stage stage{ Stage::Run };
+    Stage stage{Stage::Run};
 };
 
 inline bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1)
@@ -212,7 +209,7 @@ public:
      * @param[in] stage       Stage of the unit workload
      */
     GpuUnitWorkload(UnitWorkloadId id, const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage)
-        : _id{ id }, _kernel_code{ kernel_code }, _stage{ stage }
+        : _id{id}, _kernel_code{kernel_code}, _stage{stage}
     {
     }
     /** Get the id of the unit workload */
@@ -253,7 +250,10 @@ public:
      *
      * @return UnitWorkloadId  Allocated unit workload id
      */
-    UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage, const MemoryDescriptorMap &mem_map, const GpuWorkloadContext *context)
+    UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code,
+                                     const UnitWorkloadStage   &stage,
+                                     const MemoryDescriptorMap &mem_map,
+                                     const GpuWorkloadContext  *context)
     {
         // Use the size of the kernel codes as Id
         const auto uwk_id    = static_cast<UnitWorkloadId>(_unit_workloads.size());
@@ -262,12 +262,13 @@ public:
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
         ARM_COMPUTE_UNUSED(context);
         // Assemble kernel argument with memory descriptor to form workload argument
-        for(const auto &id_arg : kernel_code.arguments())
+        for (const auto &id_arg : kernel_code.arguments())
         {
-            const auto arg_id           = id_arg.first;
-            const auto arg              = id_arg.second;
-            _workload_arguments[arg_id] = GpuWorkloadArgument{ *arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info() };
-            if(_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end())
+            const auto arg_id = id_arg.first;
+            const auto arg    = id_arg.second;
+            _workload_arguments[arg_id] =
+                GpuWorkloadArgument{*arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info()};
+            if (_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end())
             {
                 _tensor_uwork_map[arg_id] = std::set<UnitWorkloadId>();
             }
@@ -276,18 +277,19 @@ public:
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
         GpuKernelArgumentList flat_kernel_args = kernel_code.arguments();
         GpuKernelArgumentList tensor_kargs{};
-        while(true)
+        while (true)
         {
             tensor_kargs = extract_kernel_args_for_one_tensor(flat_kernel_args);
-            if(tensor_kargs.empty())
+            if (tensor_kargs.empty())
             {
                 break;
             }
             else
             {
                 const auto tensor_id           = tensor_kargs.at(0).id();
-                _workload_arguments[tensor_id] = GpuWorkloadArgument{ *context->implementation().get_tensor_info(tensor_id), mem_map.at(tensor_id), tensor_kargs };
-                if(_tensor_uwork_map.find(tensor_id) == _tensor_uwork_map.end())
+                _workload_arguments[tensor_id] = GpuWorkloadArgument{
+                    *context->implementation().get_tensor_info(tensor_id), mem_map.at(tensor_id), tensor_kargs};
+                if (_tensor_uwork_map.find(tensor_id) == _tensor_uwork_map.end())
                 {
                     _tensor_uwork_map[tensor_id] = std::set<UnitWorkloadId>();
                 }
@@ -308,7 +310,7 @@ public:
     {
         std::vector<UnitWorkloadId> ids{};
 
-        for(const auto &uwk : _unit_workloads)
+        for (const auto &uwk : _unit_workloads)
         {
             ids.push_back(uwk.id());
         }
@@ -323,7 +325,7 @@ public:
     std::vector<ITensorInfo::Id> tensors() const
     {
         std::vector<ITensorInfo::Id> ids{};
-        for(const auto &id_tensor : _workload_arguments)
+        for (const auto &id_tensor : _workload_arguments)
         {
             ids.push_back(id_tensor.first);
         }
@@ -337,7 +339,7 @@ public:
     }
 
 private:
-    std::vector<GpuUnitWorkload> _unit_workloads{};
+    std::vector<GpuUnitWorkload>                        _unit_workloads{};
     std::map<ITensorInfo::Id, GpuWorkloadArgument>      _workload_arguments{};
     std::map<ITensorInfo::Id, std::set<UnitWorkloadId>> _tensor_uwork_map{};
 };
diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
index 1d8b231efdab759548a92738c3f42925c73d665c..ad474674f9e6c1157f6e5f5d356ab136fa1c0991 100644
--- a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
+++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
 
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
index 4b4c22fa1d23779c0aeadad606f0f16b32914caf..c4ab110c92877952802c31c591969f95b3b3da88 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+
 #include "ckw/Error.h"
 
 namespace arm_compute
@@ -36,12 +37,12 @@ GpuCkwComponentArgument::GpuCkwComponentArgument()
 {
 }
 
-GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand &tensor)
-    : _tensor(&tensor)
+GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand &tensor) : _tensor(&tensor)
 {
 }
 
-GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &tile_sampler)
+GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand             &tile,
+                                                                      const ckw::TensorTileSampler &tile_sampler)
 {
     CKW_ASSERT(_tile == nullptr);
 
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
index 80f91389a039e84810429c9d11ecf96508cc7401..863989a7bdceadd8af1657bfd2d10617c6a99ca8 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
@@ -110,9 +110,9 @@ public:
     const ckw::TensorTileSampler &tile_sampler() const;
 
 private:
-    ckw::TensorOperand *_tensor{ nullptr };
-    ckw::TileOperand   *_tile{ nullptr };
-    ckw::TensorTileSampler  _tile_sampler{};
+    ckw::TensorOperand    *_tensor{nullptr};
+    ckw::TileOperand      *_tile{nullptr};
+    ckw::TensorTileSampler _tile_sampler{};
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
index a24a172d773157bd0d2bbea046c8077bcc1c5f6a..c927f32bde13fea5f4ff6d0c50c516a905afbe2a 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
@@ -23,17 +23,16 @@
  */
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h"
 
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/common/utils/Log.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 using namespace ckw;
 namespace arm_compute
@@ -43,11 +42,11 @@ namespace experimental
 namespace dynamic_fusion
 {
 GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components)
-    : _components{ components }, _kernel{ GpuTargetLanguage::OpenCL }, _code{}
+    : _components{components}, _kernel{GpuTargetLanguage::OpenCL}, _code{}
 {
     // Generate kernel name
     std::string name = "";
-    for(auto &comp : _components)
+    for (auto &comp : _components)
     {
         auto ckw_driver = comp->ckw_component_driver();
         ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
@@ -60,7 +59,7 @@ GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components)
     GpuCkwScopedKernelWriter writer(&root_writer);
     GpuCkwVariableTable      vtable{};
 
-    for(auto &comp : _components)
+    for (auto &comp : _components)
     {
         auto ckw_driver = comp->ckw_component_driver();
         ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
@@ -82,7 +81,7 @@ std::string GpuCkwDriver::get_code()
 std::string GpuCkwDriver::get_config_id()
 {
     std::string id = "";
-    for(auto &comp : _components)
+    for (auto &comp : _components)
     {
         auto ckw_driver = comp->ckw_component_driver();
         ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
@@ -101,9 +100,9 @@ Window GpuCkwDriver::get_window() const
 GpuKernelArgumentList GpuCkwDriver::get_kernel_arguments()
 {
     GpuKernelArgumentList args{};
-    for(const auto &arg : _kernel.arguments())
+    for (const auto &arg : _kernel.arguments())
     {
-        switch(arg.type())
+        switch (arg.type())
         {
             case KernelArgument::Type::TensorStorage:
             {
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
index 19db575fea7870535981890cc88b50ca3c0be24f..2ca5fb435cadaec9586ef44cb5d9e84f66469dd7 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
@@ -24,12 +24,12 @@
 #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER
 
+#include "ckw/Kernel.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
 
-#include "ckw/Kernel.h"
-
 #include <map>
 #include <string>
 
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp
index ca4f121566b546184a9bb342250376e4b77751f2..5f8ce919e324436896bec7dfe3f61417b34c552f 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp
@@ -23,10 +23,12 @@
  */
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+
 #include "ckw/Error.h"
 #include "ckw/TileInfo.h"
 
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+
 namespace arm_compute
 {
 namespace experimental
@@ -34,21 +36,21 @@ namespace experimental
 namespace dynamic_fusion
 {
 
-GpuCkwKernelWriter::GpuCkwKernelWriter(ckw::Kernel &kernel)
-    : KernelWriter(kernel)
+GpuCkwKernelWriter::GpuCkwKernelWriter(ckw::Kernel &kernel) : KernelWriter(kernel)
 {
 }
 
 void GpuCkwKernelWriter::op_load_once(GpuCkwComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler)
 {
-    if(!tensor_or_tile->has_tile())
+    if (!tensor_or_tile->has_tile())
     {
         CKW_ASSERT(tensor_or_tile->has_tensor());
 
         auto &tensor = tensor_or_tile->tensor();
 
         const auto tile_name = tensor.name() + "_tile";
-        auto      &tile      = declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
+        auto      &tile =
+            declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
 
         op_load(tile, tensor, sampler);
 
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
index 043fda9e6f13d807ae90f3e231ce36051c3d173c..cbadbd96395626cf7e73b6351e158b6825381820 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 
 namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
index 4d11b5e3e41212a357ce29f6e275c1012be4bd28..81049bfe37e3f852dfda29eb63fb4d6bf4428db0 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
@@ -63,7 +63,7 @@ public:
 
 private:
     GpuCkwKernelWriter *_writer;
-    int32_t          _parent_id_space;
+    int32_t             _parent_id_space;
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
index 37c27cd116c0661f7a36302583d5518fea8caef1..88a0cf7f43d70ea770fff0cb49358aa7dc0cebae 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
@@ -23,11 +23,12 @@
  */
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
 #include <sstream>
 
 namespace arm_compute
@@ -36,19 +37,22 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, TensorStorageType storage,
-                                                               const std::string &alias)
+GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group,
+                                                               GpuCkwScopedKernelWriter      &writer,
+                                                               const ITensorInfo             *tensor,
+                                                               TensorStorageType              storage,
+                                                               const std::string             &alias)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
 
     // Do not re-declare if the variable associated with the tensor has already been declared
     auto it = _vars.find(tensor->id());
 
-    if(it != _vars.end())
+    if (it != _vars.end())
     {
         return &it->second;
     }
-    if(comp_group.is_intermediate_tensor(tensor))
+    if (comp_group.is_intermediate_tensor(tensor))
     {
         // Create a virtual tensor variable
         GpuCkwComponentArgument var;
@@ -61,7 +65,7 @@ GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelCo
         std::stringstream ss;
         ss << alias << "_t" << abs(tensor->id());
         const auto              uniq_name = ss.str();
-        GpuCkwComponentArgument var{ writer->declare_tensor_argument(uniq_name, to_ckw(*tensor), to_ckw(storage)) };
+        GpuCkwComponentArgument var{writer->declare_tensor_argument(uniq_name, to_ckw(*tensor), to_ckw(storage))};
         auto                  &&inserted = _vars.emplace(tensor->id(), var);
         return &(inserted.first->second);
     }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
index 0649dcba9df0f137bd5e335a95ba6197d7b89d00..2b118911b8ffb13e346a8529feb6cd79ab571f9c 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE
 
 #include "arm_compute/core/ITensorInfo.h"
+
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
 
 #include <map>
@@ -58,8 +59,11 @@ public:
      *
      * @return GpuCkwComponentArgument*
      */
-    GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, TensorStorageType storage,
-                                              const std::string &alias = "unnamed");
+    GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group,
+                                              GpuCkwScopedKernelWriter      &writer,
+                                              const ITensorInfo             *tensor,
+                                              TensorStorageType              storage,
+                                              const std::string             &alias = "unnamed");
 
 private:
     std::map<ITensorInfo::Id, GpuCkwComponentArgument> _vars{};
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
index 14086f785e0b79b5fb4b9d0d1b06c5ff73e5f801..52e56e2e35c05dc15156a53ece8e8a124829d309 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER
 
 #include "arm_compute/core/Window.h"
+
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/components/Types.h"
 
@@ -73,8 +74,7 @@ public:
      * @param[in] id      Component id
      * @param[in] tensors Tensor arguments to the components
      */
-    IGpuCkwComponentDriver(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
-        : _id{ id }, _tensors{ tensors }
+    IGpuCkwComponentDriver(ComponentId id, const ArgumentPack<ITensorInfo> &tensors) : _id{id}, _tensors{tensors}
     {
     }
     /** Destructor */
@@ -89,7 +89,9 @@ public:
      *
      *                            @note @p writer can only be passed via value since the new scope is created in the copy constructor
      */
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const = 0;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const = 0;
     /** Get tensor arguments */
     ArgumentPack<ITensorInfo> tensors() const
     {
@@ -128,7 +130,7 @@ public:
     }
 
 private:
-    ComponentId               _id{ -1 };
+    ComponentId               _id{-1};
     ArgumentPack<ITensorInfo> _tensors{};
 };
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
index c07fac0e0d997ac125ab91c232f19d307f5d5cce..c3b1b3c8bc9af902ad7c66aff3bcc411bda8c5e8 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
@@ -24,16 +24,18 @@
 #include "GpuCkwActivation.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
 #include "ckw/TensorTileSampler.h"
+
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
 #include <string>
 
 using namespace ckw;
@@ -87,24 +89,25 @@ inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_
 GpuCkwActivation::GpuCkwActivation(ComponentId                      id,
                                    const ArgumentPack<ITensorInfo> &tensors,
                                    const Attributes                &attributes)
-    : IGpuCkwComponentDriver{ id, tensors },
-      _src{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 }
 
-void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwActivation::write_component_code(const ComponentGroup    &comp_group,
+                                            GpuCkwVariableTable     &vtable,
+                                            GpuCkwScopedKernelWriter writer) const
 {
     const auto         root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
     const unsigned int n0          = root_window.x().step();
     const unsigned int m0          = root_window.y().step();
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     load_src_dst_tiles_and_prepare_sampler(writer, src, dst, m0, n0, create_sampler);
 
@@ -119,7 +122,7 @@ void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, Gp
     const auto &constant_B       = writer->declare_tile("B_VAL", _attributes.b());
 
     // Perform the operation.
-    switch(_attributes.activation())
+    switch (_attributes.activation())
     {
         case ActivationLayerInfo::ActivationFunction::LOGISTIC:
         {
@@ -179,9 +182,10 @@ Window GpuCkwActivation::get_window() const
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl           = 16;
-    const unsigned int     num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window                 win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    constexpr unsigned int vector_size_byte_opencl = 16;
+    const unsigned int     num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h
index e157e36cbf298455acbace56e01c1011e251d20b..386e933a7221710f676488fafca114462cd34fbb 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h
@@ -46,15 +46,15 @@ public:
      * @param[in] tensors    Tensor arguments to the component
      * @param[in] attributes Component attributes
      */
-    GpuCkwActivation(ComponentId                      id,
-                            const ArgumentPack<ITensorInfo> &tensors,
-                            const Attributes                &attributes);
+    GpuCkwActivation(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwActivation);
     /** Destructor */
     ~GpuCkwActivation() override = default;
     // Inherited methods overriden:
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override;
-    Window get_window() const override;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
 
 private:
     const ITensorInfo *_src;
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
index 6ecf2bac44a0a0b4f2c84d738fc395ccd10b827a..e8e5087633f402ef54ba31be680f3e42080742a1 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
@@ -24,16 +24,18 @@
 #include "GpuCkwCast.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
 #include "ckw/TensorTileSampler.h"
+
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
 #include <string>
 
 using namespace ckw;
@@ -84,30 +86,29 @@ inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_
 }
 } // namespace
 
-GpuCkwCast::GpuCkwCast(ComponentId                      id,
-                       const ArgumentPack<ITensorInfo> &tensors,
-                       const Attributes                &attributes)
-    : IGpuCkwComponentDriver{ id, tensors },
-      _src{},
-      _dst{},
-      _attributes{ attributes }
+GpuCkwCast::GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 }
 
-void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwCast::write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const
 {
     const auto         root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
     const unsigned int n0          = root_window.x().step();
     const unsigned int m0          = root_window.y().step();
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     // Load the source tile and prepare the sampler.
-    if(!src->has_tile())
+    if (!src->has_tile())
     {
         const auto sampler = create_sampler(writer, m0, n0);
         writer->op_load_once(src, sampler);
@@ -122,7 +123,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa
     const auto &sampler  = src->tile_sampler();
 
     // Prepare the output tile.
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
         // Get Target datatype and convert it to ckw::DataType.
         ckw::DataType target_dt = dynamic_fusion::to_ckw(_attributes.data_type());
@@ -143,7 +144,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa
     const size_t dst_size  = data_size_from_type(_dst->data_type());
     const bool   cast_down = (src_size >= dst_size);
 
-    if(cast_down && is_data_type_quantized(_src->data_type()))
+    if (cast_down && is_data_type_quantized(_src->data_type()))
     {
         const auto &constant_x80 = writer->declare_tile("0x80", 0x80);
         writer->op_binary_expression(src_tile, src_tile, BinaryOp::BitwiseXOR, constant_x80);
@@ -151,7 +152,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa
 
     ckw::ConvertPolicy convert_policy = ckw::ConvertPolicy::None;
 
-    if(cast_down && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
+    if (cast_down && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
     {
         convert_policy = ckw::ConvertPolicy::Saturate;
     }
@@ -167,9 +168,10 @@ Window GpuCkwCast::get_window() const
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl           = 16;
-    const unsigned int     num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window                 win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    constexpr unsigned int vector_size_byte_opencl = 16;
+    const unsigned int     num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h
index 821cec1e19b97e8ae05bbf2d8def591e6884f1ba..2389301196f87f889da035d45248754bab38cca6 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h
@@ -46,15 +46,15 @@ public:
      * @param[in] tensors    Tensor arguments to the component
      * @param[in] attributes Component attributes
      */
-    GpuCkwCast(ComponentId                      id,
-                            const ArgumentPack<ITensorInfo> &tensors,
-                            const Attributes                &attributes);
+    GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwCast);
     /** Destructor */
     ~GpuCkwCast() override = default;
     // Inherited methods overriden:
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override;
-    Window get_window() const override;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
 
 private:
     const ITensorInfo *_src;
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e09c78f473cd6dfde353f2b4147ddee5f46744d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+#include "ckw/TensorTileSampler.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include <cstdint>
+#include <string>
+
+using namespace ckw;
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwDepthwiseConv2d::GpuCkwDepthwiseConv2d(ComponentId                      id,
+                                             const ArgumentPack<ITensorInfo> &tensors,
+                                             const Attributes                &attributes,
+                                             const Settings                  &settings)
+    : IGpuCkwComponentDriver{id, tensors},
+      _src{},
+      _weight{},
+      _bias{},
+      _dst{},
+      _attributes{attributes},
+      _settings{settings}
+{
+    _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+    if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
+    {
+        _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+    }
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _bias, _dst);
+}
+
+void GpuCkwDepthwiseConv2d::write_component_code(const ComponentGroup    &comp_group,
+                                                 GpuCkwVariableTable     &vtable,
+                                                 GpuCkwScopedKernelWriter writer) const
+{
+    // Data Layout is NHWC
+    constexpr int32_t width_idx  = 1;
+    constexpr int32_t height_idx = 2;
+
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+    // Tunable parameters
+    // Currently only m0 and n0 = 1 are supported.
+    const int32_t     m0       = root_window.y().step();
+    const int32_t     n0       = root_window.x().step();
+    constexpr int32_t m0_a_val = 1;
+    constexpr int32_t n0_a_val = 1;
+    constexpr int32_t m0_b_val = 1;
+
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *wei =
+        vtable.declare_variable(comp_group, writer, _weight, TensorStorageType::ClBufferUint8Ptr, "wei");
+    GpuCkwComponentArgument *bia = nullptr;
+
+    if (_bias && _bias->has_valid_id())
+    {
+        bia = vtable.declare_variable(comp_group, writer, _bias, TensorStorageType::ClBufferUint8Ptr, "bia");
+    }
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+
+    // Constants
+    const auto &const_1    = writer->declare_tile("1", 1);
+    const auto &wei_height = writer->declare_tile("WEI_HEIGHT", static_cast<int32_t>(_weight->dimension(height_idx)));
+    const auto &wei_width  = writer->declare_tile("WEI_WIDTH", static_cast<int32_t>(_weight->dimension(width_idx)));
+    const auto &dst_height = writer->declare_tile("DST_HEIGHT", static_cast<int32_t>(_dst->dimension(height_idx)));
+    const auto &stride_x   = writer->declare_tile("STRIDE_X", static_cast<int32_t>(_attributes.stride().x()));
+    const auto &stride_y   = writer->declare_tile("STRIDE_Y", static_cast<int32_t>(_attributes.stride().y()));
+    const auto &pad_left   = writer->declare_tile("PAD_LEFT", static_cast<int32_t>(_attributes.pad().left));
+    const auto &pad_top    = writer->declare_tile("PAD_TOP", static_cast<int32_t>(_attributes.pad().top));
+    const auto &depth_multiplier =
+        writer->declare_tile("DEPTH_MULTIPLIER", static_cast<int32_t>(_attributes.depth_multiplier()));
+    auto &const_0 = writer->declare_tile("0", 0);
+    auto &yo      = writer->declare_tile("yo", ckw::DataType::Int32);
+
+    auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
+    auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
+    auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+
+    writer->op_get_global_id(gid_0, 0);
+    writer->op_get_global_id(gid_1, 1);
+    writer->op_get_global_id(gid_2, 2);
+
+    auto &bout = writer->declare_tile("bout", ckw::DataType::Int32);
+    writer->op_binary_expression(bout, gid_2, ckw::BinaryOp::Div, dst_height); // gid_2 / h
+    writer->op_binary_expression(yo, gid_2, ckw::BinaryOp::Mod, dst_height);   // gid_2 % h
+
+    const int32_t dst_partial_n0_v = _dst->tensor_shape()[0] % n0;
+    const int32_t dst_partial_m0_v = _dst->tensor_shape()[1] % m0;
+    auto         &g_ind_0          = writer->declare_tile("g_ind_0", ckw::DataType::Int32);
+    auto         &g_ind_1          = writer->declare_tile("g_ind_1", ckw::DataType::Int32);
+    get_coord(writer, g_ind_0, gid_0, n0, dst_partial_n0_v, "dst_x_", const_0);
+    get_coord(writer, g_ind_1, gid_1, m0, dst_partial_m0_v, "dst_y_", const_0);
+
+    TensorTileSampler src_sampler;
+    src_sampler.width(m0_a_val);
+    src_sampler.height(n0_a_val);
+    src_sampler.format(TensorSamplerFormat::C_W_H);
+    src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+    src_sampler.address_mode_y(TensorSamplerAddressModeY::Skip);
+    src_sampler.address_mode_z(TensorSamplerAddressModeZ::Skip);
+
+    TensorTileSampler wei_sampler;
+    wei_sampler.width(m0_b_val);
+    wei_sampler.height(n0);
+    wei_sampler.format(TensorSamplerFormat::C_W_H);
+    wei_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+    wei_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+    wei_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+
+    TensorTileSampler dst_sampler;
+    dst_sampler.width(n0);
+    dst_sampler.height(m0);
+    dst_sampler.format(TensorSamplerFormat::C_W_H);
+    dst_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+    dst_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::Skip);
+    dst_sampler.x(g_ind_0);
+    dst_sampler.y(g_ind_1);
+    dst_sampler.z(yo);
+    dst_sampler.b(bout);
+
+    if (!dst->has_tile())
+    {
+        auto &dst_tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), m0, n0));
+        dst->init_virtual_tensor(dst_tile, dst_sampler);
+    }
+    auto &dst_tile = dst->tile();
+
+    writer->op_assign(dst_tile, const_0);
+
+    auto &xi = writer->declare_tile("xi", ckw::DataType::Int32);
+    writer->op_binary_expression(xi, g_ind_1, ckw::BinaryOp::Mul, stride_x);
+    writer->op_binary_expression(xi, xi, ckw::BinaryOp::Sub, pad_left);
+
+    auto &yi = writer->declare_tile("yi", ckw::DataType::Int32);
+    writer->op_binary_expression(yi, yo, ckw::BinaryOp::Mul, stride_y);
+    writer->op_binary_expression(yi, yi, ckw::BinaryOp::Sub, pad_top);
+
+    auto &a_x = writer->declare_tile("a_x", ckw::DataType::Int32);
+    writer->op_binary_expression(a_x, g_ind_0, BinaryOp::Div, depth_multiplier);
+
+    // src_tile
+    auto &a = writer->declare_tile("a", ckw::TileInfo(to_ckw(_src->data_type()), m0_a_val, n0_a_val));
+    // wei_tile
+    auto &b = writer->declare_tile("b", ckw::TileInfo(to_ckw(_weight->data_type()), m0_b_val, n0));
+
+    // Loop variables
+    auto &yk = writer->declare_tile("yk", ckw::DataType::Int32);
+    auto &xk = writer->declare_tile("xk", ckw::DataType::Int32);
+
+    // Because 1x1 blocks are being used here, scalar values are being loaded from memory instead of using tiles, since tile vector access currently is not available. Hence the values are loaded in the inner loop.
+    // This loop will be reworked.
+    writer->op_assign(yk, const_0);
+    writer->op_for_loop(yk, BinaryOp::Less, wei_height, yk, AssignmentOp::Increment, const_1,
+                        [&]()
+                        {
+                            // xk = 0
+                            writer->op_assign(xk, const_0);
+                            writer->op_for_loop(
+                                xk, BinaryOp::Less, wei_width, xk, AssignmentOp::Increment, const_1,
+                                [&]()
+                                {
+                                    writer->op_assign(b, const_0);
+                                    writer->op_assign(a, const_0);
+
+                                    // src_tile loading
+                                    auto &xi_curr = writer->declare_tile("xi_curr", ckw::DataType::Int32);
+                                    writer->op_binary_expression(xi_curr, xi, BinaryOp::Add, xk);
+                                    auto &a_y = writer->declare_tile("a_y", ckw::DataType::Int32);
+                                    writer->op_binary_expression(a_y, yi, BinaryOp::Add, yk);
+                                    src_sampler.x(a_x);
+                                    src_sampler.y(xi_curr);
+                                    src_sampler.z(a_y);
+                                    src_sampler.b(bout);
+                                    writer->op_load(a, src->tensor(), src_sampler);
+
+                                    // wei_tile loading
+                                    auto &b_y = writer->declare_tile("b_y", ckw::DataType::Int32);
+                                    writer->op_binary_expression(b_y, wei_width, BinaryOp::Mul, yk);
+                                    writer->op_binary_expression(b_y, b_y, BinaryOp::Add, xk);
+                                    wei_sampler.x(g_ind_0);
+                                    wei_sampler.y(b_y);
+                                    wei_sampler.z(const_0);
+                                    wei_sampler.b(const_0);
+                                    writer->op_load(b, wei->tensor(), wei_sampler);
+
+                                    // Do the accumulation
+                                    auto &mul_result = writer->declare_tile("mul_results", a.data_type());
+                                    writer->op_binary_expression(mul_result, a, BinaryOp::Mul, b);
+                                    writer->op_binary_expression(dst_tile, dst_tile, BinaryOp::Add, mul_result);
+                                });
+                        });
+
+    // Add Bias
+    if (_bias && _bias->has_valid_id())
+    {
+        TensorTileSampler bias_sampler;
+        bias_sampler.width(n0);
+        bias_sampler.height(1);
+        bias_sampler.format(TensorSamplerFormat::C_W_H);
+        bias_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+        bias_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+        bias_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+        bias_sampler.x(g_ind_0);
+        bias_sampler.y(const_0);
+        bias_sampler.z(const_0);
+        bias_sampler.b(const_0);
+
+        auto &bias_tile = writer->declare_tile("bias_tile", ckw::TileInfo(to_ckw(_bias->data_type()), 1, n0));
+        writer->op_load(bias_tile, bia->tensor(), bias_sampler);
+        writer->op_binary_expression(dst_tile, dst_tile, BinaryOp::Add, bias_tile);
+    }
+}
+
+Window GpuCkwDepthwiseConv2d::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+    TensorShape output_shape = _dst->tensor_shape();
+    // Currently only m0 and n0 = 1 are supported.
+    Window win = calculate_max_window(output_shape, Steps(1U, 1U));
+    return win.collapse(win, Window::DimZ);
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9bcaabaec365ec823ebd4de916297c1c28b41ed
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+class GpuCkwDepthwiseConv2d : public IGpuCkwComponentDriver
+{
+public:
+    using Attributes = ClComponentDepthwiseConv2d::Attributes;
+    using Settings   = ClComponentDepthwiseConv2d::Settings;
+
+    /** Constructor
+     *
+     * For supported configurations please refer to @ref ClComponentDepthwiseConv2d::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the component
+     * @param[in] attributes Component attributes
+     * @param[in] settings   Component settings
+     */
+    GpuCkwDepthwiseConv2d(ComponentId                      id,
+                          const ArgumentPack<ITensorInfo> &tensors,
+                          const Attributes                &attributes,
+                          const Settings                  &settings);
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwDepthwiseConv2d);
+    /** Destructor */
+    ~GpuCkwDepthwiseConv2d() override = default;
+    // Inherited methods overriden:
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_weight;
+    const ITensorInfo *_bias;
+    const ITensorInfo *_dst;
+    Attributes         _attributes;
+    Settings           _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
index 3c906646a67d8451ad0f13b116b3e655322d8573..7833da23343a5d81272bb7a1d4e17989902c2070 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
@@ -25,21 +25,20 @@
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
-
+#include "arm_compute/core/Validate.h"
 #include "ckw/TensorTileSampler.h"
 #include "ckw/TileInfo.h"
 
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
 namespace arm_compute
 {
@@ -54,13 +53,7 @@ GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId                      id,
                                        const ArgumentPack<ITensorInfo> &tensors,
                                        const Attributes                &attributes,
                                        const Settings                  &settings)
-    : IGpuCkwComponentDriver{ id, tensors },
-      _src{},
-      _wei{},
-      _bia{},
-      _dst{},
-      _attributes{ attributes },
-      _settings{ settings }
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
@@ -69,7 +62,9 @@ GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId                      id,
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _dst); // Bias can be null
 }
 
-void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwDirectConv2d::write_component_code(const ComponentGroup    &comp_group,
+                                              GpuCkwVariableTable     &vtable,
+                                              GpuCkwScopedKernelWriter writer) const
 {
     const auto desc = _settings.direct_conv_descriptor();
     ARM_COMPUTE_ERROR_ON_MSG(desc.export_input_to_cl_image || desc.export_output_to_cl_image,
@@ -99,15 +94,18 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group,
     // extra loop to compute the left-over elements.
     const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (K % 4 == 0);
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
     GpuCkwComponentArgument *wei = vtable.declare_variable(
-        comp_group, writer, _wei, use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+        comp_group, writer, _wei,
+        use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
     GpuCkwComponentArgument *bia = nullptr;
 
     const bool using_bias = _bia != nullptr;
 
-    if(using_bias)
+    if (using_bias)
     {
         bia = vtable.declare_variable(comp_group, writer, _bia, TensorStorageType::ClBufferUint8Ptr, "bia");
     }
@@ -154,7 +152,8 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group,
     src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
     // We cannot have out-of-bounds reads when the kernel height is equal to 1. Otherwise, we need to ensure the
     // indirection buffer mi does not contain negative values representing out-of-bounds reads.
-    src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None : TensorSamplerAddressModeY::SkipMinEdgeOnly);
+    src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None
+                                                  : TensorSamplerAddressModeY::SkipMinEdgeOnly);
     src_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
 
     TensorTileSampler wei_sampler;
@@ -178,7 +177,7 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group,
     dst_sampler.z(tile_0);
     dst_sampler.b(tile_bout);
 
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
         auto &tile = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0));
         dst->init_virtual_tensor(tile, dst_sampler);
@@ -189,10 +188,10 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group,
 
     // We create a 2d container of size (M0, 1) to store the indices for iteration
     TileContainer it;
-    for(int m = 0; m < m0; ++m)
+    for (int m = 0; m < m0; ++m)
     {
-        std::vector<std::string> idx { std::to_string(m) };
-        it.push_back({ idx });
+        std::vector<std::string> idx{std::to_string(m)};
+        it.push_back({idx});
     }
     const auto &tile_it = writer->declare_tile("it", it, ckw::DataType::Int32);
 
@@ -289,9 +288,9 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group,
     // Bias addition
     // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of
     // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel.
-    if(using_bias)
+    if (using_bias)
     {
-        if(!bia->has_tile())
+        if (!bia->has_tile())
         {
             // Reuse the destination sampler for the bias
             writer->op_load_once(bia, dst_sampler);
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
index c8bf999261c26577a0615585e28d32e83b677491..2935ba45ea93f0c0895057343ac60f2e728d6970 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
@@ -24,22 +24,24 @@
 #include "GpuCkwElementwiseBinary.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 #include "ckw/TensorTileSampler.h"
 #include "ckw/types/TensorSamplerTypes.h"
+
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h"
 #include "src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
+
 #include <algorithm>
 #include <string>
 
@@ -53,11 +55,7 @@ namespace dynamic_fusion
 GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId                      id,
                                                  const ArgumentPack<ITensorInfo> &tensors,
                                                  const Attributes                &attributes)
-    : IGpuCkwComponentDriver{ id, tensors },
-      _lhs{},
-      _rhs{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}
 {
     _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
@@ -65,15 +63,20 @@ GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId
     ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst);
 }
 
-void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup    &comp_group,
+                                                   GpuCkwVariableTable     &vtable,
+                                                   GpuCkwScopedKernelWriter writer) const
 {
     const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
     const auto n0          = static_cast<int32_t>(root_window.x().step());
     const auto m0          = static_cast<int32_t>(root_window.y().step());
 
-    GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs");
-    GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *lhs =
+        vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs");
+    GpuCkwComponentArgument *rhs =
+        vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
     auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
@@ -86,32 +89,36 @@ void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_gr
     auto &const_0 = writer->declare_tile("0", 0);
 
     // Load the LHS and RHS tiles
-    if(!lhs->has_tile())
+    if (!lhs->has_tile())
     {
-        auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _lhs->dimension(0), _lhs->dimension(1), n0, m0, "lhs_", const_0);
+        auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _lhs->dimension(0), _lhs->dimension(1),
+                                                        n0, m0, "lhs_", const_0);
         sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
         sampler.z(const_0);
         sampler.b(gid_2);
         writer->op_load_once(lhs, sampler);
     }
-    if(!rhs->has_tile())
+    if (!rhs->has_tile())
     {
-        auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _rhs->dimension(0), _rhs->dimension(1), n0, m0, "rhs_", const_0);
+        auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _rhs->dimension(0), _rhs->dimension(1),
+                                                        n0, m0, "rhs_", const_0);
         sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
         sampler.z(const_0);
         sampler.b(gid_2);
         writer->op_load_once(rhs, sampler);
     }
 
-    auto dst_sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _dst->dimension(0), _dst->dimension(1), n0, m0, "dst_", const_0);
+    auto dst_sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _dst->dimension(0), _dst->dimension(1),
+                                                        n0, m0, "dst_", const_0);
     dst_sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
     dst_sampler.z(const_0);
     dst_sampler.b(gid_2);
 
     // Prepare the output tile.
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
-        auto &tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), dst_sampler.height(), dst_sampler.width()));
+        auto &tile = writer->declare_tile(
+            "dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), dst_sampler.height(), dst_sampler.width()));
         dst->init_virtual_tensor(tile, dst_sampler);
     }
 
@@ -131,9 +138,10 @@ Window GpuCkwElementwiseBinary::get_window() const
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl           = 16;
-    const unsigned int     num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window                 win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    constexpr unsigned int vector_size_byte_opencl = 16;
+    const unsigned int     num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }
@@ -141,11 +149,12 @@ Window GpuCkwElementwiseBinary::get_window() const
 std::string GpuCkwElementwiseBinary::get_name(const ComponentGroup &comp_group) const
 {
     ARM_COMPUTE_UNUSED(comp_group);
-    const std::vector<std::string> build_params =
-    {
+    const std::vector<std::string> build_params = {
         "elementwise_binary",
-        "op", to_string(_attributes.operation()),
-        "dt", lower_string(string_from_data_type(_dst->data_type())),
+        "op",
+        to_string(_attributes.operation()),
+        "dt",
+        lower_string(string_from_data_type(_dst->data_type())),
     };
     return join(build_params, "_");
 }
@@ -154,13 +163,16 @@ std::string GpuCkwElementwiseBinary::get_tuner_id(const ComponentGroup &comp_gro
 {
     ARM_COMPUTE_UNUSED(comp_group);
     /// NOTE: Hardcoded for now, the parameters should ideally be exported by ckw (a selection of constant tiles)
-    std::vector<std::string> build_params =
-    {
+    std::vector<std::string> build_params = {
         "elementwise_binary",
-        "op", to_string(_attributes.operation()),
-        "dt", lower_string(string_from_data_type(_dst->data_type())),
-        "dst_dim0", support::cpp11::to_string(_dst->dimension(0)),
-        "dst_dim1", support::cpp11::to_string(_dst->dimension(1)),
+        "op",
+        to_string(_attributes.operation()),
+        "dt",
+        lower_string(string_from_data_type(_dst->data_type())),
+        "dst_dim0",
+        support::cpp11::to_string(_dst->dimension(0)),
+        "dst_dim1",
+        support::cpp11::to_string(_dst->dimension(1)),
     };
     return join(build_params, "_");
 }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
index e9c41530f8703c4b9fd7e2c364ac519e9a695935..1a20d4c5335903cca299fe0f105f6cd33bd6f21d 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
@@ -46,17 +46,17 @@ public:
      * @param[in] tensors    Tensor arguments to the component
      * @param[in] attributes Component attributes
      */
-    GpuCkwElementwiseBinary(ComponentId                      id,
-                            const ArgumentPack<ITensorInfo> &tensors,
-                            const Attributes                &attributes);
+    GpuCkwElementwiseBinary(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwElementwiseBinary);
     /** Destructor */
     ~GpuCkwElementwiseBinary() override = default;
     // Inherited methods overriden:
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override;
-    Window      get_window() const override;
-    std::string get_name(const ComponentGroup &comp_group) const override;
-    std::string get_tuner_id(const ComponentGroup &comp_group) const override;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
+    std::string  get_tuner_id(const ComponentGroup &comp_group) const override;
 
 private:
     const ITensorInfo *_lhs;
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9beba0359806118585af673fb7198764e5f76a5d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h"
+
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "support/StringSupport.h"
+
+using namespace ckw;
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwMatMul::GpuCkwMatMul(ComponentId                      id,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings)
+    : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}, _settings{settings}
+{
+    _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst);
+}
+
+void GpuCkwMatMul::write_component_code(const ComponentGroup    &comp_group,
+                                        GpuCkwVariableTable     &vtable,
+                                        GpuCkwScopedKernelWriter writer) const
+{
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+    GpuCkwComponentArgument *lhs =
+        vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs");
+    GpuCkwComponentArgument *rhs =
+        vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+
+    // Constants
+    const int   height_idx = get_data_layout_dimension_index(_lhs->data_layout(), DataLayoutDimension::HEIGHT);
+    const auto &rhs_h      = writer->declare_tile("rhs_h", static_cast<int32_t>(_rhs->dimension(height_idx)));
+    const int   m          = static_cast<int>(_dst->dimension(1));
+    const int   n          = static_cast<int>(_dst->dimension(0));
+    const int   k =
+        _attributes.adj_lhs() ? static_cast<int>(_lhs->tensor_shape().y()) : static_cast<int>(_lhs->tensor_shape().x());
+    const int m0               = root_window.y().step();
+    const int n0               = root_window.x().step();
+    const int k0               = _settings.k0();
+    const int partial_store_m0 = m % m0;
+    const int partial_store_n0 = n % n0;
+
+    const auto &const_1 = writer->declare_tile("1", 1);
+    auto       &const_0 = writer->declare_tile("0", 0);
+    auto       &k0_tile = writer->declare_tile("k0", k0);
+    auto       &k_tile  = writer->declare_tile("k", k);
+
+    auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
+    auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
+    auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+
+    writer->op_get_global_id(gid_0, 0);
+    writer->op_get_global_id(gid_1, 1);
+    writer->op_get_global_id(gid_2, 2);
+
+    auto &x = writer->declare_tile("x", ckw::DataType::Int32);
+    auto &y = writer->declare_tile("y", ckw::DataType::Int32);
+    auto &z = writer->declare_tile("z", ckw::DataType::Int32);
+
+    get_coord(writer, x, gid_0, n0, partial_store_n0, "gid_x_", const_0);
+    get_coord(writer, y, gid_1, m0, partial_store_m0, "gid_y_", const_0);
+    get_coord(writer, z, gid_2, 1, 0, "gid_z_", const_0);
+
+    TensorTileSampler lhs_sampler;
+    lhs_sampler.height(m0);
+    lhs_sampler.width(k0);
+    lhs_sampler.format(TensorSamplerFormat::C_W_H);
+    lhs_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+    lhs_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+    lhs_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+
+    TensorTileSampler rhs_sampler;
+    rhs_sampler.height(k0);
+    rhs_sampler.width(n0);
+    rhs_sampler.format(TensorSamplerFormat::C_WH_1);
+    rhs_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+    rhs_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+    rhs_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+
+    TensorTileSampler dst_sampler;
+    dst_sampler.width(n0);
+    dst_sampler.height(m0);
+    dst_sampler.format(TensorSamplerFormat::C_W_H);
+    dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
+    dst_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+    dst_sampler.x(x);
+    dst_sampler.y(y);
+    dst_sampler.z(z);
+    dst_sampler.b(const_0);
+
+    if (!dst->has_tile())
+    {
+        auto &dst_tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), m0, n0));
+        dst->init_virtual_tensor(dst_tile, dst_sampler);
+    }
+    auto &dst_tile = dst->tile();
+
+    // Initialize the accumulators
+    writer->op_assign(dst_tile, const_0);
+
+    auto &rhs_z = writer->declare_tile("rhs_z", ckw::DataType::Int32);
+    writer->op_binary_expression(rhs_z, z, BinaryOp::Mul, rhs_h);
+
+    auto &k_i     = writer->declare_tile("k_i", ckw::DataType::Int32);
+    auto &k_limit = writer->declare_tile("k_limit", k - k0);
+
+    auto &x_i = writer->declare_tile("x_i", ckw::DataType::Int32);
+    writer->op_assign(x_i, const_0);
+
+    writer->op_assign(k_i, const_0);
+
+    // *INDENT-OFF*
+    // clang-format off
+    writer->op_for_loop(k_i, BinaryOp::LessEqual, k_limit, k_i, AssignmentOp::Increment, k0_tile,
+        [&]()
+        {
+            //Initialize tiles
+            // lhs_tile
+            auto &a = writer->declare_tile("a", ckw::TileInfo(to_ckw(_lhs->data_type()), m0, k0));
+            // rhs_tile
+            auto &b = writer->declare_tile("b", ckw::TileInfo(to_ckw(_rhs->data_type()), n0, k0));
+            writer->op_assign(a, const_0);
+            writer->op_assign(b, const_0);
+
+            // Loading the tiles
+            // LHS
+            lhs_sampler.x(x_i);
+            lhs_sampler.y(y);
+            lhs_sampler.z(z);
+            lhs_sampler.b(const_0);
+            writer->op_load(a, lhs->tensor(), lhs_sampler);
+
+            // RHS
+            auto &y_i = writer->declare_tile("y_i", ckw::DataType::Int32);
+            writer->op_binary_expression(y_i, x, BinaryOp::Add, rhs_z);
+            rhs_sampler.x(k_i);
+            rhs_sampler.y(y_i);
+            rhs_sampler.z(const_0);
+            rhs_sampler.b(const_0);
+            writer->op_load(b, rhs->tensor(), rhs_sampler);
+
+            // Perform Matmul
+            writer->op_binary_expression(dst_tile, a, BinaryOp::MatMul_Nt_T, b);
+            writer->op_binary_expression(x_i, x_i, BinaryOp::Add, k0_tile);
+        });
+// *INDENT-ON*
+    // clang-format on
+
+    // Handling leftovers
+    if (k % k0 != 0)
+    {
+        // *INDENT-OFF*
+        // clang-format off
+        writer->op_for_loop(k_i, BinaryOp::Less, k_tile, k_i, AssignmentOp::Increment, const_1,
+            [&]()
+            {
+                //Initialize tiles
+                // lhs_tile
+                auto &a =
+                    writer->declare_tile("a_leftover", ckw::TileInfo(to_ckw(_lhs->data_type()), m0, 1));
+                // rhs_tile
+                auto &b =
+                    writer->declare_tile("b_leftover", ckw::TileInfo(to_ckw(_rhs->data_type()), n0, 1));
+                writer->op_assign(a, const_0);
+                writer->op_assign(b, const_0);
+
+                // Loading the tiles
+                // LHS
+                lhs_sampler.x(x_i);
+                lhs_sampler.y(y);
+                lhs_sampler.z(z);
+                lhs_sampler.b(const_0);
+                writer->op_load(a, lhs->tensor(), lhs_sampler);
+
+                // RHS
+                auto &y_i = writer->declare_tile("y_i_leftover", ckw::DataType::Int32);
+                writer->op_binary_expression(y_i, x, BinaryOp::Add, rhs_z);
+                rhs_sampler.x(k_i);
+                rhs_sampler.y(y_i);
+                rhs_sampler.z(const_0);
+                rhs_sampler.b(const_0);
+                writer->op_load(b, rhs->tensor(), rhs_sampler);
+
+                // Perform Matmul
+                writer->op_binary_expression(dst_tile, a, BinaryOp::MatMul_Nt_T, b);
+                writer->op_binary_expression(x_i, x_i, BinaryOp::Add, const_1);
+            });
+// *INDENT-ON*
+        // clang-format on
+    }
+}
+
+Window GpuCkwMatMul::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    const int  m       = _dst->dimension(1);
+    const int  n       = _dst->dimension(0);
+    const bool adj_lhs = _attributes.adj_lhs();
+
+    int m0 = adj_lhs ? adjust_vec_size(_settings.m0(), m) : std::min(_settings.m0(), m);
+    int n0 = adjust_vec_size(_settings.n0(), n);
+
+    // Configure kernel window
+    Window win = calculate_max_window(_dst->tensor_shape(), Steps(n0, m0));
+    win        = win.collapse(win, Window::DimZ);
+
+    return win;
+}
+
+std::string GpuCkwMatMul::get_name(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    std::string kernel_name("mat_mul_native");
+
+    const int m = _dst->dimension(1);
+    const int n = _dst->dimension(0);
+    const int k = _attributes.adj_lhs() ? _lhs->tensor_shape().y() : _lhs->tensor_shape().x();
+
+    kernel_name += _attributes.adj_lhs() ? "_t" : "_nt";
+    kernel_name += _attributes.adj_rhs() ? "_t" : "_nt";
+    kernel_name += "_";
+    kernel_name += support::cpp11::to_string(m);
+    kernel_name += "_";
+    kernel_name += support::cpp11::to_string(n);
+    kernel_name += "_";
+    kernel_name += support::cpp11::to_string(k);
+    kernel_name += "_";
+    kernel_name += support::cpp11::to_string(_dst->dimension(2));
+    kernel_name += "_";
+    kernel_name += support::cpp11::to_string(_settings.m0());
+    kernel_name += "_";
+    kernel_name += support::cpp11::to_string(_settings.n0());
+    kernel_name += "_";
+    kernel_name += support::cpp11::to_string(_settings.k0());
+
+    return kernel_name;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae2ea09f05da067db83888c8fa9cc63bc126c5d9
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h"
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwMatMul final : public IGpuCkwComponentDriver
+{
+public:
+    using Attributes = ClComponentMatMul::Attributes;
+    using Settings   = ClComponentMatMul::Settings;
+
+public:
+    /** Constructor
+     *
+     * For supported configurations please refer to @ref ClComponentMatMul::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the component
+     * @param[in] attributes Component attributes. Attributes are a set of parameters that define what a component does
+     * @param[in] settings   Component settings. Settings are a set of parameters that influence the implementation of a component
+     */
+    GpuCkwMatMul(ComponentId                      id,
+                 const ArgumentPack<ITensorInfo> &tensors,
+                 const Attributes                &attributes,
+                 const Settings                  &settings);
+
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwMatMul);
+
+    /** Destructor */
+    ~GpuCkwMatMul() override = default;
+
+    // Inherited methods overriden
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
+
+private:
+    const ITensorInfo *_lhs;
+    const ITensorInfo *_rhs;
+    const ITensorInfo *_dst;
+
+    const Attributes _attributes;
+    const Settings   _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ab3ec3a55611f70978b61ed065e2ef255e1c3d5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+#include "ckw/TensorTileSampler.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+using namespace ckw;
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwPool2d::GpuCkwPool2d(ComponentId                      id,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings)
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings}
+
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+}
+
+void GpuCkwPool2d::write_component_code(const ComponentGroup    &comp_group,
+                                        GpuCkwVariableTable     &vtable,
+                                        GpuCkwScopedKernelWriter writer) const
+{
+    const auto         root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+    const unsigned int n0          = root_window.x().step();
+    const unsigned int m0          = root_window.y().step();
+
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+
+    TileOperand &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
+    TileOperand &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
+    TileOperand &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+
+    writer->op_get_global_id(gid_0, 0);
+    writer->op_get_global_id(gid_1, 1);
+    writer->op_get_global_id(gid_2, 2);
+
+    // Data Layout is NHWC
+    constexpr int width_idx  = 1;
+    constexpr int height_idx = 2;
+
+    const int32_t pool_size_x   = static_cast<int32_t>(_attributes.pool_size().x());
+    const int32_t pool_size_y   = static_cast<int32_t>(_attributes.pool_size().y());
+    const int32_t pad_x         = static_cast<int32_t>(_attributes.pad().left);
+    const int32_t pad_y         = static_cast<int32_t>(_attributes.pad().top);
+    const int32_t src_width     = static_cast<int32_t>(_src->dimension(width_idx));
+    const int32_t src_height    = static_cast<int32_t>(_src->dimension(height_idx));
+    const auto    src_data_type = _src->data_type();
+
+    // Check if this is global pooling path
+    const bool is_global_pooling =
+        (pool_size_x == src_width) && (pool_size_y == src_height) && (pad_x == 0) && (pad_y == 0);
+    // Check if this a case of FP_MIXED_PRECISION
+    const bool use_fp_mixed_precision =
+        (src_data_type == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX;
+    const auto acc_data_type = (use_fp_mixed_precision) ? (DataType::F32) : (src_data_type);
+
+    TileOperand       &const_0            = writer->declare_tile("0", 0);
+    const TileOperand &const_1            = writer->declare_tile("1", 1);
+    const TileOperand &const_lowest_value = writer->declare_tile("LOWEST_VALUE", std::numeric_limits<float>::lowest());
+    const TileOperand &pool_size_x_tile   = writer->declare_tile("POOL_SIZE_X", pool_size_x);
+    const TileOperand &pool_size_y_tile   = writer->declare_tile("POOL_SIZE_Y", pool_size_y);
+    const TileOperand &stride_x_tile = writer->declare_tile("STRIDE_X", static_cast<int32_t>(_attributes.stride().x()));
+    const TileOperand &stride_y_tile = writer->declare_tile("STRIDE_Y", static_cast<int32_t>(_attributes.stride().y()));
+    const TileOperand &pad_x_tile    = writer->declare_tile("PAD_X", pad_x);
+    const TileOperand &pad_y_tile    = writer->declare_tile("PAD_Y", pad_y);
+    const TileOperand &dst_height_tile =
+        writer->declare_tile("DST_HEIGHT", static_cast<int32_t>(_dst->dimension(height_idx)));
+    const TileOperand &src_height_tile = writer->declare_tile("SRC_HEIGHT", src_height);
+    const TileOperand &src_width_tile  = writer->declare_tile("SRC_WIDTH", src_width);
+
+    TileOperand &idx_out_n = writer->declare_tile("idx_out_n", ckw::DataType::Int32);
+    TileOperand &idx_out_h = writer->declare_tile("idx_out_h", ckw::DataType::Int32);
+    TileOperand &idx_out_w = writer->declare_tile("idx_out_w", ckw::DataType::Int32);
+    TileOperand &idx_out_c = writer->declare_tile("idx_out_c", ckw::DataType::Int32);
+
+    const int32_t dst_partial_n0_v = _dst->tensor_shape()[0] % n0;
+
+    get_coord(writer, idx_out_c, gid_0, n0, dst_partial_n0_v, "dst_x_", const_0);
+    get_coord(writer, idx_out_w, gid_1, 1, 0, "dst_y_", const_0);
+
+    writer->op_binary_expression(idx_out_h, gid_2, BinaryOp::Mod, dst_height_tile); // gid_2 % h
+    writer->op_binary_expression(idx_out_n, gid_2, BinaryOp::Div, dst_height_tile); // gid_2 / h
+
+    TensorTileSampler src_sampler;
+    src_sampler.width(n0);
+    src_sampler.height(m0);
+    src_sampler.format(TensorSamplerFormat::C_W_H);
+    src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+    src_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+    src_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+    src_sampler.x(idx_out_c);
+    src_sampler.b(idx_out_n);
+
+    TensorTileSampler dst_sampler;
+    dst_sampler.width(n0);
+    dst_sampler.height(m0);
+    dst_sampler.format(TensorSamplerFormat::C_W_H);
+    dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
+    dst_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+    dst_sampler.x(idx_out_c);
+    dst_sampler.y(idx_out_w);
+    dst_sampler.z(idx_out_h);
+    dst_sampler.b(idx_out_n);
+
+    // Prepare dst tensor and tile
+    TileInfo dst_tile_info = TileInfo(to_ckw(src_data_type), m0, n0);
+    if (!dst->has_tile())
+    {
+        TileOperand &dst_tile = writer->declare_tile("dst_tile", dst_tile_info);
+        dst->init_virtual_tensor(dst_tile, dst_sampler);
+    }
+    const TileOperand &dst_tile = dst->tile();
+
+    // A tile used to temporarily store results or as an accumulator in case of AVG and L2 pooling.
+    const TileOperand &res_tile = writer->declare_tile("res_tile", TileInfo(to_ckw(acc_data_type), m0, n0));
+
+    // Initialise result tile with appropriate value
+    if (_attributes.pool_type() == PoolingType::MAX)
+    {
+        if (_settings.use_inf_as_limit())
+        {
+            TileContainer            minus_inf_tile_container;
+            std::vector<std::string> value = std::vector<std::string>(n0, "(-INFINITY)");
+            minus_inf_tile_container.push_back({value});
+            const TileOperand &minus_inf =
+                writer->declare_tile("minus_inf_const", minus_inf_tile_container, to_ckw(acc_data_type));
+            writer->op_assign(res_tile, minus_inf);
+        }
+        else
+        {
+            writer->op_assign(res_tile, const_lowest_value);
+        }
+    }
+    else
+    {
+        writer->op_assign(res_tile, const_0);
+    }
+
+    // idx_in_w = idx_out_w * STRIDE_X - PAD_X
+    TileOperand &idx_in_w = writer->declare_tile("idx_in_w", ckw::DataType::Int32);
+    writer->op_binary_expression(idx_in_w, idx_out_w, BinaryOp::Mul, stride_x_tile);
+    writer->op_binary_expression(idx_in_w, idx_in_w, BinaryOp::Sub, pad_x_tile);
+
+    // idx_in_h = idx_out_h * STRIDE_Y - PAD_Y
+    TileOperand &idx_in_h = writer->declare_tile("idx_in_h", ckw::DataType::Int32);
+    writer->op_binary_expression(idx_in_h, idx_out_h, BinaryOp::Mul, stride_y_tile);
+    writer->op_binary_expression(idx_in_h, idx_in_h, BinaryOp::Sub, pad_y_tile);
+
+    TileOperand &minus_idx_in_w = writer->declare_tile("minus_idx_in_w", ckw::DataType::Int32);
+    TileOperand &minus_idx_in_h = writer->declare_tile("minus_idx_in_h", ckw::DataType::Int32);
+
+    writer->op_unary_expression(minus_idx_in_w, UnaryOp::Negate, idx_in_w);
+    writer->op_unary_expression(minus_idx_in_h, UnaryOp::Negate, idx_in_h);
+
+    // Pooling starting/ending offsets for X dim
+    TileOperand &pool_x_s = writer->declare_tile("pool_x_s", ckw::DataType::Int32);
+    TileOperand &pool_x_e = writer->declare_tile("pool_x_e", ckw::DataType::Int32);
+
+    writer->op_binary_elementwise_function(pool_x_s, BinaryFunction::Max, const_0, minus_idx_in_w);
+    writer->op_binary_expression(pool_x_e, src_width_tile, BinaryOp::Add, minus_idx_in_w);
+    writer->op_binary_elementwise_function(pool_x_e, BinaryFunction::Min, pool_size_x_tile, pool_x_e);
+
+    // Pooling starting/ending offsets for Y dim
+    TileOperand &pool_y_s = writer->declare_tile("pool_y_s", ckw::DataType::Int32);
+    TileOperand &pool_y_e = writer->declare_tile("pool_y_e", ckw::DataType::Int32);
+
+    writer->op_binary_elementwise_function(pool_y_s, BinaryFunction::Max, const_0, minus_idx_in_h);
+    writer->op_binary_expression(pool_y_e, src_height_tile, BinaryOp::Add, minus_idx_in_h);
+    writer->op_binary_elementwise_function(pool_y_e, BinaryFunction::Min, pool_size_y_tile, pool_y_e);
+
+    const TileOperand &filter_size = writer->declare_tile("filter_size", ckw::DataType::Int32);
+    if (_attributes.exclude_padding())
+    {
+        const TileOperand &y_diff = writer->declare_tile("y_diff", ckw::DataType::Int32);
+        const TileOperand &x_diff = writer->declare_tile("x_diff", ckw::DataType::Int32);
+
+        writer->op_binary_expression(y_diff, pool_y_e, BinaryOp::Sub, pool_y_s);
+        writer->op_binary_expression(x_diff, pool_x_e, BinaryOp::Sub, pool_x_s);
+
+        writer->op_binary_expression(filter_size, y_diff, BinaryOp::Mul, x_diff);
+    }
+    else
+    {
+        writer->op_binary_expression(filter_size, pool_size_x_tile, BinaryOp::Mul, pool_size_y_tile);
+    }
+
+    const TileOperand &x = writer->declare_tile("x", ckw::DataType::Int32);
+    const TileOperand &y = writer->declare_tile("y", ckw::DataType::Int32);
+
+    if (is_global_pooling)
+    {
+        writer->op_assign(x, const_0);
+        writer->op_assign(y, const_0);
+
+        writer->op_assign(pool_y_e, pool_size_y_tile);
+        writer->op_assign(pool_x_e, pool_size_x_tile);
+    }
+    else
+    {
+        writer->op_assign(x, pool_x_s);
+        writer->op_assign(y, pool_y_s);
+    }
+
+    // Y dim for-loop
+    writer->op_for_loop(
+        y, BinaryOp::Less, pool_y_e, y, AssignmentOp::Increment, const_1,
+        [&]()
+        {
+            // Reset the iterator for the inner loop
+            if (is_global_pooling)
+            {
+                writer->op_assign(x, const_0);
+            }
+            else
+            {
+                writer->op_assign(x, pool_x_s);
+            }
+
+            TileOperand &a_y = writer->declare_tile("a_y", ckw::DataType::Int32);
+            writer->op_binary_expression(a_y, idx_in_h, BinaryOp::Add, y);
+
+            // X dim for-loop
+            writer->op_for_loop(
+                x, BinaryOp::Less, pool_x_e, x, AssignmentOp::Increment, const_1,
+                [&]()
+                {
+                    TileOperand &a_x = writer->declare_tile("a_x", ckw::DataType::Int32);
+                    writer->op_binary_expression(a_x, idx_in_w, BinaryOp::Add, x);
+
+                    TileOperand &src_tile = writer->declare_tile("src_tile", TileInfo(to_ckw(acc_data_type), m0, n0));
+
+                    src_sampler.y(a_x);
+                    src_sampler.z(a_y);
+
+                    // Load src tile
+                    if (use_fp_mixed_precision)
+                    {
+                        TileOperand &src_uncasted_tile = writer->declare_tile("uncasted_src_tile", dst_tile_info);
+                        writer->op_load(src_uncasted_tile, src->tensor(), src_sampler);
+                        writer->op_cast_expression(src_tile, src_uncasted_tile, ckw::ConvertPolicy::None);
+                    }
+                    else
+                    {
+                        writer->op_load(src_tile, src->tensor(), src_sampler);
+                    }
+
+                    // Take the square of the input, for L2 Pooling
+                    if (_attributes.pool_type() == PoolingType::L2)
+                    {
+                        writer->op_binary_expression(src_tile, src_tile, BinaryOp::Mul, src_tile);
+                    }
+
+                    // Perfom Pooling op
+                    if (_attributes.pool_type() == PoolingType::MAX)
+                    {
+                        writer->op_binary_elementwise_function(res_tile, BinaryFunction::Max, res_tile, src_tile);
+                    }
+                    else
+                    {
+                        writer->op_binary_expression(res_tile, res_tile, BinaryOp::Add, src_tile);
+                    }
+                });
+        });
+
+    if ((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2))
+    {
+        // filter_size is automatically broadcasted in the operation
+        writer->op_binary_expression(res_tile, res_tile, BinaryOp::Div, filter_size);
+    }
+
+    // Take square root of the result in L2 pooling
+    if (_attributes.pool_type() == PoolingType::L2)
+    {
+        writer->op_unary_elementwise_function(res_tile, UnaryFunction::Sqrt, res_tile);
+    }
+
+    // Store the results and do casting if FP_MIXED_PRECISION
+    if (use_fp_mixed_precision)
+    {
+        writer->op_cast_expression(dst_tile, res_tile, ckw::ConvertPolicy::None);
+    }
+    else
+    {
+        writer->op_assign(dst_tile, res_tile);
+    }
+}
+
+Window GpuCkwPool2d::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    TensorShape        output_shape = _dst->tensor_shape();
+    const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
+    // Create and configure kernel window
+    auto win = calculate_max_window(output_shape, Steps(vec_size));
+    win      = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size.
+    return win;
+}
+
+std::string GpuCkwPool2d::get_name(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    return "pool2dMxN";
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..822282a10887549d257810b0a0ed89cf560c8882
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
+
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwPool2d : public IGpuCkwComponentDriver
+{
+public:
+    using Attributes = ClComponentPool2d::Attributes;
+    using Settings   = ClComponentPool2d::Settings;
+
+    /** Constructor
+     *
+     * For supported configurations please refer to @ref ClComponentCast::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the component
+     * @param[in] attributes Component attributes
+     * @param[in] settings   Component settings
+     */
+    GpuCkwPool2d(ComponentId                      id,
+                 const ArgumentPack<ITensorInfo> &tensors,
+                 const Attributes                &attributes,
+                 const Settings                  &settings);
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwPool2d);
+    /** Destructor */
+    ~GpuCkwPool2d() override = default;
+    // Inherited methods overriden:
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_dst;
+    Attributes         _attributes;
+    Settings           _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f2a7d41afdebe5060324f450a0a853403eabe246
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
@@ -0,0 +1,534 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h"
+
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+constexpr unsigned int opencl_vector_size_in_bytes = 16;
+} // namespace
+
+GpuCkwResize::GpuCkwResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+}
+
+void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup    &comp_group,
+                                              GpuCkwVariableTable     &vtable,
+                                              GpuCkwScopedKernelWriter writer) const
+{
+    const size_t width_idx  = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
+
+    const Window  root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+    const int32_t n0          = root_window.x().step();
+    const int32_t m0          = root_window.y().step();
+    const int32_t partial_n0  = _dst->dimension(0) % n0;
+
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+
+    // Constants
+    const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx),
+                                                              _attributes.align_corners());
+    const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx),
+                                                              _attributes.align_corners());
+    const auto &tile_scale_x = writer->declare_tile("scale_x", scale_x);
+    const auto &tile_scale_y = writer->declare_tile("scale_y", scale_y);
+    const auto &tile_0       = writer->declare_tile("0", 0);
+    const auto &tile_half    = writer->declare_tile("half", 0.5f);
+    const auto &tile_1       = writer->declare_tile("1", 1);
+    const auto &tile_src_w   = writer->declare_tile("src_w", static_cast<int32_t>(_src->dimension(width_idx)));
+    const auto &tile_src_h   = writer->declare_tile("src_h", static_cast<int32_t>(_src->dimension(height_idx)));
+    const auto &tile_dst_h   = writer->declare_tile("dst_h", static_cast<int32_t>(_dst->dimension(height_idx)));
+
+    const auto &tile_gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
+    const auto &tile_gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
+    const auto &tile_gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto &tile_co = writer->declare_tile("co", ckw::DataType::Int32); // OFM
+    auto &tile_xo = writer->declare_tile("xo", ckw::DataType::Int32); // WIDTH
+    auto &tile_yo = writer->declare_tile("yo", ckw::DataType::Int32); // HEIGHT
+    auto &tile_bo = writer->declare_tile("bo", ckw::DataType::Int32); // BATCH SIZE IDX
+
+    // Get the boundary aware coordinates at each global dimension index
+    get_coord(writer, tile_co, tile_gid_0, n0, partial_n0, tile_co.name() + "_dim0_", tile_0);
+    get_coord(writer, tile_xo, tile_gid_1, 1, 0, tile_xo.name() + "_dim1_", tile_0);
+    get_coord(writer, tile_yo, tile_gid_2, 1, 0, tile_yo.name() + "_dim2_", tile_0);
+    get_coord(writer, tile_bo, tile_gid_2, 1, 0, tile_yo.name() + "_dim3_", tile_0);
+
+    writer->op_binary_expression(tile_yo, tile_yo, BinaryOp::Mod, tile_dst_h);
+    writer->op_binary_expression(tile_bo, tile_bo, BinaryOp::Div, tile_dst_h);
+
+    const auto &tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
+    const auto &tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
+
+    switch (_attributes.sampling_policy())
+    {
+        case SamplingPolicy::TOP_LEFT:
+            // xi_f = (xo * scale_x)
+            // yi_f = (yo * scale_y)
+            writer->op_binary_expression(tile_xi_f, tile_xo, BinaryOp::Mul, tile_scale_x);
+            writer->op_binary_expression(tile_yi_f, tile_yo, BinaryOp::Mul, tile_scale_y);
+            break;
+        case SamplingPolicy::CENTER:
+        {
+            // xi_f = ((xo + 0.5f) * scale_x)
+            // yi_f = ((yo + 0.5f) * scale_y)
+            const auto &tile_xo_plus_half = writer->declare_tile("xo_plus_half", ckw::DataType::Fp32);
+            const auto &tile_yo_plus_half = writer->declare_tile("yo_plus_half", ckw::DataType::Fp32);
+
+            writer->op_binary_expression(tile_xo_plus_half, tile_xo, BinaryOp::Add, tile_half);
+            writer->op_binary_expression(tile_yo_plus_half, tile_yo, BinaryOp::Add, tile_half);
+
+            writer->op_binary_expression(tile_xi_f, tile_xo_plus_half, BinaryOp::Mul, tile_scale_x);
+            writer->op_binary_expression(tile_yi_f, tile_yo_plus_half, BinaryOp::Mul, tile_scale_y);
+        }
+        break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported sampling policy");
+    }
+
+    if (_attributes.align_corners())
+    {
+        writer->op_unary_elementwise_function(tile_xi_f, UnaryFunction::Round, tile_xi_f);
+        writer->op_unary_elementwise_function(tile_yi_f, UnaryFunction::Round, tile_yi_f);
+    }
+
+    // xi0 = clamp((int)xi_f, 0, (int)src_w - 1)
+    // yi0 = clamp((int)yi_f, 0, (int)src_h - 1)
+    const auto &tile_xi_f_int = writer->declare_tile("xi_f_int", ckw::DataType::Int32);
+    const auto &tile_yi_f_int = writer->declare_tile("yi_f_int", ckw::DataType::Int32);
+
+    writer->op_cast_expression(tile_xi_f_int, tile_xi_f, ckw::ConvertPolicy::None);
+    writer->op_cast_expression(tile_yi_f_int, tile_yi_f, ckw::ConvertPolicy::None);
+
+    const auto &tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32);
+    const auto &tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32);
+
+    writer->op_binary_expression(tile_src_w_minus_1, tile_src_w, BinaryOp::Sub, tile_1);
+    writer->op_binary_expression(tile_src_h_minus_1, tile_src_h, BinaryOp::Sub, tile_1);
+
+    auto &tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
+    auto &tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
+
+    writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi_f_int, tile_0,
+                                            tile_src_w_minus_1);
+    writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi_f_int, tile_0,
+                                            tile_src_h_minus_1);
+
+    TensorTileSampler src_sampler;
+    src_sampler.x(tile_co);
+    src_sampler.y(tile_xi0);
+    src_sampler.z(tile_yi0);
+    src_sampler.b(tile_bo);
+    src_sampler.height(m0);
+    src_sampler.width(n0);
+    // We guarantee to not have out-of-bounds accesses
+    src_sampler.format(TensorSamplerFormat::C_W_H);
+    src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+    src_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+    src_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+
+    writer->op_load_once(src, src_sampler);
+    auto &tile_src = src->tile();
+
+    TensorTileSampler dst_sampler;
+    dst_sampler.x(tile_co);
+    dst_sampler.y(tile_xo);
+    dst_sampler.z(tile_yo);
+    dst_sampler.b(tile_bo);
+    dst_sampler.height(m0);
+    dst_sampler.width(n0);
+    dst_sampler.format(TensorSamplerFormat::C_W_H);
+    // Do not write to the same memory location with multiple threads
+    dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
+    dst_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+
+    auto &tile_dst = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0));
+    dst->init_virtual_tensor(tile_dst, dst_sampler);
+
+    writer->op_assign(tile_dst, tile_src);
+}
+
+void GpuCkwResize::do_bilinear_resize(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const
+{
+    const size_t width_idx  = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
+
+    const Window  root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+    const int32_t n0          = root_window.x().step();
+    const int32_t m0          = root_window.y().step();
+    const int32_t partial_n0  = _dst->dimension(0) % n0;
+
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+
+    // Constants
+    const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx),
+                                                              _attributes.align_corners());
+    const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx),
+                                                              _attributes.align_corners());
+    const auto &tile_scale_x = writer->declare_tile("scale_x", scale_x);
+    const auto &tile_scale_y = writer->declare_tile("scale_y", scale_y);
+    const auto &tile_0       = writer->declare_tile("0", 0);
+    const auto &tile_half    = writer->declare_tile("half", 0.5f);
+    const auto &tile_1       = writer->declare_tile("1", 1);
+    const auto &tile_src_w   = writer->declare_tile("src_w", static_cast<int32_t>(_src->dimension(width_idx)));
+    const auto &tile_src_h   = writer->declare_tile("src_h", static_cast<int32_t>(_src->dimension(height_idx)));
+    const auto &tile_dst_h   = writer->declare_tile("dst_h", static_cast<int32_t>(_dst->dimension(height_idx)));
+
+    const auto &tile_gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
+    const auto &tile_gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
+    const auto &tile_gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto &tile_co = writer->declare_tile("co", ckw::DataType::Int32); // OFM
+    auto &tile_xo = writer->declare_tile("xo", ckw::DataType::Int32); // WIDTH
+    auto &tile_yo = writer->declare_tile("yo", ckw::DataType::Int32); // HEIGHT
+    auto &tile_bo = writer->declare_tile("bo", ckw::DataType::Int32); // BATCH SIZE IDX
+
+    // Get the boundary aware coordinates at each global dimension index
+    get_coord(writer, tile_co, tile_gid_0, n0, partial_n0, tile_co.name() + "_dim0_", tile_0);
+    get_coord(writer, tile_xo, tile_gid_1, 1, 0, tile_xo.name() + "_dim1_", tile_0);
+    get_coord(writer, tile_yo, tile_gid_2, 1, 0, tile_yo.name() + "_dim2_", tile_0);
+    get_coord(writer, tile_bo, tile_gid_2, 1, 0, tile_yo.name() + "_dim3_", tile_0);
+
+    // yo = coord_dim2 % dst_h
+    // bo = coord_dim2 / dst_h
+    writer->op_binary_expression(tile_yo, tile_yo, BinaryOp::Mod, tile_dst_h);
+    writer->op_binary_expression(tile_bo, tile_bo, BinaryOp::Div, tile_dst_h);
+
+    const auto &tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
+    const auto &tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
+
+    switch (_attributes.sampling_policy())
+    {
+        case SamplingPolicy::TOP_LEFT:
+            // xi_f = (xo * scale_x)
+            // yi_f = (yo * scale_y)
+            writer->op_binary_expression(tile_xi_f, tile_xo, BinaryOp::Mul, tile_scale_x);
+            writer->op_binary_expression(tile_yi_f, tile_yo, BinaryOp::Mul, tile_scale_y);
+            break;
+        case SamplingPolicy::CENTER:
+        {
+            // xi_f = ((xo + 0.5f) * scale_x - 0.5f)
+            // yi_f = ((yo + 0.5f) * scale_y - 0.5f)
+            const auto &tile_xo_plus_half = writer->declare_tile("xo_plus_half", ckw::DataType::Fp32);
+            const auto &tile_yo_plus_half = writer->declare_tile("yo_plus_half", ckw::DataType::Fp32);
+            writer->op_binary_expression(tile_xo_plus_half, tile_xo, BinaryOp::Add, tile_half);
+            writer->op_binary_expression(tile_yo_plus_half, tile_yo, BinaryOp::Add, tile_half);
+
+            writer->op_binary_expression(tile_xi_f, tile_xo_plus_half, BinaryOp::Mul, tile_scale_x);
+            writer->op_binary_expression(tile_yi_f, tile_yo_plus_half, BinaryOp::Mul, tile_scale_y);
+
+            writer->op_binary_expression(tile_xi_f, tile_xi_f, BinaryOp::Sub, tile_half);
+            writer->op_binary_expression(tile_yi_f, tile_yi_f, BinaryOp::Sub, tile_half);
+        }
+        break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported sampling policy");
+    }
+
+    // xi = (int)floor(xi_f);
+    // yi = (int)floor(yi_f);
+    const auto &tile_xi_f_floor = writer->declare_tile("xi_f_floor", ckw::DataType::Fp32);
+    const auto &tile_yi_f_floor = writer->declare_tile("yi_f_floor", ckw::DataType::Fp32);
+    writer->op_unary_elementwise_function(tile_xi_f_floor, UnaryFunction::Floor, tile_xi_f);
+    writer->op_unary_elementwise_function(tile_yi_f_floor, UnaryFunction::Floor, tile_yi_f);
+
+    const auto &tile_xi = writer->declare_tile("xi", ckw::DataType::Int32);
+    const auto &tile_yi = writer->declare_tile("yi", ckw::DataType::Int32);
+    writer->op_cast_expression(tile_xi, tile_xi_f_floor, ckw::ConvertPolicy::None);
+    writer->op_cast_expression(tile_yi, tile_yi_f_floor, ckw::ConvertPolicy::None);
+
+    // xi0  = clamp(xi, 0, (int)src_w - 1);
+    // yi0  = clamp(yi, 0, (int)src_h - 1);
+    // xi1  = clamp(xi + 1, 0, (int)src_w - 1);
+    // yi1  = clamp(yi + 1, 0, (int)src_h - 1);
+    const auto &tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32);
+    const auto &tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32);
+    writer->op_binary_expression(tile_src_w_minus_1, tile_src_w, BinaryOp::Sub, tile_1);
+    writer->op_binary_expression(tile_src_h_minus_1, tile_src_h, BinaryOp::Sub, tile_1);
+
+    const auto &tile_xi_plus_1 = writer->declare_tile("xi_plus_1", ckw::DataType::Int32);
+    const auto &tile_yi_plus_1 = writer->declare_tile("yi_plus_1", ckw::DataType::Int32);
+    writer->op_binary_expression(tile_xi_plus_1, tile_xi, BinaryOp::Add, tile_1);
+    writer->op_binary_expression(tile_yi_plus_1, tile_yi, BinaryOp::Add, tile_1);
+
+    auto &tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
+    auto &tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
+    auto &tile_xi1 = writer->declare_tile("xi1", ckw::DataType::Int32);
+    auto &tile_yi1 = writer->declare_tile("yi1", ckw::DataType::Int32);
+
+    writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi, tile_0, tile_src_w_minus_1);
+    writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi, tile_0, tile_src_h_minus_1);
+    writer->op_ternary_elementwise_function(tile_xi1, TernaryFunction::Clamp, tile_xi_plus_1, tile_0,
+                                            tile_src_w_minus_1);
+    writer->op_ternary_elementwise_function(tile_yi1, TernaryFunction::Clamp, tile_yi_plus_1, tile_0,
+                                            tile_src_h_minus_1);
+
+    TensorTileSampler in_sampler;
+    in_sampler.x(tile_co);
+    in_sampler.b(tile_bo);
+    in_sampler.height(1);
+    in_sampler.width(n0);
+    // We guarantee to not have out-of-bounds accesses
+    in_sampler.format(TensorSamplerFormat::C_W_H);
+    in_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+    in_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+    in_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+
+    TensorTileSampler in00_sampler = in_sampler;
+    in00_sampler.y(tile_xi0);
+    in00_sampler.z(tile_yi0);
+
+    TensorTileSampler in01_sampler = in_sampler;
+    in01_sampler.y(tile_xi1);
+    in01_sampler.z(tile_yi0);
+
+    TensorTileSampler in10_sampler = in_sampler;
+    in10_sampler.y(tile_xi0);
+    in10_sampler.z(tile_yi1);
+
+    TensorTileSampler in11_sampler = in_sampler;
+    in11_sampler.y(tile_xi1);
+    in11_sampler.z(tile_yi1);
+
+    auto &tile_in00 = writer->declare_tile("in00", TileInfo(to_ckw(_src->data_type()), 1, n0));
+    auto &tile_in01 = writer->declare_tile("in01", TileInfo(to_ckw(_src->data_type()), 1, n0));
+    auto &tile_in10 = writer->declare_tile("in10", TileInfo(to_ckw(_src->data_type()), 1, n0));
+    auto &tile_in11 = writer->declare_tile("in11", TileInfo(to_ckw(_src->data_type()), 1, n0));
+
+    writer->op_load(tile_in00, src->tensor(), in00_sampler);
+    writer->op_load(tile_in01, src->tensor(), in01_sampler);
+    writer->op_load(tile_in10, src->tensor(), in10_sampler);
+    writer->op_load(tile_in11, src->tensor(), in11_sampler);
+
+    TensorTileSampler dst_sampler;
+    dst_sampler.x(tile_co);
+    dst_sampler.y(tile_xo);
+    dst_sampler.z(tile_yo);
+    dst_sampler.b(tile_bo);
+    dst_sampler.height(m0);
+    dst_sampler.width(n0);
+    dst_sampler.format(TensorSamplerFormat::C_W_H);
+    // Do not write to the same memory location with multiple threads
+    dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
+    dst_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+
+    auto &tile_dst = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0));
+    dst->init_virtual_tensor(tile_dst, dst_sampler);
+
+    // Weights of each nearest pixel
+    const auto &tile_a  = writer->declare_tile("a", ckw::DataType::Fp32);
+    const auto &tile_b  = writer->declare_tile("b", ckw::DataType::Fp32);
+    const auto &tile_a1 = writer->declare_tile("a1", ckw::DataType::Fp32);
+    const auto &tile_b1 = writer->declare_tile("b1", ckw::DataType::Fp32);
+
+    // a = (xi_f - (float)xi)
+    // b = (1.f - a)
+    // a1 = (yi_f - (float)yi)
+    // b1 = (1.f - a1)
+    const auto &tile_xi_float = writer->declare_tile("xi_float", ckw::DataType::Fp32);
+    const auto &tile_yi_float = writer->declare_tile("yi_float", ckw::DataType::Fp32);
+    writer->op_cast_expression(tile_xi_float, tile_xi, ckw::ConvertPolicy::None);
+    writer->op_cast_expression(tile_yi_float, tile_yi, ckw::ConvertPolicy::None);
+
+    writer->op_binary_expression(tile_a, tile_xi_f, BinaryOp::Sub, tile_xi_float);
+    writer->op_binary_expression(tile_b, tile_1, BinaryOp::Sub, tile_a);
+    writer->op_binary_expression(tile_a1, tile_yi_f, BinaryOp::Sub, tile_yi_float);
+    writer->op_binary_expression(tile_b1, tile_1, BinaryOp::Sub, tile_a1);
+
+    if (is_data_type_float(_src->data_type()))
+    {
+        // Cast weights to source type
+        const auto &tile_a_src_type  = writer->declare_tile("a_src_t", to_ckw(_src->data_type()));
+        const auto &tile_b_src_type  = writer->declare_tile("b_src_t", to_ckw(_src->data_type()));
+        const auto &tile_a1_src_type = writer->declare_tile("a1_src_t", to_ckw(_src->data_type()));
+        const auto &tile_b1_src_type = writer->declare_tile("b1_src_t", to_ckw(_src->data_type()));
+
+        writer->op_cast_expression(tile_a_src_type, tile_a, ckw::ConvertPolicy::None);
+        writer->op_cast_expression(tile_b_src_type, tile_b, ckw::ConvertPolicy::None);
+        writer->op_cast_expression(tile_a1_src_type, tile_a1, ckw::ConvertPolicy::None);
+        writer->op_cast_expression(tile_b1_src_type, tile_b1, ckw::ConvertPolicy::None);
+
+        // in00 * b * b1
+        writer->op_binary_expression(tile_in00, tile_in00, BinaryOp::Mul, tile_b_src_type);
+        writer->op_binary_expression(tile_in00, tile_in00, BinaryOp::Mul, tile_b1_src_type);
+
+        // in01 * a * b1
+        writer->op_binary_expression(tile_in01, tile_in01, BinaryOp::Mul, tile_a_src_type);
+        writer->op_binary_expression(tile_in01, tile_in01, BinaryOp::Mul, tile_b1_src_type);
+
+        // in10 * b * a1
+        writer->op_binary_expression(tile_in10, tile_in10, BinaryOp::Mul, tile_b_src_type);
+        writer->op_binary_expression(tile_in10, tile_in10, BinaryOp::Mul, tile_a1_src_type);
+
+        // in11 * a * a1
+        writer->op_binary_expression(tile_in11, tile_in11, BinaryOp::Mul, tile_a_src_type);
+        writer->op_binary_expression(tile_in11, tile_in11, BinaryOp::Mul, tile_a1_src_type);
+
+        // Summation of above terms
+        writer->op_assign(tile_dst, tile_in00);
+        writer->op_binary_expression(tile_dst, tile_dst, BinaryOp::Add, tile_in01);
+        writer->op_binary_expression(tile_dst, tile_dst, BinaryOp::Add, tile_in10);
+        writer->op_binary_expression(tile_dst, tile_dst, BinaryOp::Add, tile_in11);
+    }
+    else
+    {
+        // Cast to float
+        const auto &tile_in00_f = writer->declare_tile("in00_f", TileInfo(ckw::DataType::Fp32, 1, n0));
+        const auto &tile_in01_f = writer->declare_tile("in01_f", TileInfo(ckw::DataType::Fp32, 1, n0));
+        const auto &tile_in10_f = writer->declare_tile("in10_f", TileInfo(ckw::DataType::Fp32, 1, n0));
+        const auto &tile_in11_f = writer->declare_tile("in11_f", TileInfo(ckw::DataType::Fp32, 1, n0));
+        writer->op_cast_expression(tile_in00_f, tile_in00, ckw::ConvertPolicy::None);
+        writer->op_cast_expression(tile_in01_f, tile_in01, ckw::ConvertPolicy::None);
+        writer->op_cast_expression(tile_in10_f, tile_in10, ckw::ConvertPolicy::None);
+        writer->op_cast_expression(tile_in11_f, tile_in11, ckw::ConvertPolicy::None);
+
+        // in00 * b * b1
+        writer->op_binary_expression(tile_in00_f, tile_in00_f, BinaryOp::Mul, tile_b);
+        writer->op_binary_expression(tile_in00_f, tile_in00_f, BinaryOp::Mul, tile_b1);
+
+        // in01 * a * b1
+        writer->op_binary_expression(tile_in01_f, tile_in01_f, BinaryOp::Mul, tile_a);
+        writer->op_binary_expression(tile_in01_f, tile_in01_f, BinaryOp::Mul, tile_b1);
+
+        // in10 * b * a1
+        writer->op_binary_expression(tile_in10_f, tile_in10_f, BinaryOp::Mul, tile_b);
+        writer->op_binary_expression(tile_in10_f, tile_in10_f, BinaryOp::Mul, tile_a1);
+
+        // in11 * a * a1
+        writer->op_binary_expression(tile_in11_f, tile_in11_f, BinaryOp::Mul, tile_a);
+        writer->op_binary_expression(tile_in11_f, tile_in11_f, BinaryOp::Mul, tile_a1);
+
+        // Summation of above terms
+        writer->op_binary_expression(tile_in00_f, tile_in00_f, BinaryOp::Add, tile_in01_f);
+        writer->op_binary_expression(tile_in00_f, tile_in00_f, BinaryOp::Add, tile_in10_f);
+        writer->op_binary_expression(tile_in00_f, tile_in00_f, BinaryOp::Add, tile_in11_f);
+
+        // Cast to destination type with saturation
+        writer->op_cast_expression(tile_dst, tile_in00_f, ckw::ConvertPolicy::Saturate);
+    }
+}
+
+void GpuCkwResize::write_component_code(const ComponentGroup    &comp_group,
+                                        GpuCkwVariableTable     &vtable,
+                                        GpuCkwScopedKernelWriter writer) const
+{
+    switch (_attributes.interpolation_policy())
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+            do_nearest_neighbor_resize(comp_group, vtable, writer);
+            break;
+        case InterpolationPolicy::BILINEAR:
+            do_bilinear_resize(comp_group, vtable, writer);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported interpolation policy");
+    }
+}
+
+Window GpuCkwResize::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    const unsigned int n0  = adjust_vec_size(opencl_vector_size_in_bytes / _src->element_size(), _src->dimension(0));
+    Window             win = calculate_max_window(*_dst, Steps(n0));
+    return win.collapse(win, Window::DimZ);
+}
+
+std::string GpuCkwResize::get_tuner_id(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    std::string tuner_id = "resize_";
+    tuner_id += _attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "nearest_neighbor" : "";
+    tuner_id += _attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "bilinear" : "";
+    tuner_id += "_";
+    tuner_id += _attributes.sampling_policy() == SamplingPolicy::CENTER ? "center" : "topleft";
+    tuner_id += "_";
+    tuner_id += support::cpp11::to_string(_dst->dimension(0));
+    tuner_id += "_";
+    tuner_id += support::cpp11::to_string(_dst->dimension(1));
+    tuner_id += "_";
+    tuner_id += support::cpp11::to_string(_dst->dimension(2));
+    tuner_id += "_";
+    tuner_id += support::cpp11::to_string(_dst->dimension(3));
+
+    return tuner_id;
+}
+
+std::string GpuCkwResize::get_name(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    std::string name = "resize_";
+    name += _attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "nearest_neighbor" : "";
+    name += _attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "bilinear" : "";
+
+    return name;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h
new file mode 100644
index 0000000000000000000000000000000000000000..1266c059217b059bf66e5c60ec3ae05899ab189b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwResize final : public IGpuCkwComponentDriver
+{
+public:
+    using Attributes = ClComponentResize::Attributes;
+
+public:
+    /** Constructor
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the components
+     * @param[in] attributes Component attributes
+     */
+    GpuCkwResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwResize);
+
+    /** Destructor */
+    ~GpuCkwResize() override = default;
+
+    // Inherited methods overriden
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
+    std::string  get_tuner_id(const ComponentGroup &comp_group) const override;
+
+private:
+    /** Resize using nearest neighbor interpolation
+     *
+     * @param[in]      comp_group Component group to which this component belongs to
+     * @param[in, out] vtable     Table of variables declared by this component
+     * @param[in, out] writer     CKW writer that writes code scoped to this kernel component
+     */
+    void do_nearest_neighbor_resize(const ComponentGroup    &comp_group,
+                                    GpuCkwVariableTable     &vtable,
+                                    GpuCkwScopedKernelWriter writer) const;
+
+    /** Resize using bilinear interpolation
+     *
+     * @param[in]      comp_group Component group to which this component belongs to
+     * @param[in, out] vtable     Table of variables declared by this component
+     * @param[in, out] writer     CKW writer that writes code scoped to this kernel component
+     */
+    void do_bilinear_resize(const ComponentGroup    &comp_group,
+                            GpuCkwVariableTable     &vtable,
+                            GpuCkwScopedKernelWriter writer) const;
+
+    const ITensorInfo *_src;
+    const ITensorInfo *_dst;
+    Attributes         _attributes;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
index 891739153783e7145e6c279edf4b663b73e743a8..889706b0c085ff2d81043e502e59cd52f635ed4f 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
@@ -24,10 +24,12 @@
 #include "GpuCkwStore.h"
 
 #include "arm_compute/core/Error.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
 #include <string>
 
 namespace arm_compute
@@ -37,12 +39,14 @@ namespace experimental
 namespace dynamic_fusion
 {
 GpuCkwStore::GpuCkwStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuCkwComponentDriver{ id, tensors }, _src{}, _dst{}
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
 }
-void GpuCkwStore::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwStore::write_component_code(const ComponentGroup    &comp_group,
+                                       GpuCkwVariableTable     &vtable,
+                                       GpuCkwScopedKernelWriter writer) const
 {
     auto src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
     auto dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
index 8e35651caf8ff21136eb7df9e9bad6a5c89191fd..f1f0e6747b70c0daf461be952278c6e28cc35a13 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
@@ -48,8 +48,10 @@ public:
     /** Destructor */
     ~GpuCkwStore() override = default;
     // Inherited methods overriden:
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override;
-    std::string get_name(const ComponentGroup &comp_group) const override;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
 
 private:
     const ITensorInfo *_src;
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
index f4a056c5a084d2e1102eb233b6a7ef92a6c836e5..6ba2b2f6513f3f601163d8a4e41ddfec6334b523 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
@@ -21,12 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_WRITERHELPER
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_WRITERHELPER
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_WRITERHELPER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_WRITERHELPER_H
 
 #include "arm_compute/core/utils/misc/Utility.h"
 #include "ckw/TensorTileSampler.h"
+
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 
 #include <algorithm>
@@ -43,9 +45,14 @@ using SamplerCreator = std::function<TensorTileSampler(GpuCkwScopedKernelWriter
 
 /** Load src and dst tiles of dimension [m0, n0] only when not loaded and prepare the sampler
  */
-inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &writer, GpuCkwComponentArgument *src, GpuCkwComponentArgument *dst, int32_t m0, int32_t n0, SamplerCreator create_sampler)
+inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &writer,
+                                                   GpuCkwComponentArgument  *src,
+                                                   GpuCkwComponentArgument  *dst,
+                                                   int32_t                   m0,
+                                                   int32_t                   n0,
+                                                   SamplerCreator            create_sampler)
 {
-    if(!src->has_tile())
+    if (!src->has_tile())
     {
         const auto sampler = create_sampler(writer, m0, n0);
         writer->op_load_once(src, sampler);
@@ -60,7 +67,7 @@ inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &wri
     const auto &sampler  = src->tile_sampler();
 
     // Prepare the output tile.
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
         auto &tile = writer->declare_tile("dst_tile", src_tile.tile_info());
         dst->init_virtual_tensor(tile, sampler);
@@ -77,7 +84,13 @@ inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &wri
  * @param[in]     prefix          Prefix to all the tiles declared within this function
  * @param[in]     const_0         Constant tile of value 0
  */
-inline void get_coord(GpuCkwScopedKernelWriter writer, TileOperand &coord, TileOperand &gid, int32_t step_v, int32_t leftover_step_v, const std::string &prefix, TileOperand &const_0)
+inline void get_coord(GpuCkwScopedKernelWriter writer,
+                      TileOperand             &coord,
+                      const TileOperand       &gid,
+                      int32_t                  step_v,
+                      int32_t                  leftover_step_v,
+                      const std::string       &prefix,
+                      const TileOperand       &const_0)
 {
     auto &step          = writer->declare_tile(prefix + "step", step_v);
     auto &leftover_step = writer->declare_tile(prefix + "leftover_step", leftover_step_v);
@@ -121,8 +134,15 @@ inline void get_coord(GpuCkwScopedKernelWriter writer, TileOperand &coord, TileO
  *
  * @return TensorTileSampler
  */
-inline TensorTileSampler create_boundary_aware_2d_sampler(GpuCkwScopedKernelWriter writer, TileOperand &gid_0, TileOperand &gid_1, int32_t dim0_v, int32_t dim1_v, int32_t n0_v, int32_t m0_v,
-                                                          const std::string prefix, TileOperand &const_0)
+inline TensorTileSampler create_boundary_aware_2d_sampler(GpuCkwScopedKernelWriter writer,
+                                                          TileOperand             &gid_0,
+                                                          TileOperand             &gid_1,
+                                                          int32_t                  dim0_v,
+                                                          int32_t                  dim1_v,
+                                                          int32_t                  n0_v,
+                                                          int32_t                  m0_v,
+                                                          const std::string        prefix,
+                                                          TileOperand             &const_0)
 {
     // Clamp tile size [n0, m0] against dimension [dim0, dim1]
     // This is needed to:
@@ -158,4 +178,4 @@ inline TensorTileSampler create_boundary_aware_2d_sampler(GpuCkwScopedKernelWrit
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_WRITERHELPER */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_WRITERHELPER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
index 34b1283add66beb1a59ae4e4abc9f6c3426cb25a..5da317bf38f2c4effc79ec450fd06c91af4492dd 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "ckw/TensorInfo.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
 namespace arm_compute
@@ -38,7 +39,7 @@ namespace dynamic_fusion
 {
 inline ckw::DataType to_ckw(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::F32:
             return ckw::DataType::Fp32;
@@ -65,21 +66,16 @@ inline ckw::DataType to_ckw(DataType dt)
 
 inline ckw::TensorShape to_ckw(const TensorShape &shape)
 {
-    ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size<ckw::TensorShape> {});
-    ARM_COMPUTE_ERROR_ON(std::tuple_size<ckw::TensorShape> {} != 5);
+    ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size<ckw::TensorShape>{});
+    ARM_COMPUTE_ERROR_ON(std::tuple_size<ckw::TensorShape>{} != 5);
     /// NOTE: Overflow danger. Use size_t?
-    return ckw::TensorShape
-    {
-        static_cast<int32_t>(shape[0]),
-        static_cast<int32_t>(shape[1]),
-        static_cast<int32_t>(shape[2]),
-        static_cast<int32_t>(shape[3]),
-        static_cast<int32_t>(shape[4])
-    };
+    return ckw::TensorShape{static_cast<int32_t>(shape[0]), static_cast<int32_t>(shape[1]),
+                            static_cast<int32_t>(shape[2]), static_cast<int32_t>(shape[3]),
+                            static_cast<int32_t>(shape[4])};
 }
 inline ckw::TensorDataLayout to_ckw(DataLayout dl)
 {
-    switch(dl)
+    switch (dl)
     {
         case DataLayout::NHWC:
             return ckw::TensorDataLayout::Nhwc;
@@ -91,18 +87,13 @@ inline ckw::TensorDataLayout to_ckw(DataLayout dl)
 }
 inline ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info)
 {
-    return ckw::TensorInfo
-    {
-        to_ckw(tensor_info.data_type()),
-        to_ckw(tensor_info.tensor_shape()),
-        to_ckw(tensor_info.data_layout()),
-        tensor_info.id()
-    };
+    return ckw::TensorInfo{to_ckw(tensor_info.data_type()), to_ckw(tensor_info.tensor_shape()),
+                           to_ckw(tensor_info.data_layout()), tensor_info.id()};
 }
 
 inline TensorComponentType from_ckw(const ckw::TensorComponentType &component)
 {
-    switch(component)
+    switch (component)
     {
         case ckw::TensorComponentType::OffsetFirstElement:
             return TensorComponentType::OffsetFirstElement;
@@ -142,7 +133,7 @@ inline TensorComponentType from_ckw(const ckw::TensorComponentType &component)
 
 inline ckw::TensorStorageType to_ckw(const TensorStorageType &storage)
 {
-    switch(storage)
+    switch (storage)
     {
         case TensorStorageType::ClBufferUint8Ptr:
             return ckw::TensorStorageType::BufferUint8Ptr;
@@ -159,7 +150,7 @@ inline ckw::TensorStorageType to_ckw(const TensorStorageType &storage)
 }
 inline TensorStorageType from_ckw(const ckw::TensorStorageType &storage)
 {
-    switch(storage)
+    switch (storage)
     {
         case ckw::TensorStorageType::BufferUint8Ptr:
             return TensorStorageType::ClBufferUint8Ptr;
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
index 9cb022fc10e0d626141b092616f05897feaba6e2..0cba258940c7cc8bb3ad9147884ec71209266858 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY
 
 #include "ckw/types/Operators.h"
+
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
 
 namespace arm_compute
@@ -35,7 +36,7 @@ namespace dynamic_fusion
 {
 inline ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes)
 {
-    switch(attributes.operation())
+    switch (attributes.operation())
     {
         case ElementwiseBinaryCommonAttributes::ElementwiseOp::Add:
             return ckw::BinaryOp::Add;
diff --git a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
index f7f0029618d52401d730e36b658126187ddea7b1..ee109a7e2bec46c7767de9d614de956af16365b1 100644
--- a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
+++ b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
@@ -24,8 +24,9 @@
 #ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
 
-#include "Types.h"
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include "Types.h"
 #include <memory>
 
 namespace arm_compute
@@ -49,13 +50,13 @@ public:
      * @return std::unique_ptr<IGpuKernelComponent>
      */
     template <typename T, typename... Args>
-    std::unique_ptr<IGpuKernelComponent> create(Args &&... args)
+    std::unique_ptr<IGpuKernelComponent> create(Args &&...args)
     {
         return std::make_unique<T>(_count++, std::forward<Args>(args)...);
     }
 
 private:
-    ComponentId _count{ 0 };
+    ComponentId _count{0};
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
index af766a7eceb7eb6850015e7525c869c45b376a69..4b8eea2f57f5707f485e8c5645b018b6b5e445ae 100644
--- a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
+++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
@@ -24,11 +24,11 @@
 #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
 
-#include "Types.h"
-
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
 
+#include "Types.h"
+
 namespace arm_compute
 {
 namespace experimental
@@ -76,13 +76,8 @@ public:
      * @param[in] properties Kernel component properties
      * @param[in] tensors    Tensor arguments to the components
      */
-    IGpuKernelComponent(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors)
-        : _id{ id },
-          _properties{ properties },
-          _tensors{ tensors }
+    IGpuKernelComponent(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
+        : _id{id}, _properties{properties}, _tensors{tensors}
     {
     }
     /** Destructor */
@@ -117,7 +112,7 @@ public:
     virtual GpuComponentType type() const = 0;
 
 private:
-    ComponentId               _id{ -1 };
+    ComponentId               _id{-1};
     Properties                _properties{};
     ArgumentPack<ITensorInfo> _tensors{};
 };
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
index c41257d18c328ad18fb8d70f9a0105892e67f34c..fdf528a65d9756af1e7506fcb55b02b78e2746b5 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
@@ -68,17 +68,11 @@ ClComponentActivation::ClComponentActivation(ComponentId
                                              const IGpuKernelComponent::Properties &properties,
                                              const ArgumentPack<ITensorInfo>       &tensors,
                                              const Attributes                      &attributes)
-    : IGpuKernelComponent{ id, properties, tensors },
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<ClTemplateActivation>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<ClTemplateActivation>(id, tensors, attributes)}
 #else  //ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<GpuCkwActivation>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<GpuCkwActivation>(id, tensors, attributes)}
 #endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
index 9b090af988a107b845c711a89b74144c045bd9f9..02c854356a11be28862a653584b002220429108e 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
@@ -25,9 +25,8 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
-#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
 {
@@ -79,20 +78,17 @@ public:
      * |F16            |F16            |
      * |F32            |F32            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    static Status
+    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
 
     /** Constructor
      *
      * Similar to @ref ClComponentActivation::validate()
      */
-    ClComponentActivation(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    ClComponentActivation(ComponentId                      id,
+                          const Properties                &properties,
+                          const ArgumentPack<ITensorInfo> &tensors,
+                          const Attributes                &attributes);
 
     /** Destructor */
     ~ClComponentActivation() override;
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
index 635869f817a32e875de2ef0b03be4bb903b3e406..b1636795a3b04b448f0a8220b97b317c87a6a62c 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
@@ -24,6 +24,7 @@
 #include "ClComponentCast.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
@@ -38,11 +39,10 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-Status ClComponentCast::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
+Status ClComponentCast::validate(const Properties                &properties,
+                                 const ArgumentPack<ITensorInfo> &tensors,
+                                 const Attributes                &attributes,
+                                 const Settings                  &settings)
 {
     ARM_COMPUTE_UNUSED(properties, attributes, settings);
 
@@ -53,13 +53,15 @@ Status ClComponentCast::validate(
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == attributes.data_type(), "input and target data types should be different");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == attributes.data_type(),
+                                    "input and target data types should be different");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != attributes.data_type(), "dst and target data types should be same");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != attributes.data_type(),
+                                        "dst and target data types should be same");
     }
 
     return Status{};
@@ -69,17 +71,11 @@ ClComponentCast::ClComponentCast(ComponentId                      id,
                                  const ArgumentPack<ITensorInfo> &tensors,
                                  const Attributes                &attributes,
                                  const Settings                  &settings)
-    : IGpuKernelComponent{ id, properties, tensors },
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<ClTemplateCast>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<ClTemplateCast>(id, tensors, attributes)}
 #else  //ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<GpuCkwCast>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<GpuCkwCast>(id, tensors, attributes)}
 #endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     ARM_COMPUTE_UNUSED(attributes, settings);
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
index 37b8cbb6c91698e749f570d5718b27c397ae8397..ed77b1203b68755fc0f663adf119c1f281048c20 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
@@ -93,11 +94,10 @@ public:
      * |F16            | U8, S8, U16, S16, U32, S32, F32       |
      * |F32            | U8, S8, U16, S16, U32, S32, F16       |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
 
     /** Constructor
      *
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
index 56260930799bf4af8d6e3c78194972562670ec86..ca8037c393733b75c79c60e5537620f1dcacae2d 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,8 +26,13 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+
 #include "src/core/CL/CLValidate.h"
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h"
+#else //ACL_INTERNAL_TEST_CKW_IN_DF
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h"
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -103,11 +108,10 @@ unsigned int Settings::m0() const
     return _m0;
 }
 
-Status ClComponentDepthwiseConv2d::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
+Status ClComponentDepthwiseConv2d::validate(const Properties                &properties,
+                                            const ArgumentPack<ITensorInfo> &tensors,
+                                            const Attributes                &attributes,
+                                            const Settings                  &settings)
 {
     ARM_COMPUTE_UNUSED(properties, settings);
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
@@ -121,7 +125,7 @@ Status ClComponentDepthwiseConv2d::validate(
     // Matching data type
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia);
     }
@@ -129,7 +133,7 @@ Status ClComponentDepthwiseConv2d::validate(
     // Matching data layout
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia);
     }
@@ -138,7 +142,7 @@ Status ClComponentDepthwiseConv2d::validate(
     ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0);
     }
@@ -148,16 +152,17 @@ Status ClComponentDepthwiseConv2d::validate(
     const DataLayout data_layout = src->data_layout();
     const size_t     channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != (src->dimension(channel_idx) * attributes.depth_multiplier()));
+    ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) !=
+                                (src->dimension(channel_idx) * attributes.depth_multiplier()));
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 3, "Weights can be at most 3 dimensional");
 
     // dst shape is correct
-    const PadStrideInfo pad_stride_info = PadStrideInfo(attributes.stride().x(), attributes.stride().y(),
-                                                        attributes.pad().left, attributes.pad().right,
-                                                        attributes.pad().top, attributes.pad().bottom,
-                                                        attributes.dimension_rounding_type());
-    const ConvolutionInfo conv_info{ pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), attributes.dilation() };
-    const TensorShape     output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
+    const PadStrideInfo pad_stride_info =
+        PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right,
+                      attributes.pad().top, attributes.pad().bottom, attributes.dimension_rounding_type());
+    const ConvolutionInfo conv_info{pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(),
+                                    attributes.dilation()};
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
 
@@ -168,19 +173,22 @@ Status ClComponentDepthwiseConv2d::validate(
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && settings.m0() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && settings.m0() != 1);
 
-    if(conv_info.depth_multiplier > 1 && settings.n0() > 1)
+    if (conv_info.depth_multiplier > 1 && settings.n0() > 1)
     {
         ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % settings.n0()) != 0);
     }
 
     // Check export weights to cl image
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((settings.export_weights_to_cl_image() == true) && (export_to_cl_image(wei) == false), "Weights cannot be exported to cl_image!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((settings.export_weights_to_cl_image() == true) &&
+                                        (export_to_cl_image(wei) == false),
+                                    "Weights cannot be exported to cl_image!");
     ARM_COMPUTE_RETURN_ERROR_ON((settings.export_weights_to_cl_image() == true) && ((settings.n0() % 4) != 0));
 
-    ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != (src->dimension(channel_idx) * conv_info.depth_multiplier));
+    ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) !=
+                                (src->dimension(channel_idx) * conv_info.depth_multiplier));
 
     // bia shape is correct
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != output_shape[channel_idx],
                                         "Biases size and number of dst feature maps should match");
@@ -198,20 +206,28 @@ Status ClComponentDepthwiseConv2d::validate(
     return Status{};
 }
 
-ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d(
-    ComponentId                      id,
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
-    : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateDepthwiseConv2d>(id, tensors, attributes, settings) }
+ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d(ComponentId                      id,
+                                                       const Properties                &properties,
+                                                       const ArgumentPack<ITensorInfo> &tensors,
+                                                       const Attributes                &attributes,
+                                                       const Settings                  &settings)
+    : IGpuKernelComponent{id, properties, tensors},
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer{std::make_unique<ClTemplateDepthwiseConv2d>(id, tensors, attributes, settings)}
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer{std::make_unique<GpuCkwDepthwiseConv2d>(id, tensors, attributes, settings)}
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
+    ARM_COMPUTE_UNUSED(attributes, settings);
 }
 ClComponentDepthwiseConv2d::~ClComponentDepthwiseConv2d()
 {
 }
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuTemplateComponentWriter *ClComponentDepthwiseConv2d::template_writer() const
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+const IGpuCkwComponentDriver *ClComponentDepthwiseConv2d::ckw_component_driver() const
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
index 0e2b5f14cbcbee177c1445f6883e245b466e02b3..01168e9dedd1d00bb41c2b9b6e505c2463449f87 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D_H
 
 #include "arm_compute/core/Error.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -41,6 +43,13 @@ template <typename T>
 class ArgumentPack;
 class DepthwiseConv2dAttributes;
 
+/** Forward declaration */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+class ClTemplateDepthwiseConv2d;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+class GpuCkwDepthwiseConv2d;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
+
 /** Component specific settings
  */
 class ClComponentDepthwiseConv2dSettings
@@ -77,12 +86,12 @@ public:
     unsigned int m0() const;
 
 private:
-    bool         _export_input_to_cl_image{ false };   /**< Export input to cl_image */
-    bool         _export_weights_to_cl_image{ false }; /**< Export the weights to cl_image */
-    bool         _fast_relaxed_math{ true };           /**< Enable/disable -cl-fast-relaxed-math flag */
-    bool         _is_fma_available{ false };           /**< Is fma instruction available */
-    unsigned int _n0{ 0 };                             /**< Number of columns processed by each thread */
-    unsigned int _m0{ 0 };                             /**< Number of rows processed by each thread */
+    bool         _export_input_to_cl_image{false};   /**< Export input to cl_image */
+    bool         _export_weights_to_cl_image{false}; /**< Export the weights to cl_image */
+    bool         _fast_relaxed_math{true};           /**< Enable/disable -cl-fast-relaxed-math flag */
+    bool         _is_fma_available{false};           /**< Is fma instruction available */
+    unsigned int _n0{0};                             /**< Number of columns processed by each thread */
+    unsigned int _m0{0};                             /**< Number of rows processed by each thread */
 };
 
 /** Forward declaration */
@@ -127,22 +136,20 @@ public:
      * |F16            |F16            |F16            |F16            |
      * |F32            |F32            |F32            |F32            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
 
     /** Constructor
      *
      * Similar to @ref ClComponentDepthwiseConv2d::validate()
      */
-    ClComponentDepthwiseConv2d(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    ClComponentDepthwiseConv2d(ComponentId                      id,
+                               const Properties                &properties,
+                               const ArgumentPack<ITensorInfo> &tensors,
+                               const Attributes                &attributes,
+                               const Settings                  &settings);
 
     /** Destructor */
     ~ClComponentDepthwiseConv2d() override;
@@ -155,7 +162,12 @@ public:
     /** Allow instances of this class to be moved */
     ClComponentDepthwiseConv2d &operator=(ClComponentDepthwiseConv2d &&component) = default;
     /** Get template writer for the component */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+    const IGpuCkwComponentDriver          *ckw_component_driver() const override;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
+
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -163,9 +175,13 @@ public:
     }
 
 private:
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<ClTemplateDepthwiseConv2d> _component_writer;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwDepthwiseConv2d> _component_writer;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
index a713c8200352fd1754b16bda57210cfef54eb459..98f3d6a882c125fbca5f5078702ed4c42504e5d9 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
@@ -23,8 +23,8 @@
  */
 #include "ClComponentDirectConv2d.h"
 
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
 
 #include "src/core/CL/CLValidate.h"
@@ -57,7 +57,8 @@ bool ClComponentDirectConv2dSettings::fast_relaxed_math() const
     return _fast_relaxed_math;
 }
 
-ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::direct_conv_descriptor(const DirectConvComputeKernelInfo &desc)
+ClComponentDirectConv2dSettings &
+ClComponentDirectConv2dSettings::direct_conv_descriptor(const DirectConvComputeKernelInfo &desc)
 {
     _desc = desc;
     return *this;
@@ -68,11 +69,10 @@ DirectConvComputeKernelInfo ClComponentDirectConv2dSettings::direct_conv_descrip
     return _desc;
 }
 
-Status ClComponentDirectConv2d::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
+Status ClComponentDirectConv2d::validate(const Properties                &properties,
+                                         const ArgumentPack<ITensorInfo> &tensors,
+                                         const Attributes                &attributes,
+                                         const Settings                  &settings)
 {
     ARM_COMPUTE_UNUSED(properties);
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
@@ -86,7 +86,7 @@ Status ClComponentDirectConv2d::validate(
     // Matching data type
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia);
     }
@@ -94,7 +94,7 @@ Status ClComponentDirectConv2d::validate(
     // Matching data layout
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia);
     }
@@ -103,7 +103,7 @@ Status ClComponentDirectConv2d::validate(
     ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0);
     }
@@ -112,22 +112,23 @@ Status ClComponentDirectConv2d::validate(
     // wei shape is correct
     const DataLayout data_layout = src->data_layout();
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 4, "Weights can be at most 4 dimensional");
 
     // dst shape is correct
-    PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right, attributes.pad().top,
-                                    attributes.pad().bottom, DimensionRoundingType{});
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                       misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride));
+    PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                                    attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+                                    DimensionRoundingType{});
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+        dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride));
 
     // bia shape is correct
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != wei->dimension(3),
                                         "Biases size and number of dst feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1,
-                                        "Biases should be one dimensional");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // 2. Check support level
@@ -137,24 +138,25 @@ Status ClComponentDirectConv2d::validate(
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
 
     const auto desc = settings.direct_conv_descriptor();
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+                                        desc.n0 != 16,
                                     "N0 can only be: 1, 2, 3, 4, 8, and 16");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+                                        desc.k0 != 16,
                                     "K0 can only be: 1, 2, 3, 4, 8, and 16");
     return Status{};
 }
 
-ClComponentDirectConv2d::ClComponentDirectConv2d(
-    ComponentId                      id,
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
-    : IGpuKernelComponent{ id, properties, tensors },
+ClComponentDirectConv2d::ClComponentDirectConv2d(ComponentId                      id,
+                                                 const Properties                &properties,
+                                                 const ArgumentPack<ITensorInfo> &tensors,
+                                                 const Attributes                &attributes,
+                                                 const Settings                  &settings)
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{ std::make_unique<ClTemplateDirectConv2d>(id, tensors, attributes, settings) }
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{ std::make_unique<GpuCkwDirectConv2d>(id, tensors, attributes, settings) }
+      _component_writer{std::make_unique<ClTemplateDirectConv2d>(id, tensors, attributes, settings)}
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer{std::make_unique<GpuCkwDirectConv2d>(id, tensors, attributes, settings)}
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
@@ -165,7 +167,7 @@ ClComponentDirectConv2d::~ClComponentDirectConv2d()
 
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuTemplateComponentWriter *ClComponentDirectConv2d::template_writer() const
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentDirectConv2d::ckw_component_driver() const
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
index 24acb1b2c155ae91e2d5655e4fb7ea86845d3045..d6d9705d3ce40f5ccd8efd23a8b3d112c6b3f2f2 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
@@ -26,7 +26,9 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -61,7 +63,7 @@ public:
     DirectConvComputeKernelInfo direct_conv_descriptor() const;
 
 private:
-    bool                        _fast_relaxed_math{ true };
+    bool                        _fast_relaxed_math{true};
     DirectConvComputeKernelInfo _desc{}; // Direct convolution descriptor
 };
 
@@ -111,22 +113,20 @@ public:
      * |F16            |F16            |F16            |F16            |
      * |F32            |F32            |F32            |F32            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
 
     /** Constructor
      *
      * Similar to @ref ClComponentDirectConv2d::validate()
      */
-    ClComponentDirectConv2d(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    ClComponentDirectConv2d(ComponentId                      id,
+                            const Properties                &properties,
+                            const ArgumentPack<ITensorInfo> &tensors,
+                            const Attributes                &attributes,
+                            const Settings                  &settings);
 
     /** Destructor */
     ~ClComponentDirectConv2d() override;
@@ -142,7 +142,7 @@ public:
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+    const IGpuCkwComponentDriver       *ckw_component_driver() const override;
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get component type */
     GpuComponentType type() const override
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
index 88d729170c7f713b2fb7580da8d4828a75b6099c..5b136427e42defbcf2656eb83fe7d235e5ae6a03 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
@@ -24,6 +24,7 @@
 #include "ClComponentElementwiseBinary.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h"
@@ -39,56 +40,55 @@ namespace dynamic_fusion
 {
 namespace
 {
-std::set<ElementwiseBinaryCommonAttributes::ElementwiseOp> supported_ops
-{
-    ElementwiseBinaryCommonAttributes::ElementwiseOp::Add,
-    ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub,
-    ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul
-};
+std::set<ElementwiseBinaryCommonAttributes::ElementwiseOp> supported_ops{
+    ElementwiseBinaryCommonAttributes::ElementwiseOp::Add, ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub,
+    ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul};
 }
 
-Status ClComponentElementwiseBinary::validate(const ArgumentPack<ITensorInfo> &tensors, const ElementwiseBinaryCommonAttributes &attributes)
+Status ClComponentElementwiseBinary::validate(const ArgumentPack<ITensorInfo>         &tensors,
+                                              const ElementwiseBinaryCommonAttributes &attributes)
 {
     const auto lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0);
     const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
 
     // Check operator type
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(supported_ops.find(attributes.operation()) == supported_ops.end(), "Provided Elementwise operation not supported.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(supported_ops.find(attributes.operation()) == supported_ops.end(),
+                                    "Provided Elementwise operation not supported.");
 
     // Check validity
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
 
     //Check data type for different elementwise operators
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::S32, DataType::S16, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::S32,
+                                                         DataType::S16, DataType::U8);
 
     // dst shape is correct
     const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape());
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                    "Wrong shape for dst.");
 
     const auto &lhs_shape = lhs->tensor_shape();
     const auto &rhs_shape = rhs->tensor_shape();
     const auto &dst_shape = dst->tensor_shape();
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(lhs_shape, dst_shape, 0) && detail::have_different_dimensions(rhs_shape, dst_shape, 0),
-        "Only LHS or RHS can be broadcasting, not both.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(lhs_shape, dst_shape, 0) &&
+                                        detail::have_different_dimensions(rhs_shape, dst_shape, 0),
+                                    "Only LHS or RHS can be broadcasting, not both.");
 
     // Dimension Y and Z are collapsed together in the current kernel implementation,
     // hence they cannot be independently broadcast or non-broadcast.
     // See: ClTemplateElementwiseBinary::get_window
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        (lhs_shape[1] != dst_shape[1] || rhs_shape[1] != dst_shape[1]) != (lhs_shape[2] != dst_shape[2] || rhs_shape[2] != dst_shape[2]),
-        "Dimension Y and Z must both be either broadcast or non-broadcast.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_shape[1] != dst_shape[1] || rhs_shape[1] != dst_shape[1]) !=
+                                        (lhs_shape[2] != dst_shape[2] || rhs_shape[2] != dst_shape[2]),
+                                    "Dimension Y and Z must both be either broadcast or non-broadcast.");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(lhs_shape, dst_shape, 3),
-        "LHS broadcast in dimension 3 or higher is not supported.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(lhs_shape, dst_shape, 3),
+                                    "LHS broadcast in dimension 3 or higher is not supported.");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(rhs_shape, dst_shape, 3),
-        "RHS broadcast in dimension 3 or higher is not supported.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(rhs_shape, dst_shape, 3),
+                                    "RHS broadcast in dimension 3 or higher is not supported.");
 
     // Matching data type
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
@@ -112,22 +112,15 @@ Status ClComponentElementwiseBinary::validate(const ArgumentPack<ITensorInfo> &t
 ClComponentElementwiseBinary::~ClComponentElementwiseBinary()
 {
 }
-ClComponentElementwiseBinary::ClComponentElementwiseBinary(
-    ComponentId                      id,
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes)
-    : IGpuKernelComponent{ id, properties, tensors },
+ClComponentElementwiseBinary::ClComponentElementwiseBinary(ComponentId                      id,
+                                                           const Properties                &properties,
+                                                           const ArgumentPack<ITensorInfo> &tensors,
+                                                           const Attributes                &attributes)
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<ClTemplateElementwiseBinary>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<ClTemplateElementwiseBinary>(id, tensors, attributes)}
 #else  //ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<GpuCkwElementwiseBinary>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<GpuCkwElementwiseBinary>(id, tensors, attributes)}
 #endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
index f7175903d0153bf04f2bfdfe5d4ecc4e8f694572..7589b9732c4746144fe9b851cf77e138597bf420 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
@@ -82,17 +82,17 @@ public:
      * |S16            |S16            |S16            |
      * |U8             |U8             |U8             |
      */
-    static Status validate(const ArgumentPack<ITensorInfo> &tensors, const ElementwiseBinaryCommonAttributes &attributes);
+    static Status validate(const ArgumentPack<ITensorInfo>         &tensors,
+                           const ElementwiseBinaryCommonAttributes &attributes);
 
     /** Constructor
      *
      * Similar to @ref ClComponentElementwiseBinary::validate()
      */
-    ClComponentElementwiseBinary(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    ClComponentElementwiseBinary(ComponentId                      id,
+                                 const Properties                &properties,
+                                 const ArgumentPack<ITensorInfo> &tensors,
+                                 const Attributes                &attributes);
 
     /** Destructor */
     ~ClComponentElementwiseBinary() override;
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp
index 279c77e2272ab7435fdb0f81f50d88c0ab063da1..27c13bd654645fedfd6d2aea178fb8eaad5da35e 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp
@@ -25,9 +25,10 @@
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h"
 
@@ -37,10 +38,9 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-Status ClComponentLogits1DMaxShiftExpSum::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes)
+Status ClComponentLogits1DMaxShiftExpSum::validate(const Properties                &properties,
+                                                   const ArgumentPack<ITensorInfo> &tensors,
+                                                   const Attributes                &attributes)
 {
     ARM_COMPUTE_UNUSED(properties, attributes);
 
@@ -75,8 +75,8 @@ ClComponentLogits1DMaxShiftExpSum::ClComponentLogits1DMaxShiftExpSum(ComponentId
                                                                      const Properties                &properties,
                                                                      const ArgumentPack<ITensorInfo> &tensors,
                                                                      const Attributes                &attributes)
-    : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateLogits1DMaxShiftExpSum>(id, tensors, attributes) }
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<ClTemplateLogits1DMaxShiftExpSum>(id, tensors, attributes)}
 {
 }
 
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h
index b5db4582489d6cfc40e4a163fb286414b9acd2fc..91ab5de3b5bde8a67f1b5f03cfa5e7286bba81f1 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DMAXSHIFTEXPSUM
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
@@ -89,10 +90,8 @@ public:
      * |F16        | F16       | F16       |
      * |F32        | F32       | F32       |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    static Status
+    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
 
     /** Constructor
      *
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp
index 7864d56d29729b397c77afc902a728714322deeb..fb2544385c33fe2a42030faf9f97598d8d214737 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp
@@ -25,9 +25,10 @@
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h"
 
@@ -37,10 +38,9 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-Status ClComponentLogits1DNorm::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes)
+Status ClComponentLogits1DNorm::validate(const Properties                &properties,
+                                         const ArgumentPack<ITensorInfo> &tensors,
+                                         const Attributes                &attributes)
 {
     ARM_COMPUTE_UNUSED(properties, attributes);
 
@@ -77,8 +77,8 @@ ClComponentLogits1DNorm::ClComponentLogits1DNorm(ComponentId
                                                  const Properties                &properties,
                                                  const ArgumentPack<ITensorInfo> &tensors,
                                                  const Attributes                &attributes)
-    : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateLogits1DNorm>(id, tensors, attributes) }
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<ClTemplateLogits1DNorm>(id, tensors, attributes)}
 {
 }
 
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h
index 5bd350b9bd4ffe2c3e32989ca38f287d52c346be..74c0273604afc32bc9faf3aad01173b0dbc8268a 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DNORM
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
@@ -86,10 +87,8 @@ public:
      * |F16        | F16       | F16       |
      * |F32        | F32       | F32       |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    static Status
+    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
 
     /** Constructor
      *
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f238d42d98d18e209df4f1437747bec3611efd15
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
+
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h"
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+using Attributes = MatMulAttributes;
+using Settings   = GpuMatMulSettings;
+
+Status validate_matmul_kernel_info(Attributes attributes, Settings settings)
+{
+    const bool adj_lhs = attributes.adj_lhs();
+    const bool adj_rhs = attributes.adj_rhs();
+    const int  m0      = settings.m0();
+    const int  n0      = settings.n0();
+    const int  k0      = settings.k0();
+
+    // Validate M0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
+
+    if (adj_lhs)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+    }
+
+    // Validate N0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
+
+    // Validate K0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
+    if (!adj_lhs || adj_rhs)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for K0");
+    }
+
+    return Status{};
+}
+
+} // namespace
+
+Status ClComponentMatMul::validate(const Properties                &properties,
+                                   const ArgumentPack<ITensorInfo> &tensors,
+                                   const Attributes                &attributes,
+                                   const Settings                  &settings)
+{
+    ARM_COMPUTE_UNUSED(properties);
+    ARM_COMPUTE_UNUSED(attributes);
+
+    const auto lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+    // Currently, the only supported case is when adj_lhs = false and adj_rhs = true
+    ARM_COMPUTE_RETURN_ERROR_ON((attributes.adj_lhs() != false) && (attributes.adj_rhs() != true));
+
+    // Check if Matching data type
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+
+    // All tensor infos are initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+
+    // Device requirements are met
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs);
+
+    // Check if block sizes are supported
+    MatMulKernelInfo matmul_kernel_info =
+        MatMulKernelInfo(attributes.adj_lhs(), attributes.adj_rhs(), settings.m0(), settings.n0(), settings.k0());
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(attributes, settings));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        opencl::kernels::validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+
+    // Check if dst shape is correct
+    const auto expected_dst_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), expected_dst_shape);
+
+    return Status{};
+}
+
+ClComponentMatMul::ClComponentMatMul(ComponentId                      id,
+                                     const Properties                &properties,
+                                     const ArgumentPack<ITensorInfo> &tensors,
+                                     const Attributes                &attributes,
+                                     const Settings                  &settings)
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<GpuCkwMatMul>(id, tensors, attributes, settings)}
+{
+}
+
+ClComponentMatMul::~ClComponentMatMul()
+{
+}
+
+const IGpuCkwComponentDriver *ClComponentMatMul::ckw_component_driver() const
+{
+    return _component_writer.get();
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h
new file mode 100644
index 0000000000000000000000000000000000000000..41833e4adb127b5ff2430eba2551e6d78aa6fe69
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTMATMUL_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTMATMUL_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+class MatMulAttributes;
+class GpuCkwMatMul;
+
+class ClComponentMatMul final : public IGpuKernelComponent
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what a component does */
+    using Attributes = MatMulAttributes;
+    /** Settings are a set of backend-specific parameters that influence the implementation of a component */
+    using Settings = GpuMatMulSettings;
+
+    /** Validate the component
+     *
+     * @param[in]     properties Component properties
+     * @param[in,out] tensors    Tensor arguments to the component
+     * @param[in]     attributes Component attributes
+     * @param[in]     settings   Component settings
+     *
+     * @return Status       Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: LHS
+     * - ACL_SRC_1: RHS
+     * - ACL_DST_0: Output
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_SRC_1: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |ACL_SRC_0      |ACL_SRC_1      |ACL_DST_0      |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     */
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
+
+    /** Constructor
+     *
+     * Similar to @ref ClComponentMatMul::validate()
+     */
+    ClComponentMatMul(ComponentId                      id,
+                      const Properties                &properties,
+                      const ArgumentPack<ITensorInfo> &tensors,
+                      const Attributes                &attributes,
+                      const Settings                  &settings);
+    /** Destructor */
+    ~ClComponentMatMul() override;
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentMatMul(const ClComponentMatMul &component) = delete;
+    /** Prevent instances of this class from being copied */
+    ClComponentMatMul &operator=(const ClComponentMatMul &component) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClComponentMatMul(ClComponentMatMul &&component) = default;
+    /** Allow instances of this class to be moved */
+    ClComponentMatMul &operator=(ClComponentMatMul &&component) = default;
+    /** Get writer for the component */
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Complex;
+    }
+
+private:
+    std::unique_ptr<GpuCkwMatMul> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTMATMUL_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
index 2b01803224c9b961699eafd9d1bf73e8e2cf15f8..409b191df516eebcf1f3e85266e17b31fd6f8751 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
@@ -24,12 +24,15 @@
 #include "ClComponentPool2d.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+
 #include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h"
 #include "src/dynamic_fusion/utils/Utils.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -38,23 +41,24 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-Status ClComponentPool2d::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
+Status ClComponentPool2d::validate(const Properties                &properties,
+                                   const ArgumentPack<ITensorInfo> &tensors,
+                                   const Attributes                &attributes,
+                                   const Settings                  &settings)
 {
     ARM_COMPUTE_UNUSED(properties);
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
     const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_ON_MSG((attributes.pool_type() != PoolingType::AVG && attributes.pool_type() != PoolingType::MAX), "Unsupported Pooling type");
+    ARM_COMPUTE_ERROR_ON_MSG((attributes.pool_type() != PoolingType::AVG && attributes.pool_type() != PoolingType::MAX),
+                             "Unsupported Pooling type");
 
     // 1. Check validity
     // Check if pooling is valid
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())),
-                                    "Pooling region that is entirely outside input tensor is unsupported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())),
+        "Pooling region that is entirely outside input tensor is unsupported");
 
     // Matching data type
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
@@ -69,8 +73,9 @@ Status ClComponentPool2d::validate(
     // Device requirements are met
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                       misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+        dst->tensor_shape(), misc::shape_calculator::compute_pool_shape(
+                                 *src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())));
 
     // 2. Check support level
     // Data type
@@ -82,23 +87,33 @@ Status ClComponentPool2d::validate(
     return Status{};
 }
 
-ClComponentPool2d::ClComponentPool2d(
-    ComponentId                      id,
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
-    : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplatePool2d>(id, tensors, attributes, settings) }
+ClComponentPool2d::ClComponentPool2d(ComponentId                      id,
+                                     const Properties                &properties,
+                                     const ArgumentPack<ITensorInfo> &tensors,
+                                     const Attributes                &attributes,
+                                     const Settings                  &settings)
+    : IGpuKernelComponent{id, properties, tensors},
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer{std::make_unique<ClTemplatePool2d>(id, tensors, attributes, settings)}
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer{std::make_unique<GpuCkwPool2d>(id, tensors, attributes, settings)}
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
 ClComponentPool2d::~ClComponentPool2d()
 {
 }
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuTemplateComponentWriter *ClComponentPool2d::template_writer() const
 {
     return _component_writer.get();
 }
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+const IGpuCkwComponentDriver *ClComponentPool2d::ckw_component_driver() const
+{
+    return _component_writer.get();
+}
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
index 896048e27aa82ec91366ee8f649f3db680754133..98fed65004d6528fe664fe5e344f4cddca663ae6 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D_H
 
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
@@ -41,7 +42,11 @@ class ArgumentPack;
 class Pool2dAttributes;
 
 /** Forward declaration */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 class ClTemplatePool2d;
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+class GpuCkwPool2d;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentPool2d final : public IGpuKernelComponent
 {
@@ -78,11 +83,10 @@ public:
      * |F16            |F16            |
      * |F32            |F32            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
 
     /** Constructor
      *
@@ -92,12 +96,11 @@ public:
      * @param[in]     attributes Component attributes
      * @param[in]     settings   Component settings
      */
-    ClComponentPool2d(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    ClComponentPool2d(ComponentId                      id,
+                      const Properties                &properties,
+                      const ArgumentPack<ITensorInfo> &tensors,
+                      const Attributes                &attributes,
+                      const Settings                  &settings);
 
     /** Destructor */
     ~ClComponentPool2d() override;
@@ -113,20 +116,28 @@ public:
 
     /** Allow instances of this class to be moved */
     ClComponentPool2d &operator=(ClComponentPool2d &&component) = default;
-
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get template writer for the component */
     const IGpuTemplateComponentWriter *template_writer() const override;
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    /** Get GPU kernel writer for the component */
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
     /** Get component type */
     GpuComponentType type() const override
     {
-        return GpuComponentType::Unfusable;
+        return GpuComponentType::Complex;
     }
 
 private:
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<ClTemplatePool2d> _component_writer;
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwPool2d> _component_writer;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
index 66e2ee695644c249a9c9df2c96d1c51088de67c0..0ece9de97076aa38fb36047c510306e9c62750a7 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "ClComponentReshape.h"
+
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h"
 
@@ -49,12 +51,10 @@ Status ClComponentReshape::validate(const ArgumentPack<ITensorInfo> &tensors)
     return Status{};
 }
 
-ClComponentReshape::ClComponentReshape(
-    ComponentId                      id,
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateReshape>(id, tensors) }
+ClComponentReshape::ClComponentReshape(ComponentId                      id,
+                                       const Properties                &properties,
+                                       const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuKernelComponent{id, properties, tensors}, _component_writer{std::make_unique<ClTemplateReshape>(id, tensors)}
 {
 }
 ClComponentReshape::~ClComponentReshape()
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
index f8d165b4c8d2b9a3d105c70587b51d6f7fc2ac23..78163d660367a0d9e98bc04d7a879a661c499039 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
@@ -73,10 +73,7 @@ public:
      * @param[in] properties Component properties @ref Properties
      * @param[in] tensors    Tensor arguments to the component
      */
-    ClComponentReshape(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors);
+    ClComponentReshape(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors);
 
     /** Destructor */
     ~ClComponentReshape() override;
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
index 895b854ae2d51f938f1cea555ca3e769ea146123..b05eb046985efa6e7bd7e6a303ab648b3ab50558 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,10 +25,16 @@
 #include "ClComponentResize.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
+
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.h"
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h"
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -36,6 +42,13 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
+/** Forward declaration */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+class ClTemplateResize;
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+class GpuCkwResize;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
+
 Status ClComponentResize::validate(const IGpuKernelComponent::Properties &properties,
                                    const ArgumentPack<ITensorInfo>       &tensors,
                                    const ClComponentResize::Attributes   &attributes)
@@ -53,7 +66,9 @@ Status ClComponentResize::validate(const IGpuKernelComponent::Properties &proper
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
 
     // Align corners and sampling policy conformance
-    ARM_COMPUTE_RETURN_ERROR_ON(attributes.align_corners() && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(attributes.sampling_policy()));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        attributes.align_corners() &&
+        !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(attributes.sampling_policy()));
 
     // All tensor infos are initialized
     ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
@@ -66,8 +81,12 @@ ClComponentResize::ClComponentResize(ComponentId                            id,
                                      const IGpuKernelComponent::Properties &properties,
                                      const ArgumentPack<ITensorInfo>       &tensors,
                                      const ClComponentResize::Attributes   &attributes)
-    : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateResize>(id, tensors, attributes) }
+    : IGpuKernelComponent{id, properties, tensors},
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer{std::make_unique<ClTemplateResize>(id, tensors, attributes)}
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer{std::make_unique<GpuCkwResize>(id, tensors, attributes)}
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
 
@@ -75,7 +94,11 @@ ClComponentResize::~ClComponentResize()
 {
 }
 
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuTemplateComponentWriter *ClComponentResize::template_writer() const
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+const IGpuCkwComponentDriver *ClComponentResize::ckw_component_driver() const
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
index 87d5a61ce33913d8af47780afd9d1de9ff40d53c..29276c3257a6e05fe40832804900b04b42dab0c3 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
@@ -41,7 +42,11 @@ template <typename T>
 class ArgumentPack;
 
 /** Forward declaration */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 class ClTemplateResize;
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+class GpuCkwResize;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentResize final : public IGpuKernelComponent
 {
@@ -78,10 +83,8 @@ public:
      * |U8             |U8             |
      * |S16            |S16            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    static Status
+    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
 
     /** Constructor
      *
@@ -107,8 +110,12 @@ public:
     /** Allow instances of this class to be moved */
     ClComponentResize &operator=(ClComponentResize &&component) = default;
 
-    /** Get template writer for the component */
+    /** Get writer for the component */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
     /** Get component type */
     GpuComponentType type() const override
@@ -117,7 +124,11 @@ public:
     }
 
 private:
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<ClTemplateResize> _component_writer;
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwResize> _component_writer;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
index 12b81c3d568c0b4167cec46ec93322dc8fbd8e24..dcbecaff3507f3cd59c1293f1356f45a33d9f143 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
@@ -38,25 +38,19 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-Status ClComponentStore::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors)
+Status ClComponentStore::validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
 {
     ARM_COMPUTE_UNUSED(properties, tensors);
     return Status{};
 }
-ClComponentStore::ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuKernelComponent{ id, properties, tensors },
+ClComponentStore::ClComponentStore(ComponentId                      id,
+                                   const Properties                &properties,
+                                   const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<ClTemplateStore>(id, tensors)
-}
+      _component_writer{std::make_unique<ClTemplateStore>(id, tensors)}
 #else  //ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<GpuCkwStore>(id, tensors)
-}
+      _component_writer{std::make_unique<GpuCkwStore>(id, tensors)}
 #endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
index 853ee39012bcbc00ce12b683c4943eb8f3b6768d..948785c4809f732afe08b1de42cfc359cb7b8f7a 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
 
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -70,9 +71,7 @@ public:
      * |:--------------|:--------------|
      * |All            |All            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors);
+    static Status validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors);
     /** Constructor
      *
      * Similar to @ref ClComponentStore::validate()
diff --git a/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h
index bc7133f4df8fac9d7e0811ad9cfad68bd75924a8..4c3e84e59da574d0e10b4251fecc913cf75ce962 100644
--- a/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h
@@ -46,18 +46,16 @@ using namespace experimental::dynamic_fusion;
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ClComponentElementwiseBinary::Attributes::ElementwiseOp &op)
 {
-    const std::map<ClComponentElementwiseBinary::Attributes::ElementwiseOp, std::string> op_name =
-    {
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Add, "add" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Div, "div" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Max, "max" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Min, "min" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Mul, "mul" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Power, "power" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Prelu, "prelu" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::SquaredDiff, "squareddiff" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Sub, "sub" }
-    };
+    const std::map<ClComponentElementwiseBinary::Attributes::ElementwiseOp, std::string> op_name = {
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Add, "add"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Div, "div"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Max, "max"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Min, "min"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Mul, "mul"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Power, "power"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Prelu, "prelu"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::SquaredDiff, "squareddiff"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Sub, "sub"}};
     os << op_name.at(op);
     return os;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
index e7ee1c10dfff533e50623b0920750a420a8a64a4..2cec67dc6544f75da841ec1cad1810ad9cd7ea0c 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
+
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
@@ -32,12 +33,11 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch,
-                           const ITensorInfo       *lhs,
-                           const ITensorInfo       *rhs)
+Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
+                                                         DataType::S16, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Add then call the elementwise common validate_op
@@ -46,12 +46,11 @@ Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch,
     return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
 }
 
-Status GpuAdd::is_supported_op(const GpuWorkloadContext &context,
-                               const ITensorInfo        *lhs,
-                               const ITensorInfo        *rhs)
+Status GpuAdd::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
+                                                         DataType::S16, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Add then call the elementwise common is_supported_op
@@ -60,9 +59,7 @@ Status GpuAdd::is_supported_op(const GpuWorkloadContext &context,
     return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
 }
 
-ITensorInfo *GpuAdd::create_op(GpuWorkloadSketch &sketch,
-                               ITensorInfo       *lhs,
-                               ITensorInfo       *rhs)
+ITensorInfo *GpuAdd::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs)
 {
     // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op()
     // Set the elementwise operation to Add then call the elementwise common create_op
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
index 33c2d43e07d70d040b6f19022c2cd31d64481df4..6f35e66ea8197a966eaf89c2096c345baf474a2b 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
@@ -23,12 +23,11 @@
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h"
 
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h"
-
-#include "src/common/utils/Log.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
 {
@@ -49,7 +48,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -58,25 +57,22 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 
     // Check support level
     // Data Type
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
-
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+        src, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
+        DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr, 1, DataType::U8, DataType::S8,
+                                                         DataType::QASYMM8, DataType::S16, DataType::U16, DataType::U32,
+                                                         DataType::S32, DataType::F16, DataType::F32);
+
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
         // Validate Cast Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
-            auto       settings   = ClComponentCast::Settings();
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentCast::Settings();
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
@@ -94,16 +90,13 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-Status GpuCast::is_supported_op(const GpuWorkloadContext &context,
-                                const ITensorInfo        *src,
-                                const CastAttributes     &attributes)
+Status
+GpuCast::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const CastAttributes &attributes)
 {
     return is_supported_op_helper(context, src, nullptr, attributes);
 }
 
-Status GpuCast::validate_op(const GpuWorkloadSketch &sketch,
-                            const ITensorInfo       *src,
-                            const CastAttributes    &attributes)
+Status GpuCast::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const CastAttributes &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
@@ -127,9 +120,7 @@ Status GpuCast::validate_op(const GpuWorkloadSketch &sketch,
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
 }
 
-ITensorInfo *GpuCast::create_op(GpuWorkloadSketch    &sketch,
-                                ITensorInfo          *src,
-                                const CastAttributes &attributes)
+ITensorInfo *GpuCast::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const CastAttributes &attributes)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, attributes);
@@ -145,14 +136,15 @@ ITensorInfo *GpuCast::create_op(GpuWorkloadSketch    &sketch,
     GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
     const auto              *sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         ARM_COMPUTE_ERROR_ON(sketch_ctx->cl_compile_context() == nullptr);
 
         // Add Depthwise Conv2d Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
-            auto       settings   = ClComponentCast::Settings();
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentCast::Settings();
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
index 89b533c9b86abad028faf739c40beb080835a7d7..697b7d4e1fa69b9971e82dddf41f7f8976f20921 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
@@ -25,14 +25,13 @@
 
 #include "arm_compute/core/experimental/Types.h"
 
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace experimental
@@ -48,12 +47,13 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.max_val() < attributes.min_val(), "Maximum clamp value cannot be lower than minimum value");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.max_val() < attributes.min_val(),
+                                    "Maximum clamp value cannot be lower than minimum value");
 
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -61,16 +61,15 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
     auto_init_if_empty(dst_info_to_validate, *src->clone());
 
     // CLAMP operator is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped
-    const ClComponentActivation::Attributes act_info
-    {
-        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, attributes.max_val(), attributes.min_val()
-    };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                     attributes.max_val(), attributes.min_val()};
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         // Validate Activation Component
-        const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        const auto properties =
+            IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);
@@ -87,16 +86,13 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-Status GpuClamp::is_supported_op(const GpuWorkloadContext &context,
-                                 const ITensorInfo        *src,
-                                 const ClampAttributes    &attributes)
+Status
+GpuClamp::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ClampAttributes &attributes)
 {
     return is_supported_op_helper(context, src, nullptr, attributes);
 }
 
-Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch,
-                             const ITensorInfo       *src,
-                             const ClampAttributes   &attributes)
+Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ClampAttributes &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
@@ -121,9 +117,7 @@ Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch,
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
 }
 
-ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch     &sketch,
-                                 ITensorInfo           *src,
-                                 const ClampAttributes &attributes)
+ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const ClampAttributes &attributes)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, attributes);
@@ -139,18 +133,16 @@ ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch     &sketch,
     GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
 
     // CLAMP operator is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped
-    const ClComponentActivation::Attributes act_info
-    {
-        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, attributes.max_val(), attributes.min_val()
-    };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                     attributes.max_val(), attributes.min_val()};
 
     const auto *const sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         // Add Activation Component
         auto properties = IGpuKernelComponent::Properties();
-        properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
index cb270ed4b06249f3313c627b159d09d65f283d91..aaeec543f8b23567c0fa4d3ceff90a077492ecc0 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
@@ -24,15 +24,15 @@
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
 
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h"
 #include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
@@ -45,24 +45,30 @@ namespace dynamic_fusion
 {
 namespace
 {
-DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo
+config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
 {
     // Get GPU target
     GPUTarget gpu_target = CLScheduler::get().target();
 
-    std::unique_ptr<arm_compute::cl_direct_conv::IClDirectConvKernelConfig> t = arm_compute::cl_direct_conv::ClDirectConvKernelConfigurationFactory::create(gpu_target);
+    std::unique_ptr<arm_compute::cl_direct_conv::IClDirectConvKernelConfig> t =
+        arm_compute::cl_direct_conv::ClDirectConvKernelConfigurationFactory::create(gpu_target);
 
     return t->configure(src, weights, conv_info);
 }
 
-void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const Conv2dAttributes &attributes)
+void calculate_and_init_dst_if_empty(ITensorInfo            *dst,
+                                     const ITensorInfo      *src,
+                                     const ITensorInfo      *wei,
+                                     const Conv2dAttributes &attributes)
 {
-    if(dst->total_size() == 0U)
+    if (dst->total_size() == 0U)
     {
-        const auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), src->data_layout(), wei->tensor_shape(),
-                                                                                  PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
-                                                                                                attributes.pad().right,
-                                                                                                attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
+        const auto shape = misc::shape_calculator::compute_deep_convolution_shape(
+            src->tensor_shape(), src->data_layout(), wei->tensor_shape(),
+            PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                          attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+                          DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
 
         auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
     }
@@ -83,7 +89,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -98,18 +104,20 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 
     // Check components
     const auto gpu_target = context.gpu_target();
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
         // Validate Direct Conv2d Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
-            auto       settings   = ClComponentDirectConv2d::Settings();
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentDirectConv2d::Settings();
 
             settings.fast_relaxed_math(
-                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-                && (dst_info_to_validate_ptr->data_type() == DataType::F32 || dst_info_to_validate_ptr->data_type() == DataType::F16));
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+                (dst_info_to_validate_ptr->data_type() == DataType::F32 ||
+                 dst_info_to_validate_ptr->data_type() == DataType::F16));
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
@@ -142,14 +150,14 @@ Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch,
                               const ITensorInfo       *src,
                               const ITensorInfo       *wei,
                               const ITensorInfo       *bia,
-                              const Conv2dAttributes &attributes)
+                              const Conv2dAttributes  &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!wei->are_values_constant(), "Dynamic weights are not supported");
 
     // Check if tensors have valid id. I.e. they are created from a sketch
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id());
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id());
     }
@@ -178,16 +186,13 @@ Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch,
     return is_supported_op_helper(*sketch.gpu_context(), src, wei, bia, &dst_info_to_validate, attributes);
 }
 
-ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch      &sketch,
-                                  ITensorInfo            *src,
-                                  ITensorInfo            *wei,
-                                  ITensorInfo            *bia,
-                                  const Conv2dAttributes &attributes)
+ITensorInfo *GpuConv2d::create_op(
+    GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *wei, ITensorInfo *bia, const Conv2dAttributes &attributes)
 {
     ARM_COMPUTE_LOG_PARAMS(src, wei, bia, attributes);
     PadStrideInfo conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
-                            attributes.pad().right,
-                            attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR);
+                            attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+                            DimensionRoundingType::FLOOR);
     // Initialize the direct convolution descriptor
     const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, wei, conv_info);
 
@@ -207,7 +212,7 @@ ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch      &sketch,
 
     const auto gpu_target = sketch_ctx->gpu_target();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
         ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr);
@@ -216,17 +221,17 @@ ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch      &sketch,
         // Add Direct Conv2d Component
         {
             auto properties = IGpuKernelComponent::Properties();
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             auto settings = ClComponentDirectConv2d::Settings();
 
             settings.fast_relaxed_math(
-                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-                && (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16));
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+                (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16));
 
             settings.direct_conv_descriptor(desc);
 
-            if(settings.export_to_cl_image())
+            if (settings.export_to_cl_image())
             {
                 arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei);
             }
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
index c72098e9439ee0f2450a0e267f80760920f9a4dc..e2b673bd4388988d75eca37d72aa9128c15c24c2 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
@@ -28,8 +28,8 @@
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
 #include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
@@ -42,20 +42,20 @@ namespace dynamic_fusion
 {
 namespace
 {
-void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const DepthwiseConv2dAttributes &attributes)
+void calculate_and_init_dst_if_empty(ITensorInfo                     *dst,
+                                     const ITensorInfo               *src,
+                                     const ITensorInfo               *wei,
+                                     const DepthwiseConv2dAttributes &attributes)
 {
-    if(dst->total_size() == 0U)
+    if (dst->total_size() == 0U)
     {
-        const PadStrideInfo pad_stride_info(attributes.stride().x(),
-                                            attributes.stride().y(),
-                                            attributes.pad().left,
-                                            attributes.pad().right,
-                                            attributes.pad().top,
-                                            attributes.pad().bottom,
+        const PadStrideInfo pad_stride_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                                            attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
                                             attributes.dimension_rounding_type());
 
-        const ConvolutionInfo conv_info{ pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), attributes.dilation() };
-        const TensorShape     shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
+        const ConvolutionInfo conv_info{pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(),
+                                        attributes.dilation()};
+        const TensorShape shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
 
         auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
     }
@@ -76,7 +76,7 @@ Status is_supported_op_helper(const GpuWorkloadContext        &context,
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -91,40 +91,44 @@ Status is_supported_op_helper(const GpuWorkloadContext        &context,
 
     const GpuTarget gpu_target = context.gpu_target();
 
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const CLCompileContext *cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
 
         // Validate Depthwise Conv2d Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
-            auto       settings   = ClComponentDepthwiseConv2d::Settings();
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentDepthwiseConv2d::Settings();
 
-            const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
-                                                 attributes.pad().right,
-                                                 attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR);
+            const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(),
+                                                 attributes.pad().left, attributes.pad().right, attributes.pad().top,
+                                                 attributes.pad().bottom, DimensionRoundingType::FLOOR);
 
             // Get the depthwise convolution compute parameters
-            auto                       t        = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-            const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
+            auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+            const DWCComputeKernelInfo dwc_info =
+                t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
 
             settings.fast_relaxed_math(
-                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-                && (dst_info_to_validate_ptr->data_type() == DataType::F32 || dst_info_to_validate_ptr->data_type() == DataType::F16));
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+                (dst_info_to_validate_ptr->data_type() == DataType::F32 ||
+                 dst_info_to_validate_ptr->data_type() == DataType::F16));
 
             settings.is_fma_available(get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
-            .m0(dwc_info.m0)
-            .n0(dwc_info.n0)
-            .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
-            .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
+                .m0(dwc_info.m0)
+                .n0(dwc_info.n0)
+                .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
+                .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
             arguments.add_const_tensor(ACL_SRC_1, wei);
             arguments.add_const_tensor(ACL_SRC_2, bia);
             arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
-            ARM_COMPUTE_RETURN_ON_ERROR(ClComponentDepthwiseConv2d::validate(properties, arguments, attributes, settings));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                ClComponentDepthwiseConv2d::validate(properties, arguments, attributes, settings));
         }
     }
     else
@@ -158,7 +162,7 @@ Status GpuDepthwiseConv2d::validate_op(const GpuWorkloadSketch         &sketch,
 
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id());
 
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id());
     }
@@ -205,35 +209,37 @@ ITensorInfo *GpuDepthwiseConv2d::create_op(GpuWorkloadSketch               &sket
     const auto              *sketch_ctx = sketch.implementation().context();
     const GpuTarget          gpu_target = sketch_ctx->gpu_target();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context());
 
         // Add Depthwise Conv2d Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
-            auto       settings   = ClComponentDepthwiseConv2d::Settings();
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentDepthwiseConv2d::Settings();
 
-            const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
-                                                 attributes.pad().right,
-                                                 attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR);
+            const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(),
+                                                 attributes.pad().left, attributes.pad().right, attributes.pad().top,
+                                                 attributes.pad().bottom, DimensionRoundingType::FLOOR);
 
             // Get the depthwise convolution compute parameters
-            auto                       t        = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-            const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
+            auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+            const DWCComputeKernelInfo dwc_info =
+                t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
 
             settings.is_fma_available(get_arch_from_target(gpu_target) != GPUTarget::MIDGARD)
-            .m0(dwc_info.m0)
-            .n0(dwc_info.n0)
-            .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
-            .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
+                .m0(dwc_info.m0)
+                .n0(dwc_info.n0)
+                .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
+                .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
 
-            if(settings.export_input_to_cl_image())
+            if (settings.export_input_to_cl_image())
             {
                 arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(src);
             }
 
-            if(settings.export_weights_to_cl_image())
+            if (settings.export_weights_to_cl_image())
             {
                 arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei);
             }
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e24629a0366219c9ce7f1549ba0e30a48dd5768e
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+void calculate_and_init_dst_if_empty(ITensorInfo             *dst,
+                                     const ITensorInfo       *lhs,
+                                     const ITensorInfo       *rhs,
+                                     const MatMulAttributes  &attributes,
+                                     const GpuMatMulSettings &settings)
+{
+    ARM_COMPUTE_UNUSED(attributes);
+
+    if (dst->total_size() == 0U)
+    {
+        const auto dst_shape = misc::shape_calculator::compute_matmul_shape(
+            lhs->tensor_shape(), rhs->tensor_shape(),
+            MatMulKernelInfo(attributes.adj_lhs(), attributes.adj_rhs(), settings.m0(), settings.n0(), settings.k0()));
+
+        auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(dst_shape));
+    }
+}
+
+/* A helper method to reduce the duplication in dst tensor initialization
+*  when calling validate()
+*/
+Status is_supported_op_helper(const GpuWorkloadContext &context,
+                              const ITensorInfo        *lhs,
+                              const ITensorInfo        *rhs,
+                              const ITensorInfo        *dst,
+                              const MatMulAttributes   &attributes,
+                              const GpuMatMulSettings  &settings)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+
+    TensorInfo         dst_info_to_validate;
+    const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+    if (dst != nullptr)
+    {
+        dst_info_to_validate_ptr = dst;
+    }
+
+    calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs, attributes, settings);
+
+    // Check support level
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+
+    // Check components
+    if (context.gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = context.cl_compile_context();
+        ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+        // Validate MatMul Component
+        {
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, lhs);
+            arguments.add_const_tensor(ACL_SRC_1, rhs);
+            arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
+
+            ARM_COMPUTE_RETURN_ON_ERROR(ClComponentMatMul::validate(properties, arguments, attributes, settings));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+    return Status{};
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+int GpuMatMulSettings::n0() const
+{
+    return _n0;
+}
+
+GpuMatMulSettings &GpuMatMulSettings::n0(int n0)
+{
+    _n0 = n0;
+    return *this;
+}
+
+int GpuMatMulSettings::m0() const
+{
+    return _m0;
+}
+
+GpuMatMulSettings &GpuMatMulSettings::m0(int m0)
+{
+    _m0 = m0;
+    return *this;
+}
+
+int GpuMatMulSettings::k0() const
+{
+    return _k0;
+}
+
+GpuMatMulSettings &GpuMatMulSettings::k0(int k0)
+{
+    _k0 = k0;
+    return *this;
+}
+
+Status GpuMatMul::is_supported_op(const GpuWorkloadContext &context,
+                                  const ITensorInfo        *lhs,
+                                  const ITensorInfo        *rhs,
+                                  const MatMulAttributes   &attributes,
+                                  const GpuMatMulSettings  &settings)
+{
+    return is_supported_op_helper(context, lhs, rhs, nullptr, attributes, settings);
+}
+
+Status GpuMatMul::validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *lhs,
+                              const ITensorInfo       *rhs,
+                              const MatMulAttributes  &attributes,
+                              const GpuMatMulSettings &settings)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+
+    // Check if tensors have valid id. I.e. they are created from a sketch
+    ARM_COMPUTE_RETURN_ERROR_ON(!lhs->has_valid_id() || !rhs->has_valid_id());
+
+    // Refer to GpuMatmul::validate_op() for id-validness of this TensorInfo object
+    TensorInfo dst_info_to_validate;
+
+    // Auto initialize dst tensor info
+    calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs, attributes, settings);
+
+    // Perform fusion test
+    // Check if operator meets fusion constraints
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, lhs);
+    tensors.add_const_tensor(ACL_SRC_1, rhs);
+    tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Check if configuration is supported
+    return is_supported_op_helper(*sketch.gpu_context(), lhs, rhs, &dst_info_to_validate, attributes, settings);
+}
+
+ITensorInfo *GpuMatMul::create_op(GpuWorkloadSketch &sketch,
+                                  ITensorInfo       *lhs,
+                                  ITensorInfo       *rhs,
+                                  const Attributes  &attributes,
+                                  const Settings    &settings)
+{
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, attributes, settings);
+
+    ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+
+    // Assert validation
+    ARM_COMPUTE_ERROR_THROW_ON(GpuMatMul::validate_op(sketch, lhs, rhs, attributes, settings));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+
+    // Auto initialize dst tensor
+    calculate_and_init_dst_if_empty(dst, lhs, rhs, attributes, settings);
+
+    // Translate into components and add to component graph
+    auto      &comp_graph = sketch.implementation().component_graph();
+    const auto sketch_ctx = sketch.implementation().context();
+
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        auto properties = IGpuKernelComponent::Properties();
+        properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+        ArgumentPack<ITensorInfo> arguments;
+        arguments.add_const_tensor(ACL_SRC_0, lhs);
+        arguments.add_const_tensor(ACL_SRC_1, rhs);
+        arguments.add_const_tensor(ACL_DST_0, dst);
+        comp_graph.add_new_component<ClComponentMatMul>(properties, arguments, attributes, settings);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, lhs);
+    tensors.add_const_tensor(ACL_SRC_1, rhs);
+    tensors.add_const_tensor(ACL_DST_0, dst);
+
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+
+    return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp
index 464a32cbad2fd5803c0add2ff29f1ff105a73e0f..b871171e8dd756bab42bd90ef8ba0d73eb8063c8 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h"
+
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
@@ -32,9 +33,7 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-Status GpuMul::validate_op(const GpuWorkloadSketch &sketch,
-                           const ITensorInfo       *lhs,
-                           const ITensorInfo       *rhs)
+Status GpuMul::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
@@ -46,9 +45,7 @@ Status GpuMul::validate_op(const GpuWorkloadSketch &sketch,
     return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
 }
 
-Status GpuMul::is_supported_op(const GpuWorkloadContext &context,
-                               const ITensorInfo        *lhs,
-                               const ITensorInfo        *rhs)
+Status GpuMul::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
@@ -60,9 +57,7 @@ Status GpuMul::is_supported_op(const GpuWorkloadContext &context,
     return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
 }
 
-ITensorInfo *GpuMul::create_op(GpuWorkloadSketch &sketch,
-                               ITensorInfo       *lhs,
-                               ITensorInfo       *rhs)
+ITensorInfo *GpuMul::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs)
 {
     // Set the elementwise operation to Mul then call the elementwise common create_op
     ElementwiseBinaryCommonAttributes common_attributes{};
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
index 107a5e5fa7a4a618cd152868f4feef46869925c5..f0d368d75718b3973f4dc62ee8304bd29016fb97 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
@@ -26,10 +26,9 @@
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
-
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/utils/Utils.h"
 
 namespace arm_compute
@@ -43,9 +42,7 @@ namespace
 constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-Status GpuOutput::is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src,
-                                  const ITensorInfo        *dst)
+Status GpuOutput::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
@@ -60,9 +57,7 @@ Status GpuOutput::is_supported_op(const GpuWorkloadContext &context,
     return Status{};
 }
 
-Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src,
-                              const ITensorInfo       *dst)
+Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
@@ -90,9 +85,7 @@ Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch,
     return status;
 }
 
-void GpuOutput::create_op(GpuWorkloadSketch &sketch,
-                          ITensorInfo       *src,
-                          ITensorInfo       *dst)
+void GpuOutput::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(GpuOutput::validate_op(sketch, src, dst));
@@ -104,14 +97,14 @@ void GpuOutput::create_op(GpuWorkloadSketch &sketch,
     auto      &comp_graph = sketch.implementation().component_graph();
     const auto sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         ARM_COMPUTE_ERROR_ON(sketch_ctx->cl_compile_context() == nullptr);
 
         // Add store component
         {
             IGpuKernelComponent::Properties properties;
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
index e464be8607aae8d31e5b8ba6f8680cf6b7080e45..55c604aacc97abe06286fd843218838f567528b8 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
@@ -22,20 +22,21 @@
  * SOFTWARE.
  */
 
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
 #include "arm_compute/core/CL/CLCompileContext.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
-
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
-#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
 #include "src/dynamic_fusion/utils/Utils.h"
 
 namespace arm_compute
@@ -46,7 +47,20 @@ namespace dynamic_fusion
 {
 namespace
 {
-constexpr GpuOperatorType operator_type = GpuOperatorType::Unfusable;
+void calculate_and_init_dst_if_empty(ITensorInfo             *dst,
+                                     const ITensorInfo       *src,
+                                     const Pool2dAttributes  &attributes,
+                                     const GpuPool2dSettings &settings)
+{
+    if (dst->total_size() == 0U)
+    {
+        auto shape = misc::shape_calculator::compute_pool_shape(
+            *src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision()));
+        auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
+    }
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
 } // namespace
 
 GpuPool2dSettings &GpuPool2dSettings::mixed_precision(bool mixed_precision)
@@ -73,19 +87,16 @@ bool GpuPool2dSettings::use_inf_as_limit() const
 
 Status GpuPool2d::validate_op(const GpuWorkloadSketch &sketch,
                               const ITensorInfo       *src,
-                              const ITensorInfo       *dst,
-                              const Pool2dAttributes &attributes,
+                              const Pool2dAttributes  &attributes,
                               const GpuPool2dSettings &settings)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !dst->has_valid_id());
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
 
     // Auto initialize dst tensor info
-    TensorInfo dst_info_to_validate = *dst;
-    {
-        auto shape = misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision()));
-        auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(shape));
-    }
+    TensorInfo dst_info_to_validate;
+
+    calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes, settings);
 
     // Perform fusion test
     // Pack tensor infos
@@ -98,39 +109,38 @@ Status GpuPool2d::validate_op(const GpuWorkloadSketch &sketch,
                                     "Operator fusion test failed. This operator cannot be fused into the workload");
 
     // Check if configuration is supported
-    return is_supported_op(*sketch.gpu_context(), src, &dst_info_to_validate, attributes, settings);
+    return is_supported_op(*sketch.gpu_context(), src, attributes, settings);
 }
 
 Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context,
                                   const ITensorInfo        *src,
-                                  const ITensorInfo        *dst,
                                   const Pool2dAttributes   &attributes,
-                                  const GpuPool2dSettings &settings)
+                                  const GpuPool2dSettings  &settings)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
     // Data type
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
     // Data layout
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
     // Check exclude padding is not false
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!attributes.exclude_padding(), "Exclude padding must be set to true in Attributes!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!attributes.exclude_padding(),
+                                    "Exclude padding must be set to true in Attributes!");
 
     // Auto initialize dst tensor info
-    TensorInfo dst_info_to_validate = *dst;
-    {
-        auto shape = misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision()));
-        auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(shape));
-    }
+    TensorInfo dst_info_to_validate;
+
+    calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes, settings);
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
 
         // Validate Component
         {
-            const KernelProperties properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            const KernelProperties properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
@@ -145,28 +155,27 @@ Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context,
     return Status{};
 }
 
-void GpuPool2d::create_op(GpuWorkloadSketch       &sketch,
-                          ITensorInfo             *src,
-                          ITensorInfo             *dst,
-                          const Pool2dAttributes &attributes,
-                          const GpuPool2dSettings &settings)
+ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch       &sketch,
+                                  ITensorInfo             *src,
+                                  const Pool2dAttributes  &attributes,
+                                  const GpuPool2dSettings &settings)
 {
     // Assert validation
-    ARM_COMPUTE_ERROR_THROW_ON(GpuPool2d::validate_op(sketch, src, dst, attributes, settings));
-    ARM_COMPUTE_LOG_PARAMS(src, dst, attributes, settings);
+    ARM_COMPUTE_ERROR_THROW_ON(GpuPool2d::validate_op(sketch, src, attributes, settings));
+    ARM_COMPUTE_LOG_PARAMS(src, attributes, settings);
+
+    ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+    ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
 
     // Auto initialize dst tensor
-    {
-        auto shape = misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())); // use the default DimensionRoundingType
-        auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
-    }
+    calculate_and_init_dst_if_empty(dst, src, attributes, settings);
 
     // Translate into components and add to component graph
     auto &comp_graph = sketch.implementation().component_graph();
 
     const auto sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
         ARM_COMPUTE_UNUSED(cl_compile_ctx);
@@ -175,7 +184,7 @@ void GpuPool2d::create_op(GpuWorkloadSketch       &sketch,
         // Add Component
         {
             auto properties = IGpuKernelComponent::Properties();
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
@@ -198,6 +207,8 @@ void GpuPool2d::create_op(GpuWorkloadSketch       &sketch,
 
     const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
     sketch.implementation().operator_group().add_operator(op);
+
+    return dst;
 }
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp
index 0f43a578dff0d7e5b1e129e408561ec15f3e18ca..3def7a1a81c5b6c2057bc27b53bccc3288e14f92 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp
@@ -22,12 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h"
+
 #include "arm_compute/core/Error.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
 {
@@ -40,14 +42,14 @@ namespace
 Status is_supported_op_helper(const GpuWorkloadContext &context,
                               const ITensorInfo        *src,
                               const ITensorInfo        *dst,
-                              const ReshapeAttributes &attributes)
+                              const ReshapeAttributes  &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -55,7 +57,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
     auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(attributes.shape()));
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
@@ -78,16 +80,13 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 GpuOperatorType operator_type = GpuOperatorType::Complex;
 } // namespace
 
-Status GpuReshape::is_supported_op(const GpuWorkloadContext &context,
-                                   const ITensorInfo        *src,
-                                   const Attributes         &attributes)
+Status
+GpuReshape::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes)
 {
     return is_supported_op_helper(context, src, nullptr, attributes);
 }
 
-Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch,
-                               const ITensorInfo       *src,
-                               const Attributes        &attributes)
+Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
@@ -111,9 +110,7 @@ Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch,
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
 }
 
-ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch,
-                                   ITensorInfo       *src,
-                                   const Attributes &attributes)
+ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, attributes.shape());
@@ -127,7 +124,7 @@ ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch,
     // Translate into components and add to component graph
     auto      &comp_graph = sketch.implementation().component_graph();
     const auto sketch_ctx = sketch.implementation().context();
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
         ARM_COMPUTE_UNUSED(cl_compile_ctx);
@@ -136,7 +133,7 @@ ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch,
         // Add ElementwiseBinary Component
         {
             auto properties = IGpuKernelComponent::Properties();
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
index 5f52eea7d07c4137b5f5f437e41782044792a5b9..fb09875b337990e3841d88471fc88b0b86cf0af3 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
@@ -26,12 +26,12 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h"
-
-#include "src/common/utils/Log.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
 {
@@ -43,7 +43,7 @@ namespace
 {
 void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ResizeAttributes &attributes)
 {
-    if(dst->total_size() == 0U)
+    if (dst->total_size() == 0U)
     {
         TensorShape out_shape = src->tensor_shape();
 
@@ -64,7 +64,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -73,22 +73,25 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 
     // Check support level
     // Data type
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::U8, DataType::S16, DataType::F16, DataType::F32);
     // Data layout
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
     // Interpolation policy
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.interpolation_policy() != InterpolationPolicy::NEAREST_NEIGHBOR && attributes.interpolation_policy() != InterpolationPolicy::BILINEAR,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.interpolation_policy() != InterpolationPolicy::NEAREST_NEIGHBOR &&
+                                        attributes.interpolation_policy() != InterpolationPolicy::BILINEAR,
                                     "Interpolation policy must be NEAREST_NEIGHBOR or BILINEAR");
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
 
         // Validate Activation Component
         {
-            const KernelProperties properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            const KernelProperties properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
@@ -107,16 +110,14 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
 } // namespace
 
-Status GpuResize::is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src,
-                                  const Attributes         &attributes)
+Status
+GpuResize::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes)
 {
     return is_supported_op_helper(context, src, nullptr, attributes);
 }
 
-Status GpuResize::validate_op(const GpuWorkloadSketch     &sketch,
-                              const ITensorInfo           *src,
-                              const GpuResize::Attributes &attributes)
+Status
+GpuResize::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const GpuResize::Attributes &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
@@ -141,9 +142,7 @@ Status GpuResize::validate_op(const GpuWorkloadSketch     &sketch,
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
 }
 
-ITensorInfo *GpuResize::create_op(GpuWorkloadSketch           &sketch,
-                                  ITensorInfo                 *src,
-                                  const GpuResize::Attributes &attributes)
+ITensorInfo *GpuResize::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const GpuResize::Attributes &attributes)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, attributes);
@@ -159,13 +158,14 @@ ITensorInfo *GpuResize::create_op(GpuWorkloadSketch           &sketch,
     GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
     const auto              *sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context());
 
         // Add Resize Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp
index 09debad9692468ec7b6debb706210ec3083d50ce..a2260c8c36ea2852a5571e345753d0e4afe7f2b2 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp
@@ -23,14 +23,15 @@
  */
 
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h"
+
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
 {
@@ -40,9 +41,7 @@ namespace dynamic_fusion
 {
 namespace
 {
-Status is_supported_op_helper(const GpuWorkloadContext &context,
-                              const ITensorInfo        *src,
-                              const ITensorInfo        *dst)
+Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
@@ -50,20 +49,21 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
 
     auto_init_if_empty(dst_info_to_validate, *src->clone());
 
-    const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::LOGISTIC };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC};
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         // Validate Activation Component
-        const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        const auto properties =
+            IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);
@@ -80,14 +80,12 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-Status GpuSigmoid::is_supported_op(const GpuWorkloadContext &context,
-                                   const ITensorInfo        *src)
+Status GpuSigmoid::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src)
 {
     return is_supported_op_helper(context, src, nullptr);
 }
 
-Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch,
-                               const ITensorInfo       *src)
+Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
@@ -112,8 +110,7 @@ Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch,
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate);
 }
 
-ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch,
-                                   ITensorInfo       *src)
+ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src);
@@ -128,15 +125,15 @@ ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch,
     // Translate into components and add to component graph
     GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
 
-    const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::LOGISTIC };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC};
 
     const auto *const sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         // Add Activation Component
         auto properties = IGpuKernelComponent::Properties();
-        properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
index ffc4553a7db5433bcce0f4a8076bbb654ec7afda..c87b282aecb28977486c95be720d87c9256a7a25 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
@@ -22,13 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h"
+
 #include "arm_compute/core/Error.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
@@ -52,7 +53,7 @@ Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context,
     TensorInfo dst_info_to_validate;
 
     // Auto initialize dst tensor info
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate = *dst;
     }
@@ -61,11 +62,12 @@ Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context,
         auto_init_if_empty(dst_info_to_validate, *src->clone());
     }
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
-        const KernelProperties properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        const KernelProperties properties =
+            IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         TensorShape logits_sum_shape = src->tensor_shape();
         TensorInfo  logits(src->clone()->set_tensor_shape(logits_sum_shape));
@@ -86,7 +88,8 @@ Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context,
         arguments_norm.add_const_tensor(ACL_SRC_1, &sum);
         arguments_norm.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
 
-        ARM_COMPUTE_RETURN_ON_ERROR(ClComponentLogits1DMaxShiftExpSum::validate(properties, arguments_exp_sum, attributes));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            ClComponentLogits1DMaxShiftExpSum::validate(properties, arguments_exp_sum, attributes));
         ARM_COMPUTE_RETURN_ON_ERROR(ClComponentLogits1DNorm::validate(properties, arguments_norm, attributes));
     }
     else
@@ -105,14 +108,16 @@ Status GpuSoftmax::validate_op(const GpuWorkloadSketch &sketch,
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !dst->has_valid_id());
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(attributes.axis() < static_cast<int32_t>(-src->num_dimensions()) || static_cast<int32_t>(src->num_dimensions()) <= attributes.axis());
+    ARM_COMPUTE_RETURN_ERROR_ON(attributes.axis() < static_cast<int32_t>(-src->num_dimensions()) ||
+                                static_cast<int32_t>(src->num_dimensions()) <= attributes.axis());
 
     // Auto initialize dst tensor info
     TensorInfo dst_info_to_validate = *dst;
     auto_init_if_empty(dst_info_to_validate, *src->clone());
 
-    const size_t actual_axis   = static_cast<size_t>(wrap_around(attributes.axis(), static_cast<int32_t>(src->num_dimensions())));
-    const bool   needs_permute = actual_axis != 0;
+    const size_t actual_axis =
+        static_cast<size_t>(wrap_around(attributes.axis(), static_cast<int32_t>(src->num_dimensions())));
+    const bool needs_permute = actual_axis != 0;
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(needs_permute, "Dynamic fusion softmax on axis!=0 not supported yet.");
 
     // Perform fusion test and check if the operator meets the fusion constraints
@@ -128,17 +133,16 @@ Status GpuSoftmax::validate_op(const GpuWorkloadSketch &sketch,
     return is_supported_op(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
 }
 
-void GpuSoftmax::create_op(GpuWorkloadSketch &sketch,
-                           ITensorInfo       *src,
-                           ITensorInfo       *dst,
-                           const Attributes &attributes)
+void GpuSoftmax::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst, const Attributes &attributes)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_LOG_PARAMS(src, dst, attributes);
     TensorShape  logits_sum_shape = src->tensor_shape();
-    ITensorInfo *logits           = sketch.implementation().create_auxiliary_tensor(src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape));
+    ITensorInfo *logits           = sketch.implementation().create_auxiliary_tensor(
+                  src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape));
     logits_sum_shape.set(0, 1);
-    ITensorInfo *sum = sketch.implementation().create_auxiliary_tensor(src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape));
+    ITensorInfo *sum = sketch.implementation().create_auxiliary_tensor(
+        src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape));
 
     // Auto initialize dst tensor info and the auxiliary tensor infos as well
     auto_init_if_empty(*dst, *src->clone());
@@ -151,7 +155,7 @@ void GpuSoftmax::create_op(GpuWorkloadSketch &sketch,
     auto      &comp_graph = sketch.implementation().component_graph();
     const auto sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
         ARM_COMPUTE_UNUSED(cl_compile_ctx);
@@ -160,7 +164,7 @@ void GpuSoftmax::create_op(GpuWorkloadSketch &sketch,
         // Add Direct Conv2d Component
         {
             auto properties = IGpuKernelComponent::Properties();
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments_exp_sum;
             ArgumentPack<ITensorInfo> arguments_norm;
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
index 8240008f2aa94f3f94905740d36e5c80268f516c..e5d62c993029c30d9221cb5f8da54fde138e7add 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h"
+
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
@@ -32,12 +33,11 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-Status GpuSub::validate_op(const GpuWorkloadSketch &sketch,
-                           const ITensorInfo       *lhs,
-                           const ITensorInfo       *rhs)
+Status GpuSub::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
+                                                         DataType::S16, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Sub then call the elementwise common validate_op
@@ -46,12 +46,11 @@ Status GpuSub::validate_op(const GpuWorkloadSketch &sketch,
     return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
 }
 
-Status GpuSub::is_supported_op(const GpuWorkloadContext &context,
-                               const ITensorInfo        *lhs,
-                               const ITensorInfo        *rhs)
+Status GpuSub::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
+                                                         DataType::S16, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Sub then call the elementwise common is_supported_op
@@ -60,9 +59,7 @@ Status GpuSub::is_supported_op(const GpuWorkloadContext &context,
     return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
 }
 
-ITensorInfo *GpuSub::create_op(GpuWorkloadSketch &sketch,
-                               ITensorInfo       *lhs,
-                               ITensorInfo       *rhs)
+ITensorInfo *GpuSub::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs)
 {
     // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op()
     // Set the elementwise operation to Sub then call the elementwise common create_op
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
index c00716c76eff467912b4348259b2985d723ba361..bf0f274c5c998a94a42cd0937b823fe0e7e66aba 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
@@ -23,14 +23,15 @@
  */
 
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h"
+
 #include "arm_compute/core/experimental/Types.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/common/utils/Log.h"
 
 namespace arm_compute
 {
@@ -40,9 +41,7 @@ namespace dynamic_fusion
 {
 namespace
 {
-Status is_supported_op_helper(const GpuWorkloadContext &context,
-                              const ITensorInfo        *src,
-                              const ITensorInfo        *dst)
+Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
@@ -50,20 +49,21 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
 
     auto_init_if_empty(dst_info_to_validate, *src->clone());
 
-    const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::TANH };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::TANH};
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         // Validate Activation Component
-        const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        const auto properties =
+            IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);
@@ -80,14 +80,12 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-Status GpuTanh::is_supported_op(const GpuWorkloadContext &context,
-                                const ITensorInfo        *src)
+Status GpuTanh::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src)
 {
     return is_supported_op_helper(context, src, nullptr);
 }
 
-Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch,
-                            const ITensorInfo       *src)
+Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
@@ -112,8 +110,7 @@ Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch,
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate);
 }
 
-ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch     &sketch,
-                                ITensorInfo           *src)
+ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src);
@@ -128,15 +125,15 @@ ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch     &sketch,
     // Translate into components and add to component graph
     GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
 
-    const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::TANH };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::TANH};
 
     const auto *const sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         // Add Activation Component
         auto properties = IGpuKernelComponent::Properties();
-        properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);
diff --git a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp
index 7c087c9a7bb0c490658e2661593c6d5150aec5ed..d79a4c42c9c233ae257b80842bce4ef4e5951da9 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp
@@ -22,11 +22,12 @@
  * SOFTWARE.
  */
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
 {
@@ -38,9 +39,10 @@ namespace
 {
 void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
-    if(dst->total_size() == 0U)
+    if (dst->total_size() == 0U)
     {
-        const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*lhs, *rhs);
+        const std::pair<TensorShape, ValidRegion> broadcast_pair =
+            ITensorInfo::broadcast_shape_and_valid_region(*lhs, *rhs);
         auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(broadcast_pair.first));
     }
 }
@@ -56,7 +58,7 @@ Status is_supported_op_helper(const GpuWorkloadContext                &context,
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -64,7 +66,7 @@ Status is_supported_op_helper(const GpuWorkloadContext                &context,
     calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs);
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
@@ -90,7 +92,8 @@ Status is_supported_op_helper(const GpuWorkloadContext                &context,
 GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-ElementwiseBinaryCommonAttributes &ElementwiseBinaryCommonAttributes::operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation)
+ElementwiseBinaryCommonAttributes &
+ElementwiseBinaryCommonAttributes::operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation)
 {
     _operation = operation;
     return *this;
@@ -157,14 +160,14 @@ ITensorInfo *GpuElementwiseBinaryCommon::create_op(GpuWorkloadSketch
 
     const auto sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context());
 
         // Add ElementwiseBinary Component
         {
             auto properties = IGpuKernelComponent::Properties();
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, lhs);
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
index 0972b4e8e2b381f836c8a8449945b9af672eaf67..775b0a0c8ced6f329c0ddfa154977bad6e45bd9a 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "GpuKernelVariableTable.h"
+
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/ITensorInfo.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
 namespace arm_compute
@@ -32,14 +34,17 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, const std::string &alias)
+void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group,
+                                              const ITensorInfo             *tensor,
+                                              GpuKernelArgumentInfo          argument_info,
+                                              const std::string             &alias)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
 
     // Do not re-declare if the variable associated with the tensor has already been declared
     auto it = _vars.find(tensor->id());
 
-    if(it != _vars.end())
+    if (it != _vars.end())
     {
         ARM_COMPUTE_ERROR_ON(!(it->second.kernel_argument_info == argument_info));
         return;
@@ -47,14 +52,12 @@ void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &com
 
     const auto target = comp_group.get_tile_for_tensor(tensor);
 
-    if(target != tensor)
+    if (target != tensor)
     {
         // If the tensor uses a shared tile, don't declare another variable.
         it = _vars.find(target->id());
 
-        ARM_COMPUTE_ERROR_ON_MSG(
-            it == _vars.end(),
-            "The variable used for this tensor must have been declared.");
+        ARM_COMPUTE_ERROR_ON_MSG(it == _vars.end(), "The variable used for this tensor must have been declared.");
 
         _vars[tensor->id()] = it->second;
     }
@@ -64,7 +67,7 @@ void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &com
         std::stringstream ss;
         ss << alias << "_t" << abs(tensor->id());
         const auto     uniq_name = ss.str();
-        TensorVariable var{ tensor->id(), uniq_name, argument_info };
+        TensorVariable var{tensor->id(), uniq_name, argument_info};
 
         _vars.emplace(tensor->id(), var);
     }
@@ -76,12 +79,13 @@ GpuKernelVariableTable::TensorVariable GpuKernelVariableTable::get_variable(cons
     return var;
 }
 
-GpuKernelVariableTable::VariableList GpuKernelVariableTable::get_variable_list(const std::vector<const ITensorInfo *> &tensors) const
+GpuKernelVariableTable::VariableList
+GpuKernelVariableTable::get_variable_list(const std::vector<const ITensorInfo *> &tensors) const
 {
     VariableList vars{};
-    for(const auto &tensor : tensors)
+    for (const auto &tensor : tensors)
     {
-        if(!tensor->has_valid_id())
+        if (!tensor->has_valid_id())
         {
             continue;
         }
@@ -90,23 +94,19 @@ GpuKernelVariableTable::VariableList GpuKernelVariableTable::get_variable_list(c
     return vars;
 }
 
-TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var)
-    : value{ var.uniq_name }
+TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var) : value{var.uniq_name}
 {
 }
 
-TagVal::TagVal(const std::string &val)
-    : value{ val }
+TagVal::TagVal(const std::string &val) : value{val}
 {
 }
 
-TagVal::TagVal(const char *val)
-    : value{ std::string(val) }
+TagVal::TagVal(const char *val) : value{std::string(val)}
 {
 }
 
-TagVal::TagVal(const DataType &data_type)
-    : value{ get_cl_type_from_data_type(data_type) }
+TagVal::TagVal(const DataType &data_type) : value{get_cl_type_from_data_type(data_type)}
 {
 }
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
index a49d38e10c832d6aab16378eac2678ecbbcf894c..c17f131ada3b42a93f1b983be956432c328b736c 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
 
 #include "arm_compute/core/ITensorInfo.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "support/AclRequires.h"
 #include "support/StringSupport.h"
@@ -55,11 +56,11 @@ public:
     struct TensorVariable
     {
     public:
-        TensorVariable()                       = default;
-        TensorVariable(const TensorVariable &) = default;
+        TensorVariable()                                        = default;
+        TensorVariable(const TensorVariable &)                  = default;
         TensorVariable       &operator=(const TensorVariable &) = default;
-        ITensorInfo::Id       id{ ITensorInfo::invalid_tensor_id };
-        std::string           uniq_name{ "empty" }; // Unique name, also the final variable name used in the built code
+        ITensorInfo::Id       id{ITensorInfo::invalid_tensor_id};
+        std::string           uniq_name{"empty"}; // Unique name, also the final variable name used in the built code
         GpuKernelArgumentInfo kernel_argument_info{};
         bool                  has_valid_id() const
         {
@@ -76,7 +77,10 @@ public:
      * @param[in] argument_info Kernel argument information
      * @param[in] alias         Alias for the variable. Will be used as part of the variable name
      */
-    void declare_variable(const GpuKernelComponentGroup &comp_group, const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, const std::string &alias = "unnamed");
+    void declare_variable(const GpuKernelComponentGroup &comp_group,
+                          const ITensorInfo             *tensor,
+                          GpuKernelArgumentInfo          argument_info,
+                          const std::string             &alias = "unnamed");
     /** Get the @ref TensorVariable associated with @p tensor
      *
      * @param[in] tensor Tensor info to be queried
@@ -106,8 +110,7 @@ struct TagVal
     TagVal(const GpuKernelVariableTable::TensorVariable &var);
     /** Construct a @ref TagVal from an integral type */
     template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
-    TagVal(T val)
-        : value{ support::cpp11::to_string(val) }
+    TagVal(T val) : value{support::cpp11::to_string(val)}
     {
     }
     /** Construct a @ref TagVal from a string */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
index 4a1fb142d6aa32e3aca469b1e34a21d6b2a5da10..9d0b4f592acdff467070d138cdf9afb576757d82 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/components/Types.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
@@ -57,8 +58,7 @@ public:
      * @param[in] id      Component id
      * @param[in] tensors Tensor arguments to the components
      */
-    IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
-        : _id{ id }, _tensors{ tensors }
+    IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack<ITensorInfo> &tensors) : _id{id}, _tensors{tensors}
     {
     }
     /** Destructor */
@@ -112,7 +112,7 @@ public:
     /** Generate the header list used in the component */
     virtual std::set<std::string> get_headers_list() const
     {
-        return std::set<std::string> {};
+        return std::set<std::string>{};
     }
     /** Generate the execution window for the component */
     virtual Window get_window() const
@@ -131,7 +131,7 @@ public:
     }
 
 private:
-    ComponentId               _id{ -1 };
+    ComponentId               _id{-1};
     ArgumentPack<ITensorInfo> _tensors{};
 };
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
index 3c7c843dd872f650d6ac1d8a8e23420fbc1de334..c165fb5f33ec550d761ec99932a4e5af05d76df2 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
@@ -39,10 +40,7 @@ namespace dynamic_fusion
 ClTemplateActivation::ClTemplateActivation(ComponentId                      id,
                                            const ArgumentPack<ITensorInfo> &tensors,
                                            const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _src{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST);
@@ -62,7 +60,7 @@ std::string ClTemplateActivation::get_component_code(const ComponentGroup &comp_
     code = R"_(
 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
 )_";
-    if(is_root)
+    if (is_root)
     {
         code += R"_(
 // IN(src)              {{src}}
@@ -104,17 +102,11 @@ LOOP_UNROLLING(int, i, 0, 1, M0,
 
 void ClTemplateActivation::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
-
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
+
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
 TagLUT ClTemplateActivation::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
@@ -173,7 +165,7 @@ std::string ClTemplateActivation::get_config_id() const
 
 std::set<std::string> ClTemplateActivation::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h", "activation_float_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h", "activation_float_helpers.h"};
 }
 
 Window ClTemplateActivation::get_window() const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h
index ec78cf6ce52520bf6bc80c49ce9cb04ccd58e524..88ee37034214564997c946aacc55d15c784aa79a 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
index 4956879ad3fe6722bfbf7044965c4505f7868f04..0da3a73801d6eacb13015299645abb730581a9d2 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
@@ -35,7 +36,7 @@ namespace experimental
 namespace dynamic_fusion
 {
 ClTemplateCast::ClTemplateCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}, _attributes{ attributes }
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -62,7 +63,7 @@ std::string ClTemplateCast::get_component_code(const ComponentGroup &comp_group)
 //------------------ START KERNEL {{meta_kernel_id}} CAST ---------------------
 )_";
 
-    if(is_root)
+    if (is_root)
     {
         code += R"_(
 // IN_0(src)            {{src}}
@@ -82,14 +83,15 @@ TILE(uint, M0, 1, g_dst_indirect_y);
     {
 )_";
 
-    if(kernel_name == "cast_down" && is_data_type_quantized(_src->data_type()))
+    if (kernel_name == "cast_down" && is_data_type_quantized(_src->data_type()))
     {
         code += R"_(
     {{tmp}}[m0].v ^= (VEC_DATA_TYPE({{DATA_TYPE_IN}}, N0))0x80;
 )_";
     }
 
-    if(kernel_name == "cast_down" && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
+    if (kernel_name == "cast_down" &&
+        (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
     {
         code += R"_(
     {{dst}}[m0].v = CONVERT_SAT({{tmp}}[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
@@ -106,7 +108,7 @@ TILE(uint, M0, 1, g_dst_indirect_y);
     })
 )_";
 
-    if(is_root)
+    if (is_root)
     {
         code += R"_(
     LOOP_UNROLLING(int, i, 0, 1, M0,
@@ -128,17 +130,11 @@ TILE(uint, M0, 1, g_dst_indirect_y);
 
 void ClTemplateCast::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
-
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
+
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
 TagLUT ClTemplateCast::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
@@ -199,7 +195,7 @@ std::string ClTemplateCast::get_config_id() const
 
 std::set<std::string> ClTemplateCast::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateCast::get_window() const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
index ab7cc9f05aaa29e6f521f69f9b9a43f29b02112c..8380620ab2b5f7740e25bb7a639f4eb2b9356f28 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
@@ -36,17 +36,17 @@ ClTemplateDepthwiseConv2d::ClTemplateDepthwiseConv2d(ComponentId
                                                      const ArgumentPack<ITensorInfo> &tensors,
                                                      const Attributes                &attributes,
                                                      const Settings                  &settings)
-    : IGpuTemplateComponentWriter{ id, tensors },
+    : IGpuTemplateComponentWriter{id, tensors},
       _src{},
       _weight{},
       _bias{},
       _dst{},
-      _attributes{ attributes },
-      _settings{ settings }
+      _attributes{attributes},
+      _settings{settings}
 {
     _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
-    if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
+    if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
     {
         _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
     }
@@ -71,7 +71,7 @@ std::string ClTemplateDepthwiseConv2d::get_component_code(const ComponentGroup &
 // IN_1(wei)            {{weight}}
 )_";
 
-    if(_bias != nullptr && _bias->has_valid_id())
+    if (_bias != nullptr && _bias->has_valid_id())
     {
         code += R"_(
 // IN_1(bia)            {{bias}}
@@ -113,7 +113,7 @@ TILE(uint, M0, 1, g_dst_indirect_y);
     })
 )_";
 
-    if(_weight->dimension(height_idx) < 5)
+    if (_weight->dimension(height_idx) < 5)
     {
         code += R"_(
     LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
@@ -147,7 +147,7 @@ TILE(uint, M0, 1, g_dst_indirect_y);
             {
 )_";
 
-    if(!_settings.is_fma_available())
+    if (!_settings.is_fma_available())
     {
         code += R"_(
                 {{dst}}[m0].v += a[xk + m0].v * b[xk].v;
@@ -166,14 +166,14 @@ TILE(uint, M0, 1, g_dst_indirect_y);
     }
 )_";
 
-    if(_weight->dimension(height_idx) < 5)
+    if (_weight->dimension(height_idx) < 5)
     {
         code += R"_(
     )
 )_";
     }
 
-    if(_bias && _bias->has_valid_id())
+    if (_bias && _bias->has_valid_id())
     {
         code += R"_(
         TILE({{BIA_DATA_TYPE}}, 1, N0, {{bias}});
@@ -198,44 +198,31 @@ TILE(uint, M0, 1, g_dst_indirect_y);
     return code;
 }
 
-void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable,
+                                                  const ComponentGroup   &comp_group) const
 {
-    const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image() ?
-                                                       GpuKernelArgumentInfo::Type::Tensor_4D_t_Image :
-                                                       GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
-
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(input_type),
-        "src");
-
-    const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image() ?
-                                                        GpuKernelArgumentInfo::Type::Tensor_4D_t_Image :
-                                                        GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
-
-    vtable.declare_variable(
-        comp_group,
-        _weight,
-        GpuKernelArgumentInfo(weight_type),
-        "weight");
-
-    if(_bias != nullptr && _bias->has_valid_id()) // optional bias
+    const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image()
+                                                       ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image
+                                                       : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
+
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(input_type), "src");
+
+    const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image()
+                                                        ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image
+                                                        : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
+
+    vtable.declare_variable(comp_group, _weight, GpuKernelArgumentInfo(weight_type), "weight");
+
+    if (_bias != nullptr && _bias->has_valid_id()) // optional bias
     {
-        vtable.declare_variable(
-            comp_group,
-            _bias,
-            GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
-            "bias");
+        vtable.declare_variable(comp_group, _bias, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), "bias");
     }
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
-TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable,
+                                              const ComponentGroup         &comp_group) const
 {
     TagLUT lut{};
 
@@ -243,7 +230,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab
     lut["src"]    = vtable.get_variable(_src);
     lut["weight"] = vtable.get_variable(_weight);
 
-    if(_bias != nullptr && _bias->has_valid_id()) // optional bias
+    if (_bias != nullptr && _bias->has_valid_id()) // optional bias
     {
         lut["bias"]          = vtable.get_variable(_bias);
         lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
@@ -259,7 +246,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab
     lut["SRC_DATA_TYPE"]  = _src->data_type();
     lut["WEI_DATA_TYPE"]  = _weight->data_type();
 
-    switch(vtable.get_variable(_src).kernel_argument_info.type)
+    switch (vtable.get_variable(_src).kernel_argument_info.type)
     {
         case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
         case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
@@ -271,7 +258,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab
             break;
     }
 
-    switch(vtable.get_variable(_weight).kernel_argument_info.type)
+    switch (vtable.get_variable(_weight).kernel_argument_info.type)
     {
         case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
         case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
@@ -318,7 +305,7 @@ CLBuildOptions ClTemplateDepthwiseConv2d::get_build_options(const ComponentGroup
 
     CLBuildOptions build_opts{};
 
-    if(_settings.fast_relaxed_math())
+    if (_settings.fast_relaxed_math())
     {
         build_opts.add_option("-cl-fast-relaxed-math");
     }
@@ -361,7 +348,7 @@ std::string ClTemplateDepthwiseConv2d::get_config_id() const
 
 std::set<std::string> ClTemplateDepthwiseConv2d::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateDepthwiseConv2d::get_window() const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h
index 84b689ef643bb13928ccc9ae872daec6aee48365..5d04c687c3101ea0f1bea5e40b9ce6b1bff0fe47 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
 
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
index 3322487910e7f82bf084ce7203f0df33bc3f665f..f6a7a58d1d358efe833f13a782c07f67eb2adc27 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
@@ -23,14 +23,13 @@
  */
 #include "ClTemplateDirectConv2d.h"
 
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
-
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
-#include "src/core/helpers/WindowHelpers.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -43,17 +42,17 @@ ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId
                                                const ArgumentPack<ITensorInfo> &tensors,
                                                const Attributes                &attributes,
                                                const Settings                  &settings)
-    : IGpuTemplateComponentWriter{ id, tensors },
+    : IGpuTemplateComponentWriter{id, tensors},
       _src{},
       _weight{},
       _bias{},
       _dst{},
-      _attributes{ attributes },
-      _settings{ settings }
+      _attributes{attributes},
+      _settings{settings}
 {
     _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
-    if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
+    if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
     {
         _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
     }
@@ -79,7 +78,7 @@ std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &com
 // IN_0(src)            {{src}}
 // IN_1(wei)            {{weight}}
 )_";
-    if(_bias && _bias->has_valid_id())
+    if (_bias && _bias->has_valid_id())
     {
         code += R"_(
 // IN_1(bia)            {{bias}}
@@ -161,7 +160,7 @@ TILE(uint, M0, 1, g_dst_indirect_y);
         }
 )_";
 
-    if(leftover_loop)
+    if (leftover_loop)
     {
         code += R"_(
         for(; ck < _ISRC_CHANNELS; ++ck)
@@ -186,9 +185,9 @@ TILE(uint, M0, 1, g_dst_indirect_y);
             T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
         }
     )_";
-}
+    }
 
-code += R"_(
+    code += R"_(
 #undef _I_WEI_WIDTH
 #undef _I_WEI_HEIGHT
 #undef _ISRC_WIDTH
@@ -202,7 +201,7 @@ code += R"_(
     }
 )_";
 
-    if(_bias && _bias->has_valid_id())
+    if (_bias && _bias->has_valid_id())
     {
         code += R"_(
         TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
@@ -211,9 +210,9 @@ code += R"_(
 
         T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
     )_";
-}
+    }
 
-code += R"_(
+    code += R"_(
     LOOP_UNROLLING(int, i, 0, 1, M0,
     {
         g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{DST_WIDTH}} * {{DST_HEIGHT}}) - 1);
@@ -227,32 +226,19 @@ code += R"_(
 
 void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
-
-    const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
-    vtable.declare_variable(
-        comp_group,
-        _weight,
-        GpuKernelArgumentInfo(weight_type),
-        "weight");
-
-    if(_bias && _bias->has_valid_id()) // optional bias
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
+
+    const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image()
+                                                        ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image
+                                                        : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
+    vtable.declare_variable(comp_group, _weight, GpuKernelArgumentInfo(weight_type), "weight");
+
+    if (_bias && _bias->has_valid_id()) // optional bias
     {
-        vtable.declare_variable(
-            comp_group,
-            _bias,
-            GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
-            "bias");
+        vtable.declare_variable(comp_group, _bias, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), "bias");
     }
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(common_tensor_type),
-        "dst");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst");
 }
 
 TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
@@ -262,7 +248,7 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable,
     lut["src"]    = vtable.get_variable(_src);
     lut["weight"] = vtable.get_variable(_weight);
 
-    if(_bias && _bias->has_valid_id()) // optional bias
+    if (_bias && _bias->has_valid_id()) // optional bias
     {
         lut["bias"]          = vtable.get_variable(_bias);
         lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
@@ -279,34 +265,34 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable,
     lut["WEI_DATA_TYPE"]  = _weight->data_type();
 
     lut["SRC_TENSOR_TYPE"] = "BUFFER";
-    switch(vtable.get_variable(_weight).kernel_argument_info.type)
+    switch (vtable.get_variable(_weight).kernel_argument_info.type)
     {
         case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
         case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
         case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
-    {
-        lut["WEI_TENSOR_TYPE"] = "IMAGE";
-        break;
-    }
+        {
+            lut["WEI_TENSOR_TYPE"] = "IMAGE";
+            break;
+        }
         default:
-    {
-        lut["WEI_TENSOR_TYPE"] = "BUFFER";
-        break;
-    }
+        {
+            lut["WEI_TENSOR_TYPE"] = "BUFFER";
+            break;
+        }
     }
-    const auto width_idx  = 1;
-    const auto height_idx = 2;
+    const auto width_idx   = 1;
+    const auto height_idx  = 2;
     const auto channel_idx = 0;
 
-    lut["SRC_WIDTH"] = _src->dimension(width_idx);
-    lut["SRC_HEIGHT"] = _src->dimension(height_idx);
+    lut["SRC_WIDTH"]    = _src->dimension(width_idx);
+    lut["SRC_HEIGHT"]   = _src->dimension(height_idx);
     lut["SRC_CHANNELS"] = _src->dimension(channel_idx);
 
-    lut["WEI_WIDTH"]      = _weight->dimension(width_idx);
-    lut["WEI_HEIGHT"]     = _weight->dimension(height_idx);
+    lut["WEI_WIDTH"]  = _weight->dimension(width_idx);
+    lut["WEI_HEIGHT"] = _weight->dimension(height_idx);
 
-    lut["DST_WIDTH"] = _dst->dimension(width_idx);
-    lut["DST_HEIGHT"] = _dst->dimension(height_idx);
+    lut["DST_WIDTH"]    = _dst->dimension(width_idx);
+    lut["DST_HEIGHT"]   = _dst->dimension(height_idx);
     lut["DST_CHANNELS"] = _dst->dimension(channel_idx);
 
     lut["STRIDE_X"] = _attributes.stride().x();
@@ -324,14 +310,14 @@ CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &c
 {
     const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
 
-    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0               = root_window.x().step();
-    const unsigned int m0               = root_window.y().step();
-    const unsigned int k0               = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
+    const auto         root_window = comp_group.get_root_component()->template_writer()->get_window();
+    const unsigned int n0          = root_window.x().step();
+    const unsigned int m0          = root_window.y().step();
+    const unsigned int k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
     const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
 
     CLBuildOptions build_opts{};
-    if(_settings.fast_relaxed_math())
+    if (_settings.fast_relaxed_math())
     {
         build_opts.add_option("-cl-fast-relaxed-math");
     }
@@ -379,7 +365,7 @@ std::string ClTemplateDirectConv2d::get_config_id() const
 
 std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateDirectConv2d::get_window() const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
index 8988d3ca1c2c6111bcfea5461694057eb089fca2..03c8cd2f151d143735f4fa3949dd7a60aabb6cfb 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
index c0481ae190e3577f191386382f207107c6c5dd16..78bff3c3f32bf3271688ee5729f1aba227da84c1 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
@@ -23,14 +23,13 @@
  */
 #include "ClTemplateElementwiseBinary.h"
 
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
-
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
-#include "src/core/helpers/WindowHelpers.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -44,11 +43,7 @@ constexpr unsigned int vector_size_byte_opencl = 16;
 ClTemplateElementwiseBinary::ClTemplateElementwiseBinary(ComponentId                      id,
                                                          const ArgumentPack<ITensorInfo> &tensors,
                                                          const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _lhs{},
-      _rhs{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuTemplateComponentWriter{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}
 {
     _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
@@ -69,67 +64,67 @@ std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup
     const bool  is_rhs_input = comp_group.is_input_tensor(_rhs);
 
     code =
-R"_(
+        R"_(
     //------------------ START KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} ---------------------
 )_";
 
-    if(is_root)
+    if (is_root)
     {
         code +=
-R"_(
+            R"_(
     TILE(uint, M0, 1, g_dst_indirect_y);
 )_";
     }
 
-    if(is_lhs_input)
+    if (is_lhs_input)
     {
         code +=
-R"_(
+            R"_(
     TILE({{DATA_TYPE}}, {{lhs_m0}}, N0, {{lhs}});
 )_";
     }
 
-    if(is_rhs_input)
+    if (is_rhs_input)
     {
         code +=
-R"_(
+            R"_(
     TILE({{DATA_TYPE}}, {{rhs_m0}}, N0, {{rhs}});
 )_";
     }
 
     code +=
-R"_(
+        R"_(
     {
 )_";
 
-    if(is_lhs_input)
+    if (is_lhs_input)
     {
         code +=
-R"_(
+            R"_(
         {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_w;
         T_LOAD({{DATA_TYPE}}, {{lhs_m0}}, {{lhs_n0}}, BUFFER, {{lhs}}, {{lhs_start_ind_0}}, {{lhs_start_ind_1}}, 1, {{lhs}}_stride_y, {{lhs}});
 )_";
     }
 
-    if(is_rhs_input)
+    if (is_rhs_input)
     {
         code +=
-R"_(
+            R"_(
         {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_w;
         T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, {{rhs}});
 )_";
     }
 
     code +=
-R"_(
+        R"_(
         T_ELTWISE_{{BROADCAST_OP}}{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}});
 )_";
 
-    if(is_root)
+    if (is_root)
     {
         // Calculate the destination indirect Y
         code +=
-R"_(
+            R"_(
         LOOP_UNROLLING(int, i, 0, 1, M0,
         {
             g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1);
@@ -139,7 +134,7 @@ R"_(
     }
 
     code +=
-R"_(
+        R"_(
     }
     //------------------ END KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} ---------------------
 )_";
@@ -147,28 +142,18 @@ R"_(
     return code;
 }
 
-void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable,
+                                                    const ComponentGroup   &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _lhs,
-        GpuKernelArgumentInfo(common_tensor_type),
-        "lhs");
-
-    vtable.declare_variable(
-        comp_group,
-        _rhs,
-        GpuKernelArgumentInfo(common_tensor_type),
-        "rhs");
-
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(common_tensor_type),
-        "dst");
+    vtable.declare_variable(comp_group, _lhs, GpuKernelArgumentInfo(common_tensor_type), "lhs");
+
+    vtable.declare_variable(comp_group, _rhs, GpuKernelArgumentInfo(common_tensor_type), "rhs");
+
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst");
 }
 
-TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable,
+                                                const ComponentGroup         &comp_group) const
 {
     TagLUT lut{};
 
@@ -182,7 +167,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt
     lut["dst"]     = vtable.get_variable(_dst);
     lut["arg_dst"] = vtable.get_variable(comp_group.get_any_dst_tensor());
 
-    switch(_attributes.operation())
+    switch (_attributes.operation())
     {
         case Attributes::ElementwiseOp::Add:
             lut["ELTWISE_OP"] = "ADD";
@@ -197,10 +182,10 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt
             ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
     }
 
-    ARM_COMPUTE_ERROR_ON(
-        comp_group.is_intermediate_tensor(_lhs) && detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0));
-    ARM_COMPUTE_ERROR_ON(
-        comp_group.is_intermediate_tensor(_rhs) && detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0));
+    ARM_COMPUTE_ERROR_ON(comp_group.is_intermediate_tensor(_lhs) &&
+                         detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0));
+    ARM_COMPUTE_ERROR_ON(comp_group.is_intermediate_tensor(_rhs) &&
+                         detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0));
 
     // Set broadcast parameters
     // PRE: All tensors are broadcast-compatible
@@ -228,9 +213,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt
     lut["rhs_m0"]          = (rhs_broadcast_yz) ? "1" : "M0";
     lut["rhs_start_ind_1"] = (rhs_broadcast_yz) ? "0" : "g_ind_1";
 
-    lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" :
-                          (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" :
-                                               "";
+    lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" : (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" : "";
 
     return lut;
 }
@@ -268,7 +251,7 @@ std::string ClTemplateElementwiseBinary::get_config_id() const
 
 std::set<std::string> ClTemplateElementwiseBinary::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateElementwiseBinary::get_window() const
@@ -279,8 +262,9 @@ Window ClTemplateElementwiseBinary::get_window() const
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window             win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h
index 8cca954efe9e3d4be22a6d72d04836313df26e60..991c0eca448cec1cecd66ebd89cd78779a78163d 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
@@ -48,9 +49,7 @@ public:
      * @param[in] tensors    Tensor arguments to the components
      * @param[in] attributes Component attributes
      */
-    ClTemplateElementwiseBinary(ComponentId                      id,
-                                const ArgumentPack<ITensorInfo> &tensors,
-                                const Attributes                &attributes);
+    ClTemplateElementwiseBinary(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
     /** Prevent instances of this class from being copy constructed */
     ClTemplateElementwiseBinary(const ClTemplateElementwiseBinary &elementwise) = delete;
     /** Prevent instances of this class from being copied */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
index a8d8d32b122da92d5ffef14aaa049132d195e935..522c33a022e34ccb6be84177d35cf3e41023f5c3 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
@@ -38,16 +39,12 @@ namespace dynamic_fusion
 {
 namespace
 {
-    constexpr unsigned int serial_vector_size = 8;
+constexpr unsigned int serial_vector_size = 8;
 } // namespace
 ClTemplateLogits1DMaxShiftExpSum::ClTemplateLogits1DMaxShiftExpSum(ComponentId                      id,
                                                                    const ArgumentPack<ITensorInfo> &tensors,
                                                                    const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _src{},
-      _sum{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _sum{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _sum = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -79,7 +76,7 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component
 
     const bool beta_defined = (_attributes.beta() != 1.f);
 
-    if(beta_defined)
+    if (beta_defined)
     {
         code += R"_(
     VEC_TYPE beta = (VEC_TYPE){{BETA}};
@@ -91,7 +88,7 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component
     const unsigned int     vector_size         = adjust_vec_size(_serial_vector_size, reduction_dim_size);
     const bool             non_multiple_of_n0  = ((reduction_dim_size % vector_size) != 0);
 
-    if(non_multiple_of_n0)
+    if (non_multiple_of_n0)
     {
         code += R"_(
     VEC_TYPE data    = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr);
@@ -111,19 +108,19 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component
     VEC_TYPE sum1D = 0;
 )_";
 
-    if(non_multiple_of_n0)
+    if (non_multiple_of_n0)
     {
         code += R"_(
     data -= max_val;
 )_";
-        if(beta_defined)
+        if (beta_defined)
         {
             code += R"_(
     data *= beta;
 )_";
         }
 
-        if(_attributes.is_log_softmax())
+        if (_attributes.is_log_softmax())
         {
             code += R"_(
     VSTORE_PARTIAL(N0, PARTIAL_N0)
@@ -153,14 +150,14 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component
         data -= max_val;
 )_";
 
-    if(beta_defined)
+    if (beta_defined)
     {
         code += R"_(
     data *= beta;
 )_";
     }
 
-    if(_attributes.is_log_softmax())
+    if (_attributes.is_log_softmax())
     {
         code += R"_(
     VSTORE(N0)
@@ -191,28 +188,18 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component
     return code;
 }
 
-void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable,
+                                                         const ComponentGroup   &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "src");
-
-    vtable.declare_variable(
-        comp_group,
-        _sum,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "sum");
-
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "dst");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "src");
+
+    vtable.declare_variable(comp_group, _sum, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "sum");
+
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "dst");
 }
 
-TagLUT ClTemplateLogits1DMaxShiftExpSum::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+TagLUT ClTemplateLogits1DMaxShiftExpSum::get_tag_lut(const GpuKernelVariableTable &vtable,
+                                                     const ComponentGroup         &comp_group) const
 {
     ARM_COMPUTE_UNUSED(comp_group);
 
@@ -241,8 +228,8 @@ CLBuildOptions ClTemplateLogits1DMaxShiftExpSum::get_build_options(const Compone
     ARM_COMPUTE_UNUSED(comp_group);
     CLBuildOptions build_opts{};
 
-    const unsigned int     reduction_dim_size = _src->dimension(0);
-    const unsigned int     vector_size        = adjust_vec_size(serial_vector_size, reduction_dim_size);
+    const unsigned int reduction_dim_size = _src->dimension(0);
+    const unsigned int vector_size        = adjust_vec_size(serial_vector_size, reduction_dim_size);
 
     build_opts.add_option("-DN0=" + support::cpp11::to_string(vector_size));
     build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string((reduction_dim_size % vector_size)));
@@ -264,7 +251,7 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_config_id() const
 
 std::set<std::string> ClTemplateLogits1DMaxShiftExpSum::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateLogits1DMaxShiftExpSum::get_window() const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h
index 5d232c0cf25d7a148630c2877285972cd76afaf0..ac9ddaa9d45ad99497df2c094dfaea8899e60744 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h
@@ -46,7 +46,9 @@ public:
      * @param[in] tensors    Tensor arguments to the components
      * @param[in] attributes Component attributes
      */
-    ClTemplateLogits1DMaxShiftExpSum(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+    ClTemplateLogits1DMaxShiftExpSum(ComponentId                      id,
+                                     const ArgumentPack<ITensorInfo> &tensors,
+                                     const Attributes                &attributes);
     /** Prevent instances of this class from being copy constructed */
     ClTemplateLogits1DMaxShiftExpSum(const ClTemplateLogits1DMaxShiftExpSum &) = delete;
     /** Prevent instances of this class from being copied */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp
index 056e570a25c502000b32b8c5fe8d5193c9a786aa..7d7c3e6673584d3a51a8bc01fbb5535aad2c7520 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp
@@ -25,6 +25,7 @@
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h"
 
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
@@ -38,11 +39,7 @@ namespace dynamic_fusion
 ClTemplateLogits1DNorm::ClTemplateLogits1DNorm(ComponentId                      id,
                                                const ArgumentPack<ITensorInfo> &tensors,
                                                const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _src{},
-      _sum{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _sum{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _sum = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
@@ -76,7 +73,7 @@ std::string ClTemplateLogits1DNorm::get_component_code(const ComponentGroup &com
     data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr);
 )_";
 
-    if(_attributes.is_log_softmax())
+    if (_attributes.is_log_softmax())
     {
         code += R"_(
     sum_val = log(sum_val);
@@ -101,23 +98,11 @@ std::string ClTemplateLogits1DNorm::get_component_code(const ComponentGroup &com
 
 void ClTemplateLogits1DNorm::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "src");
-
-    vtable.declare_variable(
-        comp_group,
-        _sum,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "sum");
-
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "dst");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "src");
+
+    vtable.declare_variable(comp_group, _sum, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "sum");
+
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "dst");
 }
 
 TagLUT ClTemplateLogits1DNorm::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
@@ -168,14 +153,14 @@ std::string ClTemplateLogits1DNorm::get_config_id() const
 
 std::set<std::string> ClTemplateLogits1DNorm::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateLogits1DNorm::get_window() const
 {
     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
     constexpr unsigned int serial_vector_size = 16;
-    const unsigned int vector_size = adjust_vec_size(serial_vector_size, _src->dimension(0));
+    const unsigned int     vector_size        = adjust_vec_size(serial_vector_size, _src->dimension(0));
 
     Window win = calculate_max_window(*_src, Steps(vector_size));
     return win.collapse(win, Window::DimZ);
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
index 34840c21007e6491f1f13c56c7c8e06dca0d969b..ebb037450144caf167cb72c8f8435cfe15a76b74 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
@@ -23,14 +23,13 @@
  */
 #include "ClTemplatePool2d.h"
 
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
-#include "src/core/helpers/WindowHelpers.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -50,11 +49,7 @@ ClTemplatePool2d::ClTemplatePool2d(ComponentId                      id,
                                    const ArgumentPack<ITensorInfo> &tensors,
                                    const Attributes                &attributes,
                                    const Settings                  &settings)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _src{},
-      _dst{},
-      _attributes{ attributes },
-      _settings{ settings }
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -71,7 +66,7 @@ std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_grou
     ARM_COMPUTE_UNUSED(comp_group);
 
     // Condition to use 2x2 optimized kernel
-    if(_attributes.pool_size() == Size2D(2, 2))
+    if (_attributes.pool_size() == Size2D(2, 2))
     {
         return get_2x2_kernel_code();
     }
@@ -83,11 +78,13 @@ std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_grou
 
 std::string ClTemplatePool2d::get_MxN_kernel_code() const
 {
-    const auto pool_type          = _attributes.pool_type();
-    const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
+    const auto pool_type = _attributes.pool_type();
+    const bool fp_mixed_precision =
+        (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
 
     // Define pool op macro.
-    std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
+    std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
+                                                          : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
 
     // Kernel start
     // Note: If C is not multiple of N0, we shift back of PARTIAL_N0 elements to compute the leftover elements for get_global_id(0) == 0
@@ -129,7 +126,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
 )_";
 
     // Determine filter size depending on if padding is excluded or not
-    if(_attributes.exclude_padding())
+    if (_attributes.exclude_padding())
     {
         code += R"_(
     const int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
@@ -144,7 +141,8 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
 
     // Loop through pool size
     // if global pooling
-    if(_attributes.pool_size().x() == _src->dimension(width_idx) && _attributes.pool_size().y() == _src->dimension(height_idx))
+    if (_attributes.pool_size().x() == _src->dimension(width_idx) &&
+        _attributes.pool_size().y() == _src->dimension(height_idx))
     {
         // Begin loop
         code += R"_(
@@ -173,7 +171,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
 
     // if condition inside loop - use 32bit acc if mixed_precision.
     // End loop through pooling section.
-    if(fp_mixed_precision)
+    if (fp_mixed_precision)
     {
         // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
         code += R"_(
@@ -194,7 +192,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
     }
 
     // For Pool AVG ONLY, divide pool output by filter size
-    if(pool_type == PoolingType::AVG)
+    if (pool_type == PoolingType::AVG)
     {
         code += R"_(
     res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
@@ -202,7 +200,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
     }
 
     // If mixed precision convert datatype before storing. Then end kernel.
-    if(fp_mixed_precision)
+    if (fp_mixed_precision)
     {
         code += R"_(
     VEC_DATA_TYPE({{DATA_TYPE}}, N0)
@@ -228,9 +226,11 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
 
 std::string ClTemplatePool2d::get_2x2_kernel_code() const
 {
-    const auto  pool_type          = _attributes.pool_type();
-    const bool  fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
-    std::string pool_op            = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
+    const auto pool_type = _attributes.pool_type();
+    const bool fp_mixed_precision =
+        (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
+    std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
+                                                          : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
 
     std::string code = R"_(
 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
@@ -274,7 +274,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const
     REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0), data, 0);
 )_";
 
-    if(fp_mixed_precision)
+    if (fp_mixed_precision)
     {
         // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
         code += R"_(
@@ -294,7 +294,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const
 )_";
     }
 
-    if(pool_type != PoolingType::MAX)
+    if (pool_type != PoolingType::MAX)
     {
         // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
         code += R"_(
@@ -321,10 +321,10 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const
     res0 = POOL_OP(res0, data3);
 )_";
 
-    if(pool_type == PoolingType::AVG)
+    if (pool_type == PoolingType::AVG)
     {
         // If avg pooling divide result accordingly.
-        if(_attributes.exclude_padding())
+        if (_attributes.exclude_padding())
         {
             code += R"_(
     res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
@@ -339,7 +339,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const
     }
 
     // Store result
-    if(fp_mixed_precision)
+    if (fp_mixed_precision)
     {
         code += R"_(
     VEC_DATA_TYPE({{DATA_TYPE}}, N0)
@@ -365,17 +365,11 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const
 
 void ClTemplatePool2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
-
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
+
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
 TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
@@ -391,12 +385,15 @@ TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const
     lut["meta_kernel_id"] = id();
 
     // Retrieve relevant data
-    const auto        padding                = _attributes.pad();
-    const auto        stride                 = _attributes.stride();
-    const auto        pool_size              = _attributes.pool_size();
-    const auto        data_type              = _src->data_type();
-    const auto        use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX;
-    const std::string max_initial_value      = _settings.use_inf_as_limit() ? "(-INFINITY)" : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
+    const auto padding                = _attributes.pad();
+    const auto stride                 = _attributes.stride();
+    const auto pool_size              = _attributes.pool_size();
+    const auto data_type              = _src->data_type();
+    const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() &&
+                                        _attributes.pool_type() != PoolingType::MAX;
+    const std::string max_initial_value =
+        _settings.use_inf_as_limit() ? "(-INFINITY)"
+                                     : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
 
     // pool specific
     lut["STRIDE_X"]    = stride.x();
@@ -407,7 +404,8 @@ TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const
     lut["POOL_SIZE_Y"] = pool_size.height;
 
     // Datatypes and variables
-    lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type((use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use.
+    lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type(
+        (use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use.
     lut["DATA_TYPE"]     = get_cl_type_from_data_type(data_type);
     lut["SRC_WIDTH"]     = _src->dimension(width_idx);
     lut["SRC_HEIGHT"]    = _src->dimension(height_idx);
@@ -454,14 +452,14 @@ std::string ClTemplatePool2d::get_config_id() const
 
 std::set<std::string> ClTemplatePool2d::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h", "repeat.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h", "repeat.h"};
 }
 
 Window ClTemplatePool2d::get_window() const
 {
     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
     const auto         output_shape = _dst->tensor_shape();
-    const unsigned int vec_size     = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
+    const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
 
     // Create and configure kernel window
     auto win = calculate_max_window(output_shape, Steps(vec_size));
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h
index ef1c100f44d6248818fe93b5fd48d6528630ef7a..d1d3c01669639b566542519ee4e87a9077752218 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp
index 8b50f1e209d8fc018c802de142b5ac51a47bc4de..c882353fcb31c661b6284bf7e8791a9bb7d8dfa1 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
@@ -36,11 +37,8 @@ namespace dynamic_fusion
 {
 constexpr unsigned int vector_size_byte_opencl = 16;
 
-ClTemplateReshape::ClTemplateReshape(ComponentId                      id,
-                                     const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _src{},
-      _dst{}
+ClTemplateReshape::ClTemplateReshape(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -97,23 +95,17 @@ TILE(uint, M0, 1, g_dst_indirect_y);
 
 void ClTemplateReshape::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(common_tensor_type), // GpuKernelArgumentInfo::Type::Image_3D
-        "src");
-
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(common_tensor_type),
-        "dst");
+    vtable.declare_variable(comp_group, _src,
+                            GpuKernelArgumentInfo(common_tensor_type), // GpuKernelArgumentInfo::Type::Image_3D
+                            "src");
+
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst");
 }
 
 TagLUT ClTemplateReshape::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
     ARM_COMPUTE_UNUSED(comp_group);
-    TagLUT     lut{};
+    TagLUT lut{};
 
     // Arguments and global shared variables
     lut["src"]            = vtable.get_variable(_src);
@@ -153,7 +145,7 @@ std::string ClTemplateReshape::get_config_id() const
 
 std::set<std::string> ClTemplateReshape::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateReshape::get_window() const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h
index 56b6585b613a14ecf2ef36cc3e06c0db5872b930..838a21db6d556d5f2ec01ae4e9f658532f892df2 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESHAPE
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
 
@@ -42,8 +43,7 @@ public:
      * @param[in] id      Component id
      * @param[in] tensors Tensor arguments to the components
      */
-    ClTemplateReshape(ComponentId                      id,
-                      const ArgumentPack<ITensorInfo> &tensors);
+    ClTemplateReshape(ComponentId id, const ArgumentPack<ITensorInfo> &tensors);
     /** Prevent instances of this class from being copy constructed */
     ClTemplateReshape(const ClTemplateReshape &reshape) = delete;
     /** Prevent instances of this class from being copied */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
index aaed1d990d5b25fa1131c54c269bf6ae00f1281a..846c712cebe1f0f4de04cec362c7347e43e3e0b9 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
@@ -37,8 +38,10 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-ClTemplateResize::ClTemplateResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const ClTemplateResize::Attributes &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}, _attributes{ attributes }
+ClTemplateResize::ClTemplateResize(ComponentId                         id,
+                                   const ArgumentPack<ITensorInfo>    &tensors,
+                                   const ClTemplateResize::Attributes &attributes)
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -63,9 +66,9 @@ TILE(uint, 1, 1, g_dst_indirect_y);
     const int bout = g_ind_2 / {{arg_dst}}_h;
 )_";
 
-    if(_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
-        if(_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
+        if (_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
         {
             code += R"_(
     float xi_f = (g_ind_1 * {{SCALE_X}});
@@ -80,7 +83,7 @@ TILE(uint, 1, 1, g_dst_indirect_y);
 )_";
         }
 
-        if(_attributes.align_corners())
+        if (_attributes.align_corners())
         {
             code += R"_(
     xi_f = round(xi_f);
@@ -95,9 +98,9 @@ TILE(uint, 1, 1, g_dst_indirect_y);
     T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi0, xi0, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, {{dst}});
 )_";
     }
-    else if(_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR)
+    else if (_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR)
     {
-        if(_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
+        if (_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
         {
             code += R"_(
     float xi_f = (g_ind_1 * {{SCALE_X}});
@@ -137,7 +140,7 @@ TILE(uint, 1, 1, g_dst_indirect_y);
     T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi1, xi1, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, in11);
 )_";
 
-        if(is_data_type_float(_src->data_type()))
+        if (is_data_type_float(_src->data_type()))
         {
             code += R"_(
     const {{SRC_DATA_TYPE}} a  = ({{SRC_DATA_TYPE}})(xi_f - (float)xi);
@@ -158,9 +161,9 @@ TILE(uint, 1, 1, g_dst_indirect_y);
     const float b1 = (1.f - a1);
 
     {{dst}}[0].v = CONVERT_SAT(
-        (CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + 
+        (CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) +
         (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) +
-        (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + 
+        (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) +
         (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), VEC_DATA_TYPE({{DST_DATA_TYPE}}, N0));
 )_";
         }
@@ -179,22 +182,18 @@ TILE(uint, 1, 1, g_dst_indirect_y);
     return code;
 }
 
-void ClTemplateResize::declare_variables(GpuKernelVariableTable &vtable, const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
+void ClTemplateResize::declare_variables(GpuKernelVariableTable                            &vtable,
+                                         const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
-
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
+
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
-TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable &vtable, const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
+TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable                      &vtable,
+                                     const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
 {
     TagLUT lut{};
 
@@ -212,8 +211,10 @@ TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable &vtable, const
     lut["DST_DATA_TYPE"]   = get_cl_type_from_data_type(_dst->data_type());
     lut["CONSTANT_VALUE"]  = string_from_pixel_value(0, _src->data_type());
 
-    const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(1), _dst->dimension(1), _attributes.align_corners());
-    const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(2), _dst->dimension(2), _attributes.align_corners());
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(_src->dimension(1), _dst->dimension(1), _attributes.align_corners());
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(_src->dimension(2), _dst->dimension(2), _attributes.align_corners());
 
     lut["SCALE_X"] = float_to_string_with_full_precision(scale_x);
     lut["SCALE_Y"] = float_to_string_with_full_precision(scale_y);
@@ -242,7 +243,8 @@ std::string ClTemplateResize::get_config_id() const
     std::string config_id{};
 
     config_id += "resize_";
-    config_id += (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "NEAREST_NEIGHBOR" : "");
+    config_id +=
+        (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "NEAREST_NEIGHBOR" : "");
     config_id += (_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "BILINEAR" : "");
     config_id += "_";
     config_id += (_attributes.sampling_policy() == SamplingPolicy::CENTER ? "center" : "topleft");
@@ -260,7 +262,7 @@ std::string ClTemplateResize::get_config_id() const
 
 std::set<std::string> ClTemplateResize::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateResize::get_window() const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
index 217214ced3ea4dfe6b956588321190ec02508919..d0ec91e0a914ae5a0b1ccd7bf0453615529b1af2 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
@@ -32,7 +32,7 @@ namespace experimental
 namespace dynamic_fusion
 {
 ClTemplateStore::ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -61,16 +61,10 @@ std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group
 
 void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
 TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
index 3f97a82204d504aaa6e3c46ca8c8753a6de6b500..b8c82ceadd5097d8036a0c9e9b721105476aaed7 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
 
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
index eda15f1d9541889b60b1585f553d84b0d7e6cb9a..d3d7c8db8305253d911ecb87acceaa03a3812ad0 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
@@ -24,6 +24,7 @@
 #include "ClTemplateWriter.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
 
@@ -39,11 +40,11 @@ std::string ClTemplateWriter::replace_tags(const std::string &code_template, con
     std::string replaced_code    = "";
     bool        scanning_pattern = false;
     std::string pattern_found    = "";
-    for(size_t i = 0; i < code_template.size() - 1; ++i)
+    for (size_t i = 0; i < code_template.size() - 1; ++i)
     {
-        if(!scanning_pattern)
+        if (!scanning_pattern)
         {
-            if(code_template[i] == '{' && code_template[i + 1] == '{')
+            if (code_template[i] == '{' && code_template[i + 1] == '{')
             {
                 i += 1;
                 scanning_pattern = true;
@@ -56,7 +57,7 @@ std::string ClTemplateWriter::replace_tags(const std::string &code_template, con
         }
         else
         {
-            if(code_template[i] == '}' && code_template[i + 1] == '}')
+            if (code_template[i] == '}' && code_template[i + 1] == '}')
             {
                 i += 1;
                 scanning_pattern = false;
@@ -76,8 +77,7 @@ std::string ClTemplateWriter::replace_tags(const std::string &code_template, con
 ClTemplateWriter::~ClTemplateWriter()
 {
 }
-ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components)
-    : _components{ components }
+ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components) : _components{components}
 {
 }
 std::string ClTemplateWriter::get_name()
@@ -91,7 +91,7 @@ std::string ClTemplateWriter::get_code()
 std::string ClTemplateWriter::get_config_id()
 {
     std::string config_id = get_name();
-    for(const auto &comp : _components)
+    for (const auto &comp : _components)
     {
         config_id += "--" + comp->template_writer()->get_config_id() + "--";
     }
@@ -103,7 +103,7 @@ CLBuildOptions ClTemplateWriter::get_build_options()
 {
     CLBuildOptions build_opts{};
 
-    for(const auto &comp : _components)
+    for (const auto &comp : _components)
     {
         build_opts.add_options(comp->template_writer()->get_build_options(_components).options());
     }
@@ -122,11 +122,9 @@ std::map<ITensorInfo::Id, GpuKernelArgument> ClTemplateWriter::get_tensors()
 {
     // Assemble GpuKernelArguments
     std::map<ITensorInfo::Id, GpuKernelArgument> tensors;
-    for(const auto t : _components.get_argument_tensors())
+    for (const auto t : _components.get_argument_tensors())
     {
-        tensors.emplace(
-            t->id(),
-            GpuKernelArgument{ *t, _vtable.get_variable(t).kernel_argument_info });
+        tensors.emplace(t->id(), GpuKernelArgument{*t, _vtable.get_variable(t).kernel_argument_info});
     }
     return tensors;
 }
@@ -141,22 +139,24 @@ std::string ClTemplateWriter::write_code()
     std::vector<std::string> component_codes{}; // vector because order matters
 
     // Pass 1: Declare all kernel variables
-    for(auto &component : _components)
+    for (auto &component : _components)
     {
         component->template_writer()->declare_variables(_vtable, _components);
     }
     // Pass 2: Generate component codes
-    for(auto &component : _components)
+    for (auto &component : _components)
     {
         const auto component_writer       = component->template_writer();
         auto       curr_headers_list      = component_writer->get_headers_list();
         auto       curr_additional_macros = component_writer->get_additional_macros();
         auto       curr_component_code    = component_writer->get_component_code(_components);
-        const auto var_lut                = component_writer->get_tag_lut(_vtable, _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
+        const auto var_lut                = component_writer->get_tag_lut(
+                           _vtable,
+                           _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
         component_codes.push_back(replace_tags(curr_component_code, var_lut));
 
         headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
-        if(!additional_macros.empty()) // Some components might not have any
+        if (!additional_macros.empty()) // Some components might not have any
         {
             additional_macros.insert(replace_tags(curr_additional_macros, var_lut));
         }
@@ -165,7 +165,7 @@ std::string ClTemplateWriter::write_code()
     // Step 3: Assemble the data gathered by traversing the graph into the string "code"
     std::string code = "";
 
-    for(auto &header : headers_list)
+    for (auto &header : headers_list)
     {
 #if defined(EMBEDDED_KERNELS)
         code += CLKernelLibrary::get().get_program(header).first;
@@ -174,16 +174,14 @@ std::string ClTemplateWriter::write_code()
 #endif // defined(EMBEDDED_KERNELS)
     }
 
-    for(auto &macros : additional_macros)
+    for (auto &macros : additional_macros)
     {
         code += macros;
     }
 
     auto arguments = _components.get_argument_tensors();
-    std::sort(arguments.begin(), arguments.end(), [](const ITensorInfo * l, const ITensorInfo * r)
-    {
-        return l->id() < r->id();
-    });
+    std::sort(arguments.begin(), arguments.end(),
+              [](const ITensorInfo *l, const ITensorInfo *r) { return l->id() < r->id(); });
     code += write_kernel_signature(_vtable.get_variable_list(arguments));
 
     code += "\n{\n\n";
@@ -198,7 +196,7 @@ std::string ClTemplateWriter::write_code()
 
         tiles_ss << "    //------------------ START TILE DECLARATION ---------------------\n";
 
-        for(auto tile : tiles)
+        for (auto tile : tiles)
         {
             const auto var       = _vtable.get_variable(tile);
             const auto data_type = get_cl_type_from_data_type(tile->data_type());
@@ -212,7 +210,7 @@ std::string ClTemplateWriter::write_code()
         code += tiles_ss.str();
     }
 
-    for(const auto &component_code : component_codes)
+    for (const auto &component_code : component_codes)
     {
         code += component_code;
         code += "\n";
@@ -231,7 +229,8 @@ std::string ClTemplateWriter::write_global_section() const
     auto       leftover_w = dst_w % tile_w;
 
     std::string code = "";
-    code += std::string("    int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n";
+    code += std::string("    int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " +
+            std::to_string(leftover_w) + ");\n";
     code += std::string("    int g_ind_1 = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n";
     code += std::string("    int g_ind_2 = GET_SPATIAL_IDX(2, 1, 0);\n\n");
 
@@ -243,7 +242,7 @@ std::string ClTemplateWriter::write_global_section() const
 std::string ClTemplateWriter::write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const
 {
     std::string code;
-    switch(var.kernel_argument_info.type)
+    switch (var.kernel_argument_info.type)
     {
         case GpuKernelArgumentInfo::Type::Vector:
         {
@@ -293,11 +292,11 @@ std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTabl
 {
     std::string code = "\n__kernel void " + write_kernel_name() + "(";
 
-    for(int i = 0; i < static_cast<int>(argument_list.size()) - 1; ++i)
+    for (int i = 0; i < static_cast<int>(argument_list.size()) - 1; ++i)
     {
         code += write_argument_declaration(argument_list[i]) + ",";
     }
-    if(static_cast<int>(argument_list.size()) - 1 >= 0)
+    if (static_cast<int>(argument_list.size()) - 1 >= 0)
     {
         code += write_argument_declaration(argument_list[argument_list.size() - 1]);
     }
@@ -308,12 +307,12 @@ std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTabl
 }
 std::string ClTemplateWriter::write_kernel_name() const
 {
-    if(_components.empty())
+    if (_components.empty())
     {
         return "empty_kernel";
     }
     std::string name = _components.empty() ? "" : _components[0]->template_writer()->get_name();
-    for(size_t i = 1; i < _components.size(); ++i)
+    for (size_t i = 1; i < _components.size(); ++i)
     {
         name += "___";
         name += _components[i]->template_writer()->get_name();
diff --git a/src/dynamic_fusion/sketch/utils/DependencyGraph.h b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
index c891e76d8b1d346d6366acdbcb0f4c158c101569..c157c2b21c78448272d274a6d51f4efdfbedfb02 100644
--- a/src/dynamic_fusion/sketch/utils/DependencyGraph.h
+++ b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH
 
 #include "arm_compute/core/Error.h"
+
 #include <cstdint>
 #include <map>
 #include <set>
@@ -68,12 +69,10 @@ public:
         OperatorId            op{};
         std::vector<TensorId> inputs{};
         std::vector<TensorId> outputs{};
-        friend bool operator==(const OpPack &opp0, const OpPack &opp1)
+        friend bool           operator==(const OpPack &opp0, const OpPack &opp1)
         {
-            return std::make_tuple(
-                       opp0.op, opp0.inputs, opp0.outputs)
-                   == std::make_tuple(
-                       opp1.op, opp1.inputs, opp1.outputs);
+            return std::make_tuple(opp0.op, opp0.inputs, opp0.outputs) ==
+                   std::make_tuple(opp1.op, opp1.inputs, opp1.outputs);
         }
     };
 
@@ -95,10 +94,13 @@ public:
      * @return true  If the operator can be added while keeping the graph as a linear sequence
      * @return false  Otherwise
      */
-    bool try_add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs, bool is_output = false) const
+    bool try_add_operator_as_linear(OperatorId                   op,
+                                    const std::vector<TensorId> &inputs,
+                                    const std::vector<TensorId> &outputs,
+                                    bool                         is_output = false) const
     {
         ARM_COMPUTE_UNUSED(op, is_output);
-        if(all_ops().empty())
+        if (all_ops().empty())
         {
             return true;
         }
@@ -106,25 +108,25 @@ public:
         // If the new operator is not the first operator, at least one input tensor must be
         // the output tensor of the last non-output operator. All other input tensors must be
         // the global input of the graph (i.e. not the output of any operator).
-        if(_last_op_available)
+        if (_last_op_available)
         {
             auto use_input_from_last_op = false;
 
-            for(auto src_tensor : inputs)
+            for (auto src_tensor : inputs)
             {
                 const auto src_ops = _adj_src_ops.find(src_tensor);
 
-                if(src_ops != _adj_src_ops.end())
+                if (src_ops != _adj_src_ops.end())
                 {
                     ARM_COMPUTE_ERROR_ON(src_ops->second.size() > 1);
 
-                    if(!src_ops->second.empty())
+                    if (!src_ops->second.empty())
                     {
                         const auto src_op = src_ops->second[0];
 
-                        if(src_op == _last_op)
+                        if (src_op == _last_op)
                         {
-                            if(use_input_from_last_op)
+                            if (use_input_from_last_op)
                             {
                                 // To be safe, we also forbid using the output tensor
                                 // of the last operator twice.
@@ -143,7 +145,7 @@ public:
                 }
             }
 
-            if(!use_input_from_last_op)
+            if (!use_input_from_last_op)
             {
                 // At least one input tensor must be the output tensor of the last non-output operator.
                 return false;
@@ -152,9 +154,9 @@ public:
 
         // The output tensor of the new operator must not be the input tensor of any previously
         // added operator.
-        for(auto dst_tensor : outputs)
+        for (auto dst_tensor : outputs)
         {
-            if(_adj_dst_ops.find(dst_tensor) != _adj_dst_ops.end())
+            if (_adj_dst_ops.find(dst_tensor) != _adj_dst_ops.end())
             {
                 return false;
             }
@@ -168,7 +170,10 @@ public:
      * INVARIANT: The list can only grow from head to tail
      * INVARIANT: POSTCONDITION: The graph is linear
      */
-    void add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs, bool is_output = false)
+    void add_operator_as_linear(OperatorId                   op,
+                                const std::vector<TensorId> &inputs,
+                                const std::vector<TensorId> &outputs,
+                                bool                         is_output = false)
     {
         const auto success = add_operator(op, inputs, outputs, is_output);
         ARM_COMPUTE_UNUSED(success);
@@ -183,24 +188,27 @@ public:
      * @param[in] outputs   Output tensors to the operator
      * @param[in] is_output Whether this is an output operator
      */
-    bool add_operator(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs, bool is_output = false)
+    bool add_operator(OperatorId                   op,
+                      const std::vector<TensorId> &inputs,
+                      const std::vector<TensorId> &outputs,
+                      bool                         is_output = false)
     {
-        if(operator_exists(op))
+        if (operator_exists(op))
         {
             return false;
         }
         _adj_src_tensors[op] = {};
         _adj_dst_tensors[op] = {};
-        for(auto in_tensor : inputs)
+        for (auto in_tensor : inputs)
         {
             // Linking input tensor to operator node will never create a cycle / loop because we guarantee
             // each op is newly created, so every <input, op> pair / edge is new
             link_input(op, in_tensor);
         }
-        for(auto out_tensor : outputs)
+        for (auto out_tensor : outputs)
         {
             // If there exists a back path from op's output tensor to op already, then linking the two will create a loop / cycle
-            if(path_exists_from_tensor_to_op(out_tensor, op))
+            if (path_exists_from_tensor_to_op(out_tensor, op))
             {
                 remove_operator(op);
                 return false;
@@ -211,10 +219,10 @@ public:
             }
         }
 
-        if(!is_output)
+        if (!is_output)
         {
             _last_op_available = true;
-            _last_op = op;
+            _last_op           = op;
         }
 
         return true;
@@ -230,16 +238,16 @@ public:
     std::vector<OpPack> build_operators_sequence() const
     {
         std::vector<OpPack> ops_seq;
-        std::set<Id> done_ops;
-        std::set<Id> done_tensors;
+        std::set<Id>        done_ops;
+        std::set<Id>        done_tensors;
 
         const auto input_tensors = global_src_tensors();
 
-        for(auto tensor : input_tensors)
+        for (auto tensor : input_tensors)
         {
             done_tensors.insert(tensor);
 
-            for(auto op : _adj_dst_ops.at(tensor))
+            for (auto op : _adj_dst_ops.at(tensor))
             {
                 build_operators_sequence_from_op(op, ops_seq, done_ops, done_tensors);
             }
@@ -260,10 +268,8 @@ public:
     friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1)
     {
         // Do not compare id allocators
-        return std::make_tuple(
-                   g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops)
-               == std::make_tuple(
-                   g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops);
+        return std::make_tuple(g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops) ==
+               std::make_tuple(g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops);
     }
     std::vector<OperatorId> src_ops_from_tensor(TensorId tensor) const
     {
@@ -280,10 +286,8 @@ public:
     std::vector<TensorId> all_tensors() const
     {
         std::vector<TensorId> tensors{};
-        std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), [](const auto & it)
-        {
-            return it.first;
-        });
+        std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors),
+                       [](const auto &it) { return it.first; });
         return tensors;
     }
     /** Get source tensors of the whole graph
@@ -293,9 +297,9 @@ public:
     std::vector<TensorId> global_src_tensors() const
     {
         std::vector<TensorId> tensors;
-        for(auto tensor_src_ops : _adj_src_ops)
+        for (auto tensor_src_ops : _adj_src_ops)
         {
-            if(tensor_src_ops.second.empty())
+            if (tensor_src_ops.second.empty())
             {
                 tensors.push_back(tensor_src_ops.first);
             }
@@ -309,9 +313,9 @@ public:
     std::vector<TensorId> global_dst_tensors() const
     {
         std::vector<TensorId> tensors;
-        for(auto tensor_dst_ops : _adj_dst_ops)
+        for (auto tensor_dst_ops : _adj_dst_ops)
         {
-            if(tensor_dst_ops.second.empty())
+            if (tensor_dst_ops.second.empty())
             {
                 tensors.push_back(tensor_dst_ops.first);
             }
@@ -328,14 +332,14 @@ public:
 
         // If a tensor is used to connect the input of an operator and the output of another operator,
         // it is not allocated in the memory. The tensor exists as a temporary variable only.
-        for(auto src_tensor : _adj_src_ops)
+        for (auto src_tensor : _adj_src_ops)
         {
-            if(!src_tensor.second.empty())
+            if (!src_tensor.second.empty())
             {
                 const auto dst_tensor = _adj_dst_ops.find(src_tensor.first);
-                if(dst_tensor != _adj_dst_ops.end())
+                if (dst_tensor != _adj_dst_ops.end())
                 {
-                    if(!dst_tensor->second.empty())
+                    if (!dst_tensor->second.empty())
                     {
                         tensors.push_back(src_tensor.first);
                     }
@@ -354,9 +358,9 @@ public:
         std::vector<OperatorId> ops{};
         const auto              op_list = all_ops();
 
-        for(auto op : op_list)
+        for (auto op : op_list)
         {
-            if(src_ops(op).empty())
+            if (src_ops(op).empty())
             {
                 ops.emplace_back(op);
             }
@@ -368,7 +372,7 @@ private:
     void link_input(OperatorId op, TensorId in_tensor)
     {
         ARM_COMPUTE_ERROR_ON(!operator_exists(op));
-        if(!tensor_exists(in_tensor))
+        if (!tensor_exists(in_tensor))
         {
             insert_new_tensor(in_tensor);
         }
@@ -379,7 +383,7 @@ private:
     void link_output(OperatorId op, TensorId out_tensor)
     {
         ARM_COMPUTE_ERROR_ON(!operator_exists(op));
-        if(!tensor_exists(out_tensor))
+        if (!tensor_exists(out_tensor))
         {
             insert_new_tensor(out_tensor);
         }
@@ -392,7 +396,7 @@ private:
     {
         ARM_COMPUTE_ERROR_ON(!operator_exists(op));
         std::vector<OperatorId> ops{};
-        for(TensorId src_tensor : src_tensors(op))
+        for (TensorId src_tensor : src_tensors(op))
         {
             ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor)));
         }
@@ -402,7 +406,7 @@ private:
     {
         ARM_COMPUTE_ERROR_ON(!operator_exists(op));
         std::vector<OperatorId> ops{};
-        for(TensorId dst_tensor : _adj_dst_tensors.at(op))
+        for (TensorId dst_tensor : _adj_dst_tensors.at(op))
         {
             ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor)));
         }
@@ -436,10 +440,8 @@ private:
     std::vector<OperatorId> all_ops() const
     {
         std::vector<OperatorId> ops{};
-        std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), [](const auto & it)
-        {
-            return it.first;
-        });
+        std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops),
+                       [](const auto &it) { return it.first; });
         return ops;
     }
     /** Remove an operator from graph.
@@ -448,25 +450,21 @@ private:
      */
     void remove_operator(OperatorId op)
     {
-        for(auto src_tensor : _adj_src_tensors.at(op))
+        for (auto src_tensor : _adj_src_tensors.at(op))
         {
             auto &dst_ops = _adj_dst_ops.at(src_tensor);
-            dst_ops.erase(
-                std::remove(std::begin(dst_ops), std::end(dst_ops), op),
-                std::end(dst_ops));
+            dst_ops.erase(std::remove(std::begin(dst_ops), std::end(dst_ops), op), std::end(dst_ops));
         }
-        for(auto dst_tensor : _adj_dst_tensors.at(op))
+        for (auto dst_tensor : _adj_dst_tensors.at(op))
         {
             auto &src_ops = _adj_src_ops.at(dst_tensor);
-            src_ops.erase(
-                std::remove(std::begin(src_ops), std::end(src_ops), op),
-                std::end(src_ops));
+            src_ops.erase(std::remove(std::begin(src_ops), std::end(src_ops), op), std::end(src_ops));
         }
         // Remove any isolated tensors
         // An isolated tensor is one where both its _adj_src_ops and _adj_dst_ops are empty
-        for(auto t : all_tensors())
+        for (auto t : all_tensors())
         {
-            if(_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty())
+            if (_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty())
             {
                 _adj_src_ops.erase(t);
                 _adj_dst_ops.erase(t);
@@ -486,11 +484,12 @@ private:
     }
     bool operator_exists(OperatorId op) const
     {
-        return _adj_src_tensors.find(op) != _adj_src_tensors.end() && _adj_dst_tensors.find(op) != _adj_dst_tensors.end();
+        return _adj_src_tensors.find(op) != _adj_src_tensors.end() &&
+               _adj_dst_tensors.find(op) != _adj_dst_tensors.end();
     }
     bool is_src_tensor_of(OperatorId op, TensorId tensor) const
     {
-        if(!operator_exists(op) || !tensor_exists(tensor))
+        if (!operator_exists(op) || !tensor_exists(tensor))
         {
             return false;
         }
@@ -499,7 +498,7 @@ private:
     }
     bool is_dst_tensor_of(OperatorId op, TensorId tensor) const
     {
-        if(!operator_exists(op) || !tensor_exists(tensor))
+        if (!operator_exists(op) || !tensor_exists(tensor))
         {
             return false;
         }
@@ -525,9 +524,9 @@ private:
         std::vector<OperatorId> ops{};
         const auto              op_list = all_ops();
 
-        for(auto op : op_list)
+        for (auto op : op_list)
         {
-            if(is_dst_op(op))
+            if (is_dst_op(op))
             {
                 ops.emplace_back(op);
             }
@@ -536,13 +535,13 @@ private:
     }
     bool path_exists_from_tensor_to_op(TensorId src_tensor, OperatorId dst_op) const
     {
-        if(!tensor_exists(src_tensor) || !operator_exists(dst_op))
+        if (!tensor_exists(src_tensor) || !operator_exists(dst_op))
         {
             return false;
         }
-        for(auto child_op : dst_ops_from_tensor(src_tensor))
+        for (auto child_op : dst_ops_from_tensor(src_tensor))
         {
-            if(path_exists_from_op_to_op(child_op, dst_op))
+            if (path_exists_from_op_to_op(child_op, dst_op))
             {
                 return true;
             }
@@ -552,21 +551,21 @@ private:
 
     bool path_exists_from_op_to_op(OperatorId src_op, OperatorId dst_op) const
     {
-        if(!operator_exists(src_op) || !operator_exists(dst_op))
+        if (!operator_exists(src_op) || !operator_exists(dst_op))
         {
             return false;
         }
-        if(src_op == dst_op)
+        if (src_op == dst_op)
         {
             return true;
         }
-        if(is_in(src_op, get_dst_ops()))
+        if (is_in(src_op, get_dst_ops()))
         {
             return false;
         }
-        for(auto child_tensor : dst_tensors(src_op))
+        for (auto child_tensor : dst_tensors(src_op))
         {
-            if(path_exists_from_tensor_to_op(child_tensor, dst_op))
+            if (path_exists_from_tensor_to_op(child_tensor, dst_op))
             {
                 return true;
             }
@@ -574,16 +573,15 @@ private:
         return false;
     }
 
-    void build_operators_sequence_from_op(
-        Id op,
-        std::vector<OpPack> &ops_seq,
-        std::set<Id> &done_ops,
-        std::set<Id> &done_tensors) const
+    void build_operators_sequence_from_op(Id                   op,
+                                          std::vector<OpPack> &ops_seq,
+                                          std::set<Id>        &done_ops,
+                                          std::set<Id>        &done_tensors) const
     {
-        while(true)
+        while (true)
         {
             // If the operator has been added to the sequence, ignore it.
-            if(done_ops.find(op) != done_ops.end())
+            if (done_ops.find(op) != done_ops.end())
             {
                 return;
             }
@@ -593,9 +591,9 @@ private:
             // is added to the sequence.
             const auto src_tensors = _adj_src_tensors.at(op);
 
-            for(auto src : src_tensors)
+            for (auto src : src_tensors)
             {
-                if(done_tensors.find(src) == done_tensors.end())
+                if (done_tensors.find(src) == done_tensors.end())
                 {
                     return;
                 }
@@ -606,24 +604,24 @@ private:
 
             done_ops.insert(op);
 
-            OpPack pack{ op, src_tensors, dst_tensors };
+            OpPack pack{op, src_tensors, dst_tensors};
             ops_seq.push_back(pack);
 
             done_tensors.insert(dst_tensors.begin(), dst_tensors.end());
 
             // Visit all the sink operators.
             // Call this function recursively unless there is only one sink.
-            if(dst_tensors.size() == 1 && _adj_dst_ops.at(dst_tensors[0]).size() == 1)
+            if (dst_tensors.size() == 1 && _adj_dst_ops.at(dst_tensors[0]).size() == 1)
             {
                 op = _adj_dst_ops.at(dst_tensors[0])[0];
             }
             else
             {
-                for(auto dst_tensor : dst_tensors)
+                for (auto dst_tensor : dst_tensors)
                 {
                     const auto dst_ops = _adj_dst_ops.at(dst_tensor);
 
-                    for(auto dst_op : dst_ops)
+                    for (auto dst_op : dst_ops)
                     {
                         build_operators_sequence_from_op(dst_op, ops_seq, done_ops, done_tensors);
                     }
@@ -640,8 +638,8 @@ private:
     AdjList _adj_src_ops{};
     AdjList _adj_dst_ops{};
 
-    bool _last_op_available{ false };
-    OperatorId _last_op{ 0 };
+    bool       _last_op_available{false};
+    OperatorId _last_op{0};
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/utils/Utils.h b/src/dynamic_fusion/utils/Utils.h
index c9fc2c610f5097fcb0ec8d1c42c34dbd21790ad7..3f4a2edd0399e293fa35f7ec3e0a4773ea6a90f1 100644
--- a/src/dynamic_fusion/utils/Utils.h
+++ b/src/dynamic_fusion/utils/Utils.h
@@ -63,17 +63,21 @@ inline bool is_invalid_tensor(const ITensorInfo *tensor_info)
 
 /** Inline function to convert @ref Pool2dAttributes to PoolingLayerInfo
 */
-inline PoolingLayerInfo convert_pool_attr_to_pool_info(const Pool2dAttributes &pool_attr, bool mixed_precision = false, DataLayout data_layout = DataLayout::NHWC)
+inline PoolingLayerInfo convert_pool_attr_to_pool_info(const Pool2dAttributes &pool_attr,
+                                                       bool                    mixed_precision = false,
+                                                       DataLayout              data_layout     = DataLayout::NHWC)
 {
     // Create PadStrideInfo
     const Size2D        stride  = pool_attr.stride();
     const Padding2D     padding = pool_attr.pad();
-    const PadStrideInfo pad_stride(stride.x(), stride.y(), padding.left, padding.top, arm_compute::DimensionRoundingType::FLOOR);
+    const PadStrideInfo pad_stride(stride.x(), stride.y(), padding.left, padding.top,
+                                   arm_compute::DimensionRoundingType::FLOOR);
 
-    return PoolingLayerInfo(pool_attr.pool_type(), pool_attr.pool_size(), data_layout, pad_stride, pool_attr.exclude_padding(), mixed_precision);
-}
-}
-}
+    return PoolingLayerInfo(pool_attr.pool_type(), pool_attr.pool_size(), data_layout, pad_stride,
+                            pool_attr.exclude_padding(), mixed_precision);
 }
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
 
 #endif /* SRC_DYNAMIC_FUSION_UTILS_UTILS */
diff --git a/src/gpu/cl/ClContext.cpp b/src/gpu/cl/ClContext.cpp
index d8ef18e62e3acb80719b35c30e9779c7f75df4d0..611c1cb5018bceffe4f53bd28c038eaf20827289 100644
--- a/src/gpu/cl/ClContext.cpp
+++ b/src/gpu/cl/ClContext.cpp
@@ -23,11 +23,11 @@
  */
 #include "src/gpu/cl/ClContext.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "src/gpu/cl/ClQueue.h"
 #include "src/gpu/cl/ClTensor.h"
 
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-
 namespace arm_compute
 {
 namespace gpu
@@ -41,7 +41,7 @@ mlgo::MLGOHeuristics populate_mlgo(const char *filename)
     bool                 status = false;
     mlgo::MLGOHeuristics heuristics;
 
-    if(filename != nullptr)
+    if (filename != nullptr)
     {
         status = heuristics.reload_from_file(filename);
     }
@@ -50,12 +50,9 @@ mlgo::MLGOHeuristics populate_mlgo(const char *filename)
 } // namespace
 
 ClContext::ClContext(const AclContextOptions *options)
-    : IContext(Target::GpuOcl),
-      _mlgo_heuristics(),
-      _cl_ctx(),
-      _cl_dev()
+    : IContext(Target::GpuOcl), _mlgo_heuristics(), _cl_ctx(), _cl_dev()
 {
-    if(options != nullptr)
+    if (options != nullptr)
     {
         _mlgo_heuristics = populate_mlgo(options->kernel_config_file);
     }
@@ -80,7 +77,7 @@ const mlgo::MLGOHeuristics &ClContext::mlgo() const
 
 bool ClContext::set_cl_ctx(::cl::Context ctx)
 {
-    if(this->refcount() == 0)
+    if (this->refcount() == 0)
     {
         _cl_ctx = ctx;
         CLScheduler::get().set_context(ctx);
@@ -92,7 +89,7 @@ bool ClContext::set_cl_ctx(::cl::Context ctx)
 ITensorV2 *ClContext::create_tensor(const AclTensorDescriptor &desc, bool allocate)
 {
     ClTensor *tensor = new ClTensor(this, desc);
-    if(tensor != nullptr && allocate)
+    if (tensor != nullptr && allocate)
     {
         tensor->allocate();
     }
diff --git a/src/gpu/cl/ClContext.h b/src/gpu/cl/ClContext.h
index a50b03124bdf8ea27e996e6ca8bd92e52805d511..2c67ccf4d2475591117624bd2d9e42056921ceb0 100644
--- a/src/gpu/cl/ClContext.h
+++ b/src/gpu/cl/ClContext.h
@@ -24,11 +24,11 @@
 #ifndef SRC_GPU_CLCONTEXT_H
 #define SRC_GPU_CLCONTEXT_H
 
+#include "arm_compute/core/CL/OpenCL.h"
+
 #include "src/common/IContext.h"
 #include "src/runtime/CL/mlgo/MLGOHeuristics.h"
 
-#include "arm_compute/core/CL/OpenCL.h"
-
 namespace arm_compute
 {
 namespace gpu
@@ -74,9 +74,9 @@ public:
     bool set_cl_ctx(::cl::Context ctx);
 
     // Inherrited methods overridden
-    ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
-    IQueue *create_queue(const AclQueueOptions *options) override;
-    std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src,
+    ITensorV2                          *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
+    IQueue                             *create_queue(const AclQueueOptions *options) override;
+    std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor     &src,
                                                           const AclTensorDescriptor     &dst,
                                                           const AclActivationDescriptor &act,
                                                           bool                           is_validate) override;
@@ -90,4 +90,4 @@ private:
 } // namespace gpu
 } // namespace arm_compute
 
-#endif /* SRC_GPU_CLCONTEXT_H */
\ No newline at end of file
+#endif /* SRC_GPU_CLCONTEXT_H */
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index de2e9f9742f2bfbd91f56b5cdf6f25477bca18aa..4544a66e3975159242c724685e92cacf2912055d 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -37,24 +37,16 @@
 namespace
 {
 /* Decoding table */
-constexpr std::array<uint8_t, 256> b64_invtab =
-{
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63,
-    52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0, 0, 0, 0, 0, 0,
-    0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0,
-    0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
-    41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+constexpr std::array<uint8_t, 256> b64_invtab = {
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  62, 0,  0,  0,  63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+    0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+    22, 23, 24, 25, 0,  0,  0,  0, 0, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+    45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 };
 
 /** Decode a base64 encoded string
@@ -68,13 +60,13 @@ std::string decode_base64(const std::string &str)
     constexpr const char pad_char = '=';
 
     // Handle empty string
-    if(str.empty())
+    if (str.empty())
     {
         return {};
     }
 
     // Base64 encoded string has size multiple of 4
-    if(str.length() % 4)
+    if (str.length() % 4)
     {
         return {};
     }
@@ -92,7 +84,7 @@ std::string decode_base64(const std::string &str)
     // Block decoding function (exclude padding)
     int       c   = 0;
     const int end = str_len - 4 - padding;
-    for(; c <= end; c += 4)
+    for (; c <= end; c += 4)
     {
         const int byte0 = b64_invtab[str[c]];
         const int byte1 = b64_invtab[str[c + 1]];
@@ -105,7 +97,7 @@ std::string decode_base64(const std::string &str)
     }
 
     // Last step that might contain padding symbols
-    if(padding == 1)
+    if (padding == 1)
     {
         const int byte0 = b64_invtab[str[c]];
         const int byte1 = b64_invtab[str[c + 1]];
@@ -114,7 +106,7 @@ std::string decode_base64(const std::string &str)
         dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
         dec_b64.push_back((byte1 << 4) | (byte2 >> 2));
     }
-    else if(padding == 2)
+    else if (padding == 2)
     {
         const int byte0 = b64_invtab[str[c]];
         const int byte1 = b64_invtab[str[c + 1]];
@@ -135,7 +127,7 @@ std::string decompress_zlib(const std::string &str)
 {
     // Create and initialize decompression stream
     z_stream ds{};
-    if(inflateInit(&ds) != Z_OK)
+    if (inflateInit(&ds) != Z_OK)
     {
         return std::string();
     }
@@ -152,16 +144,15 @@ std::string decompress_zlib(const std::string &str)
         ds.next_out  = reinterpret_cast<Bytef *>(roll_buff);
 
         status = inflate(&ds, 0);
-        if(inflated_str.size() < ds.total_out)
+        if (inflated_str.size() < ds.total_out)
         {
             inflated_str.append(roll_buff, ds.total_out - inflated_str.size());
         }
-    }
-    while(status == Z_OK);
+    } while (status == Z_OK);
 
     // Finalize decompression stream
     inflateEnd(&ds);
-    if(status != Z_STREAM_END)
+    if (status != Z_STREAM_END)
     {
         return std::string();
     }
@@ -175,328 +166,317 @@ namespace arm_compute
 {
 namespace opencl
 {
-const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
-{
+const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map = {
     // Common Kernels
-    { "activation_layer", "common/activation_layer.cl" },
-    { "activation_layer_quant", "common/activation_layer_quant.cl" },
-    { "activation_layer_quant_f32", "common/activation_layer_quant.cl" },
-    { "arg_min_max_x", "common/arg_min_max.cl" },
-    { "arg_min_max_y", "common/arg_min_max.cl" },
-    { "arg_min_max_z", "common/arg_min_max.cl" },
-    { "arg_min_max_w", "common/arg_min_max.cl" },
-    { "bitwise_or", "common/bitwise_op.cl" },
-    { "bitwise_and", "common/bitwise_op.cl" },
-    { "bitwise_xor", "common/bitwise_op.cl" },
-    { "bitwise_not", "common/bitwise_op.cl" },
-    { "bounding_box_transform", "common/bounding_box_transform.cl" },
-    { "bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl" },
-    { "compare_equal", "common/comparisons.cl" },
-    { "compare_equal_quantized", "common/comparisons.cl" },
-    { "compare_notequal", "common/comparisons.cl" },
-    { "compare_notequal_quantized", "common/comparisons.cl" },
-    { "compare_greater", "common/comparisons.cl" },
-    { "compare_greater_quantized", "common/comparisons.cl" },
-    { "compare_greaterequal", "common/comparisons.cl" },
-    { "compare_greaterequal_quantized", "common/comparisons.cl" },
-    { "compare_less", "common/comparisons.cl" },
-    { "compare_less_quantized", "common/comparisons.cl" },
-    { "compare_lessequal", "common/comparisons.cl" },
-    { "compare_lessequal_quantized", "common/comparisons.cl" },
-    { "concatenate", "common/concatenate.cl" },
-    { "concatenate_width", "common/concatenate.cl" },
-    { "concatenate_height", "common/concatenate.cl" },
-    { "concatenate_width_x2", "common/concatenate.cl" },
-    { "concatenate_width_x4", "common/concatenate.cl" },
-    { "col2im", "common/col2im.cl" },
-    { "cast_down", "common/cast.cl" },
-    { "cast_up", "common/cast.cl" },
-    { "convert_fc_weights", "common/convert_fc_weights.cl" },
-    { "copy_tensor", "common/copy_tensor.cl" },
-    { "crop_tensor", "common/crop_tensor.cl" },
-    { "deconvolution_reshape", "common/deconvolution_layer.cl" },
-    { "deconvolution_upsample", "common/deconvolution_layer.cl" },
-    { "dequantization_layer", "common/dequantization_layer.cl" },
-    { "elementwise_operation_ADD", "common/elementwise_operation.cl" },
-    { "elementwise_operation_SUB", "common/elementwise_operation.cl" },
-    { "elementwise_operation_MAX", "common/elementwise_operation.cl" },
-    { "elementwise_operation_MIN", "common/elementwise_operation.cl" },
-    { "elementwise_operation_DIV", "common/elementwise_operation.cl" },
-    { "elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl" },
-    { "elementwise_operation_POWER", "common/elementwise_operation.cl" },
-    { "elementwise_operation_PRELU", "common/elementwise_operation.cl" },
-    { "elementwise_operation_AND", "common/elementwise_operation.cl" },
-    { "elementwise_operation_OR", "common/elementwise_operation.cl" },
-    { "elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_unary", "common/elementwise_unary.cl" },
-    { "elementwise_unary_quantized", "common/elementwise_unary_quantized.cl" },
-    { "fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl" },
-    { "fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl" },
-    { "fft_radix_2_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_2_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_2_axis_0", "common/fft.cl" },
-    { "fft_radix_2_axis_1", "common/fft.cl" },
-    { "fft_radix_3_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_3_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_3_axis_0", "common/fft.cl" },
-    { "fft_radix_3_axis_1", "common/fft.cl" },
-    { "fft_radix_4_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_4_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_4_axis_0", "common/fft.cl" },
-    { "fft_radix_4_axis_1", "common/fft.cl" },
-    { "fft_radix_5_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_5_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_5_axis_0", "common/fft.cl" },
-    { "fft_radix_5_axis_1", "common/fft.cl" },
-    { "fft_radix_7_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_7_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_7_axis_0", "common/fft.cl" },
-    { "fft_radix_7_axis_1", "common/fft.cl" },
-    { "fft_radix_8_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_8_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_8_axis_0", "common/fft.cl" },
-    { "fft_radix_8_axis_1", "common/fft.cl" },
-    { "fft_scale_conj", "common/fft_scale.cl" },
-    { "fill_image_borders_constant", "common/fill_border.cl" },
-    { "fill_image_borders_replicate", "common/fill_border.cl" },
-    { "floor_layer", "common/floor.cl" },
-    { "fuse_batchnormalization_layer", "common/batchnormalization_layer.cl" },
-    { "gather", "common/gather.cl" },
-    { "gemm_ma_f16", "common/gemm.cl" },
-    { "gemm_ma_f32", "common/gemm.cl" },
-    { "gemm_mv", "common/gemv.cl" },
-    { "gemm_mv_quantized", "common/gemv.cl" },
-    { "gemm_mm_native", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl" },
-    { "gemm_mm_native_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
-    { "gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
-    { "gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
-    { "gemm_lc_vm_f32", "common/gemm.cl" },
-    { "gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl" },
-    { "gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl" },
-    { "gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl" },
-    { "gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl" },
-    { "gemmlowp_matrix_a_reduction", "common/gemmlowp.cl" },
-    { "gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl" },
-    { "gemmlowp_matrix_b_reduction", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_native", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_mmul", "common/gemmlowp_reshaped_only_rhs_mmul.cl" },
-    { "gemmlowp_offset_contribution", "common/gemmlowp.cl" },
-    { "gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl" },
-    { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl" },
-    { "generate_proposals_compute_all_anchors", "common/generate_proposals.cl" },
-    { "generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl" },
-    { "instance_normalization", "common/instance_normalization.cl" },
-    { "compute_mean_var", "common/instance_normalization.cl" },
-    { "l2_normalize_x", "common/l2_normalize.cl" },
-    { "l2_normalize_y", "common/l2_normalize.cl" },
-    { "l2_normalize_z", "common/l2_normalize.cl" },
-    { "mat_mul_native_mmul_nt_nt", "common/mat_mul_mmul.cl" },
-    { "mat_mul_native_mmul_t_nt", "common/mat_mul_mmul.cl" },
-    { "mat_mul_native_mmul_nt_t", "common/mat_mul_mmul.cl" },
-    { "mat_mul_native_mmul_t_t", "common/mat_mul_mmul.cl" },
-    { "mat_mul_native_nt_nt", "common/mat_mul.cl" },
-    { "mat_mul_native_nt_t", "common/mat_mul.cl" },
-    { "mat_mul_native_t_nt", "common/mat_mul.cl" },
-    { "mat_mul_native_t_t", "common/mat_mul.cl" },
-    { "mat_mul_native_quantized_nt_nt", "common/mat_mul_quantized.cl" },
-    { "mat_mul_native_quantized_nt_t", "common/mat_mul_quantized.cl" },
-    { "mat_mul_native_quantized_t_nt", "common/mat_mul_quantized.cl" },
-    { "mat_mul_native_quantized_t_t", "common/mat_mul_quantized.cl" },
-    { "max_unpooling_layer_2", "common/unpooling_layer.cl" },
-    { "mean_stddev_normalization", "common/mean_stddev_normalization.cl" },
-    { "memset", "common/memset.cl" },
-    { "minmax_layer", "common/minmax_layer.cl" },
-    { "non_max_suppression", "common/nonmax.cl" },
-    { "pad_layer_constant", "common/pad_layer.cl" },
-    { "pad_layer_symmetric_reflect", "common/pad_layer.cl" },
-    { "permute", "common/permute.cl" },
-    { "pixelwise_mul_complex", "common/pixelwise_mul_float.cl" },
-    { "pixelwise_mul_float", "common/pixelwise_mul_float.cl" },
-    { "pixelwise_mul_int", "common/pixelwise_mul_int.cl" },
-    { "pixelwise_mul_quantized", "common/pixelwise_mul_int.cl" },
-    { "qlstm_layer_normalization", "common/qlstm_layer_normalization.cl" },
-    { "quantization_layer", "common/quantization_layer.cl" },
-    { "range", "common/range.cl" },
-    { "range_quantized", "common/range.cl" },
-    { "reduction_operation_x", "common/reduction_operation.cl" },
-    { "reduction_operation_non_parallel_x", "common/reduction_operation.cl" },
-    { "reduction_operation_y", "common/reduction_operation.cl" },
-    { "reduction_operation_z", "common/reduction_operation.cl" },
-    { "reduction_operation_w", "common/reduction_operation.cl" },
-    { "reshape_layer", "common/reshape_layer.cl" },
-    { "reshape_to_columns", "common/convolution_layer.cl" },
-    { "reverse", "common/reverse.cl" },
-    { "roi_align_layer", "common/roi_align_layer.cl" },
-    { "roi_align_layer_quantized", "common/roi_align_layer_quantized.cl" },
-    { "roi_pooling_layer", "common/roi_pooling_layer.cl" },
-    { "select_same_rank", "common/select.cl" },
-    { "select_different_rank_2", "common/select.cl" },
-    { "select_different_rank_n", "common/select.cl" },
-    { "softmax_layer_norm", "common/softmax_layer.cl" },
-    { "softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl" },
-    { "softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl" },
-    { "stack_layer", "common/stack_layer.cl" },
-    { "strided_slice", "common/slice_ops.cl" },
-    { "tile", "common/tile.cl" },
-    { "transpose", "common/transpose.cl" },
+    {"activation_layer", "common/activation_layer.cl"},
+    {"activation_layer_quant", "common/activation_layer_quant.cl"},
+    {"activation_layer_quant_f32", "common/activation_layer_quant.cl"},
+    {"arg_min_max_x", "common/arg_min_max.cl"},
+    {"arg_min_max_y", "common/arg_min_max.cl"},
+    {"arg_min_max_z", "common/arg_min_max.cl"},
+    {"arg_min_max_w", "common/arg_min_max.cl"},
+    {"bitwise_or", "common/bitwise_op.cl"},
+    {"bitwise_and", "common/bitwise_op.cl"},
+    {"bitwise_xor", "common/bitwise_op.cl"},
+    {"bitwise_not", "common/bitwise_op.cl"},
+    {"bounding_box_transform", "common/bounding_box_transform.cl"},
+    {"bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl"},
+    {"compare_equal", "common/comparisons.cl"},
+    {"compare_equal_quantized", "common/comparisons.cl"},
+    {"compare_notequal", "common/comparisons.cl"},
+    {"compare_notequal_quantized", "common/comparisons.cl"},
+    {"compare_greater", "common/comparisons.cl"},
+    {"compare_greater_quantized", "common/comparisons.cl"},
+    {"compare_greaterequal", "common/comparisons.cl"},
+    {"compare_greaterequal_quantized", "common/comparisons.cl"},
+    {"compare_less", "common/comparisons.cl"},
+    {"compare_less_quantized", "common/comparisons.cl"},
+    {"compare_lessequal", "common/comparisons.cl"},
+    {"compare_lessequal_quantized", "common/comparisons.cl"},
+    {"concatenate", "common/concatenate.cl"},
+    {"concatenate_width", "common/concatenate.cl"},
+    {"concatenate_height", "common/concatenate.cl"},
+    {"concatenate_width_x2", "common/concatenate.cl"},
+    {"concatenate_width_x4", "common/concatenate.cl"},
+    {"col2im", "common/col2im.cl"},
+    {"cast_down", "common/cast.cl"},
+    {"cast_up", "common/cast.cl"},
+    {"convert_fc_weights", "common/convert_fc_weights.cl"},
+    {"copy_tensor", "common/copy_tensor.cl"},
+    {"crop_tensor", "common/crop_tensor.cl"},
+    {"deconvolution_reshape", "common/deconvolution_layer.cl"},
+    {"deconvolution_upsample", "common/deconvolution_layer.cl"},
+    {"dequantization_layer", "common/dequantization_layer.cl"},
+    {"elementwise_operation_ADD", "common/elementwise_operation.cl"},
+    {"elementwise_operation_SUB", "common/elementwise_operation.cl"},
+    {"elementwise_operation_MAX", "common/elementwise_operation.cl"},
+    {"elementwise_operation_MIN", "common/elementwise_operation.cl"},
+    {"elementwise_operation_DIV", "common/elementwise_operation.cl"},
+    {"elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl"},
+    {"elementwise_operation_POWER", "common/elementwise_operation.cl"},
+    {"elementwise_operation_PRELU", "common/elementwise_operation.cl"},
+    {"elementwise_operation_AND", "common/elementwise_operation.cl"},
+    {"elementwise_operation_OR", "common/elementwise_operation.cl"},
+    {"elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_unary", "common/elementwise_unary.cl"},
+    {"elementwise_unary_quantized", "common/elementwise_unary_quantized.cl"},
+    {"fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl"},
+    {"fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl"},
+    {"fft_radix_2_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_2_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_2_axis_0", "common/fft.cl"},
+    {"fft_radix_2_axis_1", "common/fft.cl"},
+    {"fft_radix_3_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_3_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_3_axis_0", "common/fft.cl"},
+    {"fft_radix_3_axis_1", "common/fft.cl"},
+    {"fft_radix_4_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_4_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_4_axis_0", "common/fft.cl"},
+    {"fft_radix_4_axis_1", "common/fft.cl"},
+    {"fft_radix_5_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_5_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_5_axis_0", "common/fft.cl"},
+    {"fft_radix_5_axis_1", "common/fft.cl"},
+    {"fft_radix_7_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_7_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_7_axis_0", "common/fft.cl"},
+    {"fft_radix_7_axis_1", "common/fft.cl"},
+    {"fft_radix_8_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_8_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_8_axis_0", "common/fft.cl"},
+    {"fft_radix_8_axis_1", "common/fft.cl"},
+    {"fft_scale_conj", "common/fft_scale.cl"},
+    {"fill_image_borders_constant", "common/fill_border.cl"},
+    {"fill_image_borders_replicate", "common/fill_border.cl"},
+    {"floor_layer", "common/floor.cl"},
+    {"fuse_batchnormalization_layer", "common/batchnormalization_layer.cl"},
+    {"gather", "common/gather.cl"},
+    {"gemm_ma_f16", "common/gemm.cl"},
+    {"gemm_ma_f32", "common/gemm.cl"},
+    {"gemm_mv", "common/gemv.cl"},
+    {"gemm_mv_quantized", "common/gemv.cl"},
+    {"gemm_mm_native", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl"},
+    {"gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl"},
+    {"gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl"},
+    {"gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl"},
+    {"gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_t", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl"},
+    {"gemm_lc_vm_f32", "common/gemm.cl"},
+    {"gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl"},
+    {"gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl"},
+    {"gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl"},
+    {"gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl"},
+    {"gemmlowp_matrix_a_reduction", "common/gemmlowp.cl"},
+    {"gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl"},
+    {"gemmlowp_matrix_b_reduction", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_native", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_only_rhs_mmul", "common/gemmlowp_reshaped_only_rhs_mmul.cl"},
+    {"gemmlowp_offset_contribution", "common/gemmlowp.cl"},
+    {"gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl"},
+    {"gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl"},
+    {"generate_proposals_compute_all_anchors", "common/generate_proposals.cl"},
+    {"generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl"},
+    {"instance_normalization", "common/instance_normalization.cl"},
+    {"compute_mean_var", "common/instance_normalization.cl"},
+    {"l2_normalize_x", "common/l2_normalize.cl"},
+    {"l2_normalize_y", "common/l2_normalize.cl"},
+    {"l2_normalize_z", "common/l2_normalize.cl"},
+    {"mat_mul_native_mmul_nt_nt", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_mmul_t_nt", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_mmul_nt_t", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_mmul_t_t", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_nt_nt", "common/mat_mul.cl"},
+    {"mat_mul_native_nt_t", "common/mat_mul.cl"},
+    {"mat_mul_native_t_nt", "common/mat_mul.cl"},
+    {"mat_mul_native_t_t", "common/mat_mul.cl"},
+    {"mat_mul_native_quantized_nt_nt", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_nt_t", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_t_nt", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_t_t", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_mmul_nt_nt", "common/mat_mul_quantized_mmul.cl"},
+    {"mat_mul_native_quantized_mmul_nt_t", "common/mat_mul_quantized_mmul.cl"},
+    {"mat_mul_native_quantized_mmul_t_nt", "common/mat_mul_quantized_mmul.cl"},
+    {"mat_mul_native_quantized_mmul_t_t", "common/mat_mul_quantized_mmul.cl"},
+    {"max_unpooling_layer_2", "common/unpooling_layer.cl"},
+    {"mean_stddev_normalization", "common/mean_stddev_normalization.cl"},
+    {"memset", "common/memset.cl"},
+    {"minmax_layer", "common/minmax_layer.cl"},
+    {"non_max_suppression", "common/nonmax.cl"},
+    {"pad_layer_constant", "common/pad_layer.cl"},
+    {"pad_layer_symmetric_reflect", "common/pad_layer.cl"},
+    {"permute", "common/permute.cl"},
+    {"pixelwise_mul_complex", "common/pixelwise_mul_float.cl"},
+    {"pixelwise_mul_float", "common/pixelwise_mul_float.cl"},
+    {"pixelwise_mul_int", "common/pixelwise_mul_int.cl"},
+    {"pixelwise_mul_quantized", "common/pixelwise_mul_int.cl"},
+    {"qlstm_layer_normalization", "common/qlstm_layer_normalization.cl"},
+    {"quantization_layer", "common/quantization_layer.cl"},
+    {"range", "common/range.cl"},
+    {"range_quantized", "common/range.cl"},
+    {"reduction_operation_x", "common/reduction_operation.cl"},
+    {"reduction_operation_non_parallel_x", "common/reduction_operation.cl"},
+    {"reduction_operation_y", "common/reduction_operation.cl"},
+    {"reduction_operation_z", "common/reduction_operation.cl"},
+    {"reduction_operation_w", "common/reduction_operation.cl"},
+    {"reshape_layer", "common/reshape_layer.cl"},
+    {"reshape_to_columns", "common/convolution_layer.cl"},
+    {"reverse", "common/reverse.cl"},
+    {"roi_align_layer", "common/roi_align_layer.cl"},
+    {"roi_align_layer_quantized", "common/roi_align_layer_quantized.cl"},
+    {"roi_pooling_layer", "common/roi_pooling_layer.cl"},
+    {"select_same_rank", "common/select.cl"},
+    {"select_different_rank_2", "common/select.cl"},
+    {"select_different_rank_n", "common/select.cl"},
+    {"softmax_x", "common/softmax_layer.cl"},
+    {"softmax_non_x", "common/softmax_layer.cl"},
+    {"stack_layer", "common/stack_layer.cl"},
+    {"strided_slice", "common/slice_ops.cl"},
+    {"tile", "common/tile.cl"},
+    {"transpose", "common/transpose.cl"},
 #ifdef ENABLE_NCHW_KERNELS
-    { "batch_to_space_nchw", "nchw/batch_to_space.cl" },
-    { "batch_to_space_static_nchw", "nchw/batch_to_space.cl" },
-    { "batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl" },
-    { "channel_shuffle_nchw", "nchw/channel_shuffle.cl" },
-    { "depth_to_space_nchw", "nchw/depth_to_space.cl" },
-    { "dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl" },
-    { "direct_convolution1x1", "nchw/direct_convolution1x1.cl" },
-    { "direct_convolution_nchw", "nchw/direct_convolution.cl" },
+    {"batch_to_space_nchw", "nchw/batch_to_space.cl"},
+    {"batch_to_space_static_nchw", "nchw/batch_to_space.cl"},
+    {"batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl"},
+    {"channel_shuffle_nchw", "nchw/channel_shuffle.cl"},
+    {"depth_to_space_nchw", "nchw/depth_to_space.cl"},
+    {"dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl"},
+    {"direct_convolution1x1", "nchw/direct_convolution1x1.cl"},
+    {"direct_convolution_nchw", "nchw/direct_convolution.cl"},
 
-    { "im2col1x1_stridex1_nchw", "nchw/im2col.cl" },
-    { "im2col3x3_nchw", "nchw/im2col.cl" },
-    { "im2col5x5_nchw", "nchw/im2col.cl" },
-    { "im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl" },
-    { "im2col_generic_nchw", "nchw/im2col.cl" },
-    { "im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl" },
-    { "normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl" },
-    { "normalization_layer_in_map_nchw", "nchw/normalization_layer.cl" },
-    { "normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl" },
-    { "normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl" },
-    { "pooling_layer_MxN_nchw", "nchw/pooling_layer.cl" },
-    { "pooling_layer_2_nchw_indices", "nchw/pooling_layer.cl" },
-    { "prior_box_layer_nchw", "nchw/prior_box_layer.cl" },
-    { "reorg_layer_nchw", "nchw/reorg_layer.cl" },
-    { "scale_nearest_neighbour_nchw", "nchw/scale.cl" },
-    { "scale_bilinear_nchw", "nchw/scale.cl" },
-    { "space_to_batch_nchw", "nchw/space_to_batch.cl" },
-    { "space_to_batch_static_nchw", "nchw/space_to_batch.cl" },
-    { "space_to_depth_nchw", "nchw/space_to_depth.cl" },
-    { "upsample_layer_nchw", "nchw/upsample_layer.cl" },
-    { "winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl" },
+    {"im2col1x1_stridex1_nchw", "nchw/im2col.cl"},
+    {"im2col3x3_nchw", "nchw/im2col.cl"},
+    {"im2col5x5_nchw", "nchw/im2col.cl"},
+    {"im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl"},
+    {"im2col_generic_nchw", "nchw/im2col.cl"},
+    {"im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl"},
+    {"normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl"},
+    {"normalization_layer_in_map_nchw", "nchw/normalization_layer.cl"},
+    {"normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl"},
+    {"normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl"},
+    {"pooling_layer_MxN_nchw", "nchw/pooling_layer.cl"},
+    {"pooling_layer_2_nchw_indices", "nchw/pooling_layer.cl"},
+    {"prior_box_layer_nchw", "nchw/prior_box_layer.cl"},
+    {"reorg_layer_nchw", "nchw/reorg_layer.cl"},
+    {"scale_nearest_neighbour_nchw", "nchw/scale.cl"},
+    {"scale_bilinear_nchw", "nchw/scale.cl"},
+    {"space_to_batch_nchw", "nchw/space_to_batch.cl"},
+    {"space_to_batch_static_nchw", "nchw/space_to_batch.cl"},
+    {"space_to_depth_nchw", "nchw/space_to_depth.cl"},
+    {"upsample_layer_nchw", "nchw/upsample_layer.cl"},
+    {"winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl"},
 #endif /* ENABLE_NCHW_KERNELS */
 #ifdef ENABLE_NHWC_KERNELS
-    { "batch_to_space_nhwc", "nhwc/batch_to_space.cl" },
-    { "batch_to_space_static_nhwc", "nhwc/batch_to_space.cl" },
-    { "batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl" },
-    { "channel_shuffle_nhwc", "nhwc/channel_shuffle.cl" },
-    { "depth_to_space_nhwc", "nhwc/depth_to_space.cl" },
-    { "dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl" },
-    { "dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl" },
-    { "dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl" },
-    { "direct_convolution_nhwc", "nhwc/direct_convolution.cl" },
-    { "direct_convolution3d_ndhwc", "nhwc/direct_convolution3d.cl" },
-    { "im2col3x3_nhwc", "nhwc/im2col.cl" },
-    { "im2col9x9_nhwc", "nhwc/im2col.cl" },
-    { "im2col_generic_nhwc", "nhwc/im2col.cl" },
-    { "indirect_convolution_nhwc", "nhwc/indirect_convolution.cl" },
-    { "indirect_convolution_address_precalculation", "nhwc/indirect_convolution.cl" },
-    { "normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl" },
-    { "normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl" },
-    { "normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl" },
-    { "normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl" },
-    { "pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl" },
-    { "pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl" },
-    { "pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl" },
-    { "pooling_3d_layer_MxN_ndhwc", "nhwc/pooling_3d_layer.cl" },
-    { "pooling_3d_layer_MxN_ndhwc_quantized", "nhwc/pooling_3d_layer_quantized.cl" },
-    { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" },
-    { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" },
-    { "scale_bilinear_nhwc", "nhwc/scale.cl" },
-    { "space_to_batch_nhwc", "nhwc/space_to_batch.cl" },
-    { "space_to_batch_static_nhwc", "nhwc/space_to_batch.cl" },
-    { "space_to_depth_nhwc", "nhwc/space_to_depth.cl" },
-    { "transposed_convolution_nhwc", "nhwc/transposed_convolution.cl" },
-    { "upsample_layer_nhwc", "nhwc/upsample_layer.cl" },
-    { "winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl" },
+    {"batch_to_space_nhwc", "nhwc/batch_to_space.cl"},
+    {"batch_to_space_static_nhwc", "nhwc/batch_to_space.cl"},
+    {"batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl"},
+    {"channel_shuffle_nhwc", "nhwc/channel_shuffle.cl"},
+    {"depth_to_space_nhwc", "nhwc/depth_to_space.cl"},
+    {"dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl"},
+    {"dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl"},
+    {"dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl"},
+    {"direct_convolution_nhwc", "nhwc/direct_convolution.cl"},
+    {"direct_convolution3d_ndhwc", "nhwc/direct_convolution3d.cl"},
+    {"im2col3x3_nhwc", "nhwc/im2col.cl"},
+    {"im2col9x9_nhwc", "nhwc/im2col.cl"},
+    {"im2col_generic_nhwc", "nhwc/im2col.cl"},
+    {"indirect_convolution_nhwc", "nhwc/indirect_convolution.cl"},
+    {"indirect_convolution_address_precalculation", "nhwc/indirect_convolution.cl"},
+    {"normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl"},
+    {"normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl"},
+    {"normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl"},
+    {"normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl"},
+    {"pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl"},
+    {"pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl"},
+    {"pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl"},
+    {"pooling_3d_layer_MxN_ndhwc", "nhwc/pooling_3d_layer.cl"},
+    {"pooling_3d_layer_MxN_ndhwc_quantized", "nhwc/pooling_3d_layer_quantized.cl"},
+    {"reorg_layer_nhwc", "nhwc/reorg_layer.cl"},
+    {"scale_nearest_neighbour_nhwc", "nhwc/scale.cl"},
+    {"scale_bilinear_nhwc", "nhwc/scale.cl"},
+    {"space_to_batch_nhwc", "nhwc/space_to_batch.cl"},
+    {"space_to_batch_static_nhwc", "nhwc/space_to_batch.cl"},
+    {"space_to_depth_nhwc", "nhwc/space_to_depth.cl"},
+    {"transposed_convolution_nhwc", "nhwc/transposed_convolution.cl"},
+    {"upsample_layer_nhwc", "nhwc/upsample_layer.cl"},
+    {"winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl"},
 #endif /* ENABLE_NHWC_KERNELS */
 };
 
-const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
-{
+const std::map<std::string, std::string> ClKernelLibrary::_program_source_map = {
 #ifdef EMBEDDED_KERNELS
     {
         "activation_float_helpers.h",
@@ -621,26 +601,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
     {
         "common/gemm_utils.cl",
 #include "./cl_kernels/common/gemm_utils.clembed"
-    },
-    {
-        "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.hembed"
-    },
-    {
-        "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.hembed"
-    },
-    {
-        "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.clembed"
-    },
-    {
-        "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.clembed"
-    },
-    {
-        "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.clembed"
     },
     {
         "common/gemmlowp.cl",
@@ -769,10 +729,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
     {
         "common/softmax_layer.cl",
 #include "./cl_kernels/common/softmax_layer.clembed"
-    },
-    {
-        "common/softmax_layer_quantized.cl",
-#include "./cl_kernels/common/softmax_layer_quantized.clembed"
     },
     {
         "common/slice_ops.cl",
@@ -810,6 +766,10 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
         "common/mat_mul_quantized.cl",
 #include "./cl_kernels/common/mat_mul_quantized.clembed"
     },
+    {
+        "common/mat_mul_quantized_mmul.cl",
+#include "./cl_kernels/common/mat_mul_quantized_mmul.clembed"
+    },
 #ifdef ENABLE_NCHW_KERNELS
     {
         "nchw/batch_to_space.cl",
@@ -1017,7 +977,7 @@ std::string ClKernelLibrary::program_name(const std::string &kernel_name) const
     // Find which program contains the kernel
     auto kernel_program_it = _kernel_program_map.find(kernel_name);
 
-    if(_kernel_program_map.end() == kernel_program_it)
+    if (_kernel_program_map.end() == kernel_program_it)
     {
         ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
     }
@@ -1043,14 +1003,14 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr
 #ifdef EMBEDDED_KERNELS
 #ifdef ARM_COMPUTE_COMPRESSED_KERNELS
     const auto inflatted_program_source_it = _decompressed_source_map.find(program_name);
-    if(inflatted_program_source_it != _decompressed_source_map.end())
+    if (inflatted_program_source_it != _decompressed_source_map.end())
     {
-        return ClProgramInfo{ inflatted_program_source_it->second, false };
+        return ClProgramInfo{inflatted_program_source_it->second, false};
     }
 #endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
 
     const auto program_source_it = _program_source_map.find(program_name);
-    if(program_source_it == _program_source_map.end())
+    if (program_source_it == _program_source_map.end())
     {
         ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str());
     }
@@ -1063,7 +1023,7 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr
     program_source = std::move(decompressed_program_source);
 #endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
 
-    return ClProgramInfo{ program_source, false };
+    return ClProgramInfo{program_source, false};
 #else  /* EMBEDDED_KERNELS */
     // Check for binary
     std::string source_name = _kernel_path + program_name;
@@ -1071,12 +1031,12 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr
     std::string program_source{};
     bool        is_binary = false;
 
-    if(std::ifstream(binary_name).is_open())
+    if (std::ifstream(binary_name).is_open())
     {
         program_source = read_file(binary_name, true);
         is_binary      = true;
     }
-    else if(std::ifstream(source_name).is_open())
+    else if (std::ifstream(source_name).is_open())
     {
         program_source = read_file(source_name, false);
     }
@@ -1085,7 +1045,7 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr
         ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str());
     }
 
-    return ClProgramInfo{ program_source, is_binary };
+    return ClProgramInfo{program_source, is_binary};
 #endif /* EMBEDDED_KERNELS */
 }
 } // namespace opencl
diff --git a/src/gpu/cl/ClKernelLibrary.h b/src/gpu/cl/ClKernelLibrary.h
index 42bec95032484886f2a144872fecfd8c577a2ab4..cd1d6891993e3d31c418e0bf134fcfa52da44079 100644
--- a/src/gpu/cl/ClKernelLibrary.h
+++ b/src/gpu/cl/ClKernelLibrary.h
@@ -52,8 +52,8 @@ public:
     /** Structure to encapsulte program related information */
     struct ClProgramInfo
     {
-        std::string program{};          /**< Program raw string */
-        bool        is_binary{ false }; /**< Flag that indicates if is in binary format */
+        std::string program{};        /**< Program raw string */
+        bool        is_binary{false}; /**< Flag that indicates if is in binary format */
     };
 
 public:
@@ -84,10 +84,12 @@ public:
     std::string program_name(const std::string &kernel_name) const;
 
 private:
-    std::string _kernel_path{};                                                 /**< Path to the kernels folder. */
-    mutable std::map<std::string, std::string>      _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */
-    static const std::map<std::string, std::string> _kernel_program_map;        /**< Map that associates kernel names with programs. */
-    static const std::map<std::string, std::string> _program_source_map;        /**< Contains sources for all programs.
+    std::string _kernel_path{}; /**< Path to the kernels folder. */
+    mutable std::map<std::string, std::string>
+        _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */
+    static const std::map<std::string, std::string>
+        _kernel_program_map; /**< Map that associates kernel names with programs. */
+    static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs.
                                                                                      Used for compile-time kernel inclusion. >*/
 };
 } // namespace opencl
diff --git a/src/gpu/cl/ClQueue.cpp b/src/gpu/cl/ClQueue.cpp
index 2123adcf394bf7ae3f8b54ab81b0f91aa6cad5b8..0cb7af5b6142a4fcdd594e30b50cd74e30c9895b 100644
--- a/src/gpu/cl/ClQueue.cpp
+++ b/src/gpu/cl/ClQueue.cpp
@@ -36,7 +36,7 @@ namespace
 {
 CLTunerMode map_tuner_mode(AclTuningMode mode)
 {
-    switch(mode)
+    switch (mode)
     {
         case AclRapid:
             return CLTunerMode::RAPID;
@@ -55,7 +55,7 @@ CLTunerMode map_tuner_mode(AclTuningMode mode)
 
 std::unique_ptr<CLTuner> populate_tuner(const AclQueueOptions *options)
 {
-    if(options == nullptr || options->mode == AclTuningModeNone)
+    if (options == nullptr || options->mode == AclTuningModeNone)
     {
         return nullptr;
     }
@@ -68,8 +68,7 @@ std::unique_ptr<CLTuner> populate_tuner(const AclQueueOptions *options)
 }
 } // namespace
 
-ClQueue::ClQueue(IContext *ctx, const AclQueueOptions *options)
-    : IQueue(ctx), _tuner(nullptr)
+ClQueue::ClQueue(IContext *ctx, const AclQueueOptions *options) : IQueue(ctx), _tuner(nullptr)
 {
     _tuner = populate_tuner(options);
 }
diff --git a/src/gpu/cl/ClQueue.h b/src/gpu/cl/ClQueue.h
index b16a0f4e8386af45c055eaf76ef36036fc2e3c1d..09ffb06cf305b73145c7c569a23e4d0a4e97fdc4 100644
--- a/src/gpu/cl/ClQueue.h
+++ b/src/gpu/cl/ClQueue.h
@@ -24,10 +24,10 @@
 #ifndef SRC_GPU_CLQUEUE_H
 #define SRC_GPU_CLQUEUE_H
 
-#include "src/common/IQueue.h"
-
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/IQueue.h"
+
 #include <memory>
 
 namespace arm_compute
diff --git a/src/gpu/cl/ClTensor.cpp b/src/gpu/cl/ClTensor.cpp
index 0df07813e31ba9b23a9c6a6a5905e50124d1ab65..27422a41300b93f41a353597e32ed1caebecefff 100644
--- a/src/gpu/cl/ClTensor.cpp
+++ b/src/gpu/cl/ClTensor.cpp
@@ -31,8 +31,7 @@ namespace gpu
 {
 namespace opencl
 {
-ClTensor::ClTensor(IContext *ctx, const AclTensorDescriptor &desc)
-    : ITensorV2(ctx), _legacy_tensor()
+ClTensor::ClTensor(IContext *ctx, const AclTensorDescriptor &desc) : ITensorV2(ctx), _legacy_tensor()
 {
     ARM_COMPUTE_ASSERT((ctx != nullptr) && (ctx->type() == Target::GpuOcl));
     _legacy_tensor = std::make_unique<CLTensor>();
@@ -43,7 +42,7 @@ void *ClTensor::map()
 {
     ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr);
 
-    if(_legacy_tensor == nullptr)
+    if (_legacy_tensor == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[ClTensor:map]: Backing tensor does not exist!");
         return nullptr;
@@ -57,7 +56,7 @@ StatusCode ClTensor::unmap()
 {
     ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr);
 
-    if(_legacy_tensor == nullptr)
+    if (_legacy_tensor == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[ClTensor:unmap]: Backing tensor does not exist!");
         return StatusCode::RuntimeError;
diff --git a/src/gpu/cl/ClTensor.h b/src/gpu/cl/ClTensor.h
index 99d228c0b8cfc78e65a11ca43c46e58489ef5bd5..70184cd4bd1bb3a0e892142961955f2e53cf77d9 100644
--- a/src/gpu/cl/ClTensor.h
+++ b/src/gpu/cl/ClTensor.h
@@ -24,10 +24,10 @@
 #ifndef SRC_GPU_CLTENSOR_H
 #define SRC_GPU_CLTENSOR_H
 
-#include "src/common/ITensorV2.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
 
+#include "src/common/ITensorV2.h"
+
 namespace arm_compute
 {
 namespace gpu
@@ -54,7 +54,7 @@ public:
     void                 *map() override;
     StatusCode            unmap() override;
     arm_compute::ITensor *tensor() const override;
-    StatusCode import(void *handle, ImportMemoryType type) override;
+    StatusCode            import(void *handle, ImportMemoryType type) override;
 
 private:
     std::unique_ptr<CLTensor> _legacy_tensor;
@@ -63,4 +63,4 @@ private:
 } // namespace gpu
 } // namespace arm_compute
 
-#endif /* SRC_GPU_CLTENSOR_H */
\ No newline at end of file
+#endif /* SRC_GPU_CLTENSOR_H */
diff --git a/src/gpu/cl/IClKernel.h b/src/gpu/cl/IClKernel.h
index 52ea3c9183245790db9f1b02398ec4a827028186..4f07e9ad68c945af726add2be67cc8ca98197463 100644
--- a/src/gpu/cl/IClKernel.h
+++ b/src/gpu/cl/IClKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_ICL_KERNEL_H
 
 #include "arm_compute/core/ITensorInfo.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClActivationKernel.cpp b/src/gpu/cl/kernels/ClActivationKernel.cpp
index ab1543729f9f1d810a576d73d9ec94ef5a5c079e..a85296f7cd78cdf28b79ea7de0f2db5af9851365 100644
--- a/src/gpu/cl/kernels/ClActivationKernel.cpp
+++ b/src/gpu/cl/kernels/ClActivationKernel.cpp
@@ -28,14 +28,14 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 #include <set>
@@ -51,36 +51,47 @@ namespace
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
 
-    static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations =
-    {
-        ActivationLayerInfo::ActivationFunction::RELU,
-        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
-        ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-        ActivationLayerInfo::ActivationFunction::LOGISTIC,
-        ActivationLayerInfo::ActivationFunction::TANH,
-        ActivationLayerInfo::ActivationFunction::HARD_SWISH,
+    static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations = {
+        ActivationLayerInfo::ActivationFunction::RELU,         ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+        ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC,
+        ActivationLayerInfo::ActivationFunction::TANH,         ActivationLayerInfo::ActivationFunction::HARD_SWISH,
         ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
     };
-    const DataType                                data_type = src->data_type();
-    const QuantizationInfo                       &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
-    const ActivationLayerInfo::ActivationFunction f_act     = act_info.activation();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) && (quantized_supported_activations.count(f_act) == 0),
-                                    "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
-
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128)));
+    const DataType          data_type = src->data_type();
+    const QuantizationInfo &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
+    const ActivationLayerInfo::ActivationFunction f_act = act_info.activation();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) &&
+                                        (quantized_supported_activations.count(f_act) == 0),
+                                    "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and "
+                                    "lower/upper bounded relu are supported");
+
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 128)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, 0)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, -128)));
 
     // Checks performed when destination is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
@@ -95,15 +106,18 @@ ClActivationKernel::ClActivationKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClActivationKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info)
+void ClActivationKernel::configure(const ClCompileContext &compile_context,
+                                   ITensorInfo            *src,
+                                   ITensorInfo            *dst,
+                                   ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     _run_in_place = (dst == nullptr) || (dst == src);
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         // Destination auto inizialitation if not yet initialized
         auto_init_if_empty(*dst, *src->clone());
@@ -119,11 +133,10 @@ void ClActivationKernel::configure(const ClCompileContext &compile_context, ITen
 
     const ActivationLayerInfo::ActivationFunction f_act        = act_info.activation();
     const bool                                    is_quantized = is_data_type_quantized(dt);
-    const bool                                    perform_activation_in_float =
-        (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-        || (f_act == ActivationLayerInfo::ActivationFunction::TANH)
-        || (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-        || (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU);
+    const bool perform_activation_in_float = (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) ||
+                                             (f_act == ActivationLayerInfo::ActivationFunction::TANH) ||
+                                             (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) ||
+                                             (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU);
 
     // Set build options
     CLBuildOptions build_opts;
@@ -132,22 +145,23 @@ void ClActivationKernel::configure(const ClCompileContext &compile_context, ITen
     build_opts.add_option("-DACT=" + lower_string(string_from_activation_func(f_act)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
 
     std::string kernel_name = std::string("activation_layer");
 
     // Set quantization info build options
-    if(is_quantized)
+    if (is_quantized)
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
 
-        if(!perform_activation_in_float)
+        if (!perform_activation_in_float)
         {
             int a_const_int = 0;
             int b_const_int = 0;
 
             // Create quantized version of constants a, b if needed
-            switch(dt)
+            switch (dt)
             {
                 case DataType::QASYMM8:
                 {
@@ -180,22 +194,25 @@ void ClActivationKernel::configure(const ClCompileContext &compile_context, ITen
         }
 
         // Quantized value of 0 corresponds to the offset o1
-        build_opts.add_option(("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0")));
+        build_opts.add_option(
+            ("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0")));
         build_opts.add_option(("-DS1_VAL=" + float_to_string_with_full_precision(iq_info.scale)));
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset));
+        build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+                                 "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset));
 
         // Set correct kernel name
         kernel_name += perform_activation_in_float ? std::string("_quant_f32") : std::string("_quant");
 
         // Set scale and offset of the source and destination if they have different quantization info
-        if(dst != nullptr)
+        if (dst != nullptr)
         {
             const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
 
-            if(iq_info != oq_info)
+            if (iq_info != oq_info)
             {
                 build_opts.add_option(("-DS2_VAL=" + float_to_string_with_full_precision(oq_info.scale)));
-                build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset));
+                build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+                                         "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset));
             }
         }
     }
@@ -235,8 +252,9 @@ void ClActivationKernel::run_op(ITensorPack &tensors, const Window &window, ::cl
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON(_run_in_place && src != dst);
 
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -246,13 +264,12 @@ void ClActivationKernel::run_op(ITensorPack &tensors, const Window &window, ::cl
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, dst, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClActivationKernel.h b/src/gpu/cl/kernels/ClActivationKernel.h
index 82e35b61044fc848a2b521b3ea8e3c3efe912d28..ab7607bb8224473a69384c1029d81b92120e0750 100644
--- a/src/gpu/cl/kernels/ClActivationKernel.h
+++ b/src/gpu/cl/kernels/ClActivationKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_ACTIVATION_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -51,7 +52,10 @@ public:
      * @param[out]     dst             Destination tensor info. Data type supported: same as @p src
      * @param[in]      act_info        Activation layer information.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   ActivationLayerInfo     act_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClActivationKernel::configure()
@@ -64,7 +68,7 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    bool _run_in_place{ false };
+    bool _run_in_place{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
index 3d8ecf1fcc3a3ba362eacd8a44fff5d4250d1334..a853f6bc1b481b38545599ee1b559339f7a1adb5 100644
--- a/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
+++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -66,12 +66,15 @@ ClBatchConcatenateKernel::ClBatchConcatenateKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst)
+void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                         ITensorInfo            *src,
+                                         unsigned int            batch_offset,
+                                         ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     _batch_offset = batch_offset;
 
@@ -81,8 +84,9 @@ void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -136,8 +140,9 @@ void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice = window.first_slice_window_3D();
 
@@ -152,9 +157,8 @@ void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
-} // namespace opencl
 } // namespace kernels
+} // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.h b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h
index f6b7c0ed094544714f629357c344547968c784e4..549576b6289ce0b975e9cd65a69f87226f2713db 100644
--- a/src/gpu/cl/kernels/ClBatchConcatenateKernel.h
+++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h
@@ -53,7 +53,8 @@ public:
      * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClBatchConcatenateKernel::configure()
@@ -66,7 +67,7 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    unsigned int _batch_offset{ 0 };
+    unsigned int _batch_offset{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClCastKernel.cpp b/src/gpu/cl/kernels/ClCastKernel.cpp
index f621ad62d79155e828224370e47c3a2afc1aaf88..9ca35634f40ae72ce92b3be4bfcee301aa586ff7 100644
--- a/src/gpu/cl/kernels/ClCastKernel.cpp
+++ b/src/gpu/cl/kernels/ClCastKernel.cpp
@@ -32,10 +32,10 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -52,20 +52,17 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver
     ARM_COMPUTE_UNUSED(policy);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32, DataType::S64, DataType::U64);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
+                                                         DataType::S16, DataType::U16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32, DataType::S64, DataType::U64);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::S16, DataType::U16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == dst->data_type(), "src and dst data types must be different");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
     }
@@ -79,7 +76,10 @@ ClCastKernel::ClCastKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClCastKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
+void ClCastKernel::configure(const CLCompileContext &compile_context,
+                             const ITensorInfo      *src,
+                             ITensorInfo            *dst,
+                             ConvertPolicy           policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -88,7 +88,7 @@ void ClCastKernel::configure(const CLCompileContext &compile_context, const ITen
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Get data sizes
     const size_t src_size = data_size_from_type(src->data_type());
@@ -100,12 +100,14 @@ void ClCastKernel::configure(const CLCompileContext &compile_context, const ITen
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
     // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
     build_opts.add_option_if(is_data_type_float(src->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE");
-    build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()), "-DIS_DATA_TYPE_FLOAT");
+    build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()),
+                             "-DIS_DATA_TYPE_FLOAT");
     build_opts.add_option_if(is_data_type_quantized(src->data_type()), "-DIS_DATA_TYPE_QUANTIZED");
 
     // Create kernel
@@ -148,8 +150,9 @@ void ClCastKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::Comm
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -162,8 +165,7 @@ void ClCastKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::Comm
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClCastKernel.h b/src/gpu/cl/kernels/ClCastKernel.h
index a021b3c78c74bdbe7af95efebd141a35f25bfa50..07b0b6144310c3f52abf5e50622c4f9d724728bc 100644
--- a/src/gpu/cl/kernels/ClCastKernel.h
+++ b/src/gpu/cl/kernels/ClCastKernel.h
@@ -64,7 +64,8 @@ public:
      * @param[out] dst             The destination tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
      * @param[in]  policy          Conversion policy
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+    void
+    configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCastKernel::configure()
diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.cpp b/src/gpu/cl/kernels/ClCol2ImKernel.cpp
index 3316742912c1e8bcea2eac1d4c980711be859c7c..9972e07f05b8e12ef659f9fb33c623b19da142cb 100644
--- a/src/gpu/cl/kernels/ClCol2ImKernel.cpp
+++ b/src/gpu/cl/kernels/ClCol2ImKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -47,29 +48,38 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+Status validate_arguments(const ITensorInfo *src,
+                          const ITensorInfo *dst,
+                          const Size2D      &convolved_dims,
+                          unsigned int       num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     // Checks performed when output is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW,
+                                        "Col2Im output's data layout must always be NCHW");
     }
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups)).set_data_layout(DataLayout::NCHW));
+    auto_init_if_empty(*dst, src->clone()
+                                 ->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups))
+                                 .set_data_layout(DataLayout::NCHW));
 
     constexpr unsigned int num_elems_read_per_iteration = 8;
 
@@ -80,18 +90,22 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     AccessWindowHorizontal input_access(src, 0, num_elems_read_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input_access);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
-ClCol2ImKernel::ClCol2ImKernel()
-    : _convolved_dims()
+ClCol2ImKernel::ClCol2ImKernel() : _convolved_dims()
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClCol2ImKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+void ClCol2ImKernel::configure(const CLCompileContext &compile_context,
+                               ITensorInfo            *src,
+                               ITensorInfo            *dst,
+                               const Size2D           &convolved_dims,
+                               unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -132,11 +146,15 @@ void ClCol2ImKernel::configure(const CLCompileContext &compile_context, ITensorI
     _config_id += support::cpp11::to_string(dst->dimension(1));
 }
 
-Status ClCol2ImKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+Status ClCol2ImKernel::validate(const ITensorInfo *src,
+                                const ITensorInfo *dst,
+                                const Size2D      &convolved_dims,
+                                unsigned int       num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, convolved_dims, num_groups));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first);
     return Status{};
 }
 
@@ -168,8 +186,7 @@ void ClCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
         add_3D_tensor_argument(idx, src, slice);
         add_4D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
+    } while (collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.h b/src/gpu/cl/kernels/ClCol2ImKernel.h
index e19b7c8e16a13bcaf41d510b6c8dfb79d7e60809..34194aba01b447a032396c40308e3849b607e3c4 100644
--- a/src/gpu/cl/kernels/ClCol2ImKernel.h
+++ b/src/gpu/cl/kernels/ClCol2ImKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_COL2IM_KERNEL_H
 
 #include "arm_compute/core/Size2D.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -68,14 +69,19 @@ public:
      * @param[in]  convolved_dims  Output convolved dimensions.
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const Size2D           &convolved_dims,
+                   unsigned int            num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClCol2ImKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
index 716dec1f306ddfe3f379ce4bcefea26660665e01..85d3c3939c3187814aaa95d9109e04954f8d4e8d 100644
--- a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
+++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -45,17 +46,21 @@ ClConvertFullyConnectedWeightsKernel::ClConvertFullyConnectedWeightsKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape,
-                                                     DataLayout data_layout)
+void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context,
+                                                     const ITensorInfo      *src,
+                                                     ITensorInfo            *dst,
+                                                     const TensorShape      &original_src_shape,
+                                                     DataLayout              data_layout)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output tensor auto initialisation if not yet initialized
     auto_init_if_empty(*dst, *src->clone());
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
-    ARM_COMPUTE_ERROR_THROW_ON(ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout));
 
     const DataLayout src_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
 
@@ -85,8 +90,10 @@ void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &com
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape,
-                                                      DataLayout data_layout)
+Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src,
+                                                      const ITensorInfo *dst,
+                                                      const TensorShape &original_src_shape,
+                                                      DataLayout         data_layout)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
@@ -96,7 +103,7 @@ Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, co
     ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -110,8 +117,9 @@ void ClConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Wi
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     unsigned int idx = 0;
diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
index 16000e82f6a0f98a3404446707718c4c9d0bf2ae..0ddb54561abf82fbdb7091bbca5ff976d8a2269b 100644
--- a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
+++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
@@ -55,14 +55,21 @@ public:
      * @param[in]  original_src_shape Shape of the original src tensor (the one entering fully connected layer).
      * @param[in]  data_layout        The data layout the weights have been trained in.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   const TensorShape      &original_src_shape,
+                   DataLayout              data_layout);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClConvertFullyConnectedWeightsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_src_shape,
+                           DataLayout         data_layout);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClCopyKernel.cpp b/src/gpu/cl/kernels/ClCopyKernel.cpp
index 47194488194c38a092e8e8d4c1dfa821e0abb63d..c80ef664f52acce0c7d51bdf502f7fbea9646379 100644
--- a/src/gpu/cl/kernels/ClCopyKernel.cpp
+++ b/src/gpu/cl/kernels/ClCopyKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -50,11 +51,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Window
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
     // Validate dst if initialized
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        if(dst_window == nullptr)
+        if (dst_window == nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
         }
@@ -74,12 +75,15 @@ ClCopyKernel::ClCopyKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window)
+void ClCopyKernel::configure(const CLCompileContext &compile_context,
+                             const ITensorInfo      *src,
+                             ITensorInfo            *dst,
+                             Window                 *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, dst_window));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Create kernel
     CLBuildOptions build_opts;
@@ -93,7 +97,7 @@ void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITen
 
     const Window win_config = calculate_max_window(*src, Steps(vec_size_x));
 
-    if(dst_window != nullptr)
+    if (dst_window != nullptr)
     {
         _has_dst_window                = true;
         _dst_window                    = Window(*dst_window);
@@ -101,9 +105,11 @@ void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITen
         const int  vec_size_x_leftover = width_x % vec_size_x;
         const bool multi_access_x      = width_x >= static_cast<int32_t>(vec_size_x);
 
-        if(multi_access_x)
+        if (multi_access_x)
         {
-            _dst_window.set(Window::DimX, Window::Dimension(dst_window->x().start(), ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x));
+            _dst_window.set(Window::DimX,
+                            Window::Dimension(dst_window->x().start(),
+                                              ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x));
         }
 
         build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
@@ -127,7 +133,8 @@ void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITen
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window)
+Status
+ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, dst_window));
 
@@ -139,12 +146,13 @@ void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice;
 
-    if(_has_dst_window)
+    if (_has_dst_window)
     {
         slice            = window.first_slice_window_3D();
         Window out_slice = _dst_window.first_slice_window_3D();
@@ -154,8 +162,7 @@ void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
             add_3D_tensor_argument(idx, src, slice);
             add_3D_tensor_argument(idx, dst, out_slice);
             enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice));
+        } while (window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice));
     }
     else
     {
@@ -167,8 +174,7 @@ void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
             add_3D_tensor_argument(idx, src, slice);
             add_3D_tensor_argument(idx, dst, slice);
             enqueue(queue, *this, slice, lws_hint());
-        }
-        while(collapsed.slide_window_slice_3D(slice));
+        } while (collapsed.slide_window_slice_3D(slice));
     }
 }
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/ClCopyKernel.h b/src/gpu/cl/kernels/ClCopyKernel.h
index 63fd806586e3d9c3a3632ce6cd2bc4ed57bc7080..f915bf672d1b070a5ae94e66db7a4ee945c3a9ce 100644
--- a/src/gpu/cl/kernels/ClCopyKernel.h
+++ b/src/gpu/cl/kernels/ClCopyKernel.h
@@ -47,7 +47,10 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   Window                 *dst_window = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCopyKernel::configure()
diff --git a/src/gpu/cl/kernels/ClCropKernel.cpp b/src/gpu/cl/kernels/ClCropKernel.cpp
index 87ad6b49d9497262e639439031df519aafa81554..0c503e13fc4759896bdeef04672509cdf6852e4b 100644
--- a/src/gpu/cl/kernels/ClCropKernel.cpp
+++ b/src/gpu/cl/kernels/ClCropKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -46,8 +47,14 @@ ClCropKernel::ClCropKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClCropKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index,
-                             float extrapolation_value, Window *dst_window)
+void ClCropKernel::configure(const CLCompileContext &compile_context,
+                             const ITensorInfo      *src,
+                             ITensorInfo            *dst,
+                             Coordinates2D           start,
+                             Coordinates2D           end,
+                             uint32_t                batch_index,
+                             float                   extrapolation_value,
+                             Window                 *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, start, end, batch_index, extrapolation_value, dst_window));
@@ -60,7 +67,7 @@ void ClCropKernel::configure(const CLCompileContext &compile_context, const ITen
     // Create and update the window (if needed)
     Window win = calculate_max_window(*dst);
 
-    if(dst_window != nullptr)
+    if (dst_window != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *dst_window);
         win = *dst_window;
@@ -70,7 +77,7 @@ void ClCropKernel::configure(const CLCompileContext &compile_context, const ITen
     const bool     multi_access_x = dst_width_x >= vec_size_x;
     const bool     remainder_x    = dst_width_x % vec_size_x > 0;
 
-    if(multi_access_x)
+    if (multi_access_x)
     {
         win.set(Window::DimX,
                 Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
@@ -81,13 +88,21 @@ void ClCropKernel::configure(const CLCompileContext &compile_context, const ITen
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x && remainder_x,
+                             "-DLAST_ACCESSED_X=" +
+                                 support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
     build_opts.add_option_if(start.x > end.x, "-DWIDTH_FLIPPED=");
     build_opts.add_option_if(start.y > end.y, "-DHEIGHT_FLIPPED=");
     _kernel = create_kernel(compile_context, "crop_tensor", build_opts.options());
 }
 
-Status ClCropKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
+Status ClCropKernel::validate(const ITensorInfo *src,
+                              const ITensorInfo *dst,
+                              Coordinates2D      start,
+                              Coordinates2D      end,
+                              uint32_t           batch_index,
+                              float              extrapolation_value,
+                              Window            *dst_window)
 {
     ARM_COMPUTE_UNUSED(extrapolation_value, dst_window);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
@@ -95,14 +110,15 @@ Status ClCropKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, Co
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(start.x < 0 || start.y < 0 || end.x < 0 || end.y < 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(start.x >= static_cast<int32_t>(src->dimension(1)) || start.y >= static_cast<int32_t>(src->dimension(2))
-                                || end.x >= static_cast<int32_t>(src->dimension(1)) || end.y >= static_cast<int32_t>(src->dimension(2)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        start.x >= static_cast<int32_t>(src->dimension(1)) || start.y >= static_cast<int32_t>(src->dimension(2)) ||
+        end.x >= static_cast<int32_t>(src->dimension(1)) || end.y >= static_cast<int32_t>(src->dimension(2)));
     ARM_COMPUTE_RETURN_ERROR_ON(batch_index >= src->dimension(3));
-    if(dst_window != nullptr)
+    if (dst_window != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(dst_window->x().step() != 1);
     }
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
@@ -116,12 +132,15 @@ void ClCropKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window in_slice = Window();
     in_slice.use_tensor_dimensions(src->info()->tensor_shape());
-    in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()), window.x().step()));
+    in_slice.set(Window::DimX,
+                 Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()),
+                                   window.x().step()));
     in_slice.set(3, Window::Dimension(_batch_index, _batch_index + 1, 1));
 
     unsigned int idx = 0;
diff --git a/src/gpu/cl/kernels/ClCropKernel.h b/src/gpu/cl/kernels/ClCropKernel.h
index 2f166e184c95a03b994105dd4066df2f00b00ddf..506262608c07123f94c51d1c6c129bec4b94d8d0 100644
--- a/src/gpu/cl/kernels/ClCropKernel.h
+++ b/src/gpu/cl/kernels/ClCropKernel.h
@@ -53,16 +53,27 @@ public:
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in]  dst_window          Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                   Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   Coordinates2D           start,
+                   Coordinates2D           end,
+                   uint32_t                batch_index,
+                   float                   extrapolation_value = 0,
+                   Window                 *dst_window          = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCropKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                           Window *dst_window = nullptr);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           Coordinates2D      start,
+                           Coordinates2D      end,
+                           uint32_t           batch_index,
+                           float              extrapolation_value = 0,
+                           Window            *dst_window          = nullptr);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
index a05cd1321e562f3b64abc0a412244320010d8e0e..ec44d88f01ba35df5d3f1bc6d728b8e3b1f091c6 100644
--- a/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
+++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -48,7 +48,8 @@ Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, con
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
 
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
@@ -60,18 +61,20 @@ Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, con
 }
 } // namespace
 
-ClDepthConcatenateKernel::ClDepthConcatenateKernel()
-    : _depth_offset(0)
+ClDepthConcatenateKernel::ClDepthConcatenateKernel() : _depth_offset(0)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst)
+void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                         ITensorInfo            *src,
+                                         unsigned int            depth_offset,
+                                         ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     _depth_offset = depth_offset;
 
@@ -81,8 +84,9 @@ void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -122,8 +126,9 @@ void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice = window.first_slice_window_3D();
 
@@ -138,8 +143,7 @@ void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.h b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h
index 4739677f3bfddf955dddbbae08c0224673875e49..539f01030316236eaf1800a9ea7f8a80cc644c2c 100644
--- a/src/gpu/cl/kernels/ClDepthConcatenateKernel.h
+++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h
@@ -53,7 +53,8 @@ public:
      * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClDepthConcatenateKernel::configure()
diff --git a/src/gpu/cl/kernels/ClDequantizeKernel.cpp b/src/gpu/cl/kernels/ClDequantizeKernel.cpp
index 756cd56a8b93c0898ab5fbbba289f048152b9686..53429ab1aafa454c918ee4f067b78e480dbe93c1 100644
--- a/src/gpu/cl/kernels/ClDequantizeKernel.cpp
+++ b/src/gpu/cl/kernels/ClDequantizeKernel.cpp
@@ -34,7 +34,6 @@
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -49,9 +48,11 @@ namespace
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8,
+                                                         DataType::QSYMM16);
 
-    if(dst->tensor_shape().total_size() > 0)
+    if (dst->tensor_shape().total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
@@ -74,7 +75,7 @@ void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITen
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32);
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
@@ -87,7 +88,7 @@ void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITen
 
     // Create kernel
     CLBuildOptions build_opts;
-    if(!is_quantized_per_channel)
+    if (!is_quantized_per_channel)
     {
         const UniformQuantizationInfo qinfo   = src->quantization_info().uniform();
         const int                     qoffset = is_data_type_quantized_asymmetric(src->data_type()) ? qinfo.offset : 0;
@@ -103,16 +104,18 @@ void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITen
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+                                                                        std::max<int>(output_width_x - vec_size_x, 0)));
 
     // Create kernel name
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*dst);
-    if(multi_access_x)
+    if (multi_access_x)
     {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
@@ -136,10 +139,11 @@ void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::
     const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->info()->data_type());
 
     // Collapse windo
-    Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) : window.collapse_if_possible(ICLKernel::window(), 3);
+    Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4)
+                                                 : window.collapse_if_possible(ICLKernel::window(), 3);
     Window slice      = new_window.first_slice_window_3D();
 
-    if(is_quantized_per_channel)
+    if (is_quantized_per_channel)
     {
         unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters
         _kernel.setArg(idx++, src->quantization().scale->cl_buffer());
@@ -151,8 +155,7 @@ void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(new_window.slide_window_slice_3D(slice));
+    } while (new_window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
index 7ad398412a0bf4fe0cd6869d1b40d8ecb1d6f4db..7cf1958c1ba1867c6ed95e599224cd8c003f0c27 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -23,17 +23,18 @@
  */
 #include "src/gpu/cl/kernels/ClDirectConv2dKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
@@ -51,11 +52,17 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                          const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+Status validate_arguments(const ITensorInfo                 *src,
+                          const ITensorInfo                 *weights,
+                          const ITensorInfo                 *biases,
+                          const ITensorInfo                 *dst,
+                          const PadStrideInfo               &conv_info,
+                          const ActivationLayerInfo         &act_info,
+                          const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
 
     const DataLayout data_layout = src->data_layout();
@@ -63,41 +70,56 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_input_to_cl_image == true, "Export to CLImage is not supported for the input tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_output_to_cl_image == true, "Export to CLImage is not supported for the output tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_input_to_cl_image == true,
+                                    "Export to CLImage is not supported for the input tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_output_to_cl_image == true,
+                                    "Export to CLImage is not supported for the output tensor");
 
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9) && std::get<0>(conv_info.stride()) > 2,
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx),
+                                        "Weights should have same width and height");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3,
+                                        "Strides larger than 3 not supported for 1x1 convolution.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 ||
+                                         weights->dimension(width_idx) == 9) &&
+                                            std::get<0>(conv_info.stride()) > 2,
                                         "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled(), "Fused activation is not supported for NCHW layout");
 
-        if(is_data_type_quantized(src->data_type()))
+        if (is_data_type_quantized(src->data_type()))
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
-                                            "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 &&
+                    weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
+                "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
         }
         else
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5,
-                                            "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 &&
+                    weights->dimension(width_idx) != 5,
+                "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
         }
     }
 
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && !is_data_type_float(src->data_type()), "Fused activation in NHWC is only supported for floating point.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16,
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && !is_data_type_float(src->data_type()),
+                                        "Fused activation in NHWC is only supported for floating point.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+                                        "M0 can only be greater than 0 and less than or equal to 8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+                                            desc.n0 != 16,
                                         "N0 can only be: 1, 2, 3, 4, 8, and 16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+                                            desc.k0 != 16,
                                         "K0 can only be: 1, 2, 3, 4, 8, and 16");
-        if(desc.export_weights_to_cl_image)
+        if (desc.export_weights_to_cl_image)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
                                             "K0 can only be: 4, 8, and 16");
@@ -106,9 +128,9 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         }
     }
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(src->data_type()))
+        if (is_data_type_quantized_asymmetric(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -118,20 +140,19 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         }
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
                                         "Biases size and number of dst feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
-                                        "Biases should be one dimensional");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
 
     const auto data_type = src->data_type();
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
         const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
@@ -140,7 +161,8 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
         int   output_multiplier = 0;
         int   output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
     }
     return Status{};
 }
@@ -151,8 +173,14 @@ ClDirectConv2dKernel::ClDirectConv2dKernel()
     _type = CLKernelType::DIRECT;
 }
 
-void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                     const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+void ClDirectConv2dKernel::configure(const CLCompileContext            &compile_context,
+                                     ITensorInfo                       *src,
+                                     ITensorInfo                       *weights,
+                                     ITensorInfo                       *biases,
+                                     ITensorInfo                       *dst,
+                                     const PadStrideInfo               &conv_info,
+                                     const ActivationLayerInfo         &act_info,
+                                     const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
@@ -178,14 +206,11 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, output_shape,
-                       1,
-                       src->data_type(),
-                       src->quantization_info());
+    auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
 
     // Configure kernel window
     Window win;
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         output_shape.collapse(2U, 1U);
         const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]);
@@ -194,7 +219,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         // Create window and update padding
         win = calculate_max_window(output_shape, Steps(n0, m0));
     }
-    else if(_data_layout == DataLayout::NCHW)
+    else if (_data_layout == DataLayout::NCHW)
     {
         _num_elems_processed_per_iteration = 1u;
         win                                = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
@@ -205,7 +230,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
     std::stringstream kernel_name;
     CLBuildOptions    build_options;
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         kernel_name << "direct_convolution_nhwc";
 
@@ -221,22 +246,22 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         _export_output_to_cl_image  = desc.export_output_to_cl_image;
 
         // Update the padding for the weights tensor if we can export to cl_image
-        if(_export_weights_to_cl_image)
+        if (_export_weights_to_cl_image)
         {
             gemm::update_padding_for_cl_image(weights);
         }
 
-        if(_export_output_to_cl_image)
+        if (_export_output_to_cl_image)
         {
             gemm::update_padding_for_cl_image(dst);
         }
 
-        if(_export_input_to_cl_image)
+        if (_export_input_to_cl_image)
         {
             gemm::update_padding_for_cl_image(src);
         }
 
-        if(biases != nullptr)
+        if (biases != nullptr)
         {
             build_options.add_option(std::string("-DHAS_BIAS"));
             build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
@@ -246,9 +271,10 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         const auto act_function  = act_info.activation();
         const auto dst_data_type = dst->data_type();
 
-        if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-           && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-           && (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
+        if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+            (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+             act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+            (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
         {
             // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
             // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
@@ -259,7 +285,8 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
             build_options.add_option("-cl-fast-relaxed-math");
         }
 
-        build_options.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE", "-DSRC_TENSOR_TYPE=BUFFER");
+        build_options.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE",
+                                         "-DSRC_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
         build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(0)));
         build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(1)));
@@ -267,9 +294,11 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(0)));
         build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(1)));
         build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(2)));
-        build_options.add_option_if_else(_export_output_to_cl_image, "-DDST_TENSOR_TYPE=IMAGE", "-DDST_TENSOR_TYPE=BUFFER");
+        build_options.add_option_if_else(_export_output_to_cl_image, "-DDST_TENSOR_TYPE=IMAGE",
+                                         "-DDST_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
-        build_options.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
+        build_options.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE",
+                                         "-DWEI_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
         build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
         build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type()));
@@ -284,7 +313,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         build_options.add_option_if((src->dimension(channel_idx) % k0) != 0, "-DLEFTOVER_LOOP");
         build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_function)));
 
-        if(is_data_type_quantized(data_type))
+        if (is_data_type_quantized(data_type))
         {
             const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
             const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
@@ -314,11 +343,13 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
             build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
             build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
             build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
-            build_options.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-            build_options.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+            build_options.add_option_if(act_info.enabled(),
+                                        "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+            build_options.add_option_if(act_info.enabled(),
+                                        "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
         }
 
-        if(compile_context.get_ddk_version() >= 30)
+        if (compile_context.get_ddk_version() >= 30)
         {
             build_options.add_option("-fregister-allocation=64");
         }
@@ -340,13 +371,17 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
         build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
         build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
-        build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
+        build_options.add_option(
+            std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
         build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)));
         build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
-        build_options.add_option(std::string("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)));
-        build_options.add_option(std::string("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)));
+        build_options.add_option(
+            std::string("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)));
+        build_options.add_option(
+            std::string("-DVEC_SIZE_LEFTOVER=" +
+                        support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)));
 
-        if(is_data_type_quantized(data_type))
+        if (is_data_type_quantized(data_type))
         {
             const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
             const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
@@ -405,8 +440,13 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
     _config_id += lower_string(string_from_data_layout(_data_layout));
 }
 
-Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                      const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+Status ClDirectConv2dKernel::validate(const ITensorInfo                 *src,
+                                      const ITensorInfo                 *weights,
+                                      const ITensorInfo                 *biases,
+                                      const ITensorInfo                 *dst,
+                                      const PadStrideInfo               &conv_info,
+                                      const ActivationLayerInfo         &act_info,
+                                      const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, desc));
     return Status{};
@@ -420,52 +460,55 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
     // Get initial windows
     Window slice = window.first_slice_window_3D();
 
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         cl::Image2D weights_cl_image;
         cl::Image2D output_cl_image;
         cl::Image2D input_cl_image;
 
-        if(_export_weights_to_cl_image)
+        if (_export_weights_to_cl_image)
         {
             // Export tensor to cl_image
             weights_cl_image = create_image2d_from_tensor(weights, CLImage2DType::ReadOnly);
         }
 
-        if(_export_output_to_cl_image)
+        if (_export_output_to_cl_image)
         {
             // Export tensor to cl_image
             output_cl_image = create_image2d_from_tensor(dst, CLImage2DType::WriteOnly);
         }
 
-        if(_export_input_to_cl_image)
+        if (_export_input_to_cl_image)
         {
             // Export tensor to cl_image
             input_cl_image = create_image2d_from_tensor(src, CLImage2DType::ReadOnly);
         }
 
         unsigned int idx = 0;
-        if(_export_input_to_cl_image)
+        if (_export_input_to_cl_image)
         {
             _kernel.setArg(idx++, input_cl_image);
         }
         add_4d_tensor_nhwc_argument(idx, src);
-        if(_export_output_to_cl_image)
+        if (_export_output_to_cl_image)
         {
             _kernel.setArg(idx++, output_cl_image);
         }
         add_4d_tensor_nhwc_argument(idx, dst);
-        if(_export_weights_to_cl_image)
+        if (_export_weights_to_cl_image)
         {
             _kernel.setArg(idx++, weights_cl_image);
         }
         add_4d_tensor_nhwc_argument(idx, weights);
-        if(biases != nullptr)
+        if (biases != nullptr)
         {
             add_1D_tensor_argument(idx, biases, slice);
         }
@@ -476,7 +519,7 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
         unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
         add_3D_tensor_argument(idx1, weights, slice);
 
-        if(biases != nullptr)
+        if (biases != nullptr)
         {
             Window slice_biases;
             slice_biases.use_tensor_dimensions(biases->info()->tensor_shape());
@@ -491,8 +534,7 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
             add_3D_tensor_argument(idx, src, slice);
             add_3D_tensor_argument(idx, dst, slice);
             enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_3D(slice));
+        } while (window.slide_window_slice_3D(slice));
     }
 }
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/gpu/cl/kernels/ClDirectConv2dKernel.h
index 7132762b35aa161298d60fc9148b38057406aa67..c934c825cabbc58ac73910e4d9d08d14975d1880 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.h
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -68,16 +69,27 @@ public:
      * @param[in]  act_info        Contains activaton information described in @ref ActivationLayerInfo.
      * @param[in]  desc            Direct convolution descriptor used to build the NHWC direct convolution kernel. For NCHW, this parameter is ignored.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                   const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc);
+    void configure(const CLCompileContext            &compile_context,
+                   ITensorInfo                       *src,
+                   ITensorInfo                       *weights,
+                   ITensorInfo                       *biases,
+                   ITensorInfo                       *dst,
+                   const PadStrideInfo               &conv_info,
+                   const ActivationLayerInfo         &act_info,
+                   const DirectConvComputeKernelInfo &desc);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClDirectConv2dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                           const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc);
+    static Status validate(const ITensorInfo                 *src,
+                           const ITensorInfo                 *weights,
+                           const ITensorInfo                 *biases,
+                           const ITensorInfo                 *dst,
+                           const PadStrideInfo               &conv_info,
+                           const ActivationLayerInfo         &act_info,
+                           const DirectConvComputeKernelInfo &desc);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -85,9 +97,9 @@ public:
 public:
     DataLayout    _data_layout{};
     PadStrideInfo _conv_info{};
-    bool          _export_weights_to_cl_image{ false };
-    bool          _export_output_to_cl_image{ false };
-    bool          _export_input_to_cl_image{ false };
+    bool          _export_weights_to_cl_image{false};
+    bool          _export_output_to_cl_image{false};
+    bool          _export_input_to_cl_image{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
index 6191178911ef6f2a235cebc82258506b3bee2e1d..8002520a879fb1c277a0c6a4116c0a250a10da49 100644
--- a/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -40,7 +41,11 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status validate_arguments(const ITensorInfo *src0,
+                          const ITensorInfo *src1,
+                          const ITensorInfo *src2,
+                          const ITensorInfo *dst,
+                          const Conv3dInfo  &conv3d_info)
 {
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
@@ -49,20 +54,25 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv3d_info.act_info.enabled(), "Fused activation not supported");
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv3d_info.dilation != Size3D(1U, 1U, 1U));
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->dimension(1) != src0->dimension(0), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->dimension(1) != src0->dimension(0),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 5, "Weights can be at most 5 dimensional");
 
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(2) > (src0->dimension(1) + conv3d_info.padding.left + conv3d_info.padding.right));
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(3) > (src0->dimension(2) + conv3d_info.padding.top + conv3d_info.padding.bottom));
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(4) > (src0->dimension(3) + conv3d_info.padding.front + conv3d_info.padding.back));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(2) >
+                                (src0->dimension(1) + conv3d_info.padding.left + conv3d_info.padding.right));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(3) >
+                                (src0->dimension(2) + conv3d_info.padding.top + conv3d_info.padding.bottom));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(4) >
+                                (src0->dimension(3) + conv3d_info.padding.front + conv3d_info.padding.back));
 
-    if(src2 != nullptr)
+    if (src2 != nullptr)
     {
-        if(is_data_type_quantized(src0->data_type()))
+        if (is_data_type_quantized(src0->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32);
         }
@@ -70,15 +80,18 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
         }
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0),
+                                        "Biases size and number of dst feature maps should match");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src1->dimension(0), "Weights and dst OFMs should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv3d_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(),
+            misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv3d_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
 
@@ -91,8 +104,12 @@ ClDirectConv3dKernel::ClDirectConv3dKernel()
     _type = CLKernelType::DIRECT;
 }
 
-void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst,
-                                     const Conv3dInfo &conv3d_info)
+void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context,
+                                     const ITensorInfo      *src0,
+                                     const ITensorInfo      *src1,
+                                     const ITensorInfo      *src2,
+                                     ITensorInfo            *dst,
+                                     const Conv3dInfo       &conv3d_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
@@ -149,13 +166,13 @@ void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, co
     build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
     build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
 
-    if(src2 != nullptr)
+    if (src2 != nullptr)
     {
         build_options.add_option(std::string("-DHAS_BIAS"));
         build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(src2->data_type())));
     }
 
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         const UniformQuantizationInfo iqinfo = src0->quantization_info().uniform();
         const UniformQuantizationInfo wqinfo = src1->quantization_info().uniform();
@@ -218,7 +235,11 @@ void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, co
     _config_id += support::cpp11::to_string(dst_channels);
 }
 
-Status ClDirectConv3dKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status ClDirectConv3dKernel::validate(const ITensorInfo *src0,
+                                      const ITensorInfo *src1,
+                                      const ITensorInfo *src2,
+                                      const ITensorInfo *dst,
+                                      const Conv3dInfo  &conv3d_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, conv3d_info));
     return Status{};
@@ -229,21 +250,28 @@ void ClDirectConv3dKernel::run_op(ITensorPack &tensors, const Window &window, cl
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     // Get initial windows
     Window slice = window.first_slice_window_3D();
-    slice.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2) * dst->info()->dimension(3), slice.y().step()), slice.y().step()));
+    slice.set(Window::DimY, Window::Dimension(0,
+                                              ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2) *
+                                                                   dst->info()->dimension(3),
+                                                               slice.y().step()),
+                                              slice.y().step()));
     slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(4), 1));
 
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, src, slice);
     add_4D_tensor_argument(idx, dst, slice);
     add_4D_tensor_argument(idx, weights, slice);
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         add_1D_tensor_argument(idx, biases, slice);
     }
diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.h b/src/gpu/cl/kernels/ClDirectConv3dKernel.h
index de4f0ce216c26ab7dd77d8b888bc958d6fab4920..cb7509d8fa2aa707fd0a74e7a805a1a69fb86be8 100644
--- a/src/gpu/cl/kernels/ClDirectConv3dKernel.h
+++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.h
@@ -70,14 +70,23 @@ public:
      * @param[out] dst             Destination tensor. 4 lower dimensions represent a single dst [OFM, width, height, depth], while the rest represent batch of dsts.
      * @param[in]  conv3d_info     Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   const ITensorInfo      *src2,
+                   ITensorInfo            *dst,
+                   const Conv3dInfo       &conv3d_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClDirectConv3dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv3d_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.cpp b/src/gpu/cl/kernels/ClElementwiseKernel.cpp
index 6beee576b5d21cdd3fc583f71c4156ed27489290..cdb3527a92147430454be9bc5f650d46a448fa69 100644
--- a/src/gpu/cl/kernels/ClElementwiseKernel.cpp
+++ b/src/gpu/cl/kernels/ClElementwiseKernel.cpp
@@ -23,18 +23,20 @@
  */
 #include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/common/utils/Validate.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
+
 #include <map>
 
 namespace arm_compute
@@ -47,25 +49,20 @@ namespace
 {
 constexpr unsigned int vector_size_byte_opencl = 16;
 
-std::map<ArithmeticOperation, std::string> supported_arithmetic_ops =
-{
-    { ArithmeticOperation::ADD, "ADD" },
-    { ArithmeticOperation::SUB, "SUB" },
-    { ArithmeticOperation::DIV, "DIV" },
-    { ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF" },
-    { ArithmeticOperation::MIN, "MIN" },
-    { ArithmeticOperation::MAX, "MAX" },
-    { ArithmeticOperation::POWER, "POWER" },
-    { ArithmeticOperation::PRELU, "PRELU" },
+std::map<ArithmeticOperation, std::string> supported_arithmetic_ops = {
+    {ArithmeticOperation::ADD, "ADD"},     {ArithmeticOperation::SUB, "SUB"},
+    {ArithmeticOperation::DIV, "DIV"},     {ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF"},
+    {ArithmeticOperation::MIN, "MIN"},     {ArithmeticOperation::MAX, "MAX"},
+    {ArithmeticOperation::POWER, "POWER"}, {ArithmeticOperation::PRELU, "PRELU"},
 };
 
-std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops =
-{
-    { ArithmeticOperation::ADD, "ADD" },
-    { ArithmeticOperation::SUB, "SUB" },
+std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops = {
+    {ArithmeticOperation::ADD, "ADD"},
+    {ArithmeticOperation::SUB, "SUB"},
 };
 
-std::string generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
+std::string
+generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
 {
     std::string config_id;
     // Set config_id for enabling LWS tuning
@@ -79,12 +76,18 @@ std::string generate_id_for_tuning_common(const std::string &kernel_name, const
     return config_id;
 }
 
-Status validate_in_place_output_shape(const bool in_place, const bool src1_in_place, const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const TensorShape &out_shape)
+Status validate_in_place_output_shape(const bool         in_place,
+                                      const bool         src1_in_place,
+                                      const ITensorInfo &src1,
+                                      const ITensorInfo &src2,
+                                      const ITensorInfo &dst,
+                                      const TensorShape &out_shape)
 {
-    if(in_place)
+    if (in_place)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0),
-                                        "Wrong shape for dst, cannot do in_place calculation");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0),
+            "Wrong shape for dst, cannot do in_place calculation");
     }
     else
     {
@@ -94,7 +97,9 @@ Status validate_in_place_output_shape(const bool in_place, const bool src1_in_pl
     return Status{};
 }
 
-Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1,
+                                                          const ITensorInfo &src2,
+                                                          const ITensorInfo &dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&src1, &src2, &dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1);
@@ -110,11 +115,12 @@ Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
     }
 
     return Status{};
@@ -136,25 +142,27 @@ Status validate_arguments_divide_operation(const ITensorInfo *src1, const ITenso
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape));
     }
 
     return Status{};
 }
 
-Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+Status
+validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                         DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2);
 
-    if(is_data_type_quantized_symmetric(src1.data_type()))
+    if (is_data_type_quantized_symmetric(src1.data_type()))
     {
         const int32_t in1_offset = src1.quantization_info().uniform().offset;
         const int32_t in2_offset = src2.quantization_info().uniform().offset;
@@ -170,13 +178,15 @@ Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const I
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), "Wrong shape for dst");
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
+                                        "Wrong shape for dst");
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
 
-        if(is_data_type_quantized_symmetric(dst.data_type()))
+        if (is_data_type_quantized_symmetric(dst.data_type()))
         {
             const int32_t offset = dst.quantization_info().uniform().offset;
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(offset != 0, "For quantized symmetric, offset must be zero");
@@ -185,19 +195,26 @@ Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const I
     return Status{};
 }
 
-CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const std::string &operation_string)
+CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1,
+                                                            const ITensorInfo &src2,
+                                                            const ITensorInfo &dst,
+                                                            const std::string &operation_string)
 {
     CLBuildOptions build_opts;
 
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
 
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1.data_type()));
-    build_opts.add_option("-DVEC_SIZE_IN1=" + support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_IN2=" + support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_IN1=" +
+                          support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_IN2=" +
+                          support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
     build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DOP=" + operation_string);
-    if(is_data_type_quantized(src1.data_type()))
+    if (is_data_type_quantized(src1.data_type()))
     {
         const UniformQuantizationInfo iq1info = src1.quantization_info().uniform();
         const UniformQuantizationInfo iq2info = src2.quantization_info().uniform();
@@ -223,14 +240,17 @@ CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &s
 
 std::pair<Status, Window> configure_window_arithmetic_common(ITensorInfo &dst)
 {
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
-    Window             win                               = calculate_max_window(dst, Steps(num_elems_processed_per_iteration));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
+    Window win = calculate_max_window(dst, Steps(num_elems_processed_per_iteration));
     return std::make_pair(Status{}, win);
 }
 
-std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+    const std::pair<TensorShape, ValidRegion> broadcast_pair =
+        ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
     const TensorShape &out_shape = broadcast_pair.first;
 
     auto_init_if_empty(dst, out_shape, 1, src1.data_type());
@@ -238,9 +258,11 @@ std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators
     return configure_window_arithmetic_common(dst);
 }
 
-std::pair<Status, Window> validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+    const std::pair<TensorShape, ValidRegion> broadcast_pair =
+        ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
     const TensorShape &out_shape = broadcast_pair.first;
 
     set_shape_if_empty(dst, out_shape);
@@ -249,9 +271,11 @@ std::pair<Status, Window> validate_and_configure_window_for_logical_binary_opera
     return configure_window_arithmetic_common(dst);
 }
 
-std::pair<Status, Window> validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+    const std::pair<TensorShape, ValidRegion> broadcast_pair =
+        ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
     const TensorShape &out_shape = broadcast_pair.first;
 
     auto_init_if_empty(dst, out_shape, 1, src1.data_type());
@@ -265,21 +289,24 @@ ClElementwiseKernel::ClElementwiseKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
+void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context,
+                                           ITensorInfo            *src1,
+                                           ITensorInfo            *src2,
+                                           ITensorInfo            *dst)
 {
     // Configure kernel window
     auto win_config = validate_and_configure_window(*src1, *src2, *dst);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     std::string kernel_name = "elementwise_operation_" + name();
-    if(is_data_type_quantized(src1->data_type()))
+    if (is_data_type_quantized(src1->data_type()))
     {
         kernel_name += "_quantized";
     }
 
     // Set kernel build options
     CLBuildOptions build_opts = generate_build_options(*src1, *src2, *dst);
-    if(_act_info.enabled())
+    if (_act_info.enabled())
     {
         build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(_act_info.activation())));
         build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(_act_info.a()));
@@ -299,9 +326,11 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src_0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
 
@@ -311,17 +340,18 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c
 
     bool       can_collapse = true;
     const bool is_vector    = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -337,7 +367,7 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src_0, slice_src1);
         add_3D_tensor_argument(idx, src_1, slice_src2);
-        if(!in_place)
+        if (!in_place)
         {
             add_3D_tensor_argument(idx, dst, slice);
         }
@@ -345,13 +375,16 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c
         enqueue(queue, *this, slice, lws_hint());
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
 /** Logical binary */
 
-void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
+void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context,
+                                      LogicalOperation        op,
+                                      ITensorInfo            *src1,
+                                      ITensorInfo            *src2,
+                                      ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(ClLogicalBinaryKernel::validate(op, src1, src2, dst));
@@ -359,7 +392,10 @@ void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, L
     configure_common(compile_context, src1, src2, dst);
 }
 
-Status ClLogicalBinaryKernel::validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+Status ClLogicalBinaryKernel::validate(LogicalOperation   op,
+                                       const ITensorInfo *src1,
+                                       const ITensorInfo *src2,
+                                       const ITensorInfo *dst)
 {
     ARM_COMPUTE_UNUSED(op);
     ARM_COMPUTE_ASSERT(op != LogicalOperation::Unknown && op != LogicalOperation::Not);
@@ -369,14 +405,16 @@ Status ClLogicalBinaryKernel::validate(LogicalOperation op, const ITensorInfo *s
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
 
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone())
+            .first);
 
     return Status{};
 }
 
 std::string ClLogicalBinaryKernel::name()
 {
-    switch(_op)
+    switch (_op)
     {
         case LogicalOperation::And:
             return "AND";
@@ -390,30 +428,38 @@ std::string ClLogicalBinaryKernel::name()
     return "";
 }
 
-std::pair<Status, Window> ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
     return validate_and_configure_window_for_logical_binary_operators(src1, src2, dst);
 }
 
-CLBuildOptions ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+CLBuildOptions
+ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
 {
     // The arithmetic utility functions can be share
     return generate_build_options_with_arithmetic_rules(src1, src2, dst, name());
 }
 
-std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
+std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name,
+                                                          const ITensorInfo &src1,
+                                                          const ITensorInfo &dst)
 {
     return generate_id_for_tuning_common(kernel_name, src1, dst);
 }
 
 /** Arithmetic operations with saturation*/
-void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
+void ClSaturatedArithmeticKernel::configure(const ClCompileContext    &compile_context,
+                                            ArithmeticOperation        op,
+                                            ITensorInfo               *input1,
+                                            ITensorInfo               *input2,
+                                            ITensorInfo               *output,
                                             const ConvertPolicy       &policy,
                                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(ClSaturatedArithmeticKernel::validate(op, input1, input2, output, policy, act_info));
-    auto padding_info = get_padding_info({ input1, input2, output });
+    auto padding_info = get_padding_info({input1, input2, output});
 
     _policy   = policy;
     _op       = op;
@@ -422,24 +468,34 @@ void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_cont
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
+Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation        op,
+                                             const ITensorInfo         *input1,
+                                             const ITensorInfo         *input2,
+                                             const ITensorInfo         *output,
+                                             const ConvertPolicy       &policy,
                                              const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(op, policy);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone())
+            .first);
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type()));
 
     return Status{};
 }
 
-std::pair<Status, Window> ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+std::pair<Status, Window> ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1,
+                                                                                     ITensorInfo &input2,
+                                                                                     ITensorInfo &output)
 {
     return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
 }
 
-CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1,
+                                                                   const ITensorInfo &input2,
+                                                                   const ITensorInfo &output)
 {
     const bool has_float_out = is_data_type_float(output.data_type());
     auto       build_options = generate_build_options_with_arithmetic_rules(input1, input2, output, name());
@@ -447,7 +503,9 @@ CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensor
     return build_options;
 }
 
-std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
+std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name,
+                                                                const ITensorInfo &input1,
+                                                                const ITensorInfo &output)
 {
     auto config_id = generate_id_for_tuning_common(kernel_name, input1, output);
     config_id += (_policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_";
@@ -461,12 +519,16 @@ std::string ClSaturatedArithmeticKernel::name()
 }
 
 /** Arithmetic operations*/
-void ClArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
+void ClArithmeticKernel::configure(const ClCompileContext    &compile_context,
+                                   ArithmeticOperation        op,
+                                   ITensorInfo               *src1,
+                                   ITensorInfo               *src2,
+                                   ITensorInfo               *dst,
                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(ClArithmeticKernel::validate(op, src1, src2, dst, act_info));
-    auto padding_info = get_padding_info({ src1, src2, dst });
+    auto padding_info = get_padding_info({src1, src2, dst});
 
     _op       = op;
     _act_info = act_info;
@@ -474,33 +536,42 @@ void ClArithmeticKernel::configure(const ClCompileContext &compile_context, Arit
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClArithmeticKernel::validate(ArithmeticOperation        op,
+                                    const ITensorInfo         *src1,
+                                    const ITensorInfo         *src2,
+                                    const ITensorInfo         *dst,
+                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
-    if(op == ArithmeticOperation::DIV)
+    if (op == ArithmeticOperation::DIV)
     {
         // Partial integer support S32/F32/F16
         ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_divide_operation(src1, src2, dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
     }
-    else if(op == ArithmeticOperation::POWER)
+    else if (op == ArithmeticOperation::POWER)
     {
         // Power operators doesn't support integer arithmetic
         ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_float_only_supported_rules(*src1, *src2, *dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
     }
     else
     {
         ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone())
+                .first);
     }
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
 
     return Status{};
 }
-std::pair<Status, Window> ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
-    if(_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER)
+    if (_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER)
     {
         // Division and Power operators don't support integer arithmetic
         return validate_and_configure_window_for_division(src1, src2, dst);
@@ -511,11 +582,14 @@ std::pair<Status, Window> ClArithmeticKernel::validate_and_configure_window(ITen
     }
 }
 
-CLBuildOptions ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+CLBuildOptions
+ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
 {
     return generate_build_options_with_arithmetic_rules(src1, src2, dst, name());
 }
-std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
+std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name,
+                                                       const ITensorInfo &src1,
+                                                       const ITensorInfo &dst)
 {
     return generate_id_for_tuning_common(kernel_name, src1, dst);
 }
diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.h b/src/gpu/cl/kernels/ClElementwiseKernel.h
index ea3ddb2124c14a3cb08730a307b0b33bc623de20..73e54542b2fb748934448a654770e4ba874765c4 100644
--- a/src/gpu/cl/kernels/ClElementwiseKernel.h
+++ b/src/gpu/cl/kernels/ClElementwiseKernel.h
@@ -25,8 +25,9 @@
 #define ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
-#include "src/core/KernelTypes.h"
+
 #include "src/core/common/Macros.h"
+#include "src/core/KernelTypes.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 
@@ -65,24 +66,28 @@ protected:
      *
      * @return a pair of Status and Window
      */
-    virtual std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0;
+    virtual std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0;
 
     /** Generate the build options for the specific kernel
      *
      * @reutrn a CLBuildOptions struct
      */
-    virtual CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0;
+    virtual CLBuildOptions
+    generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0;
 
     /** Generate the identifier for tuning
      *
      * @reutrn a string
      */
-    virtual std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0;
+    virtual std::string
+    generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0;
 
     /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
      *
      */
-    void configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
+    void
+    configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
 
     ActivationLayerInfo _act_info{};
 };
@@ -100,23 +105,31 @@ public:
      * @param[in] src2            Second source tensor info. Data types supported: same as @p src1.
      * @param[in] dst             Destination tensor info. Data types supported: same as @p src1.
      */
-    void configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
+    void configure(const ClCompileContext &compile_context,
+                   LogicalOperation        op,
+                   ITensorInfo            *src1,
+                   ITensorInfo            *src2,
+                   ITensorInfo            *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClLogicalBinaryKernel::configure()
      *
      * @return a status
      */
-    static Status validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
+    static Status
+    validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
 
 private:
     // Inherited methods overridden:
     std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
-
-    LogicalOperation _op{ LogicalOperation::Unknown };
+    std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
+    CLBuildOptions
+    generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
+    std::string
+    generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
+
+    LogicalOperation _op{LogicalOperation::Unknown};
 };
 
 /** Addition operation */
@@ -135,7 +148,12 @@ public:
      * @param[in] policy          Policy to use to handle overflow.
      * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy,
+    void configure(const ClCompileContext    &compile_context,
+                   ArithmeticOperation        op,
+                   ITensorInfo               *input1,
+                   ITensorInfo               *input2,
+                   ITensorInfo               *output,
+                   const ConvertPolicy       &policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     /** Static function to check if given info will lead to a valid configuration
@@ -144,15 +162,23 @@ public:
      *
      * @return a status
      */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
+    static Status validate(ArithmeticOperation        op,
+                           const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ConvertPolicy       &policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
 protected:
     // Inherited methods overridden:
     std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override;
+    std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
+    CLBuildOptions
+    generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
+    std::string generate_id_for_tuning(const std::string &kernel_name,
+                                       const ITensorInfo &input1,
+                                       const ITensorInfo &output) override;
 
 private:
     ConvertPolicy       _policy{};
@@ -174,7 +200,11 @@ public:
      * @param[in] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
+    void configure(const ClCompileContext    &compile_context,
+                   ArithmeticOperation        op,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     /** Static function to check if given info will lead to a valid configuration
@@ -183,14 +213,21 @@ public:
      *
      * @return a status
      */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(ArithmeticOperation        op,
+                           const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
 protected:
     // Inherited methods overridden:
     std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
+    std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
+    CLBuildOptions
+    generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
+    std::string
+    generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
 
 private:
     ArithmeticOperation _op{};
diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
index 744a3a40c70c3f35ada7556aff2c265ff5d2a558..f7c198ee54a64c642da93ed06891efb0899682d6 100644
--- a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
+++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
@@ -23,11 +23,12 @@
  */
 #include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
 
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -46,17 +47,18 @@ constexpr unsigned int vector_size_byte_opencl = 16;
 Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const ElementWiseUnary op)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    if(op == ElementWiseUnary::LOGICAL_NOT)
+    if (op == ElementWiseUnary::LOGICAL_NOT)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::U8);
     }
-    else if(op == ElementWiseUnary::NEG)
+    else if (op == ElementWiseUnary::NEG)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32);
     }
-    else if(op == ElementWiseUnary::RSQRT) // Allow quantized types for only RSQRT.
+    else if (op == ElementWiseUnary::RSQRT) // Allow quantized types for only RSQRT.
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                             DataType::QASYMM8_SIGNED);
     }
     else
     {
@@ -64,7 +66,7 @@ Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const
     }
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
@@ -80,19 +82,23 @@ ClElementWiseUnaryKernel::ClElementWiseUnaryKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op)
+void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context,
+                                         const ITensorInfo      *src,
+                                         ITensorInfo            *dst,
+                                         const ElementWiseUnary &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src, *dst, op));
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst->element_size(), dst->dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / dst->element_size(), dst->dimension(0));
 
-    std::string kernel_name    = "elementwise_unary";
-    const int   vec_size_x     = num_elems_processed_per_iteration;
-    const int   dst_width_x    = dst->dimension(0);
-    if(is_data_type_quantized(src->data_type()))
+    std::string kernel_name = "elementwise_unary";
+    const int   vec_size_x  = num_elems_processed_per_iteration;
+    const int   dst_width_x = dst->dimension(0);
+    if (is_data_type_quantized(src->data_type()))
     {
         kernel_name += "_quantized";
     }
@@ -101,7 +107,7 @@ void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option("-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
-    if(is_data_type_quantized(src->data_type()))
+    if (is_data_type_quantized(src->data_type()))
     {
         const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
         const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
@@ -110,7 +116,7 @@ void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context
         build_opts.add_option("-DSCALE_IN=" + float_to_string_with_full_precision(iqinfo.scale));
         build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
     }
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::RSQRT:
             build_opts.add_option("-DOPERATION=rsqrt_op");
@@ -169,8 +175,9 @@ void ClElementWiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     do
     {
@@ -178,8 +185,7 @@ void ClElementWiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h
index 0f270f25e887b91172bd6ed107c5930f8e221b9d..81721f8ca8365ec231f7a03890872744f71ceff5 100644
--- a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h
+++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h
@@ -47,7 +47,10 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      * @param[in]  op              Element wise unary operation to perform.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   const ElementWiseUnary &op);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementWiseUnaryKernel::configure()
diff --git a/src/gpu/cl/kernels/ClFillKernel.cpp b/src/gpu/cl/kernels/ClFillKernel.cpp
index a9345ee334a508bf7d12a5ffe2a066a6ed0c3a8a..96ad503730eacbf08d1fe9754e01bc3284ed43db 100644
--- a/src/gpu/cl/kernels/ClFillKernel.cpp
+++ b/src/gpu/cl/kernels/ClFillKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -47,9 +48,10 @@ ClFillKernel::ClFillKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor,
-                             const PixelValue &constant_value,
-                             Window           *window)
+void ClFillKernel::configure(const CLCompileContext &compile_context,
+                             ITensorInfo            *tensor,
+                             const PixelValue       &constant_value,
+                             Window                 *window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
     ARM_COMPUTE_ERROR_THROW_ON(validate(tensor, constant_value, window));
@@ -60,7 +62,7 @@ void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInf
     // Create and update the window (if needed)
     _full_window = calculate_max_window(*tensor);
     Window win   = _full_window;
-    if(window != nullptr)
+    if (window != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window);
         win = *window;
@@ -70,9 +72,10 @@ void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInf
     const bool multi_access_x = output_width_x >= vec_size_x;
     const bool remainder_x    = output_width_x % vec_size_x > 0;
 
-    if(multi_access_x)
+    if (multi_access_x)
     {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
@@ -81,7 +84,9 @@ void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInf
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x && remainder_x,
+                             "-DLAST_ACCESSED_X=" +
+                                 support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
     _kernel = create_kernel(compile_context, "memset", build_opts.options());
 }
 
@@ -89,7 +94,7 @@ Status ClFillKernel::validate(const ITensorInfo *tensor, const PixelValue &const
 {
     ARM_COMPUTE_UNUSED(tensor);
     ARM_COMPUTE_UNUSED(constant_value);
-    if(window != nullptr)
+    if (window != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1);
     }
@@ -101,7 +106,8 @@ void ClFillKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto tensor =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
 
     // Collapse all the batches on the third
     Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ);
@@ -112,8 +118,7 @@ void ClFillKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, tensor, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClFillKernel.h b/src/gpu/cl/kernels/ClFillKernel.h
index f25cf928ad702752f5936a3f42eceaa12fbd5355..5d69fbfbd1a2626ed7d3c984f97cb51659438575 100644
--- a/src/gpu/cl/kernels/ClFillKernel.h
+++ b/src/gpu/cl/kernels/ClFillKernel.h
@@ -47,7 +47,10 @@ public:
      * @param[in]     constant_value  The value used to fill the planes of the tensor
      * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *tensor,
+                   const PixelValue       &constant_value,
+                   Window                 *window = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClFillKernel::configure()
diff --git a/src/gpu/cl/kernels/ClFloorKernel.cpp b/src/gpu/cl/kernels/ClFloorKernel.cpp
index f9f834875ad2d7f1f28551fd96d0a280421aa198..358e84012bf93d3048a5cec6f08a870eca8ae05c 100644
--- a/src/gpu/cl/kernels/ClFloorKernel.cpp
+++ b/src/gpu/cl/kernels/ClFloorKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -52,7 +53,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
 
     // Validate in case of configured output
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -76,9 +77,9 @@ void ClFloorKernel::configure(const ClCompileContext &compile_context, const ITe
 
     // Validate
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
+    const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
     const int          vec_size_x_leftovers = src->dimension(0) % vec_size_x;
     CLBuildOptions     build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
@@ -105,8 +106,9 @@ void ClFloorKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
@@ -117,8 +119,7 @@ void ClFloorKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
index accafeecc267ac03ea25b4ec51226f79174838bf..e0d925dfb2981b36faa232bf1c193c6320ea4b31 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
@@ -29,14 +29,13 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -50,26 +49,35 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMReshapeInfo &gemm_info)
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *dst,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMReshapeInfo   &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    if(src0->data_type() == DataType::QASYMM8)
+    if (src0->data_type() == DataType::QASYMM8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8,
+                                                             DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m();
@@ -83,7 +91,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != static_cast<unsigned int>(n));
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d())
+    if (gemm_info.reinterpret_input_as_3d())
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
     }
@@ -92,9 +100,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
     }
@@ -102,8 +111,13 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo       *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo   &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
 {
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
@@ -115,17 +129,19 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_dst_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_dst_as_3d)
+    if (reinterpret_input_as_3d == reinterpret_dst_as_3d)
     {
         reinterpret_dst_as_3d = false;
     }
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
+    auto_init_if_empty(*dst, src0->clone()
+                                 ->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))
+                                 .set_data_type(DataType::S32));
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_dst_as_3d)
+    if (reinterpret_dst_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -138,12 +154,12 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // RHS matrix still needs padding on the X
-    AccessWindowStatic src1_access(src1, 0, 0,
-                                   ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x),
-                                   src1->dimension(1));
+    AccessWindowStatic src1_access(
+        src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1));
 
     window_changed = update_window_and_padding(win, src1_access); // window used by the execute_window_loop
 
@@ -153,7 +169,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
     collapsed                                = win.collapse(win, dimension_to_collapse);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -163,8 +180,13 @@ ClGemmLowpMatrixMultiplyNativeKernel::ClGemmLowpMatrixMultiplyNativeKernel()
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst,
-                                                     const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext  &compile_context,
+                                                     const ITensorInfo       *src0,
+                                                     ITensorInfo             *src1,
+                                                     ITensorInfo             *dst,
+                                                     const GEMMLHSMatrixInfo &lhs_info,
+                                                     const GEMMRHSMatrixInfo &rhs_info,
+                                                     const GEMMReshapeInfo   &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
@@ -175,11 +197,11 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
 
     // We still need padding on the X dimension for the RHS matrix
-    auto padding_info = get_padding_info({ src0, dst });
+    auto padding_info = get_padding_info({src0, dst});
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_dst_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -192,7 +214,8 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+    auto win_config =
+        validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -212,8 +235,10 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
     CLBuildOptions build_opts;
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
     build_opts.add_option("-DM=" + support::cpp11::to_string(src0->dimension(1)));
@@ -258,19 +283,19 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                      const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo       *src0,
+                                                      const ITensorInfo       *src1,
+                                                      const ITensorInfo       *dst,
+                                                      const GEMMLHSMatrixInfo &lhs_info,
+                                                      const GEMMRHSMatrixInfo &rhs_info,
+                                                      const GEMMReshapeInfo   &gemm_info)
 {
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info,
                                                               num_elements_processed)
-                                .first);
+                                    .first);
 
     return Status{};
 }
@@ -280,11 +305,13 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -296,7 +323,7 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_input_as_3d)
+    if (_reinterpret_input_as_3d)
     {
         // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
@@ -304,10 +331,10 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
 
-    if(_reinterpret_output_as_3d)
+    if (_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+        const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
         const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
@@ -317,7 +344,7 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -330,8 +357,7 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
index 4b328e0ab80cb162b97be356d939d693ca99252f..4f87096158c512e1a322a3b7618ce8340e10a715 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -55,25 +56,34 @@ public:
      *                             rhs_info.k0: same as lhs_info.k0
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   ITensorInfo             *src1,
+                   ITensorInfo             *dst,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMReshapeInfo   &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpMatrixMultiplyNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst,
-                           const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *dst,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMReshapeInfo   &gemm_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool _slide_matrix_b{ true };
-    bool _reinterpret_input_as_3d{ false };
-    bool _reinterpret_output_as_3d{ false };
-    bool _use_dummy_work_items{ false };
+    bool _slide_matrix_b{true};
+    bool _reinterpret_input_as_3d{false};
+    bool _reinterpret_output_as_3d{false};
+    bool _use_dummy_work_items{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
index 15493f7ddcf7082aad42f55572b020290eeb6f7b..ddbc809cdde94080dd3b966e1388a429817fb234 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
@@ -29,13 +29,12 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -51,45 +50,55 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst,
-                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *dst,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMReshapeInfo   &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
     ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m();
     const int n = gemm_info.n();
     const int k = gemm_info.k();
 
-    TensorShape tensor_shape0{ src0->tensor_shape() };
+    TensorShape tensor_shape0{src0->tensor_shape()};
     tensor_shape0.set(0, k);
     tensor_shape0.set(1, m);
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
     const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
     const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
 
-    const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info_reshaped0 =
+        src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
@@ -99,19 +108,24 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                        const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info,
-                                                        ElementsProcessed &num_elements_processed)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo       *src0,
+                                                        const ITensorInfo       *src1,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo   &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
 {
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
     bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
 
     TensorInfo tmp_info(*dst);
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -123,7 +137,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     // Configure kernel window
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
-    Window win                          = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -140,8 +155,13 @@ ClGemmLowpMatrixMultiplyReshapedKernel::ClGemmLowpMatrixMultiplyReshapedKernel()
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                       const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext  &compile_context,
+                                                       const ITensorInfo       *src0,
+                                                       const ITensorInfo       *src1,
+                                                       ITensorInfo             *dst,
+                                                       const GEMMLHSMatrixInfo &lhs_info,
+                                                       const GEMMRHSMatrixInfo &rhs_info,
+                                                       const GEMMReshapeInfo   &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
@@ -154,11 +174,12 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c
     const unsigned int num_dimensionssrc0 = src0->num_dimensions();
     _slide_matrix_b                       = (src1->num_dimensions() >= num_dimensionssrc0);
 
-    auto              padding_info = get_padding_info({ src0, src1, dst });
+    auto              padding_info = get_padding_info({src0, src1, dst});
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+    auto win_config =
+        validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -171,8 +192,10 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
     build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
@@ -230,19 +253,19 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo       *src0,
+                                                        const ITensorInfo       *src1,
+                                                        const ITensorInfo       *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo   &gemm_info)
 {
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info,
                                                               num_elements_processed)
-                                .first);
+                                    .first);
 
     return Status{};
 }
@@ -252,11 +275,13 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -268,7 +293,7 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_output_as_3d)
+    if (_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 4;
@@ -281,7 +306,7 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -295,8 +320,7 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
index a16f500f11ca7c043570a7466aff8cd5f7180ce7..d7b785996f6b66ba47d6ca73a4f68f7b05191159 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -64,25 +65,34 @@ public:
      *
      * @note lhs_info.k0 must be equal to rhs_info.k0
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   const ITensorInfo       *src1,
+                   ITensorInfo             *dst,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMReshapeInfo   &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpMatrixMultiplyReshapedKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMReshapeInfo &gemm_info);
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *dst,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMReshapeInfo   &gemm_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool         _slide_matrix_b{ true };
-    bool         _reinterpret_output_as_3d{ false };
-    unsigned int _k{ 1 };
-    bool         _use_dummy_work_items{ false };
+    bool         _slide_matrix_b{true};
+    bool         _reinterpret_output_as_3d{false};
+    unsigned int _k{1};
+    bool         _use_dummy_work_items{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
index 5d552b8d63befc6f5bc808b74f3ab5a97392c2b6..2f1f3b8df0f20351b88e4f821022f6387182299f 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -29,14 +29,13 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -54,45 +53,57 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                          const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo    *src0,
+                          const ITensorInfo    *src1,
+                          const ITensorInfo    *dst,
+                          const GEMMKernelInfo &gemm_info,
+                          const ITensorInfo    *vector_sum_col,
+                          const ITensorInfo    *vector_sum_row,
+                          const ITensorInfo    *bias,
+                          const ITensorInfo    *output_multipliers,
+                          const ITensorInfo    *output_shifts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    if(src0->data_type() == DataType::QASYMM8)
+    if (src0->data_type() == DataType::QASYMM8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8,
+                                                             DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
 
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16),
+                                    "Only 2,3,4,8,16 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m;
     const int n = gemm_info.n;
     const int k = gemm_info.k;
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    const TensorInfo tensor_info1          = src1->clone()->set_tensor_shape(tensor_shape1);
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
     }
@@ -103,11 +114,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
     const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        if(output_stage.type == GEMMLowpOutputStageType::NONE)
+        if (output_stage.type == GEMMLowpOutputStageType::NONE)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
         }
@@ -117,39 +128,42 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         }
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0));
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) ||
+                                        (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
                                     "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
 
     // Checks performed if the dst stage needs to be fused
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         // If a_offset == 0, vector_sum_col can be a nullptr
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]);
         }
 
         // If b_offset == 0, vector_sum_row can be a nullptr
-        if(gemm_info.b_offset != 0)
+        if (gemm_info.b_offset != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
             // Check if mm result is a 3D reinterpretation
-            const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
+            const bool reinterpret_as_3d =
+                expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
 
             // Validate input
-            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2]));
+            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                                 (expected_dst_shape[1] * expected_dst_shape[2]));
             ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]);
 
-            if(expected_dst_shape.num_dimensions() > 1)
+            if (expected_dst_shape.num_dimensions() > 1)
             {
                 const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -161,30 +175,32 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx],
                                                 "vector_sum_row must have the same number of batches of dst tensor");
 
-                if(gemm_info.a_offset != 0)
+                if (gemm_info.a_offset != 0)
                 {
                     TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                     vector_sum_col_shape.collapse_from(1);
 
-                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                    "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                        vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                    "vector_sum_col tensor must have the same number of batches of "
+                                                    "vector_sum_row_shape or the number of batches must be set to 1");
                 }
             }
         }
 
-        if(dst->total_size() != 0)
+        if (dst->total_size() != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
         }
         ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
 
-        if(output_multipliers != nullptr && output_shifts != nullptr)
+        if (output_multipliers != nullptr && output_shifts != nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-            if(output_stage.is_quantized_per_channel)
+            if (output_stage.is_quantized_per_channel)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0));
                 ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0));
@@ -194,9 +210,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                        ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                        ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo    *src0,
+                                                        const ITensorInfo    *src1,
+                                                        ITensorInfo          *dst,
+                                                        const GEMMKernelInfo &gemm_info,
+                                                        ITensorInfo          *vector_sum_col,
+                                                        const ITensorInfo    *vector_sum_row,
+                                                        ITensorInfo          *bias,
+                                                        ITensorInfo          *output_multipliers,
+                                                        ITensorInfo          *output_shifts,
+                                                        ElementsProcessed    &num_elements_processed)
 {
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
 
@@ -211,16 +234,17 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+    if (reinterpret_input_as_3d == reinterpret_output_as_3d)
     {
         reinterpret_output_as_3d = false;
     }
 
     // dst tensor auto initialization if not yet initialized
     const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(output_stage.type != GEMMLowpOutputStageType::NONE)
+    if (output_stage.type != GEMMLowpOutputStageType::NONE)
     {
-        auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
+        auto_init_if_empty(
+            *dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
     }
     else
     {
@@ -229,7 +253,7 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -242,12 +266,14 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0;
     num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0;
 
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win_out =
+        calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
             window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access);
@@ -255,17 +281,19 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
         // No access window needed for vector_sum_row
         ARM_COMPUTE_UNUSED(vector_sum_row);
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
             window_changed = window_changed || update_window_and_padding(win_out, bias_access);
         }
 
-        if(output_multipliers != nullptr && output_stage.is_quantized_per_channel)
+        if (output_multipliers != nullptr && output_stage.is_quantized_per_channel)
         {
-            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x);
+            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0,
+                                                             num_elems_processed_per_iteration_x);
             AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
+            window_changed =
+                window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
         }
     }
 
@@ -275,7 +303,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
     collapsed                                = win.collapse(win, dimension_to_collapse);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -285,15 +314,22 @@ ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::ClGemmLowpMatrixMultiplyReshapedO
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                              const GEMMKernelInfo &gemm_info,
-                                                              ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                              ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context,
+                                                              const ITensorInfo      *src0,
+                                                              const ITensorInfo      *src1,
+                                                              ITensorInfo            *dst,
+                                                              const GEMMKernelInfo   &gemm_info,
+                                                              ITensorInfo            *vector_sum_col,
+                                                              const ITensorInfo      *vector_sum_row,
+                                                              ITensorInfo            *bias,
+                                                              ITensorInfo            *output_multipliers,
+                                                              ITensorInfo            *output_shifts)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                  output_multipliers, output_shifts));
 
-    auto                          padding_info = get_padding_info({ src0, src1, dst, vector_sum_row });
+    auto                          padding_info = get_padding_info({src0, src1, dst, vector_sum_row});
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
@@ -307,7 +343,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -320,7 +356,8 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed);
+    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                    output_multipliers, output_shifts, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -341,8 +378,10 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
     CLBuildOptions build_opts;
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
@@ -361,12 +400,12 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
     std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_");
     kernel_name += rhs_info.transpose ? "t" : "nt";
 
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         kernel_name += "_fused_output_stage_fixedpoint";
         _fuse_output_stage = true;
         // If a_offset == 0, vector_sum_col can be a nullptr
-        if(a_offset != 0 && vector_sum_col != nullptr)
+        if (a_offset != 0 && vector_sum_col != nullptr)
         {
             build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
             build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
@@ -377,9 +416,10 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
         build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
         build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
         // In case of _is_quantized_per_channel, RESULT_MULTIPLIER and RESULT_SHIFT are not utilized, but they are passed as a part of T_QUANTIZE8 macro.
-        if(!_is_quantized_per_channel)
+        if (!_is_quantized_per_channel)
         {
-            build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
+            build_opts.add_option("-DRESULT_MULTIPLIER=" +
+                                  support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
             build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
         }
         else
@@ -432,42 +472,56 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                               const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo    *src0,
+                                                               const ITensorInfo    *src1,
+                                                               const ITensorInfo    *dst,
+                                                               const GEMMKernelInfo &gemm_info,
+                                                               const ITensorInfo    *vector_sum_col,
+                                                               const ITensorInfo    *vector_sum_row,
+                                                               const ITensorInfo    *bias,
+                                                               const ITensorInfo    *output_multipliers,
+                                                               const ITensorInfo    *output_shifts)
 {
     ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              gemm_info,
-                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
-                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
-                                                              bias != nullptr ? bias->clone().get() : nullptr,
-                                                              output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
-                                                              output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
-                                                              num_elements_processed)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                   output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src0->clone().get(), src1->clone().get(), dst->clone().get(), gemm_info,
+                                      vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                      vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                      bias != nullptr ? bias->clone().get() : nullptr,
+                                      output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
+                                      output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
+                                      num_elements_processed)
+            .first);
 
     return Status{};
 }
 
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto bias               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    const auto vector_sum_col     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    const auto output_shifts      = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
-    const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
-    auto       dst                = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    if(src1->info()->num_dimensions() < 3)
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto output_shifts =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
+    const auto output_multipliers =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -479,7 +533,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_input_as_3d)
+    if (_reinterpret_input_as_3d)
     {
         // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
@@ -487,10 +541,10 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
 
-    if(_reinterpret_output_as_3d)
+    if (_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+        const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
         const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
@@ -515,7 +569,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -527,19 +581,19 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
-        if(_reinterpret_input_as_3d)
+        if (_reinterpret_input_as_3d)
         {
             // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
             idx++;
         }
 
-        if(_reinterpret_output_as_3d)
+        if (_reinterpret_output_as_3d)
         {
             // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
             idx++;
         }
 
-        if(_fuse_output_stage)
+        if (_fuse_output_stage)
         {
             add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col);
             add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row);
@@ -548,8 +602,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
             add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice);
         }
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
index a77604db7c5558092acceedf3d8bd7cacb6e3cac..1d4696b089f37f664284d67b0b701d63589c32d6 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -70,31 +71,44 @@ public:
      * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
      *                                Supported data types: S32.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                   ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr,
-                   ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   ITensorInfo            *dst,
+                   const GEMMKernelInfo   &gemm_info,
+                   ITensorInfo            *vector_sum_col     = nullptr,
+                   const ITensorInfo      *vector_sum_row     = nullptr,
+                   ITensorInfo            *bias               = nullptr,
+                   ITensorInfo            *output_multipliers = nullptr,
+                   ITensorInfo            *output_shifts      = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                           const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr,
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+    static Status validate(const ITensorInfo    *src0,
+                           const ITensorInfo    *src1,
+                           const ITensorInfo    *dst,
+                           const GEMMKernelInfo &gemm_info,
+                           const ITensorInfo    *vector_sum_col     = nullptr,
+                           const ITensorInfo    *vector_sum_row     = nullptr,
+                           const ITensorInfo    *bias               = nullptr,
+                           const ITensorInfo    *output_multipliers = nullptr,
+                           const ITensorInfo    *output_shifts      = nullptr);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool _slide_matrix_b{ true };
-    bool _reinterpret_input_as_3d{ false };
-    bool _reinterpret_output_as_3d{ false };
-    bool _use_dummy_work_items{ false };
-    bool _is_quantized_per_channel{ false };
-    bool _fuse_output_stage{ false };
+    bool _slide_matrix_b{true};
+    bool _reinterpret_input_as_3d{false};
+    bool _reinterpret_output_as_3d{false};
+    bool _use_dummy_work_items{false};
+    bool _is_quantized_per_channel{false};
+    bool _fuse_output_stage{false};
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
index 792c71da76eb2455bf0102f0a721221a4ca464d8..030c11d069f81f0a12d34658df80b5d853482157 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
@@ -23,16 +23,15 @@
  */
 #include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -47,39 +46,51 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                          const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo    *src0,
+                          const ITensorInfo    *src1,
+                          const ITensorInfo    *dst,
+                          const GEMMKernelInfo &gemm_info,
+                          const ITensorInfo    *vector_sum_col,
+                          const ITensorInfo    *vector_sum_row,
+                          const ITensorInfo    *bias,
+                          const ITensorInfo    *output_multipliers,
+                          const ITensorInfo    *output_shifts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
 
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.k0 != 4 || lhs_info.k0 != 4, "Only 4 is supported as value for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(lhs_info.m0 == 1 || lhs_info.m0 == 2 || lhs_info.m0 == 4), "Only 1,2,4 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(rhs_info.n0 == 1 || rhs_info.n0 == 4 || rhs_info.n0 == 8), "Only 1,4,8 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(lhs_info.m0 == 1 || lhs_info.m0 == 2 || lhs_info.m0 == 4),
+                                    "Only 1,2,4 are supported for m0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(rhs_info.n0 == 1 || rhs_info.n0 == 4 || rhs_info.n0 == 8),
+                                    "Only 1,4,8 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m;
     const int n = gemm_info.n;
     const int k = gemm_info.k;
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    const TensorInfo tensor_info1          = src1->clone()->set_tensor_shape(tensor_shape1);
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
     }
@@ -90,11 +101,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
     const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        if(output_stage.type == GEMMLowpOutputStageType::NONE)
+        if (output_stage.type == GEMMLowpOutputStageType::NONE)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
         }
@@ -104,38 +115,41 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         }
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0));
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) ||
+                                        (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
                                     "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
 
     // Checks performed if the dst stage needs to be fused
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         // If a_offset == 0, vector_sum_col can be a nullptr
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]);
         }
 
         // If b_offset == 0, vector_sum_row can be a nullptr
-        if(gemm_info.b_offset != 0)
+        if (gemm_info.b_offset != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
             // Check if mm result is a 3D reinterpretation
-            const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
+            const bool reinterpret_as_3d =
+                expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
 
             // Validate input
-            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2]));
+            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                                 (expected_dst_shape[1] * expected_dst_shape[2]));
             ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]);
 
-            if(expected_dst_shape.num_dimensions() > 1)
+            if (expected_dst_shape.num_dimensions() > 1)
             {
                 const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -147,30 +161,32 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx],
                                                 "vector_sum_row must have the same number of batches of dst tensor");
 
-                if(gemm_info.a_offset != 0)
+                if (gemm_info.a_offset != 0)
                 {
                     TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                     vector_sum_col_shape.collapse_from(1);
 
-                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                    "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                        vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                    "vector_sum_col tensor must have the same number of batches of "
+                                                    "vector_sum_row_shape or the number of batches must be set to 1");
                 }
             }
         }
 
-        if(dst->total_size() != 0)
+        if (dst->total_size() != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
         }
         ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
 
-        if(output_multipliers != nullptr && output_shifts != nullptr)
+        if (output_multipliers != nullptr && output_shifts != nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-            if(output_stage.is_quantized_per_channel)
+            if (output_stage.is_quantized_per_channel)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0));
                 ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0));
@@ -180,9 +196,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                        ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                        ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo    *src0,
+                                                        const ITensorInfo    *src1,
+                                                        ITensorInfo          *dst,
+                                                        const GEMMKernelInfo &gemm_info,
+                                                        ITensorInfo          *vector_sum_col,
+                                                        const ITensorInfo    *vector_sum_row,
+                                                        ITensorInfo          *bias,
+                                                        ITensorInfo          *output_multipliers,
+                                                        ITensorInfo          *output_shifts,
+                                                        ElementsProcessed    &num_elements_processed)
 {
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
 
@@ -200,9 +223,10 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     reinterpret_output_as_3d = false;
     // dst tensor auto initialization if not yet initialized
     const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(output_stage.type != GEMMLowpOutputStageType::NONE)
+    if (output_stage.type != GEMMLowpOutputStageType::NONE)
     {
-        auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
+        auto_init_if_empty(
+            *dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
     }
     else
     {
@@ -211,7 +235,7 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -224,11 +248,12 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     num_elems_processed_per_iteration_x = 1;
     num_elems_processed_per_iteration_y = 1;
 
-    win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
             window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
@@ -236,17 +261,19 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
         // No access window needed for vector_sum_row
         ARM_COMPUTE_UNUSED(vector_sum_row);
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
             window_changed = window_changed || update_window_and_padding(win, bias_access);
         }
 
-        if(output_multipliers != nullptr && output_stage.is_quantized_per_channel)
+        if (output_multipliers != nullptr && output_stage.is_quantized_per_channel)
         {
-            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x);
+            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0,
+                                                             num_elems_processed_per_iteration_x);
             AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access);
+            window_changed =
+                window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access);
         }
     }
 
@@ -278,7 +305,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     collapsed.set(Window::DimX, x_dimension);
     collapsed.set(Window::DimY, y_dimension);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -288,15 +316,22 @@ ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::ClGemmLowpMatrixMultiplyResha
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                                  const GEMMKernelInfo &gemm_info,
-                                                                  ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                                  ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context,
+                                                                  const ITensorInfo      *src0,
+                                                                  const ITensorInfo      *src1,
+                                                                  ITensorInfo            *dst,
+                                                                  const GEMMKernelInfo   &gemm_info,
+                                                                  ITensorInfo            *vector_sum_col,
+                                                                  const ITensorInfo      *vector_sum_row,
+                                                                  ITensorInfo            *bias,
+                                                                  ITensorInfo            *output_multipliers,
+                                                                  ITensorInfo            *output_shifts)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                  output_multipliers, output_shifts));
 
-    auto                          padding_info = get_padding_info({ src0, src1, dst, vector_sum_row });
+    auto                          padding_info = get_padding_info({src0, src1, dst, vector_sum_row});
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
@@ -313,7 +348,8 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompil
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed);
+    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                    output_multipliers, output_shifts, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -334,18 +370,19 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompil
     build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
     build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
     build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option("-DACTIVATION_TYPE=" +
+                          lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
     build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
     build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_mmul");
 
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         build_opts.add_option("-DFUSED_OUTPUT_STAGE_FIXED_POINT");
         _fuse_output_stage = true;
         // If a_offset == 0, vector_sum_col can be a nullptr
-        if(a_offset != 0 && vector_sum_col != nullptr)
+        if (a_offset != 0 && vector_sum_col != nullptr)
         {
             build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
             build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
@@ -396,42 +433,54 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompil
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                                   const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                                   const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo    *src0,
+                                                                   const ITensorInfo    *src1,
+                                                                   const ITensorInfo    *dst,
+                                                                   const GEMMKernelInfo &gemm_info,
+                                                                   const ITensorInfo    *vector_sum_col,
+                                                                   const ITensorInfo    *vector_sum_row,
+                                                                   const ITensorInfo    *bias,
+                                                                   const ITensorInfo    *output_multipliers,
+                                                                   const ITensorInfo    *output_shifts)
 {
     ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              gemm_info,
-                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
-                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
-                                                              bias != nullptr ? bias->clone().get() : nullptr,
-                                                              output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
-                                                              output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
-                                                              num_elements_processed)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                   output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src0->clone().get(), src1->clone().get(), dst->clone().get(), gemm_info,
+                                      vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                      vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                      bias != nullptr ? bias->clone().get() : nullptr,
+                                      output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
+                                      output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
+                                      num_elements_processed)
+            .first);
 
     return Status{};
 }
 
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack      &tensors,
+                                                               const Window     &window,
+                                                               cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    auto       dst            = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -449,7 +498,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tens
         add_3d_tensor_nhw_argument(idx, src1);
 
         // Bias buffer (_add_bias == true)
-        if(src2 != nullptr)
+        if (src2 != nullptr)
         {
             add_3d_tensor_nhw_argument(idx, src2);
         }
@@ -461,21 +510,20 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tens
         _kernel.setArg<cl_int>(idx++, _n);
         _kernel.setArg<cl_int>(idx++, _k);
 
-        if(_fuse_output_stage)
+        if (_fuse_output_stage)
         {
-            if(vector_sum_col != nullptr)
+            if (vector_sum_col != nullptr)
             {
                 add_3d_tensor_nhw_argument(idx, vector_sum_col);
             }
-            if(vector_sum_row != nullptr)
+            if (vector_sum_row != nullptr)
             {
                 add_3d_tensor_nhw_argument(idx, vector_sum_row);
             }
         }
 
         enqueue(queue, *this, slice, cl::NDRange(32, 2), false);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
index 0ae549cd538da36b5333433f4b5defed663f7b4f..fc8b73140d5680193bf7e74401382d128cb65fd9 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/IClKernel.h"
 
@@ -65,29 +66,42 @@ public:
      * @param[in]  output_multipliers (Optional) Output multipliers tensor. Supported data types: S32.
      * @param[in]  output_shifts      (Optional) Output shifts tensor. Supported data types: S32.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                   ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr,
-                   ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   ITensorInfo            *dst,
+                   const GEMMKernelInfo   &gemm_info,
+                   ITensorInfo            *vector_sum_col     = nullptr,
+                   const ITensorInfo      *vector_sum_row     = nullptr,
+                   ITensorInfo            *bias               = nullptr,
+                   ITensorInfo            *output_multipliers = nullptr,
+                   ITensorInfo            *output_shifts      = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                           const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr,
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+    static Status validate(const ITensorInfo    *src0,
+                           const ITensorInfo    *src1,
+                           const ITensorInfo    *dst,
+                           const GEMMKernelInfo &gemm_info,
+                           const ITensorInfo    *vector_sum_col     = nullptr,
+                           const ITensorInfo    *vector_sum_row     = nullptr,
+                           const ITensorInfo    *bias               = nullptr,
+                           const ITensorInfo    *output_multipliers = nullptr,
+                           const ITensorInfo    *output_shifts      = nullptr);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _fuse_output_stage{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _fuse_output_stage{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMULKERNEL_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMULKERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
index 9ec0b5182f98ddf839e8fe5f6c34f7027c8d6f44..d93dbde95a8cb5005d3bf8414df53ee215a3ab78 100644
--- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
@@ -28,11 +28,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -44,12 +43,16 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          int32_t a_offset, int32_t b_offset)
+Status validate_arguments(const ITensorInfo *mm_result,
+                          const ITensorInfo *vector_sum_col,
+                          const ITensorInfo *vector_sum_row,
+                          const ITensorInfo *bias,
+                          int32_t            a_offset,
+                          int32_t            b_offset)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -57,26 +60,28 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
     }
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
     }
 
     // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
         // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
         // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
         ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
+        if (output_shape.num_dimensions() > 1)
         {
             const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -87,13 +92,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
                                             "mm_result tensor must have the same number of batches of output tensor");
 
-            if(a_offset != 0)
+            if (a_offset != 0)
             {
                 TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                 vector_sum_col_shape.collapse_from(1);
 
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
             }
         }
     }
@@ -108,29 +115,34 @@ ClGemmLowpOffsetContributionKernel::ClGemmLowpOffsetContributionKernel()
 }
 
 void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context,
-                                                   const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                   int32_t k, int32_t a_offset, int32_t b_offset)
+                                                   const ITensorInfo      *mm_result,
+                                                   const ITensorInfo      *vector_sum_col,
+                                                   const ITensorInfo      *vector_sum_row,
+                                                   const ITensorInfo      *bias,
+                                                   int32_t                 k,
+                                                   int32_t                 a_offset,
+                                                   int32_t                 b_offset)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
 
-    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias });
+    auto padding_info = get_padding_info({mm_result, vector_sum_col, vector_sum_row, bias});
 
     // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->num_dimensions() > 1
-                                   && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 &&
+                                   mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0));
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
         build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
@@ -138,8 +150,10 @@ void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compi
     // If b_offset == 0, vector_sum_row can be a nullptr
     build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
     build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
-    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
 
     std::string kernel_name("gemmlowp_offset_contribution");
@@ -165,10 +179,15 @@ void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compi
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                    int32_t a_offset, int32_t b_offset)
+Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result,
+                                                    const ITensorInfo *vector_sum_col,
+                                                    const ITensorInfo *vector_sum_row,
+                                                    const ITensorInfo *bias,
+                                                    int32_t            a_offset,
+                                                    int32_t            b_offset)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
     return Status{};
 }
 
@@ -177,10 +196,13 @@ void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Wind
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window);
 
-    const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    const auto bias           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    const auto mm_result      = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_SRC_DST));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto mm_result = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_SRC_DST));
 
     Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
@@ -209,8 +231,7 @@ void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Wind
         add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
index 48926e280b166dca91a44efbca1484efd3748812..2080a3a0916c80af8c4594be2c88adfb095b0472 100644
--- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
@@ -67,15 +67,25 @@ public:
      * @param[in]      b_offset        Offset to be added to each element of the matrix B.
      */
     void configure(const CLCompileContext &compile_context,
-                   const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                   int32_t k, int32_t a_offset, int32_t b_offset);
+                   const ITensorInfo      *mm_result,
+                   const ITensorInfo      *vector_sum_col,
+                   const ITensorInfo      *vector_sum_row,
+                   const ITensorInfo      *bias,
+                   int32_t                 k,
+                   int32_t                 a_offset,
+                   int32_t                 b_offset);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpOffsetContributionKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset);
+    static Status validate(const ITensorInfo *mm_result,
+                           const ITensorInfo *vector_sum_col,
+                           const ITensorInfo *vector_sum_row,
+                           const ITensorInfo *bias,
+                           int32_t            a_offset,
+                           int32_t            b_offset);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
index c5fb54f524772e63ac24f7e36d94039ca7b77a4c..26f479f61a5d80d8f25885f9bdc46eb26ebb62c1 100644
--- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
@@ -34,7 +34,6 @@
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -46,12 +45,20 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst,
-                          int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo             *mm_result,
+                          const ITensorInfo             *vector_sum_col,
+                          const ITensorInfo             *vector_sum_row,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          int32_t                        a_offset,
+                          int32_t                        b_offset,
+                          const GEMMLowpOutputStageInfo &output_stage,
+                          const ITensorInfo             *output_multipliers,
+                          const ITensorInfo             *output_shifts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -62,33 +69,35 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
     ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-    if(output_stage.is_quantized_per_channel)
+    if (output_stage.is_quantized_per_channel)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0));
         ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0));
     }
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
     }
 
     // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
         // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
         // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
         ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
+        if (output_shape.num_dimensions() > 1)
         {
             const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -99,20 +108,22 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
                                             "mm_result tensor must have the same number of batches of output tensor");
 
-            if(a_offset != 0)
+            if (a_offset != 0)
             {
                 TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                 vector_sum_col_shape.collapse_from(1);
 
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
             }
         }
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
     // Checks performed when output is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
@@ -120,7 +131,8 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(),
+                                    "per channel quantization info is incorrect");
 
     return Status{};
 }
@@ -131,16 +143,26 @@ ClGemmLowpOffsetContributionOutputStageKernel::ClGemmLowpOffsetContributionOutpu
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context,
-                                                              const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
-                                                              int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                              const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext        &compile_context,
+                                                              const ITensorInfo             *mm_result,
+                                                              const ITensorInfo             *vector_sum_col,
+                                                              const ITensorInfo             *vector_sum_row,
+                                                              const ITensorInfo             *bias,
+                                                              ITensorInfo                   *dst,
+                                                              int32_t                        k,
+                                                              int32_t                        a_offset,
+                                                              int32_t                        b_offset,
+                                                              const GEMMLowpOutputStageInfo &output_stage,
+                                                              const ITensorInfo             *output_multipliers,
+                                                              const ITensorInfo             *output_shifts)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst, output_multipliers, output_shifts);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset,
+                                                  b_offset, output_stage, output_multipliers, output_shifts));
 
-    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts });
+    auto padding_info =
+        get_padding_info({mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts});
 
     const int min = output_stage.gemmlowp_min_bound;
     const int max = output_stage.gemmlowp_max_bound;
@@ -148,9 +170,8 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon
     _is_quantized_per_channel = output_stage.is_quantized_per_channel;
 
     // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->num_dimensions() > 1
-                                   && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 &&
+                                   mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
     // Auto initialize the output
     auto_init_if_empty(*dst, mm_result->clone()->set_data_type(output_stage.output_data_type));
@@ -160,10 +181,11 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
         build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
@@ -171,8 +193,10 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon
     // If b_offset == 0, vector_sum_row can be a nullptr
     build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
     build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
-    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
     build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
     build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
@@ -210,26 +234,42 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                               const ITensorInfo *dst, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo             *mm_result,
+                                                               const ITensorInfo             *vector_sum_col,
+                                                               const ITensorInfo             *vector_sum_row,
+                                                               const ITensorInfo             *bias,
+                                                               const ITensorInfo             *dst,
+                                                               int32_t                        a_offset,
+                                                               int32_t                        b_offset,
+                                                               const GEMMLowpOutputStageInfo &output_stage,
+                                                               const ITensorInfo             *output_multipliers,
+                                                               const ITensorInfo             *output_shifts)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset,
+                                                   b_offset, output_stage, output_multipliers, output_shifts));
     return Status{};
 }
 
-void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto mm_result          = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    const auto vector_sum_col     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    const auto output_shifts      = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
-    const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
-    auto       dst                = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto mm_result =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto output_shifts =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
+    const auto output_multipliers =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
@@ -260,8 +300,7 @@ void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors,
         add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice);
         add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
index cee04473c460961d7a8e7155c96a34cbb6e80e68..97ee9bc97f0fb686082b3f30e94fff8a509126a7 100644
--- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
@@ -66,23 +66,40 @@ public:
      * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
      *                                Supported data types: S32
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
-                   int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                   const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *mm_result,
+                   const ITensorInfo             *vector_sum_col,
+                   const ITensorInfo             *vector_sum_row,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   int32_t                        k,
+                   int32_t                        a_offset,
+                   int32_t                        b_offset,
+                   const GEMMLowpOutputStageInfo &output_stage,
+                   const ITensorInfo             *output_multipliers,
+                   const ITensorInfo             *output_shifts);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpOffsetContributionOutputStageKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset,
-                           int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
+    static Status validate(const ITensorInfo             *mm_result,
+                           const ITensorInfo             *vector_sum_col,
+                           const ITensorInfo             *vector_sum_row,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           int32_t                        a_offset,
+                           int32_t                        b_offset,
+                           const GEMMLowpOutputStageInfo &output_stage,
+                           const ITensorInfo             *output_multipliers,
+                           const ITensorInfo             *output_shifts);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool _is_quantized_per_channel{ false };
+    bool _is_quantized_per_channel{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
index 39754385a13fd0cda2ddac7e736a36488c8d729b..7b7beab12c87257b72ca2341e9e31211277661ef 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
@@ -27,15 +27,14 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -47,20 +46,23 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info)
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching dst data type");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -75,7 +77,9 @@ ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::ClGemmLowpQuantizeDownInt32S
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo             *src,
+                                                                    const ITensorInfo             *bias,
+                                                                    const ITensorInfo             *dst,
                                                                     const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -84,14 +88,17 @@ Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITenso
     return Status{};
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext        &compile_context,
+                                                                   const ITensorInfo             *src,
+                                                                   const ITensorInfo             *bias,
+                                                                   ITensorInfo                   *dst,
                                                                    const GEMMLowpOutputStageInfo *info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
 
-    auto padding_info = get_padding_info({ src, bias, dst });
+    auto padding_info = get_padding_info({src, bias, dst});
 
     // dst auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type));
@@ -103,19 +110,26 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompi
     auto           max = info->gemmlowp_max_bound;
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset));
     build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier));
     build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift));
     build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
-                             "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
-                             "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    build_opts.add_option_if(
+        (min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) &&
+            (min != max),
+        "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if(
+        (max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) &&
+            (min != max),
+        "-DMAX_BOUND=" + support::cpp11::to_string(max));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
 
     // Create kernel
-    const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint";
+    const std::string kernel_name = (info->output_data_type == DataType::QSYMM16)
+                                        ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16"
+                                        : "gemmlowp_output_stage_quantize_down_fixedpoint";
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -129,14 +143,18 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompi
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                const Window     &window,
+                                                                cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     // Create src window
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -144,7 +162,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &ten
 
     // Setup bias slice
     unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window biases_slice(slice);
         biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -158,8 +176,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &ten
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx1, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
index 69b5fc5018bb2d204889333bd71e597f1deb16e1..71c9f4b752e56efd4d15f08e7946712adf6a71a7 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
@@ -60,14 +60,21 @@ public:
      * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16.
      * @param[in]  info            Output stage info. Used to pass the quantized output data type
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo *info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
index f3796983266f50f8575eb46912d5e3da344acef9..52ebd32d46f26e08862420f335ca3b9a71c34d5d 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
@@ -27,15 +27,14 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -47,23 +46,31 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info)
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))
-                                || info->gemmlowp_min_bound > info->gemmlowp_max_bound);
+    ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) &&
+                                (info->output_data_type != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        info->gemmlowp_max_bound >
+        std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        info->gemmlowp_min_bound <
+            std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)) ||
+        info->gemmlowp_min_bound > info->gemmlowp_max_bound);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching output data type");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -78,7 +85,9 @@ ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::ClGemmLowpQuantizeDownInt32ScaleB
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo             *src,
+                                                               const ITensorInfo             *bias,
+                                                               const ITensorInfo             *dst,
                                                                const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -87,14 +96,17 @@ Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo
     return Status{};
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext        &compile_context,
+                                                              const ITensorInfo             *src,
+                                                              const ITensorInfo             *bias,
+                                                              ITensorInfo                   *dst,
                                                               const GEMMLowpOutputStageInfo *info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
 
-    auto padding_info = get_padding_info({ src, bias, dst });
+    auto padding_info = get_padding_info({src, bias, dst});
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type));
@@ -107,7 +119,8 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileCon
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier));
     build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset));
     build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
@@ -130,14 +143,18 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileCon
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     // Create input window
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -145,7 +162,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors,
 
     // Setup bias slice
     unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window biases_slice(slice);
         biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -159,8 +176,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors,
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx1, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
index 8eda24d25f92ab597977d9956cc2959e0c22ec38..057c66767f0ffb53b5206af1ca2170c5acfe088e 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
@@ -62,14 +62,21 @@ public:
      * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in]  info            Output stage info. Used to pass the quantized output data type
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo *info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
index 5d54db214a1c88643b4a4e9ee8cadb9a047c82a2..31434ce61b0a54e73583220243784d774d36224a 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
@@ -26,15 +26,14 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -46,25 +45,34 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                                || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
+    ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) &&
+                                (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_max_bound >
+        std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_min_bound <
+            std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) ||
+        output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type, "Mismatching output data type");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type,
+                                        "Mismatching output data type");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
     }
 
@@ -77,7 +85,10 @@ ClGemmLowpQuantizeDownInt32ScaleKernel::ClGemmLowpQuantizeDownInt32ScaleKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo             *src,
+                                                        const ITensorInfo             *bias,
+                                                        const ITensorInfo             *dst,
+                                                        const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage));
@@ -85,14 +96,17 @@ Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src,
     return Status{};
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext        &compile_context,
+                                                       const ITensorInfo             *src,
+                                                       const ITensorInfo             *bias,
+                                                       ITensorInfo                   *dst,
                                                        const GEMMLowpOutputStageInfo *output_stage)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage));
 
-    auto padding_info = get_padding_info({ src, bias, dst });
+    auto padding_info = get_padding_info({src, bias, dst});
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type));
@@ -104,13 +118,18 @@ void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &c
     auto           max = output_stage->gemmlowp_max_bound;
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset));
     build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier));
     build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift));
-    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
+    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(
+                                        output_stage->output_data_type))) &&
+                                 (min != max),
                              "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
+    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(
+                                        output_stage->output_data_type))) &&
+                                 (min != max),
                              "-DMAX_BOUND=" + support::cpp11::to_string(max));
     build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
@@ -135,15 +154,17 @@ void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
 
     unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window biases_slice(slice);
         biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -157,8 +178,7 @@ void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx1, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
index 84c5060362134b9d6098667365edaed5ba491fb6..e6390801f13b492bf8dc775aee04f2e4f5d0d260 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
@@ -62,14 +62,21 @@ public:
      * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in]  output_stage    GEMMLowp output stage metadata.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo *output_stage);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *output_stage);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -77,4 +84,4 @@ public:
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
index ea88b485a045fc2dfc918044dee64995f0f5a42c..ee4a191fed55fec782a5fcdc1f1d3512bdaa005a 100644
--- a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
@@ -32,7 +32,6 @@
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -47,12 +46,15 @@ namespace
 Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(1),
+            "Output vector must have length equal to the number of rows of the input matrix");
     }
     return Status{};
 }
@@ -60,12 +62,15 @@ Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITens
 Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(0),
+            "Output vector must have length equal to the number of columns of the input matrix");
     }
     return Status{};
 }
@@ -76,7 +81,10 @@ IClGemmLowpReductionKernel::IClGemmLowpReductionKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
+void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext            &compile_context,
+                                                 const ITensorInfo                 *mtx_a,
+                                                 ITensorInfo                       *vector_sum_row,
+                                                 const GEMMLowpReductionKernelInfo &info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
@@ -85,7 +93,7 @@ void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*vector_sum_row, TensorShape(mtx_a->dimension(1)), 1, DataType::S32);
 
-    auto padding_info = get_padding_info({ mtx_a, vector_sum_row });
+    auto padding_info = get_padding_info({mtx_a, vector_sum_row});
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
@@ -120,7 +128,9 @@ void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
+Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo                 *mtx_a,
+                                                  const ITensorInfo                 *vector_sum_row,
+                                                  const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
@@ -133,8 +143,9 @@ void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
     Window slice_in  = collapsed.first_slice_window_2D();
@@ -151,11 +162,13 @@ void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window
         add_3D_tensor_argument(idx, src, slice_in);
         add_2D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_2D(slice_out));
+    } while (collapsed.slide_window_slice_2D(slice_out));
 }
 
-void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
+void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext            &compile_context,
+                                                 const ITensorInfo                 *mtx_b,
+                                                 ITensorInfo                       *vector_sum_col,
+                                                 const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
@@ -163,14 +176,15 @@ void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*vector_sum_col, TensorShape(mtx_b->dimension(0)), 1, DataType::S32);
 
-    auto padding_info = get_padding_info({ mtx_b, vector_sum_col });
+    auto padding_info = get_padding_info({mtx_b, vector_sum_col});
 
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->dimension(0));
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->dimension(0)));
     build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->dimension(1)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->data_type()));
@@ -192,7 +206,9 @@ void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
+Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo                 *mtx_b,
+                                                  const ITensorInfo                 *vector_sum_col,
+                                                  const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
@@ -205,8 +221,9 @@ void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY);
 
@@ -222,8 +239,7 @@ void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window
         add_3D_tensor_argument(idx, src, slice_in);
         add_2D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_2D(slice_out));
+    } while (collapsed.slide_window_slice_2D(slice_out));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h
index 7119b5fee0e01b796e079109044141c8446bf534..c81543e4c296e22e87c8ea65a337469386141aac 100644
--- a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -52,7 +53,10 @@ public:
      *                             - scalar       Scalar value to multiply each reduced column/row by.
      *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
      */
-    virtual void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const GEMMLowpReductionKernelInfo &info) = 0;
+    virtual void configure(const CLCompileContext            &compile_context,
+                           const ITensorInfo                 *input,
+                           ITensorInfo                       *output,
+                           const GEMMLowpReductionKernelInfo &info) = 0;
 };
 
 /** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
@@ -74,14 +78,18 @@ public:
      *                             - scalar       Scalar value to multiply each reduced column/row by.
      *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
+    void configure(const CLCompileContext            &compile_context,
+                   const ITensorInfo                 *mtx_a,
+                   ITensorInfo                       *vector_sum_row,
+                   const GEMMLowpReductionKernelInfo &info) override;
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -106,14 +114,18 @@ public:
      *                             - scalar       Scalar value to multiply each reduced column/row by.
      *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
+    void configure(const CLCompileContext            &compile_context,
+                   const ITensorInfo                 *mtx_b,
+                   ITensorInfo                       *vector_sum_col,
+                   const GEMMLowpReductionKernelInfo &info) override;
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
index 5fea097ae38aed476c9bce81b4ce418583a66aa5..fd23aa992447f702f5f6dd0efbf02f10a74c801a 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
@@ -23,19 +23,19 @@
  */
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLUtils.h"
-#include "src/core/experimental/PostOpUtils.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -52,26 +52,13 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-const auto post_op_utils = experimental::PostOpCLKernelUtils(
-{
-    //  PostOp sequence                   -> {Kernel Postfix, PostOp Slots}
-    { {}, { "", {} } },
-    { { experimental::PostOpType::Activation }, { "", { 1 } } },
-
-    { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
-    { { experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 2 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-
-    { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-    { { experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
-});
-
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
 {
@@ -79,18 +66,22 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+            (!gemm_info.broadcast_bias),
+        "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
 
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
@@ -103,7 +94,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != n);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != k);
-    if(gemm_info.reinterpret_input_as_3d)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
     }
@@ -112,15 +103,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m);
     }
 
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
     {
         const unsigned int src2_dim0 = src2->dimension(0);
         const unsigned int src2_dim1 = src2->dimension(1);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-        if(gemm_info.broadcast_bias)
+        if (gemm_info.broadcast_bias)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
         }
         else
         {
@@ -128,20 +120,25 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         }
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
     }
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo             *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *src2,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
                                                         const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
+                                                        const GEMMKernelInfo    &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
 {
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
@@ -154,17 +151,18 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+    if (reinterpret_input_as_3d == reinterpret_output_as_3d)
     {
         reinterpret_output_as_3d = false;
     }
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -177,34 +175,34 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win_out =
+        calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    AccessWindowStatic src0_access(src0, 0, 0,
-                                   src0->dimension(0),
-                                   src0->dimension(1));
-    AccessWindowStatic src1_access(src1, 0, 0,
-                                   ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x),
-                                   src1->dimension(1));
-    AccessWindowStatic dst_access(dst, 0, 0,
-                                  dst->dimension(0),
-                                  dst->dimension(1));
+    AccessWindowStatic src0_access(src0, 0, 0, src0->dimension(0), src0->dimension(1));
+    AccessWindowStatic src1_access(
+        src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1));
+    AccessWindowStatic dst_access(dst, 0, 0, dst->dimension(0), dst->dimension(1));
 
-    if(src2 != nullptr)
+    if (src2 != nullptr)
     {
         const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
 
-        AccessWindowStatic src2_access(src2, 0, 0,
-                                       ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
+        AccessWindowStatic src2_access(src2, 0, 0, ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
                                        src2->dimension(1));
 
-        window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);                          // window used to update the padding requirements of dst tensor
+        window_changed = update_window_and_padding(win, src0_access, src1_access,
+                                                   src2_access) || // window used by the execute_window_loop
+                         update_window_and_padding(
+                             win_out, dst_access); // window used to update the padding requirements of dst tensor
     }
     else
     {
-        window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);             // window used to update the padding requirements of dst tensor
+        window_changed =
+            update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
+            update_window_and_padding(win_out,
+                                      dst_access); // window used to update the padding requirements of dst tensor
     }
 
     // Collapse along the Z direction
@@ -213,7 +211,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
     const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
     collapsed                                = win.collapse(win, dimension_to_collapse);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -223,28 +222,34 @@ ClGemmMatrixMultiplyNativeKernel::ClGemmMatrixMultiplyNativeKernel()
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha,
+void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext  &compile_context,
+                                                 ITensorInfo             *src0,
+                                                 ITensorInfo             *src1,
+                                                 ITensorInfo             *src2,
+                                                 ITensorInfo             *dst,
+                                                 float                    alpha,
                                                  float                    beta,
                                                  const GEMMLHSMatrixInfo &lhs_info,
-                                                 const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                 const GEMMRHSMatrixInfo &rhs_info,
+                                                 const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
 
-    auto padding_info         = get_padding_info({ src0, dst });
+    auto padding_info         = get_padding_info({src0, dst});
     _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
     _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
     _add_bias                 = src2 != nullptr;
-    _num_post_op_args         = gemm_info.post_ops.total_num_arguments();
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -257,7 +262,8 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+    auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info,
+                                                    rhs_info, gemm_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
 
@@ -283,14 +289,17 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
     build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
@@ -298,20 +307,15 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    // If post_ops are used, then we disable the use of gemm_info.activation_info
-    if(gemm_info.post_ops.size() > 0)
-    {
-        post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
-    }
-    else
-    {
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-    }
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DACTIVATION_TYPE=" +
+                                 lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_native");
-    post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -346,21 +350,23 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
+Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo       *src0,
+                                                  const ITensorInfo       *src1,
+                                                  const ITensorInfo       *src2,
+                                                  const ITensorInfo       *dst,
+                                                  float                    alpha,
+                                                  float                    beta,
                                                   const GEMMLHSMatrixInfo &lhs_info,
-                                                  const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                  const GEMMRHSMatrixInfo &rhs_info,
+                                                  const GEMMKernelInfo    &gemm_info)
 {
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
                                                               src2 != nullptr ? src2->clone().get() : nullptr,
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info,
                                                               num_elements_processed)
-                                .first);
+                                    .first);
 
     return Status{};
 }
@@ -370,15 +376,18 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -390,33 +399,33 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_input_as_3d)
+    if (_reinterpret_input_as_3d)
     {
         // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
         unsigned int idx0;
-        if(_add_bias)
+        if (_add_bias)
         {
-            idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + (7 + _num_post_op_args);
+            idx0 = 4 * num_arguments_per_2D_tensor() + 7;
         }
         else
         {
-            idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + (6 + _num_post_op_args);
+            idx0 = 3 * num_arguments_per_2D_tensor() + 6;
         }
         const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
 
-    if(_reinterpret_output_as_3d)
+    if (_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
         unsigned int idx0;
-        if(_add_bias)
+        if (_add_bias)
         {
-            idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
+            idx0 = 4 * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0);
         }
         else
         {
-            idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + 6 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
+            idx0 = 3 * num_arguments_per_2D_tensor() + 6 + (_reinterpret_input_as_3d ? 1 : 0);
         }
         const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
@@ -427,7 +436,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -435,30 +444,19 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
         unsigned int idx = 0;
         add_2D_tensor_argument(idx, src0, slice);
         add_2D_tensor_argument(idx, src1, slice_b);
-        if(_add_bias)
+        if (_add_bias)
         {
             add_2D_tensor_argument(idx, src2, slice);
         }
         add_2D_tensor_argument(idx, dst, slice);
-        // post op argument buffers
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            add_2D_tensor_argument(idx, post_op_arg, slice);
-        }
+
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
-        if(_add_bias)
+        if (_add_bias)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
         }
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
-        // post op argument stride_z
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
-        }
 
         // Pass m, n and k at runtime
         _kernel.setArg<cl_int>(idx++, _m);
@@ -466,8 +464,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
         _kernel.setArg<cl_int>(idx++, _k);
 
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
index e478df727ad2783d4a849ec4eab9b4da14170a58..da6c9a5bb759d169bffb02ea1fb6166c4abd4484 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -58,7 +59,13 @@ public:
      *                             rhs_info.k0: same of lhs_info.k0
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src0,
+                   ITensorInfo             *src1,
+                   ITensorInfo             *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
                    const GEMMLHSMatrixInfo &lhs_info,
                    const GEMMRHSMatrixInfo &rhs_info,
                    const GEMMKernelInfo    &gemm_info);
@@ -68,7 +75,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
                            const GEMMRHSMatrixInfo &rhs_info,
                            const GEMMKernelInfo    &gemm_info);
 
@@ -76,17 +89,16 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool         _slide_matrix_b{ true };
-    bool         _reinterpret_input_as_3d{ false };
-    bool         _reinterpret_output_as_3d{ false };
-    bool         _use_dummy_work_items{ false };
-    bool         _add_bias{ false };
-    signed int   _m{ 1 };
-    signed int   _n{ 1 };
-    signed int   _k{ 1 };
-    unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
+    bool       _slide_matrix_b{true};
+    bool       _reinterpret_input_as_3d{false};
+    bool       _reinterpret_output_as_3d{false};
+    bool       _use_dummy_work_items{false};
+    bool       _add_bias{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H */
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
index f14a6f1900399d5731c6fbdbc2733794add4164e..4fe6bddb36ac3899ab2d047130247e142490c1f9 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
@@ -23,19 +23,19 @@
  */
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
-#include "src/core/experimental/PostOpUtils.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -53,26 +53,13 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-const auto post_op_utils = experimental::PostOpCLKernelUtils(
-{
-    //  PostOp sequence                   -> {Kernel Postfix, PostOp Slots}
-    { {}, { "", {} } },
-    { { experimental::PostOpType::Activation }, { "", { 1 } } },
-
-    { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
-    { { experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 2 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-
-    { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-    { { experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
-});
-
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
 {
@@ -81,43 +68,50 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), "Only 2,3,4,8,16 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3),
+                                    "Only 2,3,4,8,16 are supported for m0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+            (!gemm_info.broadcast_bias),
+        "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32),
+                                    "Mixed precision only supported for F16 data type");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
 
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
     const unsigned int k = gemm_info.k;
 
-    TensorShape tensor_shape0{ src0->tensor_shape() };
+    TensorShape tensor_shape0{src0->tensor_shape()};
     tensor_shape0.set(0, k);
     tensor_shape0.set(1, m);
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
     {
         const unsigned int src2_dim0 = src2->dimension(0);
         const unsigned int src2_dim1 = src2->dimension(1);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-        if(gemm_info.broadcast_bias)
+        if (gemm_info.broadcast_bias)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
         }
         else
         {
@@ -128,26 +122,33 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
     const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
 
-    const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info_reshaped0 =
+        src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
     }
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo             *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *src2,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
                                                         const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
+                                                        const GEMMKernelInfo    &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
 {
     ARM_COMPUTE_UNUSED(src0, src1, src2);
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
@@ -156,7 +157,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -169,7 +170,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -186,23 +188,30 @@ ClGemmMatrixMultiplyReshapedKernel::ClGemmMatrixMultiplyReshapedKernel()
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context,
-                                                   const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                                                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext  &compile_context,
+                                                   const ITensorInfo       *src0,
+                                                   const ITensorInfo       *src1,
+                                                   const ITensorInfo       *src2,
+                                                   ITensorInfo             *dst,
+                                                   float                    alpha,
+                                                   float                    beta,
+                                                   const GEMMLHSMatrixInfo &lhs_info,
+                                                   const GEMMRHSMatrixInfo &rhs_info,
+                                                   const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
 
-    auto padding_info         = get_padding_info({ src0, src1, src2, dst });
+    auto padding_info         = get_padding_info({src0, src1, src2, dst});
     _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
     _add_bias                 = src2 != nullptr;
     _export_to_cl_image       = rhs_info.export_to_cl_image;
-    _num_post_op_args         = gemm_info.post_ops.total_num_arguments();
 
     // Check if we need to slide the matrix B
     const unsigned int num_dimensions_src0 = src0->num_dimensions();
@@ -211,14 +220,9 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0->clone().get(),
-                                                    src1->clone().get(),
-                                                    (src2 != nullptr) ? src2->clone().get() : nullptr,
-                                                    dst->clone().get(),
-                                                    lhs_info,
-                                                    rhs_info,
-                                                    gemm_info,
-                                                    num_elements_processed);
+    auto win_config = validate_and_configure_window(
+        src0->clone().get(), src1->clone().get(), (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(),
+        lhs_info, rhs_info, gemm_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -236,12 +240,15 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
 
     // Create build options
     CLBuildOptions build_opts;
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
@@ -252,7 +259,9 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
     build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type)));
+    build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision
+                                                            ? get_cl_type_from_data_type(DataType::F32)
+                                                            : get_cl_type_from_data_type(data_type)));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
@@ -260,23 +269,18 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    // If post_ops are used, then we disable the use of gemm_info.activation_info
-    if(gemm_info.post_ops.size() > 0)
-    {
-        post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
-    }
-    else
-    {
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-    }
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DACTIVATION_TYPE=" +
+                                 lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_reshaped_");
     kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
     kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
     kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
-    post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -319,9 +323,15 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
+Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo       *src0,
+                                                    const ITensorInfo       *src1,
+                                                    const ITensorInfo       *src2,
+                                                    const ITensorInfo       *dst,
+                                                    float                    alpha,
+                                                    float                    beta,
                                                     const GEMMLHSMatrixInfo &lhs_info,
-                                                    const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
     return Status{};
@@ -332,15 +342,18 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -356,12 +369,14 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
 
     cl::Image2D src1_image2d;
 
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
-        const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
+        const TensorShape shape2d(src1->info()->dimension(0) / 4,
+                                  src1->info()->dimension(1) * src1->info()->dimension(2));
         const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
 
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+                                                  src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     do
@@ -369,7 +384,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -380,7 +395,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         add_2D_tensor_argument(idx, src0, slice);
 
         // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
-        if(_export_to_cl_image)
+        if (_export_to_cl_image)
         {
             _kernel.setArg(idx++, src1_image2d);
         }
@@ -395,13 +410,6 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         // dst buffer
         add_2D_tensor_argument(idx, dst, slice);
 
-        // post op argument buffers
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            add_2D_tensor_argument(idx, post_op_arg, slice);
-        }
-
         // LHS stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
 
@@ -409,7 +417,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
 
         // Bias stride_z (if _add_bias == true)
-        if(_add_bias)
+        if (_add_bias)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
         }
@@ -417,14 +425,8 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         // dst stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
 
-        // post op argument stride_z
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
-        }
         // Cross-plan padding (if _reinterpret_output_as_3d = true)
-        if(_reinterpret_output_as_3d)
+        if (_reinterpret_output_as_3d)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
         }
@@ -438,8 +440,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
 
         // Dispatch kernel
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
index 2d668b91a3da0b73989537d6429836713cecd4db..30928c4e1d0f4c0eacb1b2ca3ac90b659ee7471c 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,15 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
 
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 
-#include "arm_compute/core/KernelDescriptors.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -83,16 +83,29 @@ public:
      *
      * @note lhs_info.k0 must be equal to rhs_info.k0
      */
-    void configure(const ClCompileContext &compile_context,
-                   const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
+    void configure(const ClCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   const ITensorInfo       *src1,
+                   const ITensorInfo       *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmMatrixMultiplyReshapedKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
                            const GEMMRHSMatrixInfo &rhs_info,
                            const GEMMKernelInfo    &gemm_info);
 
@@ -100,17 +113,16 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool         _slide_matrix_b{ true };
-    bool         _reinterpret_output_as_3d{ false };
-    bool         _use_dummy_work_items{ false };
-    bool         _add_bias{ false };
-    bool         _export_to_cl_image{ false };
-    signed int   _m{ 1 };
-    signed int   _n{ 1 };
-    signed int   _k{ 1 };
-    unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
+    bool       _slide_matrix_b{true};
+    bool       _reinterpret_output_as_3d{false};
+    bool       _use_dummy_work_items{false};
+    bool       _add_bias{false};
+    bool       _export_to_cl_image{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H */
\ No newline at end of file
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
index f780538f53a3c12964b8990cf55d99021b7359c1..1b19f1ec5bb3669e2a257b383c1f32b4ce6f9368 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -23,13 +23,13 @@
  */
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
-#include "src/core/experimental/PostOpUtils.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -47,64 +47,57 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-const auto post_op_utils = experimental::PostOpCLKernelUtils(
-{
-    //  PostOp sequence                   -> {Kernel Postfix, PostOp Slots}
-    { {}, { "", {} } },
-    { { experimental::PostOpType::Activation }, { "", { 1 } } },
-
-    { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
-    { { experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 2 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-
-    { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-    { { experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
-});
-
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1 || lhs_info.m0 > 8, "Only 1,2,3,4,5,6,7,8 are supported for m0");
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16 || rhs_info.k0 < 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16 || rhs_info.n0 < 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+            (!gemm_info.broadcast_bias),
+        "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
 
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
     const unsigned int k = gemm_info.k;
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
     {
         const unsigned int src2_dim0 = src2->dimension(0);
         const unsigned int src2_dim1 = src2->dimension(1);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src0);
-        if(gemm_info.broadcast_bias)
+        if (gemm_info.broadcast_bias)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
         }
         else
         {
@@ -114,10 +107,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
 
     const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
 
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
-    if(gemm_info.reinterpret_input_as_3d)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
     }
@@ -127,19 +121,25 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     }
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
     }
 
     return Status{};
 }
 
-Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                     const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
+Window validate_and_configure_window(ITensorInfo             *src0,
+                                     ITensorInfo             *src1,
+                                     ITensorInfo             *src2,
+                                     ITensorInfo             *dst,
+                                     const GEMMLHSMatrixInfo &lhs_info,
+                                     const GEMMRHSMatrixInfo &rhs_info,
+                                     const GEMMKernelInfo    &gemm_info,
+                                     ElementsProcessed       &num_elements_processed)
 {
     ARM_COMPUTE_UNUSED(src0, src1, src2);
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
@@ -150,14 +150,14 @@ Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITens
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
     // This approach should only be used when the input/dst tensors have pad on the y direction
-    if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
+    if ((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
     {
         reinterpret_output_as_3d = false;
     }
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -170,7 +170,8 @@ Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITens
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -186,14 +187,22 @@ ClGemmMatrixMultiplyReshapedOnlyRhsKernel::ClGemmMatrixMultiplyReshapedOnlyRhsKe
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context,
-                                                          const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                                                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext  &compile_context,
+                                                          const ITensorInfo       *src0,
+                                                          const ITensorInfo       *src1,
+                                                          const ITensorInfo       *src2,
+                                                          ITensorInfo             *dst,
+                                                          float                    alpha,
+                                                          float                    beta,
+                                                          const GEMMLHSMatrixInfo &lhs_info,
+                                                          const GEMMRHSMatrixInfo &rhs_info,
+                                                          const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
 
@@ -203,13 +212,12 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     _add_bias                 = src2 != nullptr;
     _export_to_cl_image       = rhs_info.export_to_cl_image;
     _has_pad_y                = gemm_info.has_pad_y;
-    _num_post_op_args         = gemm_info.post_ops.total_num_arguments();
 
-    auto padding_info = get_padding_info({ src0, src1, src2, dst });
+    auto padding_info = get_padding_info({src0, src1, src2, dst});
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
+    if ((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -222,8 +230,9 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    Window win = validate_and_configure_window(src0->clone().get(), src1->clone().get(), (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(), lhs_info, rhs_info, gemm_info,
-                                               num_elements_processed);
+    Window win = validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                               (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(),
+                                               lhs_info, rhs_info, gemm_info, num_elements_processed);
     ICLKernel::configure_internal(win);
 
     // If _reinterpret_input_as_3d = reinterpret_output_as_3d = true,
@@ -248,7 +257,8 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
@@ -263,29 +273,27 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    if(_has_pad_y)
+    if (_has_pad_y)
     {
         build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
         build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
-    }
-    // If post_ops are used, then we disable the use of gemm_info.activation_info
-    if(gemm_info.post_ops.size() > 0)
-    {
-        post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
-    }
-    else
-    {
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                                 "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
+        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                                 "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
     }
 
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DACTIVATION_TYPE=" +
+                                 lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+
     std::string kernel_name("gemm_mm_reshaped_only_rhs_");
     kernel_name += rhs_info.transpose ? "t" : "nt";
     kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
-    post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -325,28 +333,39 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
+Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo       *src0,
+                                                           const ITensorInfo       *src1,
+                                                           const ITensorInfo       *src2,
+                                                           const ITensorInfo       *dst,
+                                                           float                    alpha,
+                                                           float                    beta,
                                                            const GEMMLHSMatrixInfo &lhs_info,
-                                                           const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                           const GEMMRHSMatrixInfo &rhs_info,
+                                                           const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
     return Status{};
 }
 
-void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack      &tensors,
+                                                       const Window     &window,
+                                                       cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -372,12 +391,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
 
     cl::Image2D src1_image2d;
 
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
-        const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
+        const TensorShape shape2d(src1->info()->dimension(0) / 4,
+                                  src1->info()->dimension(1) * src1->info()->dimension(2));
         const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
 
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+                                                  src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     do
@@ -385,7 +406,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -396,7 +417,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
         add_2D_tensor_argument(idx, src0, slice);
 
         // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
-        if(_export_to_cl_image)
+        if (_export_to_cl_image)
         {
             _kernel.setArg(idx++, src1_image2d);
         }
@@ -411,13 +432,6 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
         // dst buffer
         add_2D_tensor_argument(idx, dst, slice);
 
-        // post op argument buffers
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            add_2D_tensor_argument(idx, post_op_arg, slice);
-        }
-
         // LHS stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[lhs_idx_batch_size]));
 
@@ -425,28 +439,23 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[rhs_idx_batch_size]));
 
         // Bias stride_z (if _add_bias == true)
-        if(_add_bias)
+        if (_add_bias)
         {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size]));
+            _kernel.setArg<cl_uint>(idx++,
+                                    static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size]));
         }
 
         // dst stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[out_idx_batch_size]));
-        // post op argument stride_z
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
-        }
 
         // Cross-plan padding (if _reinterpret_input_as_3d = true)
-        if(_reinterpret_input_as_3d && _has_pad_y)
+        if (_reinterpret_input_as_3d && _has_pad_y)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_lhs));
         }
 
         // Cross-plan padding (if reinterpret_output_as_3d = true)
-        if(_reinterpret_output_as_3d && _has_pad_y)
+        if (_reinterpret_output_as_3d && _has_pad_y)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out));
         }
@@ -457,8 +466,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
         _kernel.setArg<cl_int>(idx++, _k);
 
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
index 00cdb299ce4f96a02eb74ec7729ce73a6eb493b5..e8fd78d476d228ef61908de692b676f64fd73069 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,15 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
 
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 
-#include "arm_compute/core/KernelDescriptors.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -74,35 +74,48 @@ public:
      *                             rhs_info.transpose: true,false
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      */
-    void configure(const ClCompileContext &compile_context,
-                   const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
+    void configure(const ClCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   const ITensorInfo       *src1,
+                   const ITensorInfo       *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                           const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMKernelInfo    &gemm_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool         _slide_matrix_b{ true };
-    bool         _reinterpret_input_as_3d{ false };
-    bool         _reinterpret_output_as_3d{ false };
-    bool         _use_dummy_work_items{ false };
-    bool         _add_bias{ false };
-    bool         _export_to_cl_image{ false };
-    bool         _has_pad_y{ false };
-    signed int   _m{ 1 };
-    signed int   _n{ 1 };
-    signed int   _k{ 1 };
-    unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
+    bool       _slide_matrix_b{true};
+    bool       _reinterpret_input_as_3d{false};
+    bool       _reinterpret_output_as_3d{false};
+    bool       _use_dummy_work_items{false};
+    bool       _add_bias{false};
+    bool       _export_to_cl_image{false};
+    bool       _has_pad_y{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
index 734f8f9b4c934dc213ca5ab19ff295b6fa0f7afa..9a2a4890f3ed93ba0eccecc20a7d32ace1b9acf3 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
@@ -23,16 +23,17 @@
  */
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -56,23 +57,36 @@ constexpr int mmul_m0 = 4;
 constexpr int mmul_n0 = 4;
 constexpr int mmul_k0 = 4;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1, "Only values greater than 0 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.n0 != 1 && rhs_info.n0 != 2 && rhs_info.n0 != 3 && rhs_info.n0 != 4 && rhs_info.n0 != 8 && rhs_info.n0 != 16, "Only 1,2,3,4,8, and 16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.n0 != 1 && rhs_info.n0 != 2 && rhs_info.n0 != 3 && rhs_info.n0 != 4 &&
+                                        rhs_info.n0 != 8 && rhs_info.n0 != 16,
+                                    "Only 1,2,3,4,8, and 16 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 != 1 || lhs_info.k0 != 1), "Only 1 is supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.h0 != 4), "Only 4 is supported for h0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.interleave != true, "Only true is supported for interleave with mmul extension enabled");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.transpose != false, "Only false is supported for transpose with mmul extension enabled");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.interleave != true,
+                                    "Only true is supported for interleave with mmul extension enabled");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.transpose != false,
+                                    "Only false is supported for transpose with mmul extension enabled");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
 
@@ -87,7 +101,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
 
     // Validate the reinterpreted-as-3D-case
-    if(gemm_info.depth_output_gemm3d != 0)
+    if (gemm_info.depth_output_gemm3d != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
     }
@@ -97,9 +111,9 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     }
 
     // Validate the gemm-batched case
-    if(src1->num_dimensions() > 2)
+    if (src1->num_dimensions() > 2)
     {
-        if(gemm_info.depth_output_gemm3d != 0)
+        if (gemm_info.depth_output_gemm3d != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(3) != src1->dimension(2));
         }
@@ -109,15 +123,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         }
     }
 
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
     {
         const unsigned int src2_dim0 = src2->dimension(0);
         const unsigned int src2_dim1 = src2->dimension(1);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-        if(gemm_info.broadcast_bias)
+        if (gemm_info.broadcast_bias)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
         }
         else
         {
@@ -125,18 +140,20 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         }
     }
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    const TensorInfo tensor_info1          = src1->clone()->set_tensor_shape(tensor_shape1);
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
@@ -144,7 +161,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo             *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *src2,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
                                                         const GEMMRHSMatrixInfo &rhs_info,
                                                         const GEMMKernelInfo    &gemm_info)
 {
@@ -152,11 +173,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
     bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -204,19 +226,26 @@ ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::ClGemmMatrixMultiplyReshapedOnlyR
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha,
+void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext  &compile_context,
+                                                              ITensorInfo             *src0,
+                                                              ITensorInfo             *src1,
+                                                              ITensorInfo             *src2,
+                                                              ITensorInfo             *dst,
+                                                              float                    alpha,
                                                               float                    beta,
                                                               const GEMMLHSMatrixInfo &lhs_info,
-                                                              const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                              const GEMMRHSMatrixInfo &rhs_info,
+                                                              const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
 
-    auto padding_info   = get_padding_info({ src0, src1, src2, dst });
+    auto padding_info   = get_padding_info({src0, src1, src2, dst});
     _add_bias           = src2 != nullptr;
     _export_to_cl_image = rhs_info.export_to_cl_image;
 
@@ -236,7 +265,8 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
@@ -249,7 +279,8 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon
     build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
     build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
     build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option("-DACTIVATION_TYPE=" +
+                          lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
     build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
     build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
@@ -283,37 +314,44 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
+Status ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo       *src0,
+                                                               const ITensorInfo       *src1,
+                                                               const ITensorInfo       *src2,
+                                                               const ITensorInfo       *dst,
+                                                               float                    alpha,
+                                                               float                    beta,
                                                                const GEMMLHSMatrixInfo &lhs_info,
-                                                               const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                               const GEMMRHSMatrixInfo &rhs_info,
+                                                               const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
                                                               src2 != nullptr ? src2->clone().get() : nullptr,
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info)
-                                .first);
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info)
+                                    .first);
 
     return Status{};
 }
 
-void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -321,12 +359,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors,
 
     cl::Image2D src1_image2d;
 
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
-        const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
+        const TensorShape shape2d(src1->info()->dimension(0) / 4,
+                                  src1->info()->dimension(1) * src1->info()->dimension(2));
         const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
 
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+                                                  src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     Window slice = window.first_slice_window_3D();
@@ -336,14 +376,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors,
         unsigned int idx = 0;
 
         add_3d_tensor_nhw_argument(idx, src0);
-        if(_export_to_cl_image)
+        if (_export_to_cl_image)
         {
             _kernel.setArg(idx++, src1_image2d);
         }
         add_3d_tensor_nhw_argument(idx, src1);
 
         // Bias buffer (_add_bias == true)
-        if(_add_bias)
+        if (_add_bias)
         {
             add_3d_tensor_nhw_argument(idx, src2);
         }
@@ -358,8 +398,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors,
         // LWS_x should be multiple of 16 at least. (32, 2) has been chosen to have more work-items on a single core
         // LWS also enforces the order of execution of the workitems which improves cache utilization
         enqueue(queue, *this, slice, cl::NDRange(32, 2), false);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h
index 59612fcf5db4113bd6148ef213715f649585e4fe..86d3012f6edc6a8c12c4552d8ac5d1dae9333ba3 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -59,7 +60,13 @@ public:
      *                             rhs_info.transpose: false
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src0,
+                   ITensorInfo             *src1,
+                   ITensorInfo             *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
                    const GEMMLHSMatrixInfo &lhs_info,
                    const GEMMRHSMatrixInfo &rhs_info,
                    const GEMMKernelInfo    &gemm_info);
@@ -69,7 +76,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
                            const GEMMRHSMatrixInfo &rhs_info,
                            const GEMMKernelInfo    &gemm_info);
 
@@ -77,11 +90,11 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _add_bias{ false };
-    bool       _export_to_cl_image{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _add_bias{false};
+    bool       _export_to_cl_image{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
index bf4b664b6e71279191e6c21ac65c62746966c3ba..eea2a169a33e5273f2b8b20019255be18d49103c 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -46,13 +47,17 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+Status validate_arguments(const ITensorInfo       *src,
+                          const ITensorInfo       *dst,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          bool                     reinterpret_input_as_3d)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
     ARM_COMPUTE_RETURN_ERROR_ON((lhs_info.m0 > 4 && lhs_info.m0 < 8) && lhs_info.transpose);
@@ -60,10 +65,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(),
+            misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -71,14 +77,15 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     return Status{};
 }
 
-Window configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+Window
+configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
 {
     const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0;
     const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0;
 
     TensorInfo tmp_info(*src);
 
-    if(reinterpret_input_as_3d)
+    if (reinterpret_input_as_3d)
     {
         // Since the src tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -88,10 +95,12 @@ Window configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixI
     }
 
     // dst auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d)));
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(
+                                 *src, lhs_info, reinterpret_input_as_3d)));
 
     // Configure window
-    Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -106,14 +115,18 @@ ClGemmReshapeLhsMatrixKernel::ClGemmReshapeLhsMatrixKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext  &compile_context,
+                                             ITensorInfo             *src,
+                                             ITensorInfo             *dst,
+                                             const GEMMLHSMatrixInfo &lhs_info,
+                                             bool                     reinterpret_input_as_3d)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
 
-    auto padding_info = get_padding_info({ src });
+    auto padding_info = get_padding_info({src});
 
     const unsigned int src_w      = src->dimension(0);
     const unsigned int m          = reinterpret_input_as_3d ? src->dimension(1) * src->dimension(2) : src->dimension(1);
@@ -168,7 +181,10 @@ void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_con
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo       *src,
+                                              const ITensorInfo       *dst,
+                                              const GEMMLHSMatrixInfo &lhs_info,
+                                              bool                     reinterpret_input_as_3d)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
     return Status{};
@@ -179,8 +195,9 @@ void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -192,8 +209,7 @@ void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi
         add_3d_tensor_nhw_argument(idx, src);
         add_3d_tensor_nhw_argument(idx, dst);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
index db88e0d73553876aa4abf5340695a7c097836148..8e84e8ad8e13e17350c61f3a9bb3c0dc5ab27b7b 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
+++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
@@ -57,14 +57,21 @@ public:
      *                                   lhs_info.interleave: true, false
      * @param[in]  reinterpret_src_as_3d (Optional) True if the src has to be reinterpreted as 3D tensor
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d = false);
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src,
+                   ITensorInfo             *dst,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   bool                     reinterpret_src_as_3d = false);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmReshapeLhsMatrixKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d);
+    static Status validate(const ITensorInfo       *src,
+                           const ITensorInfo       *dst,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           bool                     reinterpret_src_as_3d);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -72,4 +79,4 @@ public:
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
index b3a03880edbd5ecfc0761d20d6d4da7081fdc870..b9ce3873c7331a5d3da3e1258fdac24ab4400e5a 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -52,8 +53,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), "Only 1,2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)),
+                                    "Only 1,2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose));
@@ -61,15 +64,17 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(rhs_info.export_to_cl_image)
+    if (rhs_info.export_to_cl_image)
     {
-        const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1, src->data_type());
+        const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1,
+                                              src->data_type());
         ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -77,23 +82,27 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
 {
     const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0;
     const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0;
     bool               window_changed                      = false;
 
     // dst auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)));
+    auto_init_if_empty(
+        *dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)));
 
     // Configure window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x,
+                                     num_elems_processed_per_iteration_y);
 
     window_changed = update_window_and_padding(win, src_access);
 
-    if(rhs_info.export_to_cl_image)
+    if (rhs_info.export_to_cl_image)
     {
         gemm::update_padding_for_cl_image(dst);
     }
@@ -102,7 +111,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     // This collapse needs to be here in order to tune the Z dimension of LWS
     Window collapsed = win.collapse(win, Window::DimZ);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -112,7 +122,10 @@ ClGemmReshapeRhsMatrixKernel::ClGemmReshapeRhsMatrixKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
+void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext  &compile_context,
+                                             ITensorInfo             *src,
+                                             ITensorInfo             *dst,
+                                             const GEMMRHSMatrixInfo &rhs_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -143,7 +156,9 @@ void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_con
     _kernel.setArg<cl_int>(idx++, rhs_info.h0);
 }
 
-Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
+Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo       *src,
+                                              const ITensorInfo       *dst,
+                                              const GEMMRHSMatrixInfo &rhs_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, rhs_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), rhs_info).first);
@@ -156,8 +171,9 @@ void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -169,9 +185,8 @@ void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi
         add_3d_tensor_nhw_argument(idx, src);
         add_3d_tensor_nhw_argument(idx, dst);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
index 31eaa46e023cf33bb931b0102683876de5d07288..7203d574fb6a5b2469322e0ef53f861cad53c22b 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
+++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
@@ -66,7 +66,10 @@ public:
      *                             rhs_info.transpose: true, false
      *                             rhs_info.interleave: true, false
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info);
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src,
+                   ITensorInfo             *dst,
+                   const GEMMRHSMatrixInfo &rhs_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmReshapeRhsMatrixKernel::configure()
@@ -81,4 +84,4 @@ public:
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
index 719201d1fe239c4ab65ba0832d11e6cb1e05b4e4..2e1cefc6e7a6f230590bf22751f140b49a408c14 100644
--- a/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
+++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -52,7 +52,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, co
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
 
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != dst->dimension(0));
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
     }
@@ -62,8 +62,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, co
 }
 } // namespace
 
-ClHeightConcatenateKernel::ClHeightConcatenateKernel()
-    : _height_offset(0)
+ClHeightConcatenateKernel::ClHeightConcatenateKernel() : _height_offset(0)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -74,12 +73,15 @@ Status ClHeightConcatenateKernel::validate(const ITensorInfo *src, unsigned int
     return Status{};
 }
 
-void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst)
+void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                          ITensorInfo            *src,
+                                          unsigned int            height_offset,
+                                          ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     _height_offset = height_offset;
 
@@ -90,9 +92,10 @@ void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_contex
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
 
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -125,8 +128,9 @@ void ClHeightConcatenateKernel::run_op(ITensorPack &tensors, const Window &windo
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, src, window);
diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.h b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h
index d3c077fc224176c2b2e7f7e0d281b7bada38a74a..5a391a1212793b07f1fc7382e7dbfe7c5d324732 100644
--- a/src/gpu/cl/kernels/ClHeightConcatenateKernel.h
+++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h
@@ -50,7 +50,8 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClHeightConcatenateKernel::configure()
@@ -64,7 +65,7 @@ public:
 
 private:
     unsigned int _height_offset;
-    int32_t      _depth{ 0 };
+    int32_t      _depth{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.cpp b/src/gpu/cl/kernels/ClIm2ColKernel.cpp
index e89084719929214573a4751b38316288e400234c..ef7a52828f16c9d15e2dcec98c910f68889d3e16 100644
--- a/src/gpu/cl/kernels/ClIm2ColKernel.cpp
+++ b/src/gpu/cl/kernels/ClIm2ColKernel.cpp
@@ -29,9 +29,10 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -60,13 +61,19 @@ struct Im2ColConfiguration
     bool                  is_padding_required_nchw{};
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                          unsigned int num_groups)
+Status validate_arguments(const ITensorInfo   *src,
+                          const ITensorInfo   *dst,
+                          const Size2D        &kernel_dims,
+                          const PadStrideInfo &conv_info,
+                          bool                 has_bias,
+                          const Size2D        &dilation,
+                          unsigned int         num_groups)
 {
     const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
     ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
@@ -82,9 +89,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     const unsigned     total_height = src->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
     ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
-        const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
+        const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(
+            compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -93,13 +101,21 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                                                        unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw, unsigned int num_groups)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo         *src,
+                                                        ITensorInfo         *dst,
+                                                        const Size2D        &kernel_dims,
+                                                        const PadStrideInfo &conv_info,
+                                                        bool                 has_bias,
+                                                        const Size2D        &dilation,
+                                                        unsigned int         num_elems_processed_per_iteration,
+                                                        bool                 is_padding_required_nchw,
+                                                        unsigned int         num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output tensor auto initialization if not yet initialized
-    TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
+    TensorShape expected_output_shape =
+        compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
 
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(expected_output_shape));
 
@@ -113,22 +129,22 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     bool   window_changed = false;
     Window win;
 
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
     }
     else
     {
-        if(is_padding_required_nchw)
+        if (is_padding_required_nchw)
         {
-            const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
-            win = calculate_max_window(*src,
-                                       Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
-            AccessWindowStatic input_access(src,
-                                            -border.left,
-                                            -border.top,
-                                            ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
-                                            input_height + border.bottom);
+            const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(),
+                                    conv_info.pad_left());
+            win = calculate_max_window(
+                *src, Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
+            AccessWindowStatic input_access(
+                src, -border.left, -border.top,
+                ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
+                input_height + border.bottom);
             window_changed = window_changed || update_window_and_padding(win, input_access);
         }
         else
@@ -142,11 +158,17 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
     win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 
-Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups)
+Im2ColConfiguration configure_opencl_kernel(const ITensorInfo   *src,
+                                            const Size2D        &kernel_dims,
+                                            const PadStrideInfo &conv_info,
+                                            bool                 has_bias,
+                                            const Size2D        &dilation,
+                                            unsigned int         num_groups)
 {
     const DataLayout   data_layout   = src->data_layout();
     const DataType     data_type     = src->data_type();
@@ -157,7 +179,8 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
     const unsigned int input_height  = src->dimension(height_idx);
     const unsigned int input_channel = src->dimension(channel_idx);
 
-    const std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
+    const std::pair<unsigned int, unsigned int> convolved_dims =
+        scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
 
     // Im2Col configuration
     std::string                   kernel_name = "im2col_generic_";
@@ -184,21 +207,22 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
     build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
     build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
     build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
-    build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0");
+    build_opts.add_option_if_else(is_data_type_quantized(data_type),
+                                  "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0");
     build_opts.add_option_if(has_bias, "-DHAS_BIAS");
 
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         num_elems_processed_per_iteration = std::min(2U, input_channel);
         is_padding_required_nchw          = false;
 
         // Only the 3x3 and 9x9 cases are optimized for NHWC
-        if(kernel_dims == Size2D(3U, 3U))
+        if (kernel_dims == Size2D(3U, 3U))
         {
             kernel_name = "im2col3x3_";
             build_opts.add_option("-DIM2COL_3X3");
         }
-        else if(kernel_dims == Size2D(9U, 9U))
+        else if (kernel_dims == Size2D(9U, 9U))
         {
             kernel_name = "im2col9x9_";
             build_opts.add_option("-DIM2COL_9X9");
@@ -219,17 +243,17 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
     }
     else
     {
-        if(dilation == Size2D(1U, 1U))
+        if (dilation == Size2D(1U, 1U))
         {
             const bool squared_im2col = kernel_dims.width == kernel_dims.height;
-            if(squared_im2col)
+            if (squared_im2col)
             {
                 // Check if we can run an optimized im2col for NCHW
-                switch(kernel_dims.width)
+                switch (kernel_dims.width)
                 {
                     case 1:
                         // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
-                        if(conv_info.stride().first == 1 && !conv_info.has_padding())
+                        if (conv_info.stride().first == 1 && !conv_info.has_padding())
                         {
                             kernel_name                       = "im2col1x1_stridex1_";
                             num_elems_processed_per_iteration = 4;
@@ -248,7 +272,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
                         break;
                     case 11:
                         // Optimized im2col11x11 if pad_x = pad_y = 0
-                        if(!conv_info.has_padding())
+                        if (!conv_info.has_padding())
                         {
                             kernel_name                       = "im2col11x11_padx0_pady0_";
                             num_elems_processed_per_iteration = 1;
@@ -262,7 +286,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
                         break;
                 }
             }
-            else if(kernel_dims.width > 1 && !conv_info.has_padding())
+            else if (kernel_dims.width > 1 && !conv_info.has_padding())
             {
                 kernel_name                       = "im2col_generic_padx0_pady0_";
                 num_elems_processed_per_iteration = 1;
@@ -297,19 +321,29 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
 } // namespace
 
 ClIm2ColKernel::ClIm2ColKernel()
-    : _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
+    : _data_layout(DataLayout::UNKNOWN),
+      _convolved_dims(),
+      _num_elems_processed_per_iteration(1),
+      _kernel_dims(),
+      _conv_info(),
+      _num_groups()
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
-                               const Size2D &dilation,
-                               unsigned int  num_groups)
+void ClIm2ColKernel::configure(const ClCompileContext &compile_context,
+                               ITensorInfo            *src,
+                               ITensorInfo            *dst,
+                               const Size2D           &kernel_dims,
+                               const PadStrideInfo    &conv_info,
+                               bool                    has_bias,
+                               const Size2D           &dilation,
+                               unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
     _data_layout      = src->data_layout();
 
     const unsigned int width_idx    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
@@ -320,19 +354,22 @@ void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorI
     // Select and configure the optimal OpenCL kernel to run.
     // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration
     // and the padding requirement flag
-    Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
+    Im2ColConfiguration im2col_config =
+        configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
 
     // Create kernel
     _kernel = create_kernel(compile_context, im2col_config.kernel_name, im2col_config.build_options);
 
-    _convolved_dims                    = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
+    _convolved_dims =
+        scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
     _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration;
     _kernel_dims                       = kernel_dims; // Only needed by the Tuner
     _conv_info                         = conv_info;   // Only needed by the Tuner
     _num_groups                        = num_groups;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
+    auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation,
+                                                    im2col_config.num_elems_processed_per_iteration,
                                                     im2col_config.is_padding_required_nchw, num_groups);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
@@ -353,14 +390,22 @@ void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorI
     ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
 }
 
-Status ClIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                                unsigned int num_groups)
+Status ClIm2ColKernel::validate(const ITensorInfo   *src,
+                                const ITensorInfo   *dst,
+                                const Size2D        &kernel_dims,
+                                const PadStrideInfo &conv_info,
+                                bool                 has_bias,
+                                const Size2D        &dilation,
+                                unsigned int         num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
-    Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
+    Im2ColConfiguration im2col_config =
+        configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims,
+                                                              conv_info, has_bias, dilation,
+                                                              im2col_config.num_elems_processed_per_iteration,
                                                               im2col_config.is_padding_required_nchw, num_groups)
-                                .first);
+                                    .first);
     return Status{};
 }
 
@@ -388,7 +433,7 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
     Window slice_in  = first_slice_3d;
     Window slice_out = window_output.first_slice_window_2D();
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         const Window tmp_win     = window.collapse_if_possible(ICLKernel::window(), 3);
         const int    num_batches = tmp_win[3].end();
@@ -398,7 +443,10 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
     }
     else
     {
-        slice.set(0, Window::Dimension(0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), _num_elems_processed_per_iteration));
+        slice.set(0,
+                  Window::Dimension(
+                      0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)),
+                      _num_elems_processed_per_iteration));
         slice.set(1, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
         // Note: In case of NCHW the 3rd dimension is already set collapsing the input window
     }
@@ -414,14 +462,16 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
     slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
     slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
 
-    unsigned int idx = num_arguments_per_3D_tensor() + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
+    unsigned int idx = num_arguments_per_3D_tensor() +
+                       (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
     _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src->info()->strides_in_bytes()[3]));
-    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
+    _kernel.setArg<cl_uint>(idx++,
+                            static_cast<unsigned int>(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
     do
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src, slice_in);
-        if(_num_groups == 1)
+        if (_num_groups == 1)
         {
             add_2D_tensor_argument(idx, dst, slice_out);
         }
@@ -430,8 +480,8 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
             add_3D_tensor_argument(idx, dst, slice_out);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
+    } while (window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) &&
+             window_collapsed.slide_window_slice_3D(slice_in));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.h b/src/gpu/cl/kernels/ClIm2ColKernel.h
index a637ad215db096d9b686c602144ebbaffa59946b..c8cd5b328d9472f3f43ca9b4ac90b78d67f723f4 100644
--- a/src/gpu/cl/kernels/ClIm2ColKernel.h
+++ b/src/gpu/cl/kernels/ClIm2ColKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Size2D.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -77,28 +78,38 @@ public:
      * @param[in]  dilation        (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
-                   const Size2D &dilation   = Size2D(1U, 1U),
-                   unsigned int  num_groups = 1);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const Size2D           &kernel_dims,
+                   const PadStrideInfo    &conv_info,
+                   bool                    has_bias,
+                   const Size2D           &dilation   = Size2D(1U, 1U),
+                   unsigned int            num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClIm2ColKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
-                           unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *output,
+                           const Size2D        &kernel_dims,
+                           const PadStrideInfo &conv_info,
+                           bool                 has_bias,
+                           const Size2D        &dilation   = Size2D(1U, 1U),
+                           unsigned int         num_groups = 1);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 public:
-    DataLayout _data_layout;
+    DataLayout                            _data_layout;
     std::pair<unsigned int, unsigned int> _convolved_dims;
-    unsigned int  _num_elems_processed_per_iteration;
-    Size2D        _kernel_dims;
-    PadStrideInfo _conv_info;
-    unsigned int  _num_groups;
+    unsigned int                          _num_elems_processed_per_iteration;
+    Size2D                                _kernel_dims;
+    PadStrideInfo                         _conv_info;
+    unsigned int                          _num_groups;
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp
index d291fad76c655fd5644b965842b592ddc6d8c79a..8c493d08c6442449fc0499c778a0953f5454d3b4 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp
+++ b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,26 +44,29 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
-                          const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc)
+Status validate_arguments(const ITensorInfo                 *src,
+                          const ITensorInfo                 *weights,
+                          const ITensorInfo                 *dst,
+                          const PadStrideInfo               &conv_info,
+                          const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != src->dimension(0), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != src->dimension(0),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+                                    "M0 can only be greater than 0 and less than or equal to 8");
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(),
-                                                                                                                 src->data_layout(),
-                                                                                                                 weights->tensor_shape(),
-                                                                                                                 conv_info,
-                                                                                                                 desc));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(),
+            misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), src->data_layout(),
+                                                                  weights->tensor_shape(), conv_info, desc));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
     }
 
@@ -75,8 +79,12 @@ ClIndirectConv2dAddressPrecalculationKernel::ClIndirectConv2dAddressPrecalculati
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst,
-                                                            const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc)
+void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileContext            &compile_context,
+                                                            ITensorInfo                       *src,
+                                                            ITensorInfo                       *weights,
+                                                            ITensorInfo                       *dst,
+                                                            const PadStrideInfo               &conv_info,
+                                                            const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info, desc));
@@ -85,11 +93,8 @@ void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileConte
     constexpr unsigned int height_idx = 2;
 
     // Get dst shape
-    TensorShape output_shape = misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(),
-                                                                                     src->data_layout(),
-                                                                                     weights->tensor_shape(),
-                                                                                     conv_info,
-                                                                                     desc);
+    TensorShape output_shape = misc::shape_calculator::compute_indirect_buffer_shape(
+        src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc);
 
     TensorShape output_conv_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
 
@@ -136,14 +141,19 @@ void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileConte
     // Since this kernel should be called only once, we do not need to set the config_id for tuning
 }
 
-Status ClIndirectConv2dAddressPrecalculationKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
-                                                             const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc)
+Status ClIndirectConv2dAddressPrecalculationKernel::validate(const ITensorInfo                 *src,
+                                                             const ITensorInfo                 *weights,
+                                                             const ITensorInfo                 *dst,
+                                                             const PadStrideInfo               &conv_info,
+                                                             const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info, desc));
     return Status{};
 }
 
-void ClIndirectConv2dAddressPrecalculationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClIndirectConv2dAddressPrecalculationKernel::run_op(ITensorPack      &tensors,
+                                                         const Window     &window,
+                                                         cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h
index ff7f4be14775d2d8f58927611d96a39f43bbe969..b565609c6a99c812a922027001c3c93958a996c9 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h
+++ b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h
@@ -60,16 +60,23 @@ public:
      * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  desc            Direct convolution descriptor used to build the NHWC direct/indirect convolution kernel.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst,
-                   const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc);
+    void configure(const CLCompileContext            &compile_context,
+                   ITensorInfo                       *src,
+                   ITensorInfo                       *weights,
+                   ITensorInfo                       *dst,
+                   const PadStrideInfo               &conv_info,
+                   const DirectConvComputeKernelInfo &desc);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClIndirectConv2dAddressPreCalculationKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
-                           const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc);
+    static Status validate(const ITensorInfo                 *src,
+                           const ITensorInfo                 *weights,
+                           const ITensorInfo                 *dst,
+                           const PadStrideInfo               &conv_info,
+                           const DirectConvComputeKernelInfo &desc);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
index a337eb50fda55e586b47b9106ffee1809cfbe511..3510b6970cb6555871d8b4cb4fd27679534fb4a4 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
@@ -23,13 +23,14 @@
  */
 #include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -46,8 +47,14 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *indirect_buffer, const ITensorInfo *dst,
-                          const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+Status validate_arguments(const ITensorInfo                 *src,
+                          const ITensorInfo                 *weights,
+                          const ITensorInfo                 *biases,
+                          const ITensorInfo                 *indirect_buffer,
+                          const ITensorInfo                 *dst,
+                          const PadStrideInfo               &conv_info,
+                          const ActivationLayerInfo         &act_info,
+                          const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
@@ -55,37 +62,38 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indirect_buffer, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(indirect_buffer->tensor_shape(),
-                                                       misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(),
-                                                                                                             src->data_layout(),
-                                                                                                             weights->tensor_shape(),
-                                                                                                             conv_info,
-                                                                                                             desc));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+        indirect_buffer->tensor_shape(),
+        misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), src->data_layout(),
+                                                              weights->tensor_shape(), conv_info, desc));
 
     constexpr int channel_idx = 0;
     constexpr int batch_idx   = 3;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+                                    "M0 can only be greater than 0 and less than or equal to 8");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+                                        desc.n0 != 16,
                                     "N0 can only be: 1, 2, 3, 4, 8, and 16");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+                                        desc.k0 != 16,
                                     "K0 can only be: 1, 2, 3, 4, 8, and 16");
 
-    if(desc.export_weights_to_cl_image)
+    if (desc.export_weights_to_cl_image)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
-                                        "K0 can only be: 4, 8, and 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, "K0 can only be: 4, 8, and 16");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(weights),
                                         "Export to CLImage is not supported for this weight configuration");
     }
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(src->data_type()))
+        if (is_data_type_quantized_asymmetric(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -95,15 +103,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         }
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(channel_idx) != weights->dimension(batch_idx),
                                         "Biases size and number of dst feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
-                                        "Biases should be one dimensional");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
 
@@ -116,13 +123,21 @@ ClIndirectConv2dKernel::ClIndirectConv2dKernel()
     _type = CLKernelType::DIRECT;
 }
 
-void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *indirect_buffer, ITensorInfo *dst,
-                                       const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+void ClIndirectConv2dKernel::configure(const CLCompileContext            &compile_context,
+                                       ITensorInfo                       *src,
+                                       ITensorInfo                       *weights,
+                                       ITensorInfo                       *biases,
+                                       ITensorInfo                       *indirect_buffer,
+                                       ITensorInfo                       *dst,
+                                       const PadStrideInfo               &conv_info,
+                                       const ActivationLayerInfo         &act_info,
+                                       const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, indirect_buffer, dst);
 
     // Perform validation
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
 
     constexpr unsigned int channel_idx   = 0;
     constexpr unsigned int width_idx     = 1;
@@ -137,10 +152,7 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, output_shape,
-                       1,
-                       src->data_type(),
-                       src->quantization_info());
+    auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
 
     // Configure kernel window
     Window win;
@@ -164,7 +176,7 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     _export_to_cl_image = desc.export_weights_to_cl_image;
 
     // Update the padding for the weights tensor if we can export to cl_image
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
         gemm::update_padding_for_cl_image(weights);
     }
@@ -173,11 +185,12 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     // When M0 is 5, 6, and 7, we use vload8 to fetch the data from the buffer
     const unsigned int load_indirect_buf_size = m0 > 4 ? 8 : m0;
     const unsigned int indirect_buf_width     = indirect_buffer->tensor_shape()[0];
-    const unsigned int round_up_width         = ((indirect_buf_width + load_indirect_buf_size - 1) / load_indirect_buf_size) * load_indirect_buf_size;
-    const unsigned int padding                = round_up_width - indirect_buf_width;
+    const unsigned int round_up_width =
+        ((indirect_buf_width + load_indirect_buf_size - 1) / load_indirect_buf_size) * load_indirect_buf_size;
+    const unsigned int padding = round_up_width - indirect_buf_width;
     indirect_buffer->extend_padding(PaddingSize(0, indirect_buffer->padding().right + padding, 0, 0));
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         build_options.add_option(std::string("-DHAS_BIAS"));
         build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
@@ -186,9 +199,10 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324
     const auto act_function = act_info.activation();
 
-    if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-       && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-       && (data_type == DataType::F32 || data_type == DataType::F16))
+    if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+        (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+         act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+        (data_type == DataType::F32 || data_type == DataType::F16))
     {
         // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
         // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
@@ -224,7 +238,7 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     // A macro guard to compile ONLY the kernel of interest
     build_options.add_option("-D" + upper_string(kernel_name.str()));
 
-    if(compile_context.get_ddk_version() >= 30)
+    if (compile_context.get_ddk_version() >= 30)
     {
         build_options.add_option("-fregister-allocation=64");
     }
@@ -253,10 +267,17 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     _config_id += support::cpp11::to_string(dst->dimension(channel_idx));
 }
 
-Status ClIndirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *indirect_buffer, const ITensorInfo *dst,
-                                        const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+Status ClIndirectConv2dKernel::validate(const ITensorInfo                 *src,
+                                        const ITensorInfo                 *weights,
+                                        const ITensorInfo                 *biases,
+                                        const ITensorInfo                 *indirect_buffer,
+                                        const ITensorInfo                 *dst,
+                                        const PadStrideInfo               &conv_info,
+                                        const ActivationLayerInfo         &act_info,
+                                        const DirectConvComputeKernelInfo &desc)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
     return Status{};
 }
 
@@ -268,35 +289,42 @@ void ClIndirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window,
     // Get initial windows
     Window slice = window.first_slice_window_3D();
 
-    const auto src             = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights         = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases          = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    const auto indirect_buffer = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_3));
-    auto       dst             = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    const auto indirect_buffer =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_3));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     cl::Image2D weights_cl_image;
 
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
-        const size_t      image_w = weights->info()->dimension(0) / 4;
-        const size_t      image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
+        const size_t image_w = weights->info()->dimension(0) / 4;
+        const size_t image_h =
+            weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
         const TensorShape shape2d(image_w, image_h);
         const size_t      image_row_pitch = weights->info()->strides_in_bytes()[1];
 
         // Export cl_buffer to cl_image
-        weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        weights_cl_image =
+            create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d,
+                                       weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     unsigned int idx = 0;
     add_4d_tensor_nhwc_argument(idx, src);
     add_4d_tensor_nhwc_argument(idx, indirect_buffer);
     add_4d_tensor_nhwc_argument(idx, dst);
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
         _kernel.setArg(idx++, weights_cl_image);
     }
     add_4d_tensor_nhwc_argument(idx, weights);
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         add_1D_tensor_argument(idx, biases, slice);
     }
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.h b/src/gpu/cl/kernels/ClIndirectConv2dKernel.h
index b6c7b35fa48c4849887323369174751ceb57ff0d..04166d417e830b396f46ac0f9a198cb7793b21cd 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dKernel.h
+++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -60,22 +61,35 @@ public:
      * @param[in]  act_info        Contains activaton information described in @ref ActivationLayerInfo.
      * @param[in]  desc            Direct convolution descriptor used to build the NHWC indirect convolution kernel.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *off, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                   const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc);
+    void configure(const CLCompileContext            &compile_context,
+                   ITensorInfo                       *src,
+                   ITensorInfo                       *off,
+                   ITensorInfo                       *weights,
+                   ITensorInfo                       *biases,
+                   ITensorInfo                       *dst,
+                   const PadStrideInfo               &conv_info,
+                   const ActivationLayerInfo         &act_info,
+                   const DirectConvComputeKernelInfo &desc);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClIndirectConv2dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *off, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                           const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc);
+    static Status validate(const ITensorInfo                 *src,
+                           const ITensorInfo                 *off,
+                           const ITensorInfo                 *weights,
+                           const ITensorInfo                 *biases,
+                           const ITensorInfo                 *dst,
+                           const PadStrideInfo               &conv_info,
+                           const ActivationLayerInfo         &act_info,
+                           const DirectConvComputeKernelInfo &desc);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 public:
-    bool _export_to_cl_image{ false };
+    bool _export_to_cl_image{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
index f7fdbe2c238bc100ed48c29aaba171dcf5fb6372..0bb6b0c083826d73f583e396fd59c3d14cf683f7 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
@@ -23,11 +23,12 @@
  */
 #include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
@@ -37,9 +38,7 @@
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/gpu/cl/ClCompileContext.h"
-
-#include "arm_compute/core/QuantizationInfo.h"
-
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -62,69 +61,62 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
     // Validate M0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
 
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
     }
 
     // Validate N0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), "Only 1,2,3,4,8,16 are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
 
     // Validate K0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
-    if(!adj_lhs || adj_rhs)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), "Only 1,2,3,4,8,16 are supported for K0");
-    }
-
-    return Status{};
-}
-
-Status validate_input_shapes(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const MatMulKernelInfo &matmul_kernel_info)
-{
-    const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
-    const size_t rhs_k = matmul_kernel_info.adj_rhs ? rhs_shape.x() : rhs_shape.y();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_k != rhs_k, "K dimension in Lhs and Rhs matrices must match.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape.total_size() == 0, "Lhs tensor can't be empty");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_shape.total_size() == 0, "Rhs tensor can't be empty");
-
-    constexpr size_t batch_dim_start = 2;
-    for(size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i)
+    if (!adj_lhs || adj_rhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape[i] != rhs_shape[i], "Batch dimension broadcasting is not supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for K0");
     }
 
     return Status{};
 }
-}
+} // namespace
 ClMatMulLowpNativeKernel::ClMatMulLowpNativeKernel()
 {
     _type = CLKernelType::GEMM;
 }
-Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+Status ClMatMulLowpNativeKernel::validate(const ITensorInfo         *lhs,
+                                          const ITensorInfo         *rhs,
+                                          const ITensorInfo         *bias,
+                                          const ITensorInfo         *dst,
+                                          const MatMulKernelInfo    &matmul_kernel_info,
                                           const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY && act_info.activation() != ActivationFunction::RELU
-                                     && act_info.activation() != ActivationFunction::LU_BOUNDED_RELU && act_info.activation() != ActivationFunction::BOUNDED_RELU),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY &&
+                                     act_info.activation() != ActivationFunction::RELU &&
+                                     act_info.activation() != ActivationFunction::LU_BOUNDED_RELU &&
+                                     act_info.activation() != ActivationFunction::BOUNDED_RELU),
                                     "Activation Function specified is unsupported.");
-    const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -133,7 +125,12 @@ Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorI
 
     return Status{};
 }
-void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+void ClMatMulLowpNativeKernel::configure(const ClCompileContext    &compile_context,
+                                         ITensorInfo               *lhs,
+                                         ITensorInfo               *rhs,
+                                         ITensorInfo               *bias,
+                                         ITensorInfo               *dst,
+                                         const MatMulKernelInfo    &matmul_kernel_info,
                                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
@@ -141,7 +138,8 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
     ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
     const int  m       = dst->dimension(1);
     const int  n       = dst->dimension(0);
@@ -235,10 +233,13 @@ void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const ICLTensor *lhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const ICLTensor *rhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    ICLTensor       *dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const ICLTensor *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
 
@@ -247,7 +248,7 @@ void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window
 
     add_3d_tensor_nhw_argument(idx, lhs);
     add_3d_tensor_nhw_argument(idx, rhs);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         add_3d_tensor_nhw_argument(idx, bias);
     }
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
index 64415f42f7bd345693be8e674c68e0725a682075..ffdb72085519a455cf9add27752a263b4547829f 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -54,7 +55,12 @@ public:
      * @param[in]  matmul_kernel_info Attributes for Batch MatMul Kernel
      * @param[in]  act_info           (Optional) Class containing information about fused activation function.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const MatMulKernelInfo    &matmul_kernel_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -62,7 +68,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const MatMulKernelInfo    &matmul_kernel_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1df0ca0410fccc72f2bfeb419c36d9d1f0900e72
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+// Block size dimensions for the MMUL extension
+constexpr int mmul_m0 = 4;
+constexpr int mmul_n0 = 4;
+constexpr int mmul_k0 = 16;
+
+Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
+{
+    const bool adj_lhs = matmul_kernel_info.adj_lhs;
+    const int  m0      = matmul_kernel_info.m0;
+    const int  n0      = matmul_kernel_info.n0;
+    const int  k0      = matmul_kernel_info.k0;
+
+    // Validate M0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
+
+    if (adj_lhs)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+    }
+
+    // Validate N0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
+
+    // Validate K0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((k0 != 4), "Only 4 is supported for k0");
+
+    // Validate ExportToCLImage
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(matmul_kernel_info.export_rhs_to_cl_image, "Export to CLImage is not supported!");
+
+    return Status{};
+}
+} // namespace
+
+ClMatMulLowpNativeMMULKernel::ClMatMulLowpNativeMMULKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
+Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo         *lhs,
+                                              const ITensorInfo         *rhs,
+                                              const ITensorInfo         *bias,
+                                              const ITensorInfo         *dst,
+                                              const MatMulKernelInfo    &matmul_kernel_info,
+                                              const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
+
+    const TensorShape &lhs_shape = lhs->tensor_shape();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs_shape, rhs->tensor_shape(), matmul_kernel_info));
+
+    const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR((lhs_k % mmul_k0) != 0, "K dimension must be a multiple of %d", mmul_k0);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY),
+                                    "Activation Function specified is unsupported.");
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
+
+    if (dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+    }
+
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != bias->dimension(0));
+    }
+
+    return Status{};
+}
+
+void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext    &compile_context,
+                                             ITensorInfo               *lhs,
+                                             ITensorInfo               *rhs,
+                                             ITensorInfo               *bias,
+                                             ITensorInfo               *dst,
+                                             const MatMulKernelInfo    &matmul_kernel_info,
+                                             const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info, act_info);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+
+    ARM_COMPUTE_UNUSED(compile_context, lhs, rhs, bias, matmul_kernel_info, act_info);
+    CLBuildOptions build_opts;
+
+    const int m = dst->dimension(1);
+    const int n = dst->dimension(0);
+    const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+
+    const int m0 = std::min(matmul_kernel_info.m0, m);
+    const int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
+
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks
+    // at the end of a row/column if any. This is to avoid padding.
+    const unsigned int m0_leftover = m % m0;
+    const unsigned int n0_leftover = n % n0;
+
+    // Configure kernel window
+    const auto win_config =
+        validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    IClKernel::configure_internal(win_config.second);
+
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(lhs->data_type()));
+    build_opts.add_option("-DM=" + support::cpp11::to_string(m));
+    build_opts.add_option("-DN=" + support::cpp11::to_string(n));
+    build_opts.add_option("-DK=" + support::cpp11::to_string(k));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(matmul_kernel_info.k0));
+    build_opts.add_option("-DM0_LEFTOVER=" + support::cpp11::to_string(m0_leftover));
+    build_opts.add_option("-DN0_LEFTOVER=" + support::cpp11::to_string(n0_leftover));
+    build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
+    build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
+    build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
+    build_opts.add_option_if(bias != nullptr, "-DBIAS");
+
+    const UniformQuantizationInfo lqinfo = lhs->quantization_info().uniform();
+    const UniformQuantizationInfo rqinfo = rhs->quantization_info().uniform();
+    const UniformQuantizationInfo dqinfo = dst->quantization_info().uniform();
+
+    float multiplier        = lqinfo.scale * rqinfo.scale / dqinfo.scale;
+    int   output_multiplier = 0;
+    int   output_shift      = 0;
+    arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+
+    build_opts.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+    build_opts.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+
+    // Note : Offset is not negated, unlike gemmlowp kernels
+    build_opts.add_option("-DLHS_OFFSET=" + support::cpp11::to_string(lqinfo.offset));
+    build_opts.add_option("-DRHS_OFFSET=" + support::cpp11::to_string(rqinfo.offset));
+    build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(dqinfo.offset));
+
+    std::string kernel_name("mat_mul_native_quantized_mmul");
+    kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
+    kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(lhs->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(m);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(n);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(k);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(n0);
+}
+
+void ClMatMulLowpNativeMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+        tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+    auto *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
+
+    unsigned int idx = 0;
+    add_3d_tensor_nhw_argument(idx, lhs);
+    add_3d_tensor_nhw_argument(idx, rhs);
+
+    if (bias != nullptr)
+    {
+        add_3d_tensor_nhw_argument(idx, bias);
+    }
+    add_3d_tensor_nhw_argument(idx, dst);
+
+    // LWS_x should be multiple of 16 at least. (32, 2) has been chosen to have more work-items on a single core
+    // LWS also enforces the order of execution of the work items which improves cache utilization
+    enqueue(queue, *this, window, cl::NDRange(32, 2), false);
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e84d46d34a4df42b822d5bd0f9203db996e6f87c
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEMMULKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEMMULKERNEL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+// Forward declerations
+struct MatMulKernelInfo;
+namespace opencl
+{
+namespace kernels
+{
+class ClMatMulLowpNativeMMULKernel : public IClKernel
+{
+public:
+    ClMatMulLowpNativeMMULKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulLowpNativeMMULKernel);
+
+    /** Initialise the kernel's input and output.
+     *
+     * Similar to @ref ClMatMulLowpNativeKernel::configure()
+     *
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const MatMulKernelInfo    &matmul_kernel_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClMatMulLowpNativeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const MatMulKernelInfo    &matmul_kernel_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEMMULKERNEL_H
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
index 8f8ccfc41f39ca5858278f50fae8245e6ac2c4c3..a1fa9fa9ab22af29a926165a90aea844398a7d1f 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
@@ -23,11 +23,11 @@
  */
 #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
@@ -37,7 +37,7 @@
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -60,38 +60,23 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
     // Validate M0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
 
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
     }
 
     // Validate N0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), "Only 1,2,3,4,8,16 are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
 
     // Validate K0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
-    if(!adj_lhs || adj_rhs)
+    if (!adj_lhs || adj_rhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), "Only 1,2,3,4,8,16 are supported for K0");
-    }
-
-    return Status{};
-}
-
-Status validate_input_shapes(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const MatMulKernelInfo &matmul_kernel_info)
-{
-    const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
-    const size_t rhs_k = matmul_kernel_info.adj_rhs ? rhs_shape.x() : rhs_shape.y();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_k != rhs_k, "K dimension in Lhs and Rhs matrices must match.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape.total_size() == 0, "Lhs tensor can't be empty");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_shape.total_size() == 0, "Rhs tensor can't be empty");
-
-    constexpr size_t batch_dim_start = 2;
-    for(size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape[i] != rhs_shape[i], "Batch dimension broadcasting is not supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for K0");
     }
 
     return Status{};
@@ -100,30 +85,37 @@ Status validate_input_shapes(const TensorShape &lhs_shape, const TensorShape &rh
 Status validate_export_to_cl_image(const ITensorInfo *rhs, const MatMulKernelInfo &matmul_kernel_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(matmul_kernel_info.export_rhs_to_cl_image && rhs->lock_paddings());
-    if(matmul_kernel_info.export_rhs_to_cl_image)
+    if (matmul_kernel_info.export_rhs_to_cl_image)
     {
-        if(matmul_kernel_info.adj_rhs)
+        if (matmul_kernel_info.adj_rhs)
         {
             const int k0 = matmul_kernel_info.k0;
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 != 4 && k0 != 8 && k0 != 16, "K0 can only be: 4, 8, and 16 for Rhs transposed");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 != 4 && k0 != 8 && k0 != 16,
+                                            "K0 can only be: 4, 8, and 16 for Rhs transposed");
         }
         else
         {
             const int n0 = matmul_kernel_info.n0;
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 != 4 && n0 != 8 && n0 != 16, "N0 can only be: 4, 8, and 16 for Rhs non-transposed");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 != 4 && n0 != 8 && n0 != 16,
+                                            "N0 can only be: 4, 8, and 16 for Rhs non-transposed");
         }
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs), "Export to CLImage is not supported for this device/configuration");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs),
+                                        "Export to CLImage is not supported for this device/configuration");
     }
 
     return Status{};
 }
-}
+} // namespace
 ClMatMulNativeKernel::ClMatMulNativeKernel()
 {
     _type = CLKernelType::GEMM;
 }
 
-Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+Status ClMatMulNativeKernel::validate(const ITensorInfo         *lhs,
+                                      const ITensorInfo         *rhs,
+                                      const ITensorInfo         *bias,
+                                      const ITensorInfo         *dst,
+                                      const MatMulKernelInfo    &matmul_kernel_info,
                                       const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
@@ -131,28 +123,36 @@ Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_export_to_cl_image(rhs, matmul_kernel_info));
 
-    const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bias, lhs);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias->num_dimensions() > 1), "Multi dimensional bias is unsupported.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0], "First dimension of bias and output tensors must match.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0],
+                                        "First dimension of bias and output tensors must match.");
     }
 
     return Status{};
 }
-void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+void ClMatMulNativeKernel::configure(const ClCompileContext    &compile_context,
+                                     ITensorInfo               *lhs,
+                                     ITensorInfo               *rhs,
+                                     ITensorInfo               *bias,
+                                     ITensorInfo               *dst,
+                                     const MatMulKernelInfo    &matmul_kernel_info,
                                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
@@ -160,7 +160,8 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT
     ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
     const int  m       = dst->dimension(1);
     const int  n       = dst->dimension(0);
@@ -204,7 +205,7 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
 
-    if(_export_rhs_to_cl_image)
+    if (_export_rhs_to_cl_image)
     {
         gemm::update_padding_for_cl_image(rhs);
     }
@@ -239,10 +240,13 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const ICLTensor *lhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const ICLTensor *rhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
-    ICLTensor       *dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const ICLTensor *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+        tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+    ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
 
@@ -252,7 +256,7 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
     add_3d_tensor_nhw_argument(idx, lhs);
 
     cl::Image2D rhs_cl_image;
-    if(_export_rhs_to_cl_image)
+    if (_export_rhs_to_cl_image)
     {
         const size_t      image_w = rhs->info()->dimension(0) / 4;
         const size_t      image_h = rhs->info()->tensor_shape().total_size() / rhs->info()->dimension(0);
@@ -260,12 +264,13 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
         const size_t      image_row_pitch = rhs->info()->strides_in_bytes()[1];
 
         // Export cl_buffer to cl_image
-        rhs_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), rhs->cl_buffer(), shape2d, rhs->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        rhs_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), rhs->cl_buffer(), shape2d,
+                                                  rhs->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
         _kernel.setArg(idx++, rhs_cl_image);
     }
 
     add_3d_tensor_nhw_argument(idx, rhs);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         add_3d_tensor_nhw_argument(idx, bias);
     }
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
index fe2b787c1208b383f2b576257e9d7fc45386cdbe..2cb150bc8fd4af2f753981da47679b18277cec98 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEKERNEL
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -52,7 +53,12 @@ public:
      * @param[in]  matmul_kernel_info Attributes for Batch MatMul Kernel
      * @param[in]  act_info           (Optional) Specifies activation function to use after Matrix multiplication. Default is Identity function.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const MatMulKernelInfo    &matmul_kernel_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -60,14 +66,18 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const MatMulKernelInfo    &matmul_kernel_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool _export_rhs_to_cl_image{ false };
+    bool _export_rhs_to_cl_image{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
index 44c720a40c8015af2f95f1862ee77c14730d7b4c..76bf846e745213d0c3be161dc8f78401ba28f210 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
@@ -34,8 +34,7 @@
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -52,13 +51,6 @@ constexpr int mmul_m0 = 4;
 constexpr int mmul_n0 = 4;
 constexpr int mmul_k0 = 4;
 
-inline std::pair<int, int> adjust_m0_n0(int m0, int n0, int m, int n)
-{
-    m0 = std::min(m0, m);
-    n0 = adjust_vec_size(n0, n);
-    return { m0, n0 };
-}
-
 Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
 {
     const bool adj_lhs = matmul_kernel_info.adj_lhs;
@@ -69,123 +61,81 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
     // Validate M0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
 
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
     }
 
     // Validate N0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16), "Only 1,2,3,4,8,16 are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
 
     // Validate K0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((k0 != 1), "Only 1 is supported for k0");
 
     return Status{};
 }
-
-Status validate_input_shapes(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const MatMulKernelInfo &matmul_kernel_info)
-{
-    const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
-    const size_t rhs_k = matmul_kernel_info.adj_rhs ? rhs_shape.x() : rhs_shape.y();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_k != rhs_k, "K dimension in Lhs and Rhs matrices must match.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR((lhs_k % mmul_k0) != 0, "K dimension must be a multiple of %d", mmul_k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape.total_size() == 0, "Lhs tensor can't be empty");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_shape.total_size() == 0, "Rhs tensor can't be empty");
-
-    constexpr size_t batch_dim_start = 2;
-    for(size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape[i] != rhs_shape[i], "Batch dimension broadcasting is not supported");
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
-{
-    ARM_COMPUTE_UNUSED(lhs, rhs);
-
-    const Window win = calculate_max_window(*dst, Steps(1, 1));
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window collapsed = win.collapse(win, Window::DimZ);
-
-    // Reconfigure window size, one arm_matrix_multiply call needs 16 threads to finish.
-    Window::Dimension x_dimension = collapsed.x();
-    Window::Dimension y_dimension = collapsed.y();
-
-    const int m = dst->dimension(1);
-    const int n = dst->dimension(0);
-
-    int m0{};
-    int n0{};
-    std::tie(m0, n0) = adjust_m0_n0(matmul_kernel_info.m0, matmul_kernel_info.n0, m, n);
-
-    // Make M and N multiple of M0 and N0 respectively
-    const unsigned int ceil_to_multiple_n_n0 = ceil_to_multiple(n, n0);
-    const unsigned int ceil_to_multiple_m_m0 = ceil_to_multiple(m, m0);
-
-    // Divide M and N by M0 and N0 respectively
-    const unsigned int n_div_n0 = ceil_to_multiple_n_n0 / n0;
-    const unsigned int m_div_m0 = ceil_to_multiple_m_m0 / m0;
-
-    // Make n_div_n0 and m_div_m0 multiple of mmul_n0 and mmul_m0 respectively
-    const unsigned int ceil_to_multiple_n_div_n0_mmul_n0 = ceil_to_multiple(n_div_n0, mmul_n0);
-    const unsigned int ceil_to_multiple_m_div_m0_mmul_m0 = ceil_to_multiple(m_div_m0, mmul_m0);
-
-    // Ensure x_dimension is multiple of MMUL block size (mmul_m0 * mmul_n0)
-    x_dimension.set_end(ceil_to_multiple_n_div_n0_mmul_n0 * mmul_m0);
-    y_dimension.set_end(ceil_to_multiple_m_div_m0_mmul_m0 / mmul_m0);
-
-    collapsed.set(Window::DimX, x_dimension);
-    collapsed.set(Window::DimY, y_dimension);
-
-    return std::make_pair(Status{}, collapsed);
-}
-}
+} // namespace
 ClMatMulNativeMMULKernel::ClMatMulNativeMMULKernel()
 {
     _type = CLKernelType::GEMM;
 }
 
-Status ClMatMulNativeMMULKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
+Status ClMatMulNativeMMULKernel::validate(const ITensorInfo      *lhs,
+                                          const ITensorInfo      *rhs,
+                                          const ITensorInfo      *bias,
+                                          const ITensorInfo      *dst,
+                                          const MatMulKernelInfo &matmul_kernel_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
 
-    const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+    const TensorShape &lhs_shape = lhs->tensor_shape();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs_shape, rhs->tensor_shape(), matmul_kernel_info));
+
+    const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR((lhs_k % mmul_k0) != 0, "K dimension must be a multiple of %d", mmul_k0);
+
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias->num_dimensions() > 1), "Multi dimensional bias is unsupported.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0], "First dimension of bias and output tensors must match.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0],
+                                        "First dimension of bias and output tensors must match.");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, bias);
     }
 
     return Status{};
 }
-void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
+void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context,
+                                         ITensorInfo            *lhs,
+                                         ITensorInfo            *rhs,
+                                         ITensorInfo            *bias,
+                                         ITensorInfo            *dst,
+                                         const MatMulKernelInfo &matmul_kernel_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info);
     ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
     const int m = dst->dimension(1);
     const int n = dst->dimension(0);
@@ -195,12 +145,12 @@ void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context
     _n = n;
     _k = k;
 
-    int m0{};
-    int n0{};
-    std::tie(m0, n0) = adjust_m0_n0(matmul_kernel_info.m0, matmul_kernel_info.n0, m, n);
+    const int m0 = std::min(matmul_kernel_info.m0, m);
+    const int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
 
     // Configure kernel window
-    const auto win_config = validate_and_configure_window(lhs, rhs, dst, matmul_kernel_info);
+    const auto win_config =
+        validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
 
@@ -251,17 +201,20 @@ void ClMatMulNativeMMULKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const ICLTensor *lhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const ICLTensor *rhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
-    ICLTensor       *dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const ICLTensor *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+        tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+    ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
     unsigned int idx = 0;
 
     add_3d_tensor_nhw_argument(idx, lhs);
     add_3d_tensor_nhw_argument(idx, rhs);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         add_3d_tensor_nhw_argument(idx, bias);
     }
diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
index 80448974c413cbba0da65f86e230c0f732cb1ce2..1aeb896325c7da48537edf4b34529d9a4510b956 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
@@ -72,22 +72,31 @@ public:
      * @param[out] dst             Output tensor info.
      * @param[in]  matmul_info     Attributes for Batch MatMul Kernel
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *lhs,
+                   ITensorInfo            *rhs,
+                   ITensorInfo            *bias,
+                   ITensorInfo            *dst,
+                   const MatMulKernelInfo &matmul_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMatMulNativeMMULKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+    static Status validate(const ITensorInfo      *lhs,
+                           const ITensorInfo      *rhs,
+                           const ITensorInfo      *bias,
+                           const ITensorInfo      *dst,
+                           const MatMulKernelInfo &matmul_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    int _m{ 1 };
-    int _n{ 1 };
-    int _k{ 1 };
+    int _m{1};
+    int _n{1};
+    int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClMulKernel.cpp b/src/gpu/cl/kernels/ClMulKernel.cpp
index 5ca0639852f0aa68504abb81dbe91ea0d2a9d972..3b59c2a7fc264ee62c438accb8b4e4df1fc34423 100644
--- a/src/gpu/cl/kernels/ClMulKernel.cpp
+++ b/src/gpu/cl/kernels/ClMulKernel.cpp
@@ -23,15 +23,16 @@
  */
 #include "src/gpu/cl/kernels/ClMulKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -46,24 +47,25 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status validate_arguments(const ITensorInfo         *src1,
+                          const ITensorInfo         *src2,
+                          const ITensorInfo         *dst,
+                          float                      scale,
+                          ConvertPolicy              overflow_policy,
+                          RoundingPolicy             rounding_policy,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(overflow_policy);
     ARM_COMPUTE_UNUSED(rounding_policy);
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1,
-                                                         1,
-                                                         DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2,
-                                                         1,
-                                                         DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32,
-                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
 
@@ -76,27 +78,35 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst,
-                                                             1,
-                                                             DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                             DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                             DataType::S32, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 && (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8),
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8,
+                                                             DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                             DataType::F16, DataType::S32, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 &&
+                                            (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8),
                                         "Dst can only be U8 if both src are U8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8 && (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8),
-                                        "Dst can only be QASYMM8 if both src are QASYMM8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8_SIGNED && (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED),
-                                        "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QSYMM16 && (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16),
-                                        "Dst can only be QSYMM16 if both src are QSYMM16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) && (dst->data_type() != DataType::S32),
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->data_type() == DataType::QASYMM8 &&
+                (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8),
+            "Dst can only be QASYMM8 if both src are QASYMM8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->data_type() == DataType::QASYMM8_SIGNED &&
+                (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED),
+            "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->data_type() == DataType::QSYMM16 &&
+                (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16),
+            "Dst can only be QSYMM16 if both src are QSYMM16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) &&
+                                            (dst->data_type() != DataType::S32),
                                         "Dst must be S32 if source tensors are S32");
-        if(in_place)
+        if (in_place)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0),
-                                            "Wrong shape for dst, cannot do in_place calculation");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                detail::have_different_dimensions(out_shape,
+                                                  src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0),
+                "Wrong shape for dst, cannot do in_place calculation");
         }
         else
         {
@@ -114,14 +124,19 @@ ClMulKernel::ClMulKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                            ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void ClMulKernel::configure(const CLCompileContext    &compile_context,
+                            ITensorInfo               *src1,
+                            ITensorInfo               *src2,
+                            ITensorInfo               *dst,
+                            float                      scale,
+                            ConvertPolicy              overflow_policy,
+                            RoundingPolicy             rounding_policy,
+                            const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst,
-                                                  scale, overflow_policy, rounding_policy, act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
 
-    auto padding_info = get_padding_info({ src1, src2, dst });
+    auto padding_info = get_padding_info({src1, src2, dst});
 
     const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
     auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
@@ -133,7 +148,7 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
     // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
     // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
     // Moreover, it will be negative as we deal with 1/2^n
-    if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
+    if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
     {
         // Store the positive exponent. We know that we compute 1/2^n
         // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
@@ -142,19 +157,19 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
 
     std::string acc_type;
     // Check if it has float src and dst
-    if(is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type()))
+    if (is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type()))
     {
         scale_int = -1;
         acc_type  = (src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32) ? "float" : "half";
     }
     else
     {
-        if(src1->element_size() == 4 || src2->element_size() == 4)
+        if (src1->element_size() == 4 || src2->element_size() == 4)
         {
             // use 64 bit accumulator for 32-bit input
             acc_type = "long";
         }
-        else if(src1->element_size() == 2 || src2->element_size() == 2)
+        else if (src1->element_size() == 2 || src2->element_size() == 2)
         {
             // Use 32-bit accumulator for 16-bit input
             acc_type = "int";
@@ -176,11 +191,15 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
     build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type()));
     build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
-    build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
+    build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1)
+                                                   ? "1"
+                                                   : support::cpp11::to_string(vec_size)));
+    build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1)
+                                                   ? "1"
+                                                   : support::cpp11::to_string(vec_size)));
     build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
-    if(is_quantized && (dst->data_type() != DataType::S32))
+    if (is_quantized && (dst->data_type() != DataType::S32))
     {
         const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
@@ -200,12 +219,14 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
     else
     {
         kernel_name += (scale_int >= 0) ? "_int" : "_float";
-        build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()), "-DWRAP", "-DSATURATE");
+        build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()),
+                                      "-DWRAP", "-DSATURATE");
         build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte");
         build_opts.add_option("-DACC_DATA_TYPE=" + acc_type);
-        if(act_info.enabled())
+        if (act_info.enabled())
         {
-            build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+            build_opts.add_option("-DACTIVATION_TYPE=" +
+                                  lower_string(string_from_activation_func(act_info.activation())));
             build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
             build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
         }
@@ -223,7 +244,7 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
     // Set scale argument
     unsigned int idx = (in_place ? 2 : 3) * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
 
-    if(scale_int >= 0 && !is_quantized)
+    if (scale_int >= 0 && !is_quantized)
     {
         _kernel.setArg(idx++, scale_int);
     }
@@ -261,8 +282,13 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
     _config_id += support::cpp11::to_string(dst->dimension(2));
 }
 
-Status ClMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                             ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status ClMulKernel::validate(const ITensorInfo         *src1,
+                             const ITensorInfo         *src2,
+                             const ITensorInfo         *dst,
+                             float                      scale,
+                             ConvertPolicy              overflow_policy,
+                             RoundingPolicy             rounding_policy,
+                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
@@ -275,9 +301,11 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src_0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
 
@@ -286,17 +314,18 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command
     const TensorShape &out_shape = dst->info()->tensor_shape();
 
     bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -312,7 +341,7 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src_0, slice_input1);
         add_3D_tensor_argument(idx, src_1, slice_input2);
-        if(!in_place)
+        if (!in_place)
         {
             add_3D_tensor_argument(idx, dst, slice);
         }
@@ -320,15 +349,17 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command
 
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
 namespace
 {
 constexpr unsigned int vec_size_complex = 1;
 
-Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status validate_arguments_complex(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F16, DataType::F32);
@@ -340,11 +371,12 @@ Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *sr
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                        "Wrong shape for dst");
     }
 
     return Status{};
@@ -356,19 +388,23 @@ ClComplexMulKernel::ClComplexMulKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClComplexMulKernel::configure(const CLCompileContext    &compile_context,
+                                   ITensorInfo               *src1,
+                                   ITensorInfo               *src2,
+                                   ITensorInfo               *dst,
+                                   const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info));
 
-    auto padding_info = get_padding_info({ src1, src2, dst });
+    auto padding_info = get_padding_info({src1, src2, dst});
 
     const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
     auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
         build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
@@ -384,7 +420,10 @@ void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITen
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClComplexMulKernel::validate(const ITensorInfo         *src1,
+                                    const ITensorInfo         *src2,
+                                    const ITensorInfo         *dst,
+                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info));
@@ -397,26 +436,29 @@ void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src_0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     const TensorShape &in_shape1 = src_0->info()->tensor_shape();
     const TensorShape &in_shape2 = src_1->info()->tensor_shape();
     const TensorShape &out_shape = dst->info()->tensor_shape();
 
     bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -435,8 +477,7 @@ void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::
 
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClMulKernel.h b/src/gpu/cl/kernels/ClMulKernel.h
index 4e62a6d67a61795e0c29c6185b61b41e6d1e87ee..76a3ce02c1921e61fd3f1ff388ea039c71b696a0 100644
--- a/src/gpu/cl/kernels/ClMulKernel.h
+++ b/src/gpu/cl/kernels/ClMulKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_MUL_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -72,16 +73,27 @@ public:
      * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMulKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -101,14 +113,21 @@ public:
      * @param[out] dst             The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClComplexMulKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClPermuteKernel.cpp b/src/gpu/cl/kernels/ClPermuteKernel.cpp
index 8d4655114b27d4f179284048d1033f522025e9f9..a4755782ed848556e5a8e2673b7d24f1e29e7dea 100644
--- a/src/gpu/cl/kernels/ClPermuteKernel.cpp
+++ b/src/gpu/cl/kernels/ClPermuteKernel.cpp
@@ -29,8 +29,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -60,13 +61,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
                                     "Permutation up to 4-D src tensor is supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4,
                                     "Permutation vector size should be less than or equal to 4");
-    for(const auto &p : perm)
+    for (const auto &p : perm)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values");
     }
 
     // Validate configured dst
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
@@ -82,10 +83,13 @@ ClPermuteKernel::ClPermuteKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClPermuteKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
+void ClPermuteKernel::configure(const CLCompileContext  &compile_context,
+                                const ITensorInfo       *src,
+                                ITensorInfo             *dst,
+                                const PermutationVector &perm)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    auto              padding_info = get_padding_info({ src, dst });
+    auto              padding_info = get_padding_info({src, dst});
     const TensorShape dst_shape    = get_dst_shape(src, perm);
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
@@ -96,7 +100,8 @@ void ClPermuteKernel::configure(const CLCompileContext &compile_context, const I
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type())));
     build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(src->dimension(2)));
     // New positions of  width(W), height(H), channel(C) and batch(D) based on permutation vector
     build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0));
@@ -126,8 +131,9 @@ void ClPermuteKernel::run_op(ITensorPack &tensors, const Window &window, cl::Com
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
 
@@ -144,9 +150,8 @@ void ClPermuteKernel::run_op(ITensorPack &tensors, const Window &window, cl::Com
         add_4D_tensor_argument(idx, src, slice_in);
         add_4D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice_in, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+    } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClPermuteKernel.h b/src/gpu/cl/kernels/ClPermuteKernel.h
index 0d349e739bb610653cdd2184c532464309183394..2413b10284b8be16d0fa6d5f8ca7ba5264facd23 100644
--- a/src/gpu/cl/kernels/ClPermuteKernel.h
+++ b/src/gpu/cl/kernels/ClPermuteKernel.h
@@ -52,7 +52,10 @@ public:
      * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
      * @param[in] perm            Permutation vector
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src,
+                   ITensorInfo             *dst,
+                   const PermutationVector &perm);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClPermuteKernel::configure()
diff --git a/src/gpu/cl/kernels/ClPool2dKernel.cpp b/src/gpu/cl/kernels/ClPool2dKernel.cpp
index a1afc585e0d952e6ccf2219a87e95020dc8602d5..41ab4d6922b4dbfc7a5871750ae518e41679c718 100644
--- a/src/gpu/cl/kernels/ClPool2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClPool2dKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,37 +44,47 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status validate_arguments(const ITensorInfo      *src,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &pool_info,
+                          const ITensorInfo      *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2),
-                                    "Unsupported combination of parameters!");
-
-    const auto   data_layout       = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int    idx_width         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int    idx_height        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const bool   is_global_pooling = pool_info.is_global_pooling;
-    unsigned int pool_size_x       = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    unsigned int pool_size_y       = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    int          output_width      = 0;
-    int          output_height     = 0;
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported");
-
-    std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                     pool_size_x, pool_size_y, pool_info.pad_stride_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2),
+        "Unsupported combination of parameters!");
+
+    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const bool is_global_pooling = pool_info.is_global_pooling;
+    unsigned int pool_size_x     = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    unsigned int pool_size_y     = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    int          output_width    = 0;
+    int          output_height   = 0;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(pool_info),
+                                    "Pooling region that is entirely outside input tensor is unsupported");
+
+    std::tie(output_width, output_height) =
+        scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], pool_size_x,
+                                 pool_size_y, pool_info.pad_stride_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1),
+                                    "Calculated output dimension size is invalid");
 
     // Check indices
-    if(indices)
+    if (indices)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX,
+                                        "Pooling indices only supported for MAX pooling method");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)),
+                                        "Pooling indices only supported for pool size 2x2");
 
-        if(indices->total_size() != 0)
+        if (indices->total_size() != 0)
         {
             TensorInfo idx_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, DataType::U32));
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info);
@@ -81,7 +92,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     }
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
@@ -98,42 +109,47 @@ ClPool2dKernel::ClPool2dKernel()
     _type = CLKernelType::POOL;
 }
 
-void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+void ClPool2dKernel::configure(const ClCompileContext &compile_context,
+                               ITensorInfo            *src,
+                               ITensorInfo            *dst,
+                               const PoolingLayerInfo &pool_info,
+                               ITensorInfo            *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices));
 
-    auto padding_info = get_padding_info({ src, dst, indices });
+    auto padding_info = get_padding_info({src, dst, indices});
 
     // Auto init if empty
     TensorShape out_shape = compute_pool_shape(*src, pool_info);
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape));
-    if(indices)
+    if (indices)
     {
         auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32));
     }
 
     // Set instance variables
-    _pool_info                         = pool_info;
-    _data_layout                       = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    _num_elems_processed_per_iteration = (_data_layout == DataLayout::NCHW) ? 1 : ((dst->data_type() == DataType::F32) ? 2 : 4);
+    _pool_info   = pool_info;
+    _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    _num_elems_processed_per_iteration =
+        (_data_layout == DataLayout::NCHW) ? 1 : ((dst->data_type() == DataType::F32) ? 2 : 4);
     _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0));
 
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    const PoolingType   pool_type       = pool_info.pool_type;
-    const int           idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const int           idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const int           idx_batch_size  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
-    const int           pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    const int           pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    const bool          exclude_padding = pool_info.exclude_padding;
+    int               pool_stride_x  = 0;
+    int               pool_stride_y  = 0;
+    const PoolingType pool_type      = pool_info.pool_type;
+    const int         idx_width      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int         idx_height     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int         idx_channel    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const int         idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
+    const int         pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    const PadStrideInfo pad_stride_info    = pool_info.pad_stride_info;
+    const bool          exclude_padding    = pool_info.exclude_padding;
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int      pool_pad_top  = pad_stride_info.pad_top();
-    const int      pool_pad_left = pad_stride_info.pad_left();
-    const DataType data_type     = src->data_type();
+    const int      pool_pad_top            = pad_stride_info.pad_top();
+    const int      pool_pad_left           = pad_stride_info.pad_left();
+    const DataType data_type               = src->data_type();
 
     // Set build options
     CLBuildOptions build_opts;
@@ -148,20 +164,23 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
     build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
     build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
     build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
-    build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
-    build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
+    build_opts.add_option("-DMAX_WIDTH=" +
+                          support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
+    build_opts.add_option("-DMAX_HEIGHT=" +
+                          support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
 
     // Tensor paddings are used to calculate the indicies for MAX pooling
-    if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
+    if (pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices &&
+        is_data_type_float(data_type))
     {
         build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(src->tensor_shape().total_size_lower(3)));
     }
 
-    if(is_data_type_quantized_asymmetric(data_type))
+    if (is_data_type_quantized_asymmetric(data_type))
     {
         build_opts.add_option("-DQUANTIZED");
 
-        if(src->quantization_info() != dst->quantization_info())
+        if (src->quantization_info() != dst->quantization_info())
         {
             const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
             const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -174,9 +193,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
     }
 
     // Set the initial value for the pooling operation accordingly with the data type
-    if(pool_type == PoolingType::MAX)
+    if (pool_type == PoolingType::MAX)
     {
-        if(is_data_type_quantized(data_type))
+        if (is_data_type_quantized(data_type))
         {
             PixelValue type_min{};
             std::tie(type_min, std::ignore) = get_min_max(data_type);
@@ -184,7 +203,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
         }
         else
         {
-            std::string initial_value = pool_info.use_inf_as_limit ? "(-INFINITY)" : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
+            std::string initial_value = pool_info.use_inf_as_limit
+                                            ? "(-INFINITY)"
+                                            : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
             build_opts.add_option("-DINITIAL_VALUE=" + initial_value);
         }
     }
@@ -195,22 +216,25 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
     }
 
     // Create kernel
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
             const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
             const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
-            const auto acc_data_type          = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : (is_data_type_quantized(data_type) ? DataType::S32 : data_type));
+            const auto acc_data_type          = get_cl_type_from_data_type(
+                         use_wider_accumulator ? DataType::F32
+                                               : (is_data_type_quantized(data_type) ? DataType::S32 : data_type));
             build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
             build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
 
-            if(pool_type != PoolingType::MAX)
+            if (pool_type != PoolingType::MAX)
             {
                 build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
             }
 
-            if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
+            if (pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices &&
+                is_data_type_float(data_type))
             {
                 // For max pooling with pool2x2, store indicies which will be used in max unpooling
                 std::string kernel_name = "pooling_layer_2_nchw_indices";
@@ -226,18 +250,19 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
         case DataLayout::NHWC:
         {
             // Floating point mixed precision is support on F16 only
-            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
+            const auto use_fp_mixed_precision =
+                (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
 
             // Wider accumulation is required to avoid accuracy loss
             // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation)
             // Cast 2: Quantized (int8/uint8 src data and int32 accumulation )
             DataType acc_data_type = data_type;
 
-            if(use_fp_mixed_precision)
+            if (use_fp_mixed_precision)
             {
                 acc_data_type = DataType::F32;
             }
-            else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
+            else if (is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
             {
                 acc_data_type = DataType::S32;
             }
@@ -250,8 +275,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
             build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
             build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
             build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
-            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
-            if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
+            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                                  support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
+            if (pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
             {
                 build_opts.add_option_if(indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX");
 
@@ -260,7 +286,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
             }
             else
             {
-                std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
+                std::string kernel_name = is_data_type_quantized_asymmetric(data_type)
+                                              ? "pooling_layer_MxN_quantized_nhwc"
+                                              : "pooling_layer_MxN_nhwc";
                 _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
             }
             break;
@@ -290,7 +318,10 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status ClPool2dKernel::validate(const ITensorInfo      *src,
+                                const ITensorInfo      *dst,
+                                const PoolingLayerInfo &pool_info,
+                                const ITensorInfo      *indices)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices));
     return Status{};
@@ -301,18 +332,19 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    unsigned int pool_stride_x = 0;
-    unsigned int pool_stride_y = 0;
+    unsigned int pool_stride_x             = 0;
+    unsigned int pool_stride_y             = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
 
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
-    auto       indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
+    auto indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1));
 
     // Collapse window
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
 
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -323,13 +355,12 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
                 unsigned int idx = 0;
                 add_3D_tensor_argument(idx, src, slice);
                 add_3D_tensor_argument(idx, dst, slice);
-                if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
+                if (indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
                 {
                     add_3D_tensor_argument(idx, indices, slice);
                 }
                 enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window_collapsed.slide_window_slice_3D(slice));
+            } while (window_collapsed.slide_window_slice_3D(slice));
             break;
         }
         case DataLayout::NHWC:
@@ -338,7 +369,8 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
 
             Window slice    = window_collapsed.first_slice_window_4D();
             Window in_slice = window_collapsed.first_slice_window_4D();
-            in_slice.set(Window::DimX, Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration));
+            in_slice.set(Window::DimX,
+                         Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration));
             in_slice.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
             in_slice.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
             in_slice.set(3, Window::Dimension(0, batch_size, 1));
@@ -348,13 +380,13 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
                 unsigned int idx = 0;
                 add_4D_tensor_argument(idx, src, in_slice);
                 add_4D_tensor_argument(idx, dst, slice);
-                if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
+                if (indices && is_data_type_float(src->info()->data_type()) &&
+                    (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
                 {
                     add_4D_tensor_argument(idx, indices, slice);
                 }
                 enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
+            } while (window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
             break;
         }
         default:
diff --git a/src/gpu/cl/kernels/ClPool2dKernel.h b/src/gpu/cl/kernels/ClPool2dKernel.h
index f5bb0687e8709cc168976b241dca54eb6f5d054c..56b95a37d5860a1369340cc6b7617ba338b1bd34 100644
--- a/src/gpu/cl/kernels/ClPool2dKernel.h
+++ b/src/gpu/cl/kernels/ClPool2dKernel.h
@@ -50,22 +50,29 @@ public:
      * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
      * @param[out] indices         (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const PoolingLayerInfo &pool_info,
+                   ITensorInfo            *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPool2dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 public:
     PoolingLayerInfo _pool_info{};
-    DataLayout       _data_layout{ DataLayout::UNKNOWN };
-    unsigned int     _num_elems_processed_per_iteration{ 1 };
+    DataLayout       _data_layout{DataLayout::UNKNOWN};
+    unsigned int     _num_elems_processed_per_iteration{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClPool3dKernel.cpp b/src/gpu/cl/kernels/ClPool3dKernel.cpp
index d068832fedd9a890dbec6d0ea693e4ece1bd80c5..a08c5d4be71783e026ddfd92458353b807f9409e 100644
--- a/src/gpu/cl/kernels/ClPool3dKernel.cpp
+++ b/src/gpu/cl/kernels/ClPool3dKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -50,10 +51,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.stride.x() == 0 || pool_info.stride.y() == 0 || pool_info.stride.z() == 0), "Strides cannot be zero.");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && (!pool_info.exclude_padding
-                                                                                && (pool_info.pool_type == PoolingType::AVG)),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (pool_info.stride.x() == 0 || pool_info.stride.y() == 0 || pool_info.stride.z() == 0),
+        "Strides cannot be zero.");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED,
+                                                         DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) &&
+                                        (!pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG)),
                                     "Exclude padding is unsupported for non-float types for Avg op");
 
     const auto         data_layout       = src->data_layout();
@@ -68,17 +72,21 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     int                output_height     = 0;
     int                output_depth      = 0;
 
-    bool round_type_ceil_with_asymm_padding = (pool_info.round_type == DimensionRoundingType::CEIL) && (!is_symmetric(pool_info.padding));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(round_type_ceil_with_asymm_padding, "Cannot use dimension round type CEIL when padding is asymmetric.");
+    bool round_type_ceil_with_asymm_padding =
+        (pool_info.round_type == DimensionRoundingType::CEIL) && (!is_symmetric(pool_info.padding));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(round_type_ceil_with_asymm_padding,
+                                    "Cannot use dimension round type CEIL when padding is asymmetric.");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported");
-    std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                                      src->tensor_shape()[idx_depth], pool_size_x, pool_size_y,
-                                                                                      pool_size_z, pool_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info),
+                                    "Pooling region that is entirely outside input tensor is unsupported");
+    std::tie(output_width, output_height, output_depth) =
+        scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
+                                    src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, pool_size_z, pool_info);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1),
+                                    "Calculated output dimension size is invalid");
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
@@ -95,11 +103,14 @@ ClPool3dKernel::ClPool3dKernel()
     _type = CLKernelType::POOL;
 }
 
-void ClPool3dKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+void ClPool3dKernel::configure(const ClCompileContext   &compile_context,
+                               const ITensorInfo        *src,
+                               ITensorInfo              *dst,
+                               const Pooling3dLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Auto init if empty
     TensorShape out_shape = compute_pool3d_shape(src->tensor_shape(), pool_info);
@@ -112,23 +123,23 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
     _num_elems_processed_per_iteration = (dst->data_type() == DataType::F32) ? 2 : 4;
     _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0));
 
-    const PoolingType pool_type       = pool_info.pool_type;
-    const int         idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int         idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const int         idx_depth       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::DEPTH);
-    const int         idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const int         idx_batch_size  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
-    const int         pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    const int         pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    const int         pool_size_z     = pool_info.is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
-    const bool        exclude_padding = pool_info.exclude_padding;
-    const int         pool_stride_x   = pool_info.stride.x();
-    const int         pool_stride_y   = pool_info.stride.y();
-    const int         pool_stride_z   = pool_info.stride.z();
-    const int         pool_pad_top    = pool_info.padding.top;
-    const int         pool_pad_left   = pool_info.padding.left;
-    const int         pool_pad_front  = pool_info.padding.front;
-    const DataType    data_type       = src->data_type();
+    const PoolingType pool_type      = pool_info.pool_type;
+    const int         idx_width      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int         idx_height     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int         idx_depth      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::DEPTH);
+    const int         idx_channel    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const int         idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
+    const int         pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    const int      pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    const int      pool_size_z = pool_info.is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
+    const bool     exclude_padding = pool_info.exclude_padding;
+    const int      pool_stride_x   = pool_info.stride.x();
+    const int      pool_stride_y   = pool_info.stride.y();
+    const int      pool_stride_z   = pool_info.stride.z();
+    const int      pool_pad_top    = pool_info.padding.top;
+    const int      pool_pad_left   = pool_info.padding.left;
+    const int      pool_pad_front  = pool_info.padding.front;
+    const DataType data_type       = src->data_type();
 
     // Set build options
     CLBuildOptions build_opts;
@@ -149,7 +160,7 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
     build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(idx_depth)));
 
     // If datatype is quantized add relevant parameters
-    if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info())
+    if (is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -161,9 +172,9 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
     }
 
     // Set the initial value for the pooling operation accordingly with the data type
-    if(pool_type == PoolingType::MAX)
+    if (pool_type == PoolingType::MAX)
     {
-        if(is_data_type_quantized(data_type))
+        if (is_data_type_quantized(data_type))
         {
             PixelValue type_min{};
             std::tie(type_min, std::ignore) = get_min_max(data_type);
@@ -171,7 +182,8 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
         }
         else
         {
-            build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
+            build_opts.add_option("-DINITIAL_VALUE=" +
+                                  float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
         }
     }
     else
@@ -181,16 +193,18 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
     }
     // Create kernel
     // Floating point mixed precision is support on F16 only
-    const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
+    const auto use_fp_mixed_precision =
+        (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
 
     // Wider accumulation is required to avoid accuracy loss
     // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation)
     DataType acc_data_type = data_type;
-    if(use_fp_mixed_precision)
+    if (use_fp_mixed_precision)
     {
         acc_data_type = DataType::F32;
     }
-    else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) // Use S32 for avg pooling to allow for integer division
+    else if (is_data_type_quantized(data_type) &&
+             pool_type != PoolingType::MAX) // Use S32 for avg pooling to allow for integer division
     {
         acc_data_type = DataType::S32;
     }
@@ -202,11 +216,13 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
     build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(dst->dimension(idx_depth)));
     build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
     build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
 
     // if datatype is quantized use quantized kernel function
-    std::string kernel_name = (is_data_type_quantized_asymmetric(data_type) ? "pooling_3d_layer_MxN_ndhwc_quantized" : "pooling_3d_layer_MxN_ndhwc");
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+    std::string kernel_name = (is_data_type_quantized_asymmetric(data_type) ? "pooling_3d_layer_MxN_ndhwc_quantized"
+                                                                            : "pooling_3d_layer_MxN_ndhwc");
+    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
@@ -240,8 +256,9 @@ void ClPool3dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
 
     // Collapse 3D window
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
diff --git a/src/gpu/cl/kernels/ClPool3dKernel.h b/src/gpu/cl/kernels/ClPool3dKernel.h
index 00852349e69017e101a2e4c7209e3a8b5ecc5572..6cd229c427aa318fefcdc4c30e377d6ba7e817b5 100644
--- a/src/gpu/cl/kernels/ClPool3dKernel.h
+++ b/src/gpu/cl/kernels/ClPool3dKernel.h
@@ -50,7 +50,10 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      * @param[in]  pool_info       Contains pooling operation information described in @ref Pooling3dLayerInfo.
      */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+    void configure(const ClCompileContext   &compile_context,
+                   const ITensorInfo        *src,
+                   ITensorInfo              *dst,
+                   const Pooling3dLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPool3dKernel::configure()
@@ -64,8 +67,8 @@ public:
 
 private:
     Pooling3dLayerInfo _pool_info{};
-    DataLayout         _data_layout{ DataLayout::UNKNOWN };
-    unsigned int       _num_elems_processed_per_iteration{ 1 };
+    DataLayout         _data_layout{DataLayout::UNKNOWN};
+    unsigned int       _num_elems_processed_per_iteration{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClQuantizeKernel.cpp b/src/gpu/cl/kernels/ClQuantizeKernel.cpp
index 5c8bf97f0f00ed1e6ff95acfad6834e7e6f88b7f..e8df420f6717d1a10f4f5aee2182e74c0c1adafb 100644
--- a/src/gpu/cl/kernels/ClQuantizeKernel.cpp
+++ b/src/gpu/cl/kernels/ClQuantizeKernel.cpp
@@ -29,13 +29,12 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -50,12 +49,14 @@ namespace
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
 
     // Output must always be initialized
     ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QASYMM16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
 
     return Status{};
@@ -71,7 +72,7 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
@@ -84,7 +85,7 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const
 
     float   scale_to_apply  = qinfo.scale;
     int32_t offset_to_apply = qinfo.offset;
-    if(is_data_type_quantized_asymmetric(src->data_type()))
+    if (is_data_type_quantized_asymmetric(src->data_type()))
     {
         /*
          * In case of requantization of a quantized input tensor to an output tensor with another quantization
@@ -132,8 +133,10 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
-    std::pair<int, int> min_max_quant_values = quantization::get_min_max_values_from_quantized_data_type(output_data_type);
+    build_opts.add_option_if(
+        multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
+    std::pair<int, int> min_max_quant_values =
+        quantization::get_min_max_values_from_quantized_data_type(output_data_type);
     build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first));
     build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second));
 
@@ -141,9 +144,10 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const
 
     // Configure kernel window
     Window win = calculate_max_window(*src, Steps());
-    if(multi_access_x)
+    if (multi_access_x)
     {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
@@ -173,8 +177,7 @@ void ClQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::Co
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    } while (window_collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClReshapeKernel.cpp b/src/gpu/cl/kernels/ClReshapeKernel.cpp
index 121bb33edf3caeb78463b04a414b3aaef8267db1..53889f3a6b4dc7bc5892658224995a6b16c70eb1 100644
--- a/src/gpu/cl/kernels/ClReshapeKernel.cpp
+++ b/src/gpu/cl/kernels/ClReshapeKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -51,7 +52,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(dst->tensor_shape().total_size() != 0)
+    if (dst->tensor_shape().total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -72,27 +73,17 @@ void ClReshapeKernel::configure(const CLCompileContext &compile_context, const I
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Create kernel
-    std::set<std::string> build_opts = { "-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()) };
+    std::set<std::string> build_opts = {"-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())};
     _kernel                          = create_kernel(compile_context, "reshape_layer", build_opts);
 
     // Add static arguments
-    const cl_int2 src_shape =
-    {
-        {
-            static_cast<cl_int>(src->tensor_shape()[0]),
-            static_cast<cl_int>(src->tensor_shape()[1])
-        }
-    };
-    const cl_int2 dst_shape =
-    {
-        {
-            static_cast<cl_int>(dst->tensor_shape()[0]),
-            static_cast<cl_int>(dst->tensor_shape()[1])
-        }
-    };
+    const cl_int2 src_shape = {
+        {static_cast<cl_int>(src->tensor_shape()[0]), static_cast<cl_int>(src->tensor_shape()[1])}};
+    const cl_int2 dst_shape = {
+        {static_cast<cl_int>(dst->tensor_shape()[0]), static_cast<cl_int>(dst->tensor_shape()[1])}};
     unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
     _kernel.setArg<cl_int2>(idx++, src_shape);
     _kernel.setArg<cl_int2>(idx++, dst_shape);
@@ -119,8 +110,9 @@ void ClReshapeKernel::run_op(ITensorPack &tensors, const Window &window, cl::Com
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice            = window_collapsed.first_slice_window_3D();
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     // Set srcs
     unsigned int idx = 0;
diff --git a/src/gpu/cl/kernels/ClReshapeKernel.h b/src/gpu/cl/kernels/ClReshapeKernel.h
index db6ab5da585cf633ca7330cf025f365efbe97f96..95eae8208690e7078589e9a7763577d68e951b31 100644
--- a/src/gpu/cl/kernels/ClReshapeKernel.h
+++ b/src/gpu/cl/kernels/ClReshapeKernel.h
@@ -58,7 +58,7 @@ public:
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 };
-} // namespace opencl
 } // namespace kernels
+} // namespace opencl
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_RESHAPE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp
index 4c4373a215381d88616d3364e0676995609b4628..4305acad262a721be285d25f4ec1c4c892d9a17e 100644
--- a/src/gpu/cl/kernels/ClScaleKernel.cpp
+++ b/src/gpu/cl/kernels/ClScaleKernel.cpp
@@ -27,8 +27,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/InterpolationPolicyUtils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,7 +44,8 @@ namespace kernels
 {
 namespace
 {
-inline std::tuple<float, float> calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners)
+inline std::tuple<float, float>
+calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners)
 {
     const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -64,20 +66,25 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::U8, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels()!=1);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && !is_data_type_quantized_asymmetric(src->data_type()));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        info.align_corners &&
+        !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) &&
+                                !is_data_type_quantized_asymmetric(src->data_type()));
 
     float            scale_x     = 0.f;
     float            scale_y     = 0.f;
     const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-    std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, data_layout, info.align_corners);
+    std::tie(scale_x, scale_y)   = calculate_scale_factors(src, dst, data_layout, info.align_corners);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (scale_x > 1.f || scale_y > 1.f));
+    ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA &&
+                                (scale_x > 1.f || scale_y > 1.f));
 
     return Status{};
 }
@@ -94,23 +101,26 @@ ClScaleKernel::ClScaleKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
+void ClScaleKernel::configure(const CLCompileContext &compile_context,
+                              ITensorInfo            *src,
+                              ITensorInfo            *dst,
+                              const ScaleKernelInfo  &info)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, info));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Info required for the static tuning
     _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
 
     const bool is_nhwc = _data_layout == DataLayout::NHWC;
 
-    float scale_x = 0.f;
-    float scale_y = 0.f;
+    float scale_x              = 0.f;
+    float scale_y              = 0.f;
     std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, _data_layout, info.align_corners);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
     auto interpolation_policy_to_use = info.interpolation_policy;
-    if(info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f)
+    if (info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f)
     {
         interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR;
     }
@@ -127,7 +137,7 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
     unsigned int       vec_size_leftover = 0;
 
     CLBuildOptions build_opts;
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         vec_size          = adjust_vec_size(src->data_type() == DataType::F32 ? 4 : 8, dst_channels);
         vec_size_leftover = dst_channels % vec_size;
@@ -135,7 +145,8 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
         build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
         build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
         build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-        build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
+        build_opts.add_option("-DCONSTANT_VALUE=" +
+                              string_from_pixel_value(info.constant_border_value, src->data_type()));
         build_opts.add_option("-DN0=" + support::cpp11::to_string(vec_size));
         build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(vec_size_leftover));
         build_opts.add_option("-DSCALE_" + string_from_interpolation_policy(interpolation_policy_to_use));
@@ -144,27 +155,33 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
         build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
         build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
         build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOATING_POINT");
-        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER",
+                                      "-DSAMPLING_POLICY_TOP_LEFT");
     }
-    else if(_data_layout == DataLayout::NCHW)
+    else if (_data_layout == DataLayout::NCHW)
     {
         vec_size          = adjust_vec_size(4, dst_width);
         vec_size_leftover = dst_width % vec_size;
         build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-        build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
+        build_opts.add_option("-DCONSTANT_VALUE=" +
+                              string_from_pixel_value(info.constant_border_value, src->data_type()));
         build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
         build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
         build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
         build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
         build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
-        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover)));
+        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0)
+                                                            ? support::cpp11::to_string(vec_size)
+                                                            : support::cpp11::to_string(vec_size_leftover)));
         build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
         build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
         build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
-        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER",
+                                      "-DSAMPLING_POLICY_TOP_LEFT");
 
-        const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR;
-        if(is_qasymm_bilinear)
+        const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) &&
+                                        info.interpolation_policy == InterpolationPolicy::BILINEAR;
+        if (is_qasymm_bilinear)
         {
             const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
             build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
@@ -190,7 +207,7 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 
     // Pass scale kernel arguments
-    if(is_nhwc)
+    if (is_nhwc)
     {
         unsigned int idx = 2 * num_arguments_per_4d_tensor_nhwc();
         _kernel.setArg<cl_float>(idx++, scale_x);
@@ -219,7 +236,7 @@ void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma
     auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
     auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -231,8 +248,7 @@ void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma
                 add_2D_tensor_argument(idx, src, slice);
                 add_2D_tensor_argument(idx, dst, slice);
                 enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(slice));
+            } while (window.slide_window_slice_2D(slice));
             break;
         }
         case DataLayout::NHWC:
diff --git a/src/gpu/cl/kernels/ClScaleKernel.h b/src/gpu/cl/kernels/ClScaleKernel.h
index dd09e92ee2975da668e861039ad7bf38f5001aab..c09659017d367ed6b67592936d717ffd061ae7d3 100644
--- a/src/gpu/cl/kernels/ClScaleKernel.h
+++ b/src/gpu/cl/kernels/ClScaleKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_SCALE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -49,7 +50,8 @@ public:
      *                             All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  info            @ref ScaleKernelInfo Kernel descriptor to be used to configure.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClScaleKernel::configure()
@@ -62,7 +64,7 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    DataLayout _data_layout{ DataLayout::UNKNOWN };
+    DataLayout _data_layout{DataLayout::UNKNOWN};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
index 59299fa4415c5e53859988fd680bb45c77fae958..796345a9237fb9e72d5ac797669b5acdc5131a83 100644
--- a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
+++ b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
@@ -22,330 +22,242 @@
  * SOFTWARE.
  */
 #include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Steps.h"
+#include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
+#include <string>
+
 namespace arm_compute
 {
 namespace opencl
 {
 namespace kernels
 {
-namespace
-{
-/** Calculates softmax parameters from the quantized input scale and scaling factor for the exponent and places them as build options.
- *
- * Prepares these build options:
- * -INPUT_BETA_MULTIPLIER, INPUT_BETA_LEFT_SHIFT - quantized representation of beta multiplier.
- * -DIFF_MIN - threshold difference between maximum value of input data and current processed value,
- *             it defines whether the value will be taken into account or not.
- *
- * @param[in] build_opts  Build options to extend
- * @param[in] input_scale Input scaling factor
- * @param[in] beta        Exponent scaling factor beta
- */
-CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float beta)
-{
-    // Number of integer bits in temporary fixed-point representation of current-to-max difference
-    static const int scaled_diff_int_bits = 5;
-    // Number of integer bits used in temporary fixed-point representation of exponent accumulator
-    static const int exp_accumulation_in_bits = 12;
-
-    const double beta_multiplier = std::min(
-                                       1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)),
-                                       (1LL << 31) - 1.0);
-    int input_beta_multiplier;
-    int input_beta_left_shift;
-    quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift);
-
-    const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift);
-    const int    diff_min           = -1.f * std::floor(max_input_rescaled);
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits));
-    build_opts.add_option("-DEXP_ACCUMULATION_INT_BITS=" + support::cpp11::to_string(exp_accumulation_in_bits));
-    build_opts.add_option("-DINPUT_BETA_MULTIPLIER=" + support::cpp11::to_string(input_beta_multiplier));
-    build_opts.add_option("-DINPUT_BETA_LEFT_SHIFT=" + support::cpp11::to_string(input_beta_left_shift));
-    build_opts.add_option("-DDIFF_MIN=" + support::cpp11::to_string(diff_min));
 
-    return build_opts;
+ClSoftmaxKernel::ClSoftmaxKernel()
+{
 }
 
-Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum)
+Status ClSoftmaxKernel::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
-
-    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
+    ARM_COMPUTE_UNUSED(src, dst, info);
 
-    // Checks performed when output is configured
-    if(dst.total_size() != 0)
-    {
-        if(is_quantized_asymmetric)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
-    }
-
-    // Checks performed when sum is configured
-    if(sum.total_size() != 0)
-    {
-        if(is_quantized_asymmetric)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&sum, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&max, &sum);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&max, &sum);
-    }
+    ARM_COMPUTE_RETURN_ERROR_ON(src.num_dimensions() > 4);
 
-    return Status{};
-}
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
 
-Status validate_arguments_1DNorm(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &sum);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.is_log && !is_data_type_float(info.input_data_type));
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN( //
+        &src, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
 
-    // Note: output should always have a scale of 1/256 and offset 0
-    const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-    const bool             is_quantized_asymmetric   = is_data_type_quantized_asymmetric(info.input_data_type);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.input_data_type != src.data_type());
+    ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) ||
+                                static_cast<int32_t>(src.num_dimensions()) <= info.axis);
 
-    // Checks performed when output is configured
-    if(dst.total_size() != 0)
+    if (is_data_type_quantized_asymmetric(src.data_type()))
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
-        if(!is_quantized_asymmetric)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-            ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != allowed_quantization_info);
-        }
+        ARM_COMPUTE_RETURN_ERROR_ON(src.quantization_info().uniform().scale < 0);
+
+        ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() !=
+                                    get_softmax_output_quantization_info(src.data_type(), info.is_log));
     }
 
     return Status{};
 }
-} // namespace
-
-/**< Grid size (obtained through auto-tuning) */
-const unsigned int ClLogits1DMaxShiftExpSumKernel::_grid_size = 64;
-/**< Vector size in the serial case (obtained through auto-tuning) */
-const unsigned int ClLogits1DMaxShiftExpSumKernel::_serial_vector_size = 8;
-/**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/
-const unsigned int ClLogits1DMaxShiftExpSumKernel::_parallel_vector_size = 4;
 
-ClLogits1DMaxShiftExpSumKernel::ClLogits1DMaxShiftExpSumKernel()
+void ClSoftmaxKernel::configure(const CLCompileContext  &compile_context,
+                                const ITensorInfo       &src,
+                                ITensorInfo             &dst,
+                                const SoftmaxKernelInfo &info)
 {
-    _type = CLKernelType::ELEMENTWISE;
-}
+    ARM_COMPUTE_UNUSED(compile_context, src, dst, info);
 
-void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info)
-{
-    auto padding_info = get_padding_info({ &src, &max, &dst, &sum });
+    const auto &dst_shape = dst.tensor_shape();
 
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(sum, src.clone()->set_tensor_shape(max.tensor_shape()));
-    auto_init_if_empty(dst, *src.clone());
+    const auto data_type    = src.data_type();
+    const auto element_size = src.element_size();
 
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum));
+    const auto is_quantized = data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED;
+    const auto src_qinfo    = src.quantization_info().uniform();
+    const auto dst_qinfo    = dst.quantization_info().uniform();
 
-    const DataType                dt                 = src.data_type();
-    const UniformQuantizationInfo qinfo              = src.quantization_info().uniform();
-    const size_t                  reduction_dim_size = src.dimension(0);
-    const float                   beta               = info.beta;
-    const auto                    is_signed_qasymm8  = is_data_type_quantized_asymmetric_signed(info.input_data_type);
-    const int                     min_value          = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
+    const auto axis   = wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions()));
+    const auto length = dst_shape[axis];
 
-    const unsigned int vector_size = adjust_vec_size(_serial_vector_size, reduction_dim_size);
+    const auto tmp_data_type = is_quantized ? DataType::F32 : data_type;
 
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
-    build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
-    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(reduction_dim_size));
-    build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(reduction_dim_size % vector_size));
-    build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
-    build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
-    build_opts.add_option_if(is_signed_qasymm8, "-DQASYMM8_SIGNED");
-    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(is_data_type_float(dt) && info.is_log, "-DLOG_SOFTMAX");
-    build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX")));
-    build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
-    build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
-
-    cl::NDRange lws_hint(cl::NullRange);
-    std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "") + "serial";
-
-    // Create kernel.
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure window
-    Window win = calculate_max_window(src, Steps(reduction_dim_size));
-    IClKernel::configure_internal(win, lws_hint);
+    const auto vec_size          = adjust_vec_size(16 / element_size, dst_shape[0]);
+    const auto vec_size_leftover = dst_shape[0] % vec_size;
 
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum));
-    return Status{};
-}
+    std::string    kernel_name("softmax");
+    CLBuildOptions build_opts;
 
-ClLogits1DMaxShiftExpSumKernel::ParallelReductionInfo ClLogits1DMaxShiftExpSumKernel::is_parallel_reduction(size_t size)
-{
-    bool         is_parallel_reduction = (size >= (_grid_size * _serial_vector_size)) && (_grid_size > 1);
-    unsigned int vector_size           = is_parallel_reduction ? _parallel_vector_size : _serial_vector_size;
-    return std::make_tuple(is_parallel_reduction, vector_size);
-}
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DTMP_DATA_TYPE=" + get_cl_type_from_data_type(tmp_data_type));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
+    build_opts.add_option("-DLENGTH=" + support::cpp11::to_string(length));
+    build_opts.add_option_if(info.is_log, "-DIS_LOG");
+    build_opts.add_option("-DBETA=" + float_to_string_with_full_precision(info.beta));
+
+    build_opts.add_option_if(is_quantized, "-DIS_QUANTIZED");
+    build_opts.add_option_if(is_quantized, "-DSRC_OFFSET=" + float_to_string_with_full_precision(src_qinfo.offset));
+    build_opts.add_option_if(is_quantized, "-DSRC_SCALE=" + float_to_string_with_full_precision(src_qinfo.scale));
+    build_opts.add_option_if(is_quantized, "-DDST_OFFSET=" + float_to_string_with_full_precision(dst_qinfo.offset));
+    build_opts.add_option_if(is_quantized, "-DDST_SCALE=" + float_to_string_with_full_precision(dst_qinfo.scale));
+
+    if (axis == 0)
+    {
+        kernel_name += "_x";
+        build_opts.add_option("-DSOFTMAX_X");
 
-void ClLogits1DMaxShiftExpSumKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+        if (is_quantized)
+        {
+            _tmp_info = TensorInfo(dst_shape, 1, tmp_data_type);
+        }
+    }
+    else
+    {
+        kernel_name += "_non_x";
+        build_opts.add_option("-DSOFTMAX_NON_X");
 
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-    auto max = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0));
-    auto sum = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_1));
+        TensorShape tmp_shape;
 
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, max, sum);
+        tmp_shape.set(0, length * vec_size, false);
+        tmp_shape.set(1, dst_shape[0] + (vec_size - vec_size_leftover) % vec_size, false);
 
-    // Collapse window in Z dimension
-    Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
+        for (size_t i = 2; i <= static_cast<size_t>(axis); ++i)
+        {
+            tmp_shape.set(i, dst_shape[i - 1], false);
+        }
 
-    // Reconfigure window in case of parallel reduction
-    ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(src->info()->dimension(0));
-    if(std::get<0>(parallel_reduction_info))
-    {
-        // Launch grid_size parallel work items
-        window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size, 1));
-    }
+        for (size_t i = axis + 1; i < dst_shape.num_dimensions(); ++i)
+        {
+            tmp_shape.set(i, dst_shape[i], false);
+        }
 
-    // Get slices
-    Window slice = window_collapsed.first_slice_window_3D();
-    do
-    {
-        unsigned int idx = 0;
-        // Set inputs
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, max, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        add_3D_tensor_argument(idx, sum, slice);
-        enqueue(queue, *this, slice, lws_hint());
+        _tmp_info = TensorInfo(tmp_shape, 1, tmp_data_type);
     }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
 
-ClLogits1DNormKernel::ClLogits1DNormKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info)
-{
-    auto padding_info = get_padding_info({ &src, &dst, &sum });
-
-    // Note: output should always have a scale of 1/256 and offset 0
-    const bool                    is_quantized_asymmetric   = is_data_type_quantized_asymmetric(info.input_data_type);
-    const DataType                output_data_type          = info.input_data_type;
-    const QuantizationInfo        allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-    const UniformQuantizationInfo qinfo                     = src.quantization_info().uniform();
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(dst, src.clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
+    // Configure kernel window and kernel arguments.
+    Window win = calculate_max_window(src, Steps(vec_size));
 
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DNorm(src, sum, dst, info));
+    bool has_collapsed = true;
 
-    const auto         is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type);
-    const int          min_value         = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
-    const unsigned int vector_size       = adjust_vec_size(16, src.dimension(0));
+    win = win.shift_dimensions(1, axis); // Remove this axis from the window/GWS.
+    win = win.collapse_if_possible(win, 2, has_collapsed);
+    ARM_COMPUTE_ERROR_ON(!has_collapsed);
 
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(info.input_data_type));
-    build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
-    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-    build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(src.dimension(0) % vector_size));
-    build_opts.add_option_if(is_data_type_quantized_asymmetric_signed(info.input_data_type), "-DQASYMM8_SIGNED");
-    build_opts.add_options_if(is_quantized_asymmetric,
-                              prepare_quantized_softmax_build_options(qinfo.scale, info.beta).options());
-    build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX");
-    build_opts.add_option_if(is_quantized_asymmetric, "-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
-    build_opts.add_option_if(is_quantized_asymmetric, "-DBETA=" + float_to_string_with_full_precision(info.beta));
-
-    // Create kernel
-    std::string kernel_name = std::string("softmax_layer_norm") + (is_quantized_asymmetric ? "_quantized" : "");
-    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure window
-    auto win = calculate_max_window(src, Steps(vector_size));
     ICLKernel::configure_internal(win);
 
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
+    _axis = axis;
 
-Status ClLogits1DNormKernel::validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(src, sum, dst, info));
-
-    return Status{};
+    _config_id = "softmax_" + lower_string(string_from_data_type(data_type));
+    _config_id += "_" + std::to_string(axis);
+    _config_id += "_" + std::to_string(length);
 }
 
-void ClLogits1DNormKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+void ClSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-    auto sum = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    ICLTensor *tmp = (_tmp_info.total_size() > 0)
+                         ? utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0))
+                         : nullptr;
 
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, sum);
+    if (!_prepared)
+    {
+        _prepared = true;
+
+        const auto *src_info    = src->info();
+        const auto *dst_info    = dst->info();
+        auto        src_strides = src_info->strides_in_bytes();
+        auto        dst_strides = dst_info->strides_in_bytes();
+
+        const auto src_stride_axis = src_strides[_axis];
+        const auto dst_stride_axis = dst_strides[_axis];
+
+        // This axis has been removed from execution window, hence we remove it from the list of strides
+        // provided to the kernel.
+        // In case axis > 0, src/dst_stride_axis will be provided in dedicated argument independent from global ID.
+        src_strides.remove(_axis);
+        dst_strides.remove(_axis);
+
+        // Argument 0: src_ptr.
+        _kernel.setArg<cl_uint>(1, src_strides[0]);
+        _kernel.setArg<cl_uint>(2, src_strides[1]);
+        _kernel.setArg<cl_uint>(3, src_strides[2]);
+        _kernel.setArg<cl_uint>(4, src_info->offset_first_element_in_bytes());
+
+        // Argument 5: dst_ptr.
+        _kernel.setArg<cl_uint>(6, dst_strides[0]);
+        _kernel.setArg<cl_uint>(7, dst_strides[1]);
+        _kernel.setArg<cl_uint>(8, dst_strides[2]);
+        _kernel.setArg<cl_uint>(9, dst_info->offset_first_element_in_bytes());
+
+        if (tmp != nullptr)
+        {
+            const auto *tmp_info    = tmp->info();
+            const auto &tmp_strides = tmp_info->strides_in_bytes();
+
+            // Argument 10: tmp_ptr.
+            _kernel.setArg<cl_uint>(11, tmp_strides[1]);
+            _kernel.setArg<cl_uint>(12, tmp_strides[2]);
+            _kernel.setArg<cl_uint>(13, tmp_strides[3]);
+            _kernel.setArg<cl_uint>(14, 0);
+        }
+
+        if (_axis > 0)
+        {
+            _kernel.setArg<cl_uint>(15, src_stride_axis);
+            _kernel.setArg<cl_uint>(16, dst_stride_axis);
+        }
+    }
 
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice            = window_collapsed.first_slice_window_3D();
+    _kernel.setArg(0, src->cl_buffer());
+    _kernel.setArg(5, dst->cl_buffer());
 
-    do
+    if (tmp != nullptr)
     {
-        Window sum_slice = slice;
-        sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        unsigned int idx = 0;
-        // Set inputs
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, sum, sum_slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
+        _kernel.setArg(10, tmp->cl_buffer());
     }
-    while(window_collapsed.slide_window_slice_3D(slice));
+
+    enqueue(queue, *this, window, lws_hint());
 }
+
+const TensorInfo &ClSoftmaxKernel::tmp_tensor_info() const
+{
+    return _tmp_info;
+}
+
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.h b/src/gpu/cl/kernels/ClSoftmaxKernel.h
index a221e12132ed59885049380150accf9ce128c245..130dc7835c07cedc34b61915d5c233816ce1a0ed 100644
--- a/src/gpu/cl/kernels/ClSoftmaxKernel.h
+++ b/src/gpu/cl/kernels/ClSoftmaxKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_SOFTMAX_KERNEL_H
-#define ARM_COMPUTE_CL_SOFTMAX_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLSOFTMAXKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLSOFTMAXKERNEL_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -36,83 +39,47 @@ namespace opencl
 {
 namespace kernels
 {
-/** Interface for max, shifting, exponentiating and summing the logits */
-class ClLogits1DMaxShiftExpSumKernel : public IClKernel
-{
-    /**< Grid size (obtained through auto-tuning) */
-    static const unsigned int _grid_size;
-    /**< Vector size in the serial case (obtained through auto-tuning) */
-    static const unsigned int _serial_vector_size;
-    /**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/
-    static const unsigned int _parallel_vector_size;
 
+/** The CL kernel that performs softmax function. */
+class ClSoftmaxKernel : public IClKernel
+{
 public:
-    /** Info for whether a parallel reduction will be run and the vector size of the execution. */
-    using ParallelReductionInfo = std::tuple<bool, unsigned int>;
+    ClSoftmaxKernel();
 
-    ClLogits1DMaxShiftExpSumKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogits1DMaxShiftExpSumKernel);
-    /** Configure the kernel using the given information about tensors
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     src             Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in,out] max             Max values tensor. Data types supported: same as @p src
-     * @param[out]    dst             Destination tensor. Data types supported: same as @p src
-     * @param[out]    sum             Sum of 1D logits tensor. Data types supported: same as @p src
-     * @param[in]     info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClLogits1DMaxShiftExpSumKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum);
-    /** Checks if the given size is eligible for parallel reduction
-     *
-     * @note  Serial reduction is launched for width < (_grid_size * _serial_vector_size).
-     * @note  Parallel reduction is launched for width >= (_grid_size * _serial_vector_size) and vector_size is forced to 4.
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClSoftmaxKernel);
+
+    /** Check if the kernel arguments are valid.
      *
-     * @param[in] size Size to check
+     * See @ref ClSoftmaxKernel::configure().
      *
-     * @return A two-element tuple where the first element is a boolean specifying if a parallel reduction will be run,
-     *         while the second element is the vector size of the execution.
+     * @return The status.
      */
-    static ParallelReductionInfo is_parallel_reduction(size_t size);
+    static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
 
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-};
-
-/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
-class ClLogits1DNormKernel : public IClKernel
-{
-public:
-    ClLogits1DNormKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogits1DNormKernel);
-
-    /** Set the input and output tensors.
+    /** Configure the kernel.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
-     * @param[in]  sum             Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[out] dst             Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src
      * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClLogits1DNormKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       &src,
+                   ITensorInfo             &dst,
+                   const SoftmaxKernelInfo &info);
+
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+    /** Get the tensor info of the temporary tensor. */
+    const TensorInfo &tmp_tensor_info() const;
+
+private:
+    bool       _prepared{false};
+    int32_t    _axis{0};
+    TensorInfo _tmp_info{};
 };
+
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_SOFTMAX_KERNEL_H */
+#endif // ACL_SRC_GPU_CL_KERNELS_CLSOFTMAXKERNEL_H
diff --git a/src/gpu/cl/kernels/ClTransposeKernel.cpp b/src/gpu/cl/kernels/ClTransposeKernel.cpp
index 6450ffb5b22a39923b100b81e649f78db928bb68..f95a215107bda367bdaa33e372eab9cdbc113405 100644
--- a/src/gpu/cl/kernels/ClTransposeKernel.cpp
+++ b/src/gpu/cl/kernels/ClTransposeKernel.cpp
@@ -29,9 +29,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -57,14 +58,37 @@ void ClTransposeKernel::configure(const CLCompileContext &compile_context, const
     const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
 
+    // Explicitly set the tensor shape to preserve dimensions
+    dst->set_tensor_shape(dst_shape);
+
     ARM_COMPUTE_ERROR_THROW_ON(ClTransposeKernel::validate(src, dst));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
-    // Create kernel
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
-    const int          vec_size_x_leftovers = src->dimension(0) % vec_size_x;
-    const unsigned int vec_size_y           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1));
-    const int          vec_size_y_leftovers = src->dimension(1) % vec_size_y;
+    unsigned int vec_size_x;
+    unsigned int vec_size_y;
+
+    // Set the optimal tile size for each data type without register spilling
+    switch (src->element_size())
+    {
+        case 1:
+            vec_size_x = adjust_vec_size(8, src->dimension(0));
+            vec_size_y = adjust_vec_size(16, src->dimension(1));
+            break;
+        case 2:
+            vec_size_x = adjust_vec_size(8, src->dimension(0));
+            vec_size_y = adjust_vec_size(8, src->dimension(1));
+            break;
+        case 4:
+            vec_size_x = adjust_vec_size(4, src->dimension(0));
+            vec_size_y = adjust_vec_size(8, src->dimension(1));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type");
+            break;
+    }
+
+    const int vec_size_x_leftovers = src->dimension(0) % vec_size_x;
+    const int vec_size_y_leftovers = src->dimension(1) % vec_size_y;
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE_IN_BYTES=" + support::cpp11::to_string(src->element_size()));
@@ -77,7 +101,7 @@ void ClTransposeKernel::configure(const CLCompileContext &compile_context, const
 
     // Configure kernel window
     Window win = calculate_max_window(*src, Steps(vec_size_x, vec_size_y));
-    ICLKernel::configure_internal(win, cl::NDRange(2, 8));
+    ICLKernel::configure_internal(win);
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
@@ -86,12 +110,12 @@ Status ClTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *ds
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 2, "Transpose up to 2-D src tensor is supported");
 
     // Validate configured dst
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo dst_info = src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src));
+        const TensorInfo dst_info =
+            src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &dst_info);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -106,20 +130,22 @@ void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::C
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    Window slice = window.first_slice_window_2D();
+    // Collapse dimensions higher than width and height into the batch dimension
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
 
     do
     {
         unsigned int idx = 0;
-        add_2D_tensor_argument(idx, src, slice);
-        add_2D_tensor_argument(idx, dst, slice);
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClTransposeKernel.h b/src/gpu/cl/kernels/ClTransposeKernel.h
index b30d6f0281b6dc21b065e35aed1f6e2b3922a921..eaad38b20f5e83d70732108b8ac70050f7a7a9db 100644
--- a/src/gpu/cl/kernels/ClTransposeKernel.h
+++ b/src/gpu/cl/kernels/ClTransposeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H
-#define ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
@@ -34,7 +34,7 @@ namespace opencl
 {
 namespace kernels
 {
-/** OpenCL kernel to transpose a 2D tensor. */
+/** OpenCL kernel to transpose a tensor. Only the first two dimensions (width, height) are transposed. */
 class ClTransposeKernel : public IClKernel
 {
 public:
@@ -61,4 +61,4 @@ public:
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H */
+#endif // ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H
diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp
index ae825694c5dcfa6fc1c52fc201662c23c024e5a0..76f39ac500178eb632c15a1c20bcecd1298a8c7c 100644
--- a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp
+++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp
@@ -26,14 +26,14 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
 
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -42,11 +42,15 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status validate_arguments(const ITensorInfo   *input,
+                          const ITensorInfo   *weights,
+                          const ITensorInfo   *biases,
+                          const ITensorInfo   *output,
                           const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
+                                                         DataType::QASYMM8_SIGNED, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(weights, DataLayout::NHWC);
@@ -56,12 +60,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     constexpr unsigned int height_idx  = 2;
     constexpr unsigned int batch_idx   = 3;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(input->data_type()))
+        if (is_data_type_quantized_asymmetric(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -77,15 +82,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     }
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const size_t input_width    = input->dimension(width_idx);
         const size_t input_height   = input->dimension(height_idx);
         const size_t weights_width  = weights->dimension(width_idx);
         const size_t weights_height = weights->dimension(height_idx);
 
-        auto        out_dims     = deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
-        TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+        auto out_dims =
+            deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
+        TensorShape output_shape =
+            misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -96,8 +103,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
 }
 } // namespace
 
-void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights,
-                                              const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info)
+void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_context,
+                                              const ITensorInfo      *input,
+                                              const ITensorInfo      *weights,
+                                              const ITensorInfo      *biases,
+                                              ITensorInfo            *output,
+                                              const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_UNUSED(biases, deconv_info);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
@@ -119,7 +130,8 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co
     const size_t output_channels = output->dimension(channel_idx);
 
     // Calculate output shape
-    auto        out_dims     = deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
+    auto out_dims =
+        deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
     TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
     auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->quantization_info());
 
@@ -147,7 +159,7 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co
     const DataType    input_data_type = input->data_type();
     const PaddingInfo strides         = deconv_info.stride();
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         build_options.add_option(std::string("-DHAS_BIAS"));
         build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
@@ -180,7 +192,7 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co
     build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
     build_options.add_option_if((input_channels % k0) != 0, "-DLEFTOVER_LOOP");
 
-    if(is_data_type_quantized(output_data_type))
+    if (is_data_type_quantized(output_data_type))
     {
         const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
         const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
@@ -210,7 +222,7 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co
         build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
     }
 
-    if(compile_context.get_ddk_version() >= 30)
+    if (compile_context.get_ddk_version() >= 30)
     {
         build_options.add_option("-fregister-allocation=64");
     }
@@ -235,8 +247,11 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co
     _config_id += support::cpp11::to_string(n0);
 }
 
-Status ClTransposedConvolutionKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases,
-                                               const ITensorInfo *dst, const PadStrideInfo &deconv_info)
+Status ClTransposedConvolutionKernel::validate(const ITensorInfo   *src,
+                                               const ITensorInfo   *weights,
+                                               const ITensorInfo   *biases,
+                                               const ITensorInfo   *dst,
+                                               const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, deconv_info));
     return Status{};
@@ -250,17 +265,20 @@ void ClTransposedConvolutionKernel::run_op(ITensorPack &tensors, const Window &w
     // Get initial windows
     Window slice = window.first_slice_window_3D();
 
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     unsigned int idx = 0;
     add_4d_tensor_nhwc_argument(idx, src);
     add_4d_tensor_nhwc_argument(idx, dst);
 
     add_4d_tensor_nhwc_argument(idx, weights);
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         add_1D_tensor_argument(idx, biases, slice);
     }
diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h
index d4350dda50564a0c2fcde5d3c58e7189748af490..44f6f56b7a2bdb0ef05b69d04ecd60a1cad08a03 100644
--- a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h
+++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h
@@ -45,16 +45,23 @@ public:
      * Similar to @ref ClTransposedConvolution::configure()
      *
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights,
-                   const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   const ITensorInfo      *weights,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *output,
+                   const PadStrideInfo    &deconv_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClTransposedConvolution::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases,
-                           const ITensorInfo *output, const PadStrideInfo &deconv_info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *biases,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &deconv_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -63,4 +70,4 @@ public:
 } // namespace opencl
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
index 8f36345076ca969cd94a7b29867c21ede03d74e1..af80c4d7968dd68730aadd0527d8d5a377bdd18c 100644
--- a/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
+++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
@@ -22,9 +22,11 @@
  * SOFTWARE.
  */
 #include "src/gpu/cl/kernels/ClWeightsReshapeKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -39,7 +41,10 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *output,
+                          unsigned int       num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -48,20 +53,24 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, c
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4 && num_groups > 1);
     ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(3) % num_groups) != 0);
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_float(input->data_type()));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
         ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
         ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) &&
+                                    (biases->dimension(0) != input->tensor_shape()[3]));
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            (input->num_dimensions() == 5) &&
+            (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
     }
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
@@ -75,16 +84,21 @@ ClWeightsReshapeKernel::ClWeightsReshapeKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups)
+void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context,
+                                       const ITensorInfo      *src,
+                                       const ITensorInfo      *biases,
+                                       ITensorInfo            *dst,
+                                       unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups)));
+    auto_init_if_empty(
+        *dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups)));
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst, num_groups));
-    auto padding_info = get_padding_info({ src, biases, dst });
+    auto padding_info = get_padding_info({src, biases, dst});
 
     const DataType data_type = src->data_type();
 
@@ -104,7 +118,10 @@ void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context,
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClWeightsReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups)
+Status ClWeightsReshapeKernel::validate(const ITensorInfo *src,
+                                        const ITensorInfo *biases,
+                                        const ITensorInfo *dst,
+                                        unsigned int       num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst, num_groups));
     return Status{};
@@ -136,7 +153,7 @@ void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window,
     _kernel.setArg<cl_uint>(idx++, src->info()->dimension(3));
     _kernel.setArg<cl_uint>(idx++, dst->info()->strides_in_bytes().z());
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         biases_window.use_tensor_dimensions(biases->info()->tensor_shape());
         biases_slice = biases_window.first_slice_window_1D();
@@ -148,7 +165,7 @@ void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window,
         unsigned idx = 0;
         add_3D_tensor_argument(idx, src, in_slice);
         add_2D_tensor_argument(idx, dst, out_slice);
-        if(biases != nullptr)
+        if (biases != nullptr)
         {
             add_1D_tensor_argument(idx, biases, biases_slice);
             ARM_COMPUTE_UNUSED(biases_window.slide_window_slice_1D(biases_slice));
@@ -156,8 +173,7 @@ void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window,
 
         // Run kernel
         enqueue(queue, *this, in_slice, lws_hint());
-    }
-    while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+    } while (window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.h b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h
index 7364eb97aee497c3cf73a4f0e8f4dd598a353ed3..5e05f8d006096de5fa9856be6deadece3779e892 100644
--- a/src/gpu/cl/kernels/ClWeightsReshapeKernel.h
+++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h
@@ -75,14 +75,19 @@ public:
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      *                             Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
      */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups = 1);
+    void configure(const ClCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *dst,
+                   unsigned int            num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClWeightsReshapeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -90,4 +95,4 @@ public:
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */
\ No newline at end of file
+#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
index 0a9a3f021f5e1f3254732b0d7d02c9de0b4cd85c..15195025cee5351e3abcbf7d5a467f0bf963259a 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
+++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
@@ -29,11 +29,11 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/tensor_info.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -52,7 +52,7 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) > dst->dimension(0));
 
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
         ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
@@ -63,7 +63,8 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons
 }
 } // namespace
 
-Status ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+Status
+ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst));
     return Status{};
@@ -74,12 +75,15 @@ ClWidthConcatenate2TensorsKernel::ClWidthConcatenate2TensorsKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
+void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context,
+                                                 ITensorInfo            *src1,
+                                                 ITensorInfo            *src2,
+                                                 ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst));
 
-    auto padding_info = get_padding_info({ src1, src2, dst });
+    auto padding_info = get_padding_info({src1, src2, dst});
 
     const unsigned int min_dimension                     = std::min(src1->dimension(0), src2->dimension(0));
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
@@ -91,11 +95,12 @@ void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
     build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
-    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) %
+                                                                           num_elems_processed_per_iteration));
 
     // If input have different quantization info set quantization parameters needed for the re-quantization process
     const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2);
-    if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
+    if (is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
     {
         const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
@@ -146,9 +151,11 @@ void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window
 
     Window slice = window.first_slice_window_4D();
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     do
     {
@@ -159,8 +166,7 @@ void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window
         _kernel.setArg<cl_int>(idx++, _depth);
         _kernel.setArg<cl_int>(idx++, _input1_width);
         enqueue(queue, *this, window, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice));
+    } while (window.slide_window_slice_4D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
index 5c544790023d9fd965f91284c2096a1e0959c88e..8b53d6d66b77af41177a51ad680d267a6d74843e 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
+++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
@@ -62,8 +62,8 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    int32_t _depth{ 0 };
-    int32_t _input1_width{ 0 };
+    int32_t _depth{0};
+    int32_t _input1_width{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
index 54f7ad344ac91fb43e623dc2d9d24ce8f4e7fc99..c4f84e3e4536a8d00f3066152e3b3728bb60d66e 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
+++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
@@ -30,11 +30,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/tensor_info.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -45,15 +45,20 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
+Status validate_arguments(const ITensorInfo *src1,
+                          const ITensorInfo *src2,
+                          const ITensorInfo *src3,
+                          const ITensorInfo *src4,
+                          const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, src3, src4, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) > dst->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) >
+                                dst->dimension(0));
 
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
         ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
@@ -71,22 +76,29 @@ ClWidthConcatenate4TensorsKernel::ClWidthConcatenate4TensorsKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
+Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1,
+                                                  const ITensorInfo *src2,
+                                                  const ITensorInfo *src3,
+                                                  const ITensorInfo *src4,
+                                                  const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, src3, src4, dst));
     return Status{};
 }
 
 void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context,
-                                                 ITensorInfo *src1, ITensorInfo *src2,
-                                                 ITensorInfo *src3, ITensorInfo *src4,
-                                                 ITensorInfo *dst)
+                                                 ITensorInfo            *src1,
+                                                 ITensorInfo            *src2,
+                                                 ITensorInfo            *src3,
+                                                 ITensorInfo            *src4,
+                                                 ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, src3, src4, dst));
 
-    auto               padding_info                      = get_padding_info({ src1, src2, src3, src4, dst });
-    const unsigned int min_dimension                     = std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0)));
+    auto               padding_info = get_padding_info({src1, src2, src3, src4, dst});
+    const unsigned int min_dimension =
+        std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0)));
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
     const unsigned int vec_size_leftover                 = dst->dimension(0) % num_elems_processed_per_iteration;
 
@@ -96,9 +108,14 @@ void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
     build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
-    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DINPUT2_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) + src3->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) %
+                                                                           num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT2_ROTATE_N=" +
+                          support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) %
+                                                    num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) +
+                                                                            src3->dimension(0) - vec_size_leftover) %
+                                                                           num_elems_processed_per_iteration));
 
     _depth        = src1->dimension(2);
     _input1_width = src1->dimension(0);
@@ -106,8 +123,9 @@ void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile
     _input3_width = src3->dimension(0);
 
     // If soources have different quantization info set quantization parameters needed for the re-quantization process
-    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4);
-    if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
+    const bool have_different_qinfo =
+        helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4);
+    if (is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
     {
         const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
@@ -166,11 +184,15 @@ void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2));
-    const auto src3 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2));
+    const auto src3 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice = window.first_slice_window_4D();
 
@@ -187,8 +209,7 @@ void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window
         _kernel.setArg<cl_int>(idx++, _input2_width);
         _kernel.setArg<cl_int>(idx++, _input3_width);
         enqueue(queue, *this, window, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice));
+    } while (window.slide_window_slice_4D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
index baf8d381be0a7611638c757fa3ab04688bb0e629..f589b8ac1ab5922efaeffa90537a3e83f8658ad2 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
+++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
@@ -52,23 +52,32 @@ public:
      * @param[in]  src4            Fourth source tensor info. Data types supported: same as @p src1
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *src3, ITensorInfo *src4, ITensorInfo *dst);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src1,
+                   ITensorInfo            *src2,
+                   ITensorInfo            *src3,
+                   ITensorInfo            *src4,
+                   ITensorInfo            *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClWidthConcatenate4TensorsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst);
+    static Status validate(const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *src3,
+                           const ITensorInfo *src4,
+                           const ITensorInfo *dst);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    int32_t _depth{ 0 };
-    int32_t _input1_width{ 0 };
-    int32_t _input2_width{ 0 };
-    int32_t _input3_width{ 0 };
+    int32_t _depth{0};
+    int32_t _input1_width{0};
+    int32_t _input2_width{0};
+    int32_t _input3_width{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
index 2dfe7fce5235454d592670b4f3488678f80a6f69..989de4a7b7491b3b37a8c61b552454af2a705668 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
+++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -53,7 +53,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, con
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
 
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
     }
@@ -74,12 +74,15 @@ Status ClWidthConcatenateKernel::validate(const ITensorInfo *src, unsigned int w
     return Status{};
 }
 
-void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst)
+void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                         ITensorInfo            *src,
+                                         unsigned int            width_offset,
+                                         ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, src->dimension(0));
 
@@ -87,10 +90,11 @@ void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(width_offset));
 
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
         const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
@@ -121,8 +125,9 @@ void ClWidthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, src, window);
diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.h b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h
index 3ace4400e6e28d026fd934cdbbceff8148564f56..c10d6a4dc636aa4e3ee5c105e19827d9fc9a6430 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenateKernel.h
+++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h
@@ -50,7 +50,8 @@ public:
      * @param[in,out] dst             Destination tensor info. Data types supported: same as @p src.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClWidthConcatenateKernel::configure()
@@ -63,7 +64,7 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    int32_t _depth{ 0 };
+    int32_t _depth{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
index 7148a4c85cb87d814e348e150ad7b32af9635a37..58c01d4da568fb4a0198a093a5dbf0bbc8775ca2 100644
--- a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
@@ -29,10 +29,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -60,14 +61,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd filter transform not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()),
+        "Winograd filter transform not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width ||
+                                input->dimension(idx_h) != kernel_size.height);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -81,11 +86,15 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_UNUSED(output);
 
-    const unsigned int num_elems_processed_per_iteration_x = input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1;
+    const unsigned int num_elems_processed_per_iteration_x =
+        input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1;
     const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
-    const unsigned int num_elems_read_per_iteration_z      = input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
+    const unsigned int num_elems_read_per_iteration_z =
+        input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
 
-    Window win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z));
+    Window win =
+        calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y,
+                                           num_elems_read_per_iteration_z));
     Window win_collapsed = win.collapse(win, Window::DimZ);
     return std::make_pair(Status{}, win_collapsed);
 }
@@ -96,21 +105,25 @@ ClWinogradFilterTransformKernel::ClWinogradFilterTransformKernel()
     _type = CLKernelType::WINOGRAD;
 }
 
-void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info)
+void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context,
+                                                ITensorInfo            *src,
+                                                ITensorInfo            *dst,
+                                                const WinogradInfo     &winograd_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info)));
+    auto_init_if_empty(*dst,
+                       src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Set build options
     CLBuildOptions build_opts;
 
     // For NHWC layouts pass tensor dimesions at runtime
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         _src_dim_z = src->dimension(2);
     }
@@ -125,7 +138,8 @@ void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_
     const Size2D output_tile_size = winograd_info.output_tile_size;
 
     // Create kernel
-    std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout()));
+    std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" +
+                              kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout()));
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -138,7 +152,9 @@ void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClWinogradFilterTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info)
+Status ClWinogradFilterTransformKernel::validate(const ITensorInfo  *src,
+                                                 const ITensorInfo  *dst,
+                                                 const WinogradInfo &winograd_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
@@ -161,7 +177,7 @@ void ClWinogradFilterTransformKernel::run_op(ITensorPack &tensors, const Window
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, src, window);
     add_3D_tensor_argument(idx, dst, window_out);
-    if(src->info()->data_layout() == DataLayout::NHWC)
+    if (src->info()->data_layout() == DataLayout::NHWC)
     {
         _kernel.setArg<cl_uint>(idx++, _src_dim_z);
     }
diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
index b2130304e6720478bef14ba36c0162ff2ba28136..6e439f0c99cee490904f3764f9750c8286233a55 100644
--- a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
+++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -59,7 +60,10 @@ public:
      * @param[out] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
      * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const WinogradInfo     &winograd_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClWinogradFilterTransformKernel::configure()
@@ -72,7 +76,7 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    int32_t _src_dim_z{ 0 };
+    int32_t _src_dim_z{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
index fab6c36032144fa05044be5c6cd0fa49ed85e665..54c48986fcd5476409f876360f69d5adaadb5c47 100644
--- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
@@ -32,6 +32,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -55,17 +56,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const Size2D        kernel_size      = winograd_info.kernel_size;
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd input transform not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1,
+                                    "Winograd input transform only supports unit strides");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()),
+        "Winograd input transform not supported");
 
     ARM_COMPUTE_UNUSED(conv_info);
     ARM_COMPUTE_UNUSED(output_tile_size);
     ARM_COMPUTE_UNUSED(kernel_size);
 
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+        const TensorShape output_shape =
+            misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -74,7 +79,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
 {
     ARM_COMPUTE_UNUSED(output);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -82,7 +88,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     bool window_changed                    = false;
     int  num_elems_processed_per_iteration = 1;
 
-    if(input->data_layout() == DataLayout::NHWC)
+    if (input->data_layout() == DataLayout::NHWC)
     {
         // In the case of FP16 computation, we can perform more
         // output feature maps in a single work-item.
@@ -94,9 +100,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         const size_t   dim0 = input->dimension(0);
         const size_t   k_sz = winograd_info.kernel_size.area();
         const bool     cond = dt == DataType::F16 && ((dim0 % 2) == 0);
-        if(cond)
+        if (cond)
         {
-            if(k_sz == 3 || k_sz == 9)
+            if (k_sz == 3 || k_sz == 9)
             {
                 num_elems_processed_per_iteration = 2;
             }
@@ -104,7 +110,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     }
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
-    if(input->data_layout() == DataLayout::NCHW)
+    if (input->data_layout() == DataLayout::NCHW)
     {
         const PadStrideInfo conv_info        = winograd_info.convolution_info;
         const Size2D        output_tile_size = winograd_info.output_tile_size;
@@ -113,11 +119,13 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1;
         unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1;
 
-        AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
+        AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(),
+                                           num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
         window_changed = update_window_and_padding(win, input_access);
     }
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -132,12 +140,15 @@ BorderSize ClWinogradInputTransformKernel::border_size() const
     return _border_size;
 }
 
-void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info)
+void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context,
+                                               ITensorInfo            *src,
+                                               ITensorInfo            *dst,
+                                               const WinogradInfo     &winograd_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
@@ -150,14 +161,13 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
 
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
     const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(src->dimension(idx_w), src->dimension(idx_h)),
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
+                                                                kernel_size, output_tile_size, conv_info);
 
     _num_tiles_x = num_tiles.width;
     _num_tiles_y = num_tiles.height;
 
-    const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
+    const TensorShape output_shape =
+        misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
 
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(output_shape));
@@ -174,7 +184,7 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
     _src_height = src->dimension(idx_h);
 
     CLBuildOptions build_opts;
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         build_opts.add_option("-DNHWC");
         build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
@@ -201,13 +211,14 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
     }
 
     // Create kernel
-    std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
+    std::string kernel_name =
+        "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
 
     // Get the maximum dimension from the tile size
     const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
 
     // Check optimized kernel if output_dims == 2x2
-    if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
+    if ((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
     {
         _step_z = (src->dimension(2) % 2) != 0 ? 1 : 2;
     }
@@ -239,11 +250,14 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
     _config_id += lower_string(string_from_data_layout(_data_layout));
 }
 
-Status ClWinogradInputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info)
+Status ClWinogradInputTransformKernel::validate(const ITensorInfo  *src,
+                                                const ITensorInfo  *dst,
+                                                const WinogradInfo &winograd_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first);
     return Status{};
 }
 
@@ -263,7 +277,7 @@ void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window &
     // Collapse window
     Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         Window slice = window_collapsed.first_slice_window_3D();
         slice.set(1, Window::Dimension(0, _num_tiles_x * _num_tiles_y, 1));
@@ -298,8 +312,7 @@ void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window &
             add_3D_tensor_argument(idx, dst, slice);
 
             enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window_collapsed.slide_window_slice_3D(slice));
+        } while (window_collapsed.slide_window_slice_3D(slice));
     }
 }
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
index c10c528b9b97786122e766aab5b7cc2961c891e0..cebebea1d366fce13a9ea3f33674c15018297cdd 100644
--- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
+++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -59,7 +60,10 @@ public:
      * @param[in] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
      * @param[in] winograd_info   Contains Winograd's information described in @ref WinogradInfo.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const WinogradInfo     &winograd_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClWinogradInputTransformKernel::configure()
@@ -69,19 +73,19 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+    void       run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
     BorderSize border_size() const override;
 
 private:
     using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    BorderSize   _border_size{ 0 };
-    DataLayout   _data_layout{ DataLayout::UNKNOWN };
-    int          _num_tiles_x{ 0 };
-    int          _num_tiles_y{ 0 };
-    unsigned int _step_z{ 1 };
-    int32_t      _src_width{ 0 };
-    int32_t      _src_height{ 0 };
+    BorderSize   _border_size{0};
+    DataLayout   _data_layout{DataLayout::UNKNOWN};
+    int          _num_tiles_x{0};
+    int          _num_tiles_y{0};
+    unsigned int _step_z{1};
+    int32_t      _src_width{0};
+    int32_t      _src_height{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
index bf974d30d8649e6b64661a8038be25ca3fabfc98..89c80c55ef36c6b0b08fb55b322ca1932073ede4 100644
--- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -31,10 +30,12 @@
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -54,7 +55,11 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *bias,
+                          const ITensorInfo         *output,
+                          const WinogradInfo        &winograd_info,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
@@ -66,30 +71,32 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const Size2D        kernel_size      = winograd_info.kernel_size;
     const Size2D        input_dimensions = winograd_info.input_dimensions;
-    const unsigned int  num_channels     = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1);
+    const unsigned int  num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) *
+                                      (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), "Winograd output transform not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout),
+        "Winograd output transform not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels");
 
     // Compute number of elements to process in the X and Y direction
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
+    const Size2D num_tiles =
+        compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info);
 
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles.area())));
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
     }
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -98,14 +105,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo  *input,
+                                                        ITensorInfo  *bias,
+                                                        ITensorInfo  *output,
+                                                        const Size2D &output_tile_size)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_UNUSED(bias);
 
     unsigned int num_elems_processed_per_iteration = 1;
 
-    if(input->data_layout() == DataLayout::NHWC)
+    if (input->data_layout() == DataLayout::NHWC)
     {
         // In the case of FP16 computation, we can perform more
         // output feature maps in a single work-item.
@@ -115,7 +125,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         const DataType dt   = input->data_type();
         const size_t   dim0 = input->dimension(0);
         const bool     cond = dt == DataType::F16 && ((dim0 % 2) == 0);
-        if(cond)
+        if (cond)
         {
             num_elems_processed_per_iteration = 2;
         }
@@ -124,17 +134,19 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
     bool   window_changed = false;
 
-    if(output->data_layout() == DataLayout::NCHW)
+    if (output->data_layout() == DataLayout::NCHW)
     {
         const int output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width);
         const int output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height);
 
-        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration,
+                                           num_elems_processed_per_iteration);
         AccessWindowStatic    output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y);
         window_changed = update_window_and_padding(win, input_access, output_access);
     }
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -144,13 +156,18 @@ ClWinogradOutputTransformKernel::ClWinogradOutputTransformKernel()
     _type = CLKernelType::WINOGRAD;
 }
 
-void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info,
+void ClWinogradOutputTransformKernel::configure(const ClCompileContext    &compile_context,
+                                                ITensorInfo               *src,
+                                                ITensorInfo               *bias,
+                                                ITensorInfo               *dst,
+                                                const WinogradInfo        &winograd_info,
                                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info)));
+    auto_init_if_empty(*dst,
+                       src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, winograd_info, act_info));
 
@@ -159,7 +176,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
 
-    auto padding_info = get_padding_info({ src, bias, dst });
+    auto padding_info = get_padding_info({src, bias, dst});
 
     _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC;
 
@@ -168,14 +185,13 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     const Size2D        kernel_size      = winograd_info.kernel_size;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const int           idx_width        = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height       = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT);
+    const int idx_width = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height =
+        get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
+    const Size2D num_tiles =
+        compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info);
     const size_t total_batches = dst->tensor_shape().total_size_upper(3);
 
     // Set build options
@@ -184,11 +200,11 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
     build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
 
-    if((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
+    if ((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
     {
         build_opts.add_option("-DVEC_SIZE=2");
     }
-    else if((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
+    else if ((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
     {
         build_opts.add_option("-DVEC_SIZE=4");
     }
@@ -200,9 +216,10 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     const auto      act_function  = act_info.activation();
     const auto      src_data_type = src->data_type();
 
-    if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-       && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-       && (src_data_type == DataType::F32 || src_data_type == DataType::F16))
+    if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+        (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+         act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+        (src_data_type == DataType::F32 || src_data_type == DataType::F16))
     {
         // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
         // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
@@ -213,7 +230,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
         build_opts.add_option("-cl-fast-relaxed-math");
     }
 
-    if(_is_nhwc)
+    if (_is_nhwc)
     {
         build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS"));
         build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
@@ -247,7 +264,9 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     _dst_height = dst->dimension(idx_height);
 
     // Create kernel
-    std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout));
+    std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" +
+                              kernel_size.to_string() + "_" +
+                              lower_string(string_from_data_layout(winograd_info.output_data_layout));
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -271,10 +290,18 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info) && _is_nhwc);
 }
 
-Status ClWinogradOutputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+Status ClWinogradOutputTransformKernel::validate(const ITensorInfo         *src,
+                                                 const ITensorInfo         *bias,
+                                                 const ITensorInfo         *dst,
+                                                 const WinogradInfo        &winograd_info,
+                                                 const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), dst->clone().get(), winograd_info.output_tile_size).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
+                                                              (bias != nullptr ? bias->clone().get() : nullptr),
+                                                              dst->clone().get(), winograd_info.output_tile_size)
+                                    .first);
     return Status{};
 }
 
@@ -299,7 +326,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window
     slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
     slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         unsigned int idx1 = 2 * num_arguments_per_4D_tensor();
         Window       slice_biases;
@@ -307,7 +334,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window
         add_1D_tensor_argument(idx1, bias, slice_biases);
     }
 
-    if(_is_nhwc)
+    if (_is_nhwc)
     {
         unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
         _kernel.setArg(idx2++, static_cast<int>(dst->info()->total_size() - dst->info()->strides_in_bytes().y()));
@@ -322,8 +349,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window
         add_4D_tensor_argument(idx, src, slice);
         add_4D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
index 6f018967d0b76e2350b6f27280ed2cdc7f4be4e7..65bb9630619416adb0bf40b83907af19740993c6 100644
--- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
+++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -61,7 +62,11 @@ public:
      * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const WinogradInfo        &winograd_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -69,7 +74,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const WinogradInfo        &winograd_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -77,11 +86,11 @@ public:
 private:
     using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    bool    _is_nhwc{ false };
-    int32_t _src_height{ 0 };
-    int32_t _dst_width{ 0 };
-    int32_t _dst_height{ 0 };
-    int32_t _num_tiles_x{ 0 };
+    bool    _is_nhwc{false};
+    int32_t _src_height{0};
+    int32_t _dst_width{0};
+    int32_t _dst_height{0};
+    int32_t _num_tiles_x{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
index 9350bf74bbcd58166fbf7d90e0b256a5897bef92..b5ebac3b4927063ed16fac76f3b684ce48027202 100644
--- a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
+++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
@@ -39,14 +39,24 @@ namespace kernels
 {
 namespace gemm
 {
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m,
+                                                                       unsigned int n,
+                                                                       unsigned int m0,
+                                                                       unsigned int n0,
+                                                                       unsigned int k0,
+                                                                       unsigned int v0,
+                                                                       unsigned int h0,
+                                                                       bool         lhs_interleave,
+                                                                       bool         rhs_interleave,
+                                                                       bool         lhs_transpose,
+                                                                       bool         rhs_transpose,
+                                                                       bool         export_to_cl_image)
 {
     ARM_COMPUTE_ERROR_ON(m0 == 0 || n0 == 0);
     ARM_COMPUTE_ERROR_ON(v0 == 0);
     v0 = std::max(std::min(static_cast<int>(m / m0), static_cast<int>(v0)), static_cast<int>(1));
 
-    if(h0 == 0)
+    if (h0 == 0)
     {
         // When h0 is 0, we should take the maximum H0 possible
         h0 = std::max(n / n0, 1U);
@@ -62,17 +72,22 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned
     return std::make_pair(lhs_info, rhs_info);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
-                                                                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
-                                                                    unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
+                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
+                    unsigned int                                    n,
+                    unsigned int                                    k,
+                    unsigned int                                    b,
+                    DataType                                        data_type)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true, "The fallback GeMM configuration cannot have export_to_cl_image = true");
+    ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true,
+                             "The fallback GeMM configuration cannot have export_to_cl_image = true");
 
     const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, data_type);
     const TensorShape shape = misc::shape_calculator::compute_rhs_reshaped_shape(tensor_rhs_info, info_img.second);
     const TensorInfo  tensor_reshaped_info(shape, 1, data_type);
 
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second)))
+    if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second)))
     {
         return info_img;
     }
@@ -90,42 +105,56 @@ void update_padding_for_cl_image(ITensorInfo *tensor)
     const unsigned int pixel_alignment      = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device());
 
     ARM_COMPUTE_ERROR_ON_MSG(pixel_alignment == 0, "Cannot retrieve cl_image pitch alignment");
-    if(pixel_alignment == 0)
+    if (pixel_alignment == 0)
     {
         return;
     }
 
     const unsigned int row_pitch_alignment = pixel_alignment * num_floats_per_pixel;
-    const unsigned int round_up_width      = ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
-    const unsigned int padding             = round_up_width - stride_y_in_elements;
+    const unsigned int round_up_width =
+        ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
+    const unsigned int padding = round_up_width - stride_y_in_elements;
 
     tensor->extend_padding(PaddingSize(0, tensor->padding().right + padding, 0, 0));
 }
 
 Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info)
 {
-    if(rhs_info.export_to_cl_image)
+    if (rhs_info.export_to_cl_image)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false, "Export to cl_image only supported with n0 = 4, 8 or 16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true, "Export to cl_image only supported with k0 = 4, 8 or 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false,
+                                        "Export to cl_image only supported with n0 = 4, 8 or 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true,
+                                        "Export to cl_image only supported with k0 = 4, 8 or 16");
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(&tensor_reshaped_info, DataType::F32, DataType::F16);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            !image2d_from_buffer_supported(CLKernelLibrary::get().get_device()),
+            "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0,
+                                        "Impossible to retrieve the cl_image pitch alignment");
 
         // Check the width and height of the output tensor.
         // Since we cannot create a 3d image from a buffer, the third dimension is collapsed on the second dimension
         const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
         const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
 
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4, "Not supported width for cl_image");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h, "Not supported height for cl_image");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4,
+                                        "Not supported width for cl_image");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h,
+            "Not supported height for cl_image");
     }
 
     return Status{};
 }
 
-bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const unsigned int k, const unsigned int b,
-                              const DataType data_type, unsigned int &best_m0, unsigned int &best_n0)
+bool is_mmul_kernel_preferred(const unsigned int m,
+                              const unsigned int n,
+                              const unsigned int k,
+                              const unsigned int b,
+                              const DataType     data_type,
+                              unsigned int      &best_m0,
+                              unsigned int      &best_n0)
 {
     ARM_COMPUTE_UNUSED(n, k, b, data_type);
 
@@ -141,7 +170,8 @@ bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const
     return ((k % mmul_k0) == 0) && (gws_y > 4);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     size_t min_acc = std::numeric_limits<size_t>::max();
     size_t min_idx = 0;
@@ -150,12 +180,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConf
     const size_t num_rows = configs.size();
     const size_t num_cols = configs[0].size();
 
-    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS");
+    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, "
+                                              "N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS");
     ARM_COMPUTE_UNUSED(num_cols);
 
     // Find nearest GeMM workload
     // Note: the workload does not depend on the K dimension
-    for(size_t y = 0; y < num_rows; ++y)
+    for (size_t y = 0; y < num_rows; ++y)
     {
         size_t mc0 = static_cast<size_t>(configs[y][0]);
         size_t nc0 = static_cast<size_t>(configs[y][1]);
@@ -168,7 +199,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConf
         acc += (k - kc0) * (k - kc0);
         acc += (b - bc0) * (b - bc0);
         acc = std::sqrt(acc);
-        if(acc < min_acc)
+        if (acc < min_acc)
         {
             min_acc = acc;
             min_idx = y;
diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h
index 6689b10e696a724ea364d0bf6313554667500eb8..84776fb2071bda6ec00676cc7f20341903f28366 100644
--- a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h
+++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h
@@ -54,8 +54,18 @@ using GeMMConfigsMatrix = std::vector<std::vector<int32_t>>;
  *
  * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image = false);
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m,
+                                                                       unsigned int n,
+                                                                       unsigned int m0,
+                                                                       unsigned int n0,
+                                                                       unsigned int k0,
+                                                                       unsigned int v0,
+                                                                       unsigned int h0,
+                                                                       bool         lhs_interleave,
+                                                                       bool         rhs_interleave,
+                                                                       bool         lhs_transpose,
+                                                                       bool         rhs_transpose,
+                                                                       bool         export_to_cl_image = false);
 
 /** Select @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  *
@@ -72,9 +82,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned
  *
  * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
-                                                                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
-                                                                    unsigned int n, unsigned int k, unsigned int b, DataType data_type);
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
+                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
+                    unsigned int                                    n,
+                    unsigned int                                    k,
+                    unsigned int                                    b,
+                    DataType                                        data_type);
 
 /** Update padding required to export the OpenCL buffer to OpenCL image2d
  *
@@ -103,8 +117,13 @@ Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info,
  *
  * @return true if MMUL kernel is preferred over kernels w/o MMUL, false otherwise
  */
-bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const unsigned int k, const unsigned int b,
-                              const DataType data_type, unsigned int &best_m0, unsigned int &best_n0);
+bool is_mmul_kernel_preferred(const unsigned int m,
+                              const unsigned int n,
+                              const unsigned int k,
+                              const unsigned int b,
+                              const DataType     data_type,
+                              unsigned int      &best_m0,
+                              unsigned int      &best_n0);
 
 /** Find the preferred configurations for the LHS and RHS tensor using the GeMMConfigsMatrix provided by the user
  *
@@ -116,7 +135,8 @@ bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const
  *
  * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 } // namespace gemm
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
index a49836cfdaca8e38b6982318e56ffb2d1cedecb2..9d0863396343e9c62ada6a03ada0f8484747b3fe 100644
--- a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 
 #include <array>
@@ -56,8 +57,7 @@ public:
      * @param[in] func_int8 Function to call for GEMM Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
      *
      */
-    CLGEMMConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
+    CLGEMMConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
     {
     }
 
@@ -69,7 +69,7 @@ public:
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -96,8 +96,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    IClGemmKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClGemmKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmKernelConfig);
@@ -111,7 +110,8 @@ public:
      * @param[in] b         Batch size
      * @param[in] data_type Data type
      */
-    virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
+    virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
 
 protected:
     GPUTarget _target;
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
index d74c7fac9bed6f3e8bb9e4b5a30cc5d1055847df..2f37eef31fcbd23921002a3f600691efb27d5054 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -38,31 +39,34 @@ namespace kernels
 {
 namespace gemm
 {
-ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClGemmDefaultConfigNativeBifrost::configure_G71_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G71_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClGemmDefaultConfigNativeBifrost::configure_G71_f32,
+        &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic
+        &ClGemmDefaultConfigNativeBifrost::configure_G71_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigNativeBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G76_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+        &ClGemmDefaultConfigNativeBifrost::configure_G76_f32,
+        &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic
+        &ClGemmDefaultConfigNativeBifrost::configure_G76_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigNativeBifrost::configure_default_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_default_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClGemmDefaultConfigNativeBifrost::configure_default_f32,
+        &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic
+        &ClGemmDefaultConfigNativeBifrost::configure_default_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G76:
             func = configs_G76.get_function(data_type);
@@ -79,18 +83,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n < 2048)
+        if (n < 2048)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
         }
-        else if(n >= 2048 && n < 8192)
+        else if (n >= 2048 && n < 8192)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
         }
@@ -105,20 +110,21 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
     {
-        if(m == 1)
+        if (m == 1)
         {
-            if(n < 2048)
+            if (n < 2048)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
             }
-            else if(n >= 2048 && n < 16384)
+            else if (n >= 2048 && n < 16384)
             {
                 return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
             }
@@ -129,7 +135,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
             }
@@ -141,9 +147,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
     else
     {
-        if(m == 1)
+        if (m == 1)
         {
-            if(n < 8192)
+            if (n < 8192)
             {
                 return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
             }
@@ -159,24 +165,25 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n > 4196)
+        if (n > 4196)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 1, false, false, false, false);
         }
         else
         {
-            if(k < 2048)
+            if (k < 2048)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 1, false, false, false, false);
             }
-            else if(k >= 2048 && k < 16384)
+            else if (k >= 2048 && k < 16384)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
             }
@@ -192,18 +199,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n < 2048)
+        if (n < 2048)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
         }
-        else if(n >= 2048 && n < 16384)
+        else if (n >= 2048 && n < 16384)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
         }
@@ -214,7 +222,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
     else
     {
-        if(m < 64)
+        if (m < 64)
         {
             return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
         }
@@ -225,7 +233,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -233,7 +242,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 1, false, false, false, false);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -243,4 +253,4 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
 } // namespace gemm
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
index 9af5dc413545e4064c705f06e1344d50cada8cdb..f822daae5310b4832ee9211f53442b6800ddd588 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
@@ -45,15 +45,22 @@ public:
     ClGemmDefaultConfigNativeBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
index b9f36c7210bd0dd5299186b503860ecd209bcd15..f87fb1b659b726a5ffec78d887ff8e0c66a22c49 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -38,18 +39,17 @@ namespace kernels
 {
 namespace gemm
 {
-ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr,
-                                                                        nullptr,
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr, nullptr,
                                                                         &ClGemmDefaultConfigNativeMidgard::default_q8);
 
     auto func = configs_default.get_function(data_type);
@@ -57,7 +57,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -70,4 +71,4 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard
 } // namespace gemm
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
index c055753c48227bc3c51dff723af800e9fee5cf7b..fa76c5dba77ebdc54cf7595cda7a941d81c70595 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
@@ -45,10 +45,12 @@ public:
     ClGemmDefaultConfigNativeMidgard(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
index 95a4d2bd6999a757b60e9c29d381ef4ceaeb3037..97a1298b0a8dab38898569c757deca6eb40240ed 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -38,37 +39,38 @@ namespace kernels
 {
 namespace gemm
 {
-ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(&ClGemmDefaultConfigNativeValhall::configure_G77_f32,
-                                                                        &ClGemmDefaultConfigNativeValhall::configure_G77_f16,
-                                                                        &ClGemmDefaultConfigNativeValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(
+        &ClGemmDefaultConfigNativeValhall::configure_G77_f32, &ClGemmDefaultConfigNativeValhall::configure_G77_f16,
+        &ClGemmDefaultConfigNativeValhall::configure_G77_u8);
 
     auto func = configs_default.get_function(data_type);
     ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n < 2048)
+        if (n < 2048)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
         }
-        else if(n >= 2048 && n < 8192)
+        else if (n >= 2048 && n < 8192)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
         }
@@ -83,18 +85,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n < 2048)
+        if (n < 2048)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
         }
-        else if(n >= 2048 && n < 8192)
+        else if (n >= 2048 && n < 8192)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
         }
@@ -109,20 +112,21 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
     {
-        if(m == 1)
+        if (m == 1)
         {
-            if(n < 2048)
+            if (n < 2048)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
             }
-            else if(n >= 2048 && n < 16384)
+            else if (n >= 2048 && n < 16384)
             {
                 return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
             }
@@ -133,7 +137,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
             }
@@ -145,9 +149,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall
     }
     else
     {
-        if(m == 1)
+        if (m == 1)
         {
-            if(n < 8192)
+            if (n < 8192)
             {
                 return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
             }
@@ -165,4 +169,4 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall
 } // namespace gemm
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
index f0f812fd46132e4ec24b741c690a1ac76fae6e71..c91b0952794b7ebe9665ab46cafbeeabf1cdcf38 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
@@ -45,12 +45,16 @@ public:
     ClGemmDefaultConfigNativeValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
index cf8412830bfe4c40e3d732b0ad2fd1a81d956635..955bb3c01a551a1fee1c27980c161edc97feb626 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
@@ -51,7 +51,7 @@ public:
      */
     static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
                 return std::make_unique<ClGemmDefaultConfigNativeMidgard>(gpu);
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
index 657018eb533b408823569b688ae6599227d795ee..c956c347ef8ef3356a1c5e3ecc3e044226e89d36 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -43,30 +44,31 @@ namespace gemm
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16,
+        &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedBifrost::configure_G52_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(
+        &ClGemmDefaultConfigReshapedBifrost::configure_G52_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16,
+        &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+        &ClGemmDefaultConfigReshapedBifrost::configure_G76_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16,
+        &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G76:
             func = configs_G76.get_function(data_type);
@@ -83,12 +85,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
     }
@@ -98,12 +101,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 8, 8, 2, true, true, true, false);
     }
@@ -113,14 +117,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
     {
-        if(n <= 4)
+        if (n <= 4)
         {
             return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true);
         }
@@ -131,7 +136,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
     else
     {
-        if(n <= 4)
+        if (n <= 4)
         {
             return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true);
         }
@@ -142,7 +147,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
@@ -154,100 +160,108 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     GEMMLHSMatrixInfo lhs_info_img;
     GEMMRHSMatrixInfo rhs_info_img;
 
-    if(workload <= 274.4000f)
+    if (workload <= 274.4000f)
     {
-        if(r_nk <= 0.7461f)
+        if (r_nk <= 0.7461f)
         {
-            if(r_mn <= 21.1667f)
+            if (r_mn <= 21.1667f)
             {
                 return configure_lhs_rhs_info(m, n, 4, 2, 4, 4, 4, false, true, true, false, false);
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
         else
         {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+            std::tie(lhs_info_img, rhs_info_img) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
             return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
+                                       std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
         }
     }
     else
     {
-        if(r_mk <= 17.3926f)
+        if (r_mk <= 17.3926f)
         {
-            if(workload <= 542.4000f)
+            if (workload <= 542.4000f)
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
         else
         {
-            if(r_nk <= 0.5463f)
+            if (r_nk <= 0.5463f)
             {
-                if(workload <= 11767.6001f)
+                if (workload <= 11767.6001f)
                 {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
                 }
                 else
                 {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
                 }
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
 
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-    if(workload <= 323.4000f)
+    if (workload <= 323.4000f)
     {
         return configure_lhs_rhs_info(m, n, 2, 2, 8, 4, 8, false, false, false, true, false);
     }
@@ -257,7 +271,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -268,7 +283,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     GEMMRHSMatrixInfo rhs_info_img;
 
     // Get lhs_info/rhs_info in case of OpenCL buffer
-    if(n <= 4)
+    if (n <= 4)
     {
         std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
     }
@@ -279,15 +294,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
 
     // Get lhs_info/rhs_info in case of OpenCL image
     // Condition on the GPU workload
-    if((m / 4) * (n / 4) >= 2560)
+    if ((m / 4) * (n / 4) >= 2560)
     {
         // Big workload
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true);
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true);
     }
     else
     {
         // Small workload
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true);
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true);
     }
 
     const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
@@ -297,7 +314,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d
     const bool use_cl_image2d = (n <= 4) ? false : true;
 
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
+    if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
     {
         return std::make_pair(lhs_info_img, rhs_info_img);
     }
@@ -307,16 +324,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
     const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
 
-    if(workload <= 1595.2000f)
+    if (workload <= 1595.2000f)
     {
-        if(r_mk <= 2.1044f)
+        if (r_mk <= 2.1044f)
         {
-            if(workload <= 870.4000f)
+            if (workload <= 870.4000f)
             {
                 return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 2, true, false, true, false, false);
             }
@@ -336,12 +354,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true);
     }
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
index d86d1ba0a7cd5da1e8217ee77de6f57eccd04fba..9227ec25510da67be11b142e489e0586d5315ad9 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
@@ -45,17 +45,26 @@ public:
     ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
index 58d0873b869049ddf424bfa8282a63fdaff5ba4b..70b324eb5aef84c6123aa4f0457d9eb6450da38b 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -38,26 +39,27 @@ namespace kernels
 {
 namespace gemm
 {
-ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedValhall::configure_G77_f32,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_f16,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClGemmDefaultConfigReshapedValhall::configure_G77_f32, &ClGemmDefaultConfigReshapedValhall::configure_G77_f16,
+        &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedValhall::configure_G78_f32,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G78_f16,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClGemmDefaultConfigReshapedValhall::configure_G78_f32, &ClGemmDefaultConfigReshapedValhall::configure_G78_f16,
+        &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G78:
             func = configs_G78.get_function(data_type);
@@ -72,12 +74,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, 1, 0, 0, 1);
     }
@@ -87,7 +90,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -104,17 +108,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
 
     std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0);
 
-    if(r_mk <= 0.11824845522642136)
+    if (r_mk <= 0.11824845522642136)
     {
-        if(workload <= 880.0)
+        if (workload <= 880.0)
         {
             return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
         }
         else
         {
-            if(r_nk <= 0.42521367967128754)
+            if (r_nk <= 0.42521367967128754)
             {
-                if(workload <= 1726.4000244140625)
+                if (workload <= 1726.4000244140625)
                 {
                     return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 0);
                 }
@@ -123,13 +127,12 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
             }
             else
             {
-                if(workload <= 1241.6000366210938)
+                if (workload <= 1241.6000366210938)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
                 }
@@ -142,17 +145,16 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     }
     else
     {
-        if(workload <= 11404.7998046875)
+        if (workload <= 11404.7998046875)
         {
-            if(r_mk <= 1.0126488208770752)
+            if (r_mk <= 1.0126488208770752)
             {
-                if(r_mn <= 2.545312523841858)
+                if (r_mn <= 2.545312523841858)
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
@@ -161,43 +163,39 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
             }
             else
             {
-                if(workload <= 2881.199951171875)
+                if (workload <= 2881.199951171875)
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, 0, 0, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
             }
         }
         else
         {
-            if(r_nk <= 0.5765306055545807)
+            if (r_nk <= 0.5765306055545807)
             {
-                if(r_mn <= 6.010416746139526)
+                if (r_mn <= 6.010416746139526)
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
             }
             else
@@ -205,27 +203,27 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                 std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-    if(workload <= 1288.0000f)
+    if (workload <= 1288.0000f)
     {
-        if(workload <= 505.6000f)
+        if (workload <= 505.6000f)
         {
-            if(r_mn <= 0.4466f)
+            if (r_mn <= 0.4466f)
             {
-                if(r_nk <= 0.2384f)
+                if (r_nk <= 0.2384f)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
                 }
@@ -241,9 +239,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
         }
         else
         {
-            if(r_mn <= 0.2250f)
+            if (r_mn <= 0.2250f)
             {
-                if(r_mn <= 0.1599f)
+                if (r_mn <= 0.1599f)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
                 }
@@ -254,11 +252,11 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
             }
             else
             {
-                if(r_mk <= 0.7609f)
+                if (r_mk <= 0.7609f)
                 {
-                    if(r_mn <= 2.5453f)
+                    if (r_mn <= 2.5453f)
                     {
-                        if(workload <= 1089.6000f)
+                        if (workload <= 1089.6000f)
                         {
                             return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
                         }
@@ -281,29 +279,29 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     }
     else
     {
-        if(workload <= 5434.4001f)
+        if (workload <= 5434.4001f)
         {
-            if(workload <= 1603.2000f)
+            if (workload <= 1603.2000f)
             {
                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
             }
             else
             {
-                if(r_nk <= 0.6192f)
+                if (r_nk <= 0.6192f)
                 {
-                    if(r_mn <= 16.1016f)
+                    if (r_mn <= 16.1016f)
                     {
                         return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                     }
                     else
                     {
-                        if(workload <= 2750.0000f)
+                        if (workload <= 2750.0000f)
                         {
                             return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                         }
                         else
                         {
-                            if(r_mk <= 6.3151f)
+                            if (r_mk <= 6.3151f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
                             }
@@ -316,15 +314,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                 }
                 else
                 {
-                    if(r_mk <= 0.0387f)
+                    if (r_mk <= 0.0387f)
                     {
                         return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
                     }
                     else
                     {
-                        if(r_mk <= 2.5859f)
+                        if (r_mk <= 2.5859f)
                         {
-                            if(r_mk <= 0.2734f)
+                            if (r_mk <= 0.2734f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
                             }
@@ -343,13 +341,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
         }
         else
         {
-            if(r_mk <= 25.7500f)
+            if (r_mk <= 25.7500f)
             {
-                if(r_mk <= 0.3615f)
+                if (r_mk <= 0.3615f)
                 {
-                    if(r_mn <= 0.0913f)
+                    if (r_mn <= 0.0913f)
                     {
-                        if(r_mk <= 0.0683f)
+                        if (r_mk <= 0.0683f)
                         {
                             return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
                         }
@@ -365,15 +363,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                 }
                 else
                 {
-                    if(workload <= 11174.3999f)
+                    if (workload <= 11174.3999f)
                     {
-                        if(r_mk <= 0.8047f)
+                        if (r_mk <= 0.8047f)
                         {
                             return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                         }
                         else
                         {
-                            if(workload <= 7185.5999f)
+                            if (workload <= 7185.5999f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
                             }
@@ -385,9 +383,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                     }
                     else
                     {
-                        if(workload <= 17917.5000f)
+                        if (workload <= 17917.5000f)
                         {
-                            if(r_mk <= 1.5078f)
+                            if (r_mk <= 1.5078f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                             }
@@ -398,7 +396,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                         }
                         else
                         {
-                            if(workload <= 34449.6016f)
+                            if (workload <= 34449.6016f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                             }
@@ -412,11 +410,11 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
             }
             else
             {
-                if(r_mk <= 331.1111f)
+                if (r_mk <= 331.1111f)
                 {
-                    if(workload <= 53397.5996f)
+                    if (workload <= 53397.5996f)
                     {
-                        if(r_mn <= 57.8063f)
+                        if (r_mn <= 57.8063f)
                         {
                             return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                         }
@@ -427,7 +425,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                     }
                     else
                     {
-                        if(r_nk <= 0.9211f)
+                        if (r_nk <= 0.9211f)
                         {
                             return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
                         }
@@ -439,7 +437,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                 }
                 else
                 {
-                    if(workload <= 38070.4004f)
+                    if (workload <= 38070.4004f)
                     {
                         return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
                     }
@@ -453,27 +451,28 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-    if(workload <= 801.6000f)
+    if (workload <= 801.6000f)
     {
         return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
     }
     else
     {
-        if(r_mn <= 0.1211f)
+        if (r_mn <= 0.1211f)
         {
-            if(workload <= 3296.0000f)
+            if (workload <= 3296.0000f)
             {
                 return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
             }
             else
             {
-                if(r_nk <= 1.0625f)
+                if (r_nk <= 1.0625f)
                 {
                     return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                 }
@@ -485,15 +484,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
         }
         else
         {
-            if(workload <= 5068.8000f)
+            if (workload <= 5068.8000f)
             {
                 return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
             }
             else
             {
-                if(r_nk <= 0.2361f)
+                if (r_nk <= 0.2361f)
                 {
-                    if(workload <= 12630.0000f)
+                    if (workload <= 12630.0000f)
                     {
                         return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
                     }
@@ -504,7 +503,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                 }
                 else
                 {
-                    if(workload <= 178790.3984f)
+                    if (workload <= 178790.3984f)
                     {
                         return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                     }
@@ -518,12 +517,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, 0, 0, 0, 1);
     }
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
index 466eda00a6f7206ec479eef03bcc584fe48817a7..5f62efb59e00898132a38f82317d6b9e3e5fefd1 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
@@ -45,14 +45,20 @@ public:
     ClGemmDefaultConfigReshapedValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
index 1c32f1358b9e65d1c0daf1398359662dcc496761..83928b3f4fff994711cb946bf46de3f7a919ea0b 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
@@ -50,7 +50,7 @@ public:
      */
     static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
index 9c23d9c998bb589b69cd2e14accb4d384ef13dc5..c4825bfbebfb7e25c3ac12c192656a92453eaafe 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
@@ -29,7 +29,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -47,33 +49,39 @@ ClGemmDefaultConfigReshapedRhsOnlyBifrost::ClGemmDefaultConfigReshapedRhsOnlyBif
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G31(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G31(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G76:
             func = configs_G76.get_function(data_type);
@@ -96,14 +104,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n <= 2548)
+        if (n <= 2548)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false);
         }
@@ -118,12 +127,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int h0 = std::max(n / 2, 1U);
         return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1);
@@ -131,7 +141,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(m >= 28)
+        if (m >= 28)
         {
             return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1);
         }
@@ -142,7 +152,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -154,9 +165,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
 
     const bool is_workload_big = ((m * n * b) / 16) >= 2048;
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n >= 8192)
+        if (n >= 8192)
         {
             const unsigned int h0 = std::max(n / 4, 1U);
             return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false);
@@ -164,7 +175,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         else
         {
             const unsigned int h0 = std::max(n / 2, 1U);
-            if(n <= 204)
+            if (n <= 204)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false);
             }
@@ -177,25 +188,29 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
-        if(is_workload_big)
+        if (is_workload_big)
         {
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true);
         }
         else
         {
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true);
         }
     }
 
     // Get lhs_info/rhs_info in case of OpenCL image
     const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
-    if(is_workload_big)
+    if (is_workload_big)
     {
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
     }
     else
     {
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true);
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true);
     }
 
     const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
@@ -205,7 +220,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d
     const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true;
 
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
+    if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
     {
         return std::make_pair(lhs_info_img, rhs_info_img);
     }
@@ -215,7 +230,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
@@ -225,46 +241,49 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     GEMMLHSMatrixInfo lhs_info_img;
     GEMMRHSMatrixInfo rhs_info_img;
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(r_nk <= 0.4664f)
+        if (r_nk <= 0.4664f)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false);
         }
         else
         {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false);
+            std::tie(lhs_info_img, rhs_info_img) =
+                configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false);
 
             return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
+                                       std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
         }
     }
     else
     {
-        if(workload <= 274.4000f)
+        if (workload <= 274.4000f)
         {
             return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false);
         }
         else
         {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false);
+            std::tie(lhs_info_img, rhs_info_img) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false);
 
             return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
+                                       std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int n0 = n < 1280 ? 2 : 4;
         const unsigned int h0 = std::max(n / n0, 1U);
@@ -276,14 +295,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n > 2048)
+        if (n > 2048)
         {
             const unsigned int h0 = std::max(n / 4, 1U);
             return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
@@ -300,7 +320,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
@@ -312,57 +333,59 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     GEMMLHSMatrixInfo lhs_info_img;
     GEMMRHSMatrixInfo rhs_info_img;
 
-    if(m == 1)
+    if (m == 1)
     {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false);
+        std::tie(lhs_info_buf, rhs_info_buf) =
+            configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false);
 
-        if(r_mk <= 0.0026f)
+        if (r_mk <= 0.0026f)
         {
-            if(r_nk <= 0.4664f)
+            if (r_nk <= 0.4664f)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
             }
         }
         else
         {
-            if(r_mk <= 0.0148f)
+            if (r_mk <= 0.0148f)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
             }
         }
     }
     else
     {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false);
+        std::tie(lhs_info_buf, rhs_info_buf) =
+            configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false);
 
-        if(workload <= 362.6000f)
+        if (workload <= 362.6000f)
         {
             return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
         }
         else
         {
-            if(r_mn <= 22.6067f)
+            if (r_mn <= 22.6067f)
             {
-                if(workload <= 708.8000f)
+                if (workload <= 708.8000f)
                 {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
@@ -371,27 +394,28 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
             }
             else
             {
-                if(r_nk <= 0.0917f)
+                if (r_nk <= 0.0917f)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
                 }
                 else
                 {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
 
-    if(m == 1)
+    if (m == 1)
     {
         return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
     }
@@ -400,15 +424,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
         const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-        if(workload <= 7449.60f)
+        if (workload <= 7449.60f)
         {
-            if(workload <= 691.60f)
+            if (workload <= 691.60f)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false);
             }
             else
             {
-                if(workload <= 4155.20f)
+                if (workload <= 4155.20f)
                 {
                     return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
                 }
@@ -420,21 +444,22 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         }
         else
         {
-            if(workload <= 16300.80f)
+            if (workload <= 16300.80f)
             {
-                if(r_mn <= 44.56f)
+                if (r_mn <= 44.56f)
                 {
                     GEMMLHSMatrixInfo lhs_info_buf;
                     GEMMRHSMatrixInfo rhs_info_buf;
                     GEMMLHSMatrixInfo lhs_info_img;
                     GEMMRHSMatrixInfo rhs_info_img;
 
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) =
+                        configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
@@ -448,23 +473,25 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                 GEMMLHSMatrixInfo lhs_info_img;
                 GEMMRHSMatrixInfo rhs_info_img;
 
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int n0 = n < 1280 ? 2 : 4;
         const unsigned int h0 = std::max(n / n0, 1U);
@@ -476,14 +503,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
     {
-        if(m == 1)
+        if (m == 1)
         {
             const unsigned int h0 = std::max(n / 2, 1U);
             return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
@@ -497,7 +525,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 2), static_cast<int>(128)), static_cast<int>(1));
-        if(m == 1)
+        if (m == 1)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true);
         }
@@ -508,12 +536,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int h0 = std::max(n / 2, 1U);
         return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
@@ -524,12 +553,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int h0 = std::max(n / 2, 1U);
         return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true);
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
index 321cbb5250ec38fc0493af9663b74e846b394542..77c0c8d500e3fef2a210483b7a6f74701a5bf5e1 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
@@ -45,21 +45,34 @@ public:
     ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
index d08bf84c72ee4ee3b6b726e011ae25827752ece8..da3e2ec912aefdb0a3c40c6afd3ada6cdcb6f4c8 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
@@ -50,30 +50,35 @@ ClGemmDefaultConfigReshapedRhsOnlyValhall::ClGemmDefaultConfigReshapedRhsOnlyVal
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
-                                                                     &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16,
-                                                                     &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32,
-                                                                     &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16,
-                                                                     &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G78:
             func = configs_G78.get_function(data_type);
@@ -96,29 +101,29 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
-    if(m == 1)
+    if (m == 1)
     {
         const float r_mn = static_cast<float>(m) / static_cast<float>(n);
         const float r_mk = static_cast<float>(m) / static_cast<float>(k);
 
-        if(r_mk <= 0.0064484127797186375)
+        if (r_mk <= 0.0064484127797186375)
         {
-            if(r_mn <= 0.0028273810748942196)
+            if (r_mn <= 0.0028273810748942196)
             {
                 GEMMLHSMatrixInfo lhs_info_buf;
                 GEMMRHSMatrixInfo rhs_info_buf;
                 GEMMLHSMatrixInfo lhs_info_img;
                 GEMMRHSMatrixInfo rhs_info_img;
 
-                const unsigned int h0 = std::max(n / 4, 1U);
+                const unsigned int h0                = std::max(n / 4, 1U);
                 std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, 0, 1, 0, 0, 1);
                 std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, 0, 1, 0, 1, 0);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
             else
             {
@@ -127,7 +132,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         }
         else
         {
-            if(r_mk <= 0.020312500186264515)
+            if (r_mk <= 0.020312500186264515)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, 0, 1, 0, 0, 0);
             }
@@ -143,9 +148,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
         const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
 
-        if(workload <= 1999.2000122070312)
+        if (workload <= 1999.2000122070312)
         {
-            if(workload <= 747.1999816894531)
+            if (workload <= 747.1999816894531)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
             }
@@ -159,15 +164,14 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                 std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
         else
         {
-            if(r_mn <= 0.03348214365541935)
+            if (r_mn <= 0.03348214365541935)
             {
-                if(r_mk <= 0.028125000186264515)
+                if (r_mk <= 0.028125000186264515)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
                 }
@@ -181,8 +185,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                     std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
                 }
             }
             else
@@ -195,168 +198,112 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                 std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 1, 0, 1, 0);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
-    const GeMMConfigsMatrix configs_1nkb_best =
-    {
-        { 1, 8984, 640, 1, 1, 8, 8, 1, 0, 1, 1, 1, 1, 0 },
-        { 1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 6512, 6404, 1, 1, 4, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 5304, 640, 1, 1, 4, 4, 1, 0, 1, 0, 1, 1, 0 },
-        { 1, 1352, 1520, 1, 1, 2, 8, 1, 0, 1, 1, 1, 1, 0 },
-        { 1, 4096, 25088, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_n_small_best =
-    {
-        { 102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0 },
-        { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1 },
-        { 16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1 },
-        { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 1 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_n_small_fallback =
-    {
-        { 102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0 },
-        { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0 },
-        { 16384, 4, 128, 1, 2, 2, 16, 1, 2, 1, 1, 1, 1, 0 },
-        { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best =
-    {
-        { 25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 25584, 16, 68, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1 },
-        { 369664, 32, 28, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1 },
-        { 65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 23036, 56, 736, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 90968, 40, 600, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 12604, 60, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback =
-    {
-        { 25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 0, 0 },
-        { 369664, 32, 28, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0 },
-        { 65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 23036, 56, 736, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 90968, 40, 600, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 },
-        { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 },
-        { 12604, 60, 160, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0 },
-        { 29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best =
-    {
-        { 24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0 },
-        { 49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 1 },
-        { 49, 1024, 1024, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback =
-    {
-        { 24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0 },
-        { 49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 49, 1024, 1024, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0 },
+    const GeMMConfigsMatrix configs_1nkb_best = {
+        {1, 8984, 640, 1, 1, 8, 8, 1, 0, 1, 1, 1, 1, 0},    {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},
+        {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},    {1, 6512, 6404, 1, 1, 4, 8, 1, 0, 1, 0, 1, 0, 0},
+        {1, 5304, 640, 1, 1, 4, 4, 1, 0, 1, 0, 1, 1, 0},    {1, 1352, 1520, 1, 1, 2, 8, 1, 0, 1, 1, 1, 1, 0},
+        {1, 4096, 25088, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0},
+                                                         {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1},
+                                                         {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1},
+                                                         {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 1}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_small_fallback = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0},
+                                                             {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0},
+                                                             {16384, 4, 128, 1, 2, 2, 16, 1, 2, 1, 1, 1, 1, 0},
+                                                             {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},     {25584, 16, 68, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1},
+        {369664, 32, 28, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1},   {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},   {90968, 40, 600, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+        {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},    {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},  {12604, 60, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+        {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0},
+        {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},     {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 0, 0},
+        {369664, 32, 28, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0},  {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},   {90968, 40, 600, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+        {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0},    {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0},  {12604, 60, 160, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0},
+        {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0},
+        {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = {
+        {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0},
+        {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 1},
+        {49, 1024, 1024, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_squared_best =
-    {
-        { 72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0 },
-        { 268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0 },
-        { 180, 420, 952, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 196, 512, 512, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1 },
-        { 24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 },
-        { 24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 }
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = {
+        {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0},
+        {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {49, 1024, 1024, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_squared_fallback =
-    {
-        { 72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0 },
-        { 268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0 },
-        { 180, 420, 952, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 196, 512, 512, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0 },
-        { 24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 },
-        { 24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_best_batched =
-    {
-        { 3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_fallback_batched =
-    {
-        { 3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }
-    };
+    const GeMMConfigsMatrix configs_mnkb_squared_best = {
+        {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0},
+        {180, 420, 952, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},  {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1},
+        {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_squared_fallback = {
+        {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0},
+        {180, 420, 952, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0},
+        {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_best_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+        {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},   {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_fallback_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {4096, 48, 32, 36, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},   {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}};
 
     const GeMMConfigsMatrix *configs_best_to_use     = nullptr;
     const GeMMConfigsMatrix *configs_fallback_to_use = nullptr;
 
-    if(b == 1)
+    if (b == 1)
     {
         constexpr float        ratio_m_gt_n = 10.f;
         constexpr float        ratio_n_gt_m = 0.1f;
         constexpr unsigned int n_small_thr  = 4;
         const float            ratio        = static_cast<float>(m) / static_cast<float>(n);
 
-        if(m == 1)
+        if (m == 1)
         {
             // We do not need fallback in this case, as we never use cl_image for the rhs tensor
             configs_best_to_use     = &configs_1nkb_best;
             configs_fallback_to_use = &configs_1nkb_best;
         }
-        else if(n <= n_small_thr && ratio > ratio_m_gt_n)
+        else if (n <= n_small_thr && ratio > ratio_m_gt_n)
         {
             configs_best_to_use     = &configs_mnkb_n_small_best;
             configs_fallback_to_use = &configs_mnkb_n_small_fallback;
         }
-        else if(ratio > ratio_m_gt_n)
+        else if (ratio > ratio_m_gt_n)
         {
             configs_best_to_use     = &configs_mnkb_m_gt_n_best;
             configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback;
         }
-        else if(ratio < ratio_n_gt_m)
+        else if (ratio < ratio_n_gt_m)
         {
             configs_best_to_use     = &configs_mnkb_n_gt_m_best;
             configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback;
@@ -381,17 +328,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b);
     std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b);
 
-    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0),
-                               std::make_pair(lhs_info1, rhs_info1),
-                               n, k, b, DataType::F16);
+    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b,
+                               DataType::F16);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int h0 = std::max(n / 2, 1U);
         return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1);
@@ -399,7 +346,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(m >= 28)
+        if (m >= 28)
         {
             return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, 0, 1, 0, 1);
         }
@@ -410,30 +357,31 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(workload <= 278.7000f)
+        if (workload <= 278.7000f)
         {
-            if(workload <= 7.5000f)
+            if (workload <= 7.5000f)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
             }
             else
             {
-                if(r_mn <= 0.0031f)
+                if (r_mn <= 0.0031f)
                 {
-                    if(workload <= 256.6000f)
+                    if (workload <= 256.6000f)
                     {
-                        if(workload <= 16.7500f)
+                        if (workload <= 16.7500f)
                         {
-                            if(r_nk <= 1.6671f)
+                            if (r_nk <= 1.6671f)
                             {
                                 return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
                             }
@@ -454,15 +402,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                 }
                 else
                 {
-                    if(r_mk <= 0.0027f)
+                    if (r_mk <= 0.0027f)
                     {
-                        if(r_mk <= 0.0014f)
+                        if (r_mk <= 0.0014f)
                         {
                             return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
                         }
                         else
                         {
-                            if(workload <= 8.9500f)
+                            if (workload <= 8.9500f)
                             {
                                 return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
                             }
@@ -474,13 +422,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                     }
                     else
                     {
-                        if(workload <= 14.1500f)
+                        if (workload <= 14.1500f)
                         {
                             return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
                         }
                         else
                         {
-                            if(r_mk <= 0.0041f)
+                            if (r_mk <= 0.0041f)
                             {
                                 return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
                             }
@@ -495,9 +443,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         }
         else
         {
-            if(workload <= 363.7000f)
+            if (workload <= 363.7000f)
             {
-                if(r_mk <= 0.0031f)
+                if (r_mk <= 0.0031f)
                 {
                     return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0);
                 }
@@ -514,9 +462,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
     else
     {
-        if(workload <= 1384.8000f)
+        if (workload <= 1384.8000f)
         {
-            if(workload <= 704.0000f)
+            if (workload <= 704.0000f)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 32, 0, 1, 0, 1, 0);
             }
@@ -527,9 +475,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         }
         else
         {
-            if(workload <= 16761.6006f)
+            if (workload <= 16761.6006f)
             {
-                if(r_mn <= 187.1250f)
+                if (r_mn <= 187.1250f)
                 {
                     return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 0, 0, 1, 1);
                 }
@@ -540,7 +488,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
             }
             else
             {
-                if(r_mk <= 432.4630f)
+                if (r_mk <= 432.4630f)
                 {
                     return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 16, 0, 0, 0, 1, 1);
                 }
@@ -553,42 +501,37 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
 
-    if(m == 1)
+    if (m == 1)
     {
-        const GeMMConfigsMatrix configs_mnkb_best =
-        {
-            { 1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0 },
-            { 1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0 },
-            { 1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }
-        };
+        const GeMMConfigsMatrix configs_mnkb_best = {
+            {1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0},   {1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+            {1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},   {1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0},
+            {1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0},   {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+            {1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}};
 
         return find_lhs_rhs_info(configs_mnkb_best, m, n, k, b);
     }
     else
     {
-        if(workload <= 1384.8000f)
+        if (workload <= 1384.8000f)
         {
-            if(r_nk <= 0.8333f)
+            if (r_nk <= 0.8333f)
             {
-                if(r_mk <= 0.9119f)
+                if (r_mk <= 0.9119f)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 4, 0, 1, 0, 1, 1);
                 }
                 else
                 {
-                    if(r_nk <= 0.1181f)
+                    if (r_nk <= 0.1181f)
                     {
                         return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 32, 0, 0, 1, 0, 0);
                     }
@@ -600,7 +543,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
             }
             else
             {
-                if(r_mk <= 1.0013f)
+                if (r_mk <= 1.0013f)
                 {
                     return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 1);
                 }
@@ -612,11 +555,11 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         }
         else
         {
-            if(workload <= 11404.7998f)
+            if (workload <= 11404.7998f)
             {
-                if(r_mk <= 2.2884f)
+                if (r_mk <= 2.2884f)
                 {
-                    if(r_nk <= 0.9286f)
+                    if (r_nk <= 0.9286f)
                     {
                         return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 4, 0, 1, 1, 0, 1);
                     }
@@ -632,9 +575,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
             }
             else
             {
-                if(r_nk <= 1.1926f)
+                if (r_nk <= 1.1926f)
                 {
-                    if(r_mn <= 1385.7917f)
+                    if (r_mn <= 1385.7917f)
                     {
                         return configure_lhs_rhs_info(m, n, 6, 4, 8, 1, 4, 0, 1, 1, 0, 1);
                     }
@@ -652,12 +595,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     unsigned int best_m0;
     unsigned int best_n0;
 
-    if(is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
+    if (is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
     {
         return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true);
     }
@@ -667,153 +611,101 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
-    const GeMMConfigsMatrix configs_1nkb_best =
-    {
-        { 1, 8984, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 6512, 6404, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 5304, 640, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 4096, 25088, 1, 1, 2, 8, 1, 0, 1, 0, 1, 1, 0 },
-        { 1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }
+    const GeMMConfigsMatrix configs_1nkb_best = {
+        {1, 8984, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0},   {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},
+        {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},   {1, 6512, 6404, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+        {1, 5304, 640, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},   {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+        {1, 4096, 25088, 1, 1, 2, 8, 1, 0, 1, 0, 1, 1, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+                                                         {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+                                                         {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+                                                         {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0},    {25584, 16, 68, 1, 2, 4, 16, 1, 8, 1, 1, 1, 0, 1},
+        {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},   {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {8944, 32, 776, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},    {2688, 136, 1492, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {50176, 64, 300, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1},   {16544, 104, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {12604, 60, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},   {3728, 96, 196, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_n_small_best =
-    {
-        { 102400, 4, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 },
-        { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 },
-        { 16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 },
-        { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 }
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0},    {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0},
+        {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0},  {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 0},
+        {8944, 32, 776, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},   {2688, 136, 1492, 1, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0},
+        {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {16544, 104, 160, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0},
+        {12604, 60, 160, 1, 2, 8, 8, 1, 8, 1, 1, 1, 0, 0},   {3728, 96, 196, 1, 2, 8, 8, 1, 64, 1, 1, 1, 0, 0},
+        {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best =
-    {
-        { 25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0 },
-        { 25584, 16, 68, 1, 2, 4, 16, 1, 8, 1, 1, 1, 0, 1 },
-        { 369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 23036, 56, 736, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 8944, 32, 776, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 2688, 136, 1492, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 50176, 64, 300, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 },
-        { 16544, 104, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 12604, 60, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 3728, 96, 196, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 },
-    };
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                        {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                        {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}};
 
-    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback =
-    {
-        { 25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0 },
-        { 25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 23036, 56, 736, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 0 },
-        { 8944, 32, 776, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 },
-        { 2688, 136, 1492, 1, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0 },
-        { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 16544, 104, 160, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 12604, 60, 160, 1, 2, 8, 8, 1, 8, 1, 1, 1, 0, 0 },
-        { 3728, 96, 196, 1, 2, 8, 8, 1, 64, 1, 1, 1, 0, 0 },
-        { 29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 },
-    };
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                            {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                            {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}};
 
-    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best =
-    {
-        { 24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 }
+    const GeMMConfigsMatrix configs_mnkb_squared_best = {
+        {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},
+        {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {180, 420, 952, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1}, {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {196, 512, 512, 1, 5, 2, 8, 1, 4, 1, 1, 1, 1, 1},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback =
-    {
-        { 24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_squared_best =
-    {
-        { 24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0 },
-        { 268, 824, 5076, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 180, 420, 952, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1 },
-        { 1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 },
-        { 272, 400, 2116, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 196, 512, 512, 1, 5, 2, 8, 1, 4, 1, 1, 1, 1, 1 },
+    const GeMMConfigsMatrix configs_mnkb_squared_fallback = {
+        {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},
+        {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+        {180, 420, 952, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0},  {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 2, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_squared_fallback =
-    {
-        { 24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0 },
-        { 268, 824, 5076, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 180, 420, 952, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 },
-        { 272, 400, 2116, 1, 2, 8, 4, 1, 4, 1, 1, 1, 0, 0 },
-        { 196, 512, 512, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0 },
-    };
+    const GeMMConfigsMatrix configs_mnkb_best_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 1}, {4096, 48, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 1},  {24, 464, 412, 24, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {112, 184, 144, 28, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {5776, 64, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1},  {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}};
 
-    const GeMMConfigsMatrix configs_mnkb_best_batched =
-    {
-        { 3136, 64, 64, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 1 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 1 },
-        { 24, 464, 412, 24, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 112, 184, 144, 28, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 5776, 64, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 1568, 64, 40, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 },
-        { 2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_fallback_batched =
-    {
-        { 3136, 64, 64, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 },
-        { 688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 },
-        { 24, 464, 412, 24, 2, 8, 4, 1, 32, 1, 1, 1, 0, 0 },
-        { 112, 184, 144, 28, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0 },
-        { 5776, 64, 32, 36, 2, 8, 8, 1, 32, 1, 1, 1, 0, 0 },
-        { 1568, 64, 40, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }
-    };
+    const GeMMConfigsMatrix configs_mnkb_fallback_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},  {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0},
+        {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},  {24, 464, 412, 24, 2, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+        {112, 184, 144, 28, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 2, 8, 8, 1, 32, 1, 1, 1, 0, 0},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}};
 
     const GeMMConfigsMatrix *configs_best_to_use     = nullptr;
     const GeMMConfigsMatrix *configs_fallback_to_use = nullptr;
 
-    if(b == 1)
+    if (b == 1)
     {
         constexpr float        ratio_m_gt_n = 10.f;
         constexpr float        ratio_n_gt_m = 0.1f;
         constexpr unsigned int n_small_thr  = 4;
         const float            ratio        = static_cast<float>(m) / static_cast<float>(n);
 
-        if(m == 1)
+        if (m == 1)
         {
             // We do not need fallback in this case, as we never use cl_image for the rhs tensor
             configs_best_to_use     = &configs_1nkb_best;
             configs_fallback_to_use = &configs_1nkb_best;
         }
-        else if(n <= n_small_thr && ratio > ratio_m_gt_n)
+        else if (n <= n_small_thr && ratio > ratio_m_gt_n)
         {
             configs_best_to_use     = &configs_mnkb_n_small_best;
             configs_fallback_to_use = &configs_mnkb_n_small_best;
         }
-        else if(ratio > ratio_m_gt_n)
+        else if (ratio > ratio_m_gt_n)
         {
             configs_best_to_use     = &configs_mnkb_m_gt_n_best;
             configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback;
         }
-        else if(ratio < ratio_n_gt_m)
+        else if (ratio < ratio_n_gt_m)
         {
             configs_best_to_use     = &configs_mnkb_n_gt_m_best;
             configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback;
@@ -838,17 +730,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b);
     std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b);
 
-    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0),
-                               std::make_pair(lhs_info1, rhs_info1),
-                               n, k, b, DataType::F16);
+    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b,
+                               DataType::F16);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     unsigned int best_m0;
     unsigned int best_n0;
 
-    if(is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
+    if (is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
     {
         return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true);
     }
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
index f2952a3d30f96564212c5d587cb6cb95dd13db1b..a0ea337eb17791805c9b75ce365c67ac12509125 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
@@ -45,17 +45,26 @@ public:
     ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
index 1503e74eb6b63ddd5c8959d68e7aef52174de3a4..e07ad993ed45f2621d75ccf53236669604c6cedf 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
@@ -50,7 +50,7 @@ public:
      */
     static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
diff --git a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..689a743fdf18d08e3e281c25ff29d9e6b3833c4b
--- /dev/null
+++ b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/math/Math.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+Status validate_matmul_input_shapes(const TensorShape      &lhs_shape,
+                                    const TensorShape      &rhs_shape,
+                                    const MatMulKernelInfo &matmul_kernel_info)
+{
+    const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
+    const size_t rhs_k = matmul_kernel_info.adj_rhs ? rhs_shape.x() : rhs_shape.y();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_k != rhs_k, "K dimension in Lhs and Rhs matrices must match.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape.total_size() == 0, "Lhs tensor can't be empty");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_shape.total_size() == 0, "Rhs tensor can't be empty");
+
+    constexpr size_t batch_dim_start = 2;
+    for (size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape[i] != rhs_shape[i], "Batch dimension broadcasting is not supported");
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo      *lhs,
+                                                                         const ITensorInfo      *rhs,
+                                                                         const ITensorInfo      *dst,
+                                                                         const MatMulKernelInfo &matmul_kernel_info,
+                                                                         int                     mmul_m0,
+                                                                         int                     mmul_n0)
+{
+    ARM_COMPUTE_UNUSED(lhs, rhs);
+
+    const Window win = calculate_max_window(*dst, Steps(1, 1));
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window collapsed = win.collapse(win, Window::DimZ);
+
+    // Reconfigure window size, one arm_matrix_multiply call needs 16 threads to finish.
+    Window::Dimension x_dimension = collapsed.x();
+    Window::Dimension y_dimension = collapsed.y();
+
+    const int m = dst->dimension(1);
+    const int n = dst->dimension(0);
+
+    const int m0 = std::min(matmul_kernel_info.m0, m);
+    const int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
+
+    // Make M and N multiple of M0 and N0 respectively
+    const unsigned int ceil_to_multiple_n_n0 = ceil_to_multiple(n, n0);
+    const unsigned int ceil_to_multiple_m_m0 = ceil_to_multiple(m, m0);
+
+    // Divide M and N by M0 and N0 respectively
+    const unsigned int n_div_n0 = ceil_to_multiple_n_n0 / n0;
+    const unsigned int m_div_m0 = ceil_to_multiple_m_m0 / m0;
+
+    // Make n_div_n0 and m_div_m0 multiple of mmul_n0 and mmul_m0 respectively
+    const unsigned int ceil_to_multiple_n_div_n0_mmul_n0 = ceil_to_multiple(n_div_n0, mmul_n0);
+    const unsigned int ceil_to_multiple_m_div_m0_mmul_m0 = ceil_to_multiple(m_div_m0, mmul_m0);
+
+    // Ensure x_dimension is multiple of MMUL block size (mmul_m0 * mmul_n0)
+    x_dimension.set_end(ceil_to_multiple_n_div_n0_mmul_n0 * mmul_m0);
+    y_dimension.set_end(ceil_to_multiple_m_div_m0_mmul_m0 / mmul_m0);
+
+    collapsed.set(Window::DimX, x_dimension);
+    collapsed.set(Window::DimY, y_dimension);
+
+    return std::make_pair(Status{}, collapsed);
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2ae2a67f4782098da72e164a2ff87e654a4428a
--- /dev/null
+++ b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_GPU_CL_KERNELS_HELPERS_MATMULKERNELHELPERS_H
+#define ACL_SRC_GPU_CL_KERNELS_HELPERS_MATMULKERNELHELPERS_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Validate the input shapes of Matmul operation
+ *
+ * @param[in] lhs_shape          Lhs tensor shape
+ * @param[in] rhs_shape          Rhs tensor shape
+ * @param[in] matmul_kernel_info Matmul kernel info
+ *
+ * @return true if the shapes and matmul kernel info matches
+ */
+Status validate_matmul_input_shapes(const TensorShape      &lhs_shape,
+                                    const TensorShape      &rhs_shape,
+                                    const MatMulKernelInfo &matmul_kernel_info);
+
+/** Validate and configure window for Matmul MMUL kernels
+ *
+ * @param[in] lhs                Lhs tensor info
+ * @param[in] rhs                Rhs tensor info
+ * @param[in] dst                Dst tensor info
+ * @param[in] matmul_kernel_info Matmul kernel info
+ * @param[in] mmul_m0            Number of rows in the MMUL block
+ * @param[in] mmul_n0            Number of columns in the MMUL block
+ *
+ * @return a pair of Status and Window object
+ */
+std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo      *lhs,
+                                                                         const ITensorInfo      *rhs,
+                                                                         const ITensorInfo      *dst,
+                                                                         const MatMulKernelInfo &matmul_kernel_info,
+                                                                         int                     mmul_m0,
+                                                                         int                     mmul_n0);
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+
+#endif // ACL_SRC_GPU_CL_KERNELS_HELPERS_MATMULKERNELHELPERS_H
diff --git a/src/gpu/cl/operators/ClActivation.cpp b/src/gpu/cl/operators/ClActivation.cpp
index 74a818d738dd97e7723cf811f304b271d38d1637..66877ebceca6d5a9dd3a7471befb4d02df9ad3b2 100644
--- a/src/gpu/cl/operators/ClActivation.cpp
+++ b/src/gpu/cl/operators/ClActivation.cpp
@@ -23,19 +23,21 @@
  */
 #include "src/gpu/cl/operators/ClActivation.h"
 
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/kernels/ClActivationKernel.h"
-
 #include "src/common/IOperator.h"
 #include "src/common/utils/LegacySupport.h"
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/ClContext.h"
+#include "src/gpu/cl/kernels/ClActivationKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClActivation::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClActivation::configure(const ClCompileContext    &compile_context,
+                             ITensorInfo               *src,
+                             ITensorInfo               *dst,
+                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, act_info);
     auto k = std::make_unique<kernels::ClActivationKernel>();
@@ -53,13 +55,17 @@ namespace gpu
 {
 namespace opencl
 {
-std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate)
+std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor     &src,
+                                                                 const AclTensorDescriptor     &dst,
+                                                                 const AclActivationDescriptor &act,
+                                                                 bool                           is_validate)
 {
     TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
     TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst);
     auto       info     = detail::convert_to_activation_info(act);
 
-    if(is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
+    if (is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false),
+                                                                         &dst_info.set_is_resizable(false), info)))
     {
         return std::make_tuple(nullptr, StatusCode::UnsupportedConfig);
     }
@@ -68,7 +74,7 @@ std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensor
     act_op->configure(CLKernelLibrary::get().get_compile_context(), &src_info, &dst_info, info);
 
     auto op = new arm_compute::IOperator(static_cast<IContext *>(this));
-    if(op == nullptr)
+    if (op == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
         return std::make_tuple(nullptr, StatusCode::OutOfMemory);
diff --git a/src/gpu/cl/operators/ClActivation.h b/src/gpu/cl/operators/ClActivation.h
index 348dc279297798191fcf3f2387aa7c1bbac868fd..4f25bb5f24772d03f8b130554bdcd79cf971147a 100644
--- a/src/gpu/cl/operators/ClActivation.h
+++ b/src/gpu/cl/operators/ClActivation.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_ACTIVATION_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -43,7 +44,10 @@ public:
      * @param[out] dst             Destination tensor info. Data type supported: same as @p src
      * @param[in]  activation_info Activation layer parameters.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &activation_info);
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &activation_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClActivation::configure()
diff --git a/src/gpu/cl/operators/ClAdd.cpp b/src/gpu/cl/operators/ClAdd.cpp
index b9bf505bba0cb43b1a567161306471ed5c7b9a6b..b58d0df58d552fde882aa579243ec5943a631128 100644
--- a/src/gpu/cl/operators/ClAdd.cpp
+++ b/src/gpu/cl/operators/ClAdd.cpp
@@ -23,17 +23,20 @@
  */
 #include "src/gpu/cl/operators/ClAdd.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
-                      ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void ClAdd::configure(const ClCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      ConvertPolicy              policy,
+                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info);
     auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
@@ -41,8 +44,11 @@ void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1
     _kernel = std::move(k);
 }
 
-Status ClAdd::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst,
-                       ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status ClAdd::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
 {
     return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
 }
diff --git a/src/gpu/cl/operators/ClAdd.h b/src/gpu/cl/operators/ClAdd.h
index a17ce7b5d6a84f2828f2a5df68b61b09a5a84d40..7aed902f5d486fcf6204b841245f338878963bb8 100644
--- a/src/gpu/cl/operators/ClAdd.h
+++ b/src/gpu/cl/operators/ClAdd.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_ADD_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -65,7 +66,11 @@ public:
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -73,7 +78,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy,
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 } // namespace opencl
diff --git a/src/gpu/cl/operators/ClCast.cpp b/src/gpu/cl/operators/ClCast.cpp
index 05ea21b734759852c1251ed0c05affa5ca33101d..8f26ef003d6dedd667f8eca30a3acffdd6bea3fc 100644
--- a/src/gpu/cl/operators/ClCast.cpp
+++ b/src/gpu/cl/operators/ClCast.cpp
@@ -23,16 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClCast.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClCastKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClCast::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
+void ClCast::configure(const ClCompileContext &compile_context,
+                       const ITensorInfo      *src,
+                       ITensorInfo            *dst,
+                       ConvertPolicy           policy)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, policy);
     auto k = std::make_unique<kernels::ClCastKernel>();
diff --git a/src/gpu/cl/operators/ClCast.h b/src/gpu/cl/operators/ClCast.h
index 1b67ff7c8efa1a74fbf5a39ed00575a8ada152ed..25d22936739f63d5fc58f360ca318c8bf1f0e78d 100644
--- a/src/gpu/cl/operators/ClCast.h
+++ b/src/gpu/cl/operators/ClCast.h
@@ -58,7 +58,8 @@ public:
      * @param[out] dst             The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
      * @param[in]  policy          Conversion policy.
      */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+    void
+    configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCast::configure()
diff --git a/src/gpu/cl/operators/ClConcatenate.cpp b/src/gpu/cl/operators/ClConcatenate.cpp
index a27fc37cc41e2457d12dfc07841e58eea222b261..31018b976857a778dc5345270e87b5f0a6617476 100644
--- a/src/gpu/cl/operators/ClConcatenate.cpp
+++ b/src/gpu/cl/operators/ClConcatenate.cpp
@@ -23,9 +23,14 @@
  */
 #include "src/gpu/cl/operators/ClConcatenate.h"
 
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "src/gpu/cl/kernels/ClBatchConcatenateKernel.h"
 #include "src/gpu/cl/kernels/ClDepthConcatenateKernel.h"
 #include "src/gpu/cl/kernels/ClHeightConcatenateKernel.h"
@@ -33,42 +38,39 @@
 #include "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
 #include "src/gpu/cl/kernels/ClWidthConcatenateKernel.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-
-#include "src/common/utils/Log.h"
-#include "src/core/helpers/AutoConfiguration.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClConcatenate::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis)
+void ClConcatenate::configure(const CLCompileContext           &compile_context,
+                              const std::vector<ITensorInfo *> &src_vector,
+                              ITensorInfo                      *dst,
+                              size_t                            axis)
 {
     ARM_COMPUTE_ERROR_ON(dst == nullptr);
     ARM_COMPUTE_LOG_PARAMS(src_vector, dst, axis);
     _axis       = axis;
     _num_inputs = src_vector.size();
 
-    TensorShape                      dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis);
+    TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis);
     std::vector<const ITensorInfo *> const_src_vector(src_vector.size());
-    std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), [](ITensorInfo * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t;
-    });
+    std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(),
+                   [](ITensorInfo *t)
+                   {
+                       ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+                       return t;
+                   });
 
     // dst auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type());
     ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis));
 
     unsigned int offset = 0;
-    switch(_axis)
+    switch (_axis)
     {
         case Window::DimX:
         {
-            switch(_num_inputs)
+            switch (_num_inputs)
             {
                 case 2:
                 {
@@ -82,14 +84,15 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std
                 {
                     // Configure WidthConcatenate4Tensors kernel
                     auto kernel = std::make_unique<kernels::ClWidthConcatenate4TensorsKernel>();
-                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), src_vector.at(3), dst);
+                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2),
+                                      src_vector.at(3), dst);
                     _concat_kernels.emplace_back(std::move(kernel));
                     break;
                 }
                 default:
                 {
                     // Configure generic case WidthConcatenate kernels
-                    for(unsigned int i = 0; i < _num_inputs; ++i)
+                    for (unsigned int i = 0; i < _num_inputs; ++i)
                     {
                         auto kernel = std::make_unique<kernels::ClWidthConcatenateKernel>();
                         kernel->configure(compile_context, src_vector.at(i), offset, dst);
@@ -103,7 +106,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std
         }
         case Window::DimY:
         {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
+            for (unsigned int i = 0; i < _num_inputs; ++i)
             {
                 auto kernel = std::make_unique<kernels::ClHeightConcatenateKernel>();
                 kernel->configure(compile_context, src_vector.at(i), offset, dst);
@@ -114,7 +117,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std
         }
         case Window::DimZ:
         {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
+            for (unsigned int i = 0; i < _num_inputs; ++i)
             {
                 auto kernel = std::make_unique<kernels::ClDepthConcatenateKernel>();
                 kernel->configure(compile_context, src_vector.at(i), offset, dst);
@@ -125,7 +128,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std
         }
         case 3:
         {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
+            for (unsigned int i = 0; i < _num_inputs; ++i)
             {
                 auto kernel = std::make_unique<kernels::ClBatchConcatenateKernel>();
                 kernel->configure(compile_context, src_vector.at(i), offset, dst);
@@ -148,25 +151,27 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
     ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
 
     unsigned int offset = 0;
-    switch(axis)
+    switch (axis)
     {
         case Window::DimX:
         {
-            switch(num_inputs)
+            switch (num_inputs)
             {
                 case 2:
                     // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
                     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]);
-                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst));
+                    ARM_COMPUTE_RETURN_ON_ERROR(
+                        kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst));
                     break;
                 case 4:
                     // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
                     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]);
-                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst));
+                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(
+                        src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst));
                     break;
                 default:
                     // Validate generic case of WidthConcatenate kernel
-                    for(const auto &src : src_vector)
+                    for (const auto &src : src_vector)
                     {
                         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
                         ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst));
@@ -178,7 +183,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
         }
         case Window::DimY:
         {
-            for(const auto &src : src_vector)
+            for (const auto &src : src_vector)
             {
                 ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst));
                 offset += src->dimension(axis);
@@ -187,7 +192,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
         }
         case Window::DimZ:
         {
-            for(const auto &src : src_vector)
+            for (const auto &src : src_vector)
             {
                 ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst));
                 offset += src->dimension(axis);
@@ -196,7 +201,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
         }
         case 3:
         {
-            for(const auto &src : src_vector)
+            for (const auto &src : src_vector)
             {
                 ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst));
                 offset += src->dimension(axis);
@@ -207,7 +212,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
             ARM_COMPUTE_ERROR("Axis not supported");
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis);
         ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
@@ -218,17 +223,17 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
 
 void ClConcatenate::run(ITensorPack &tensors)
 {
-    if(tensors.empty())
+    if (tensors.empty())
     {
         ARM_COMPUTE_ERROR("No inputs provided");
     }
 
-    if(static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
+    if (static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
     {
         ARM_COMPUTE_ERROR("Configured with different number of inputs");
     }
 
-    if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
+    if (_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
     {
         ARM_COMPUTE_ERROR_ON(_concat_kernels.empty());
         CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true);
@@ -236,7 +241,7 @@ void ClConcatenate::run(ITensorPack &tensors)
     else
     {
         int i = 0;
-        for(auto &k : _concat_kernels)
+        for (auto &k : _concat_kernels)
         {
             ITensorPack pack;
             pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
diff --git a/src/gpu/cl/operators/ClConcatenate.h b/src/gpu/cl/operators/ClConcatenate.h
index de0cf84d2ca01c2190687d18a092fb1347815c4d..d8ce9d2a5c7ff400b0e472c18ceb922a72b079a3 100644
--- a/src/gpu/cl/operators/ClConcatenate.h
+++ b/src/gpu/cl/operators/ClConcatenate.h
@@ -57,7 +57,10 @@ public:
      * @param[out]    dst             Destination tensor info. Data types supported: same as @p src_vector.
      * @param[in]     axis            Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      */
-    void configure(const ClCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis);
+    void configure(const ClCompileContext           &compile_context,
+                   const std::vector<ITensorInfo *> &src_vector,
+                   ITensorInfo                      *dst,
+                   size_t                            axis);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClConcatenate::configure()
@@ -71,8 +74,8 @@ public:
 
 private:
     std::vector<std::unique_ptr<IClKernel>> _concat_kernels{};
-    unsigned int                            _num_inputs{ 0 };
-    unsigned int                            _axis{ 0 };
+    unsigned int                            _num_inputs{0};
+    unsigned int                            _axis{0};
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp
index 51248d4a7ad9556e2ea897c8adf5cca87735ed2a..2c3b0214fac3d064db508be6d8d7e74331f452f6 100644
--- a/src/gpu/cl/operators/ClConv2d.cpp
+++ b/src/gpu/cl/operators/ClConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,17 +23,17 @@
  */
 #include "src/gpu/cl/operators/ClConv2d.h"
 
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
+
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/operators/ClDirectConv2d.h"
 #include "src/gpu/cl/operators/ClGemmConv2d.h"
 #include "src/gpu/cl/operators/ClIndirectConv2d.h"
 #include "src/gpu/cl/operators/ClWinogradConv2d.h"
 
-#include "src/common/utils/Log.h"
-
 #include <memory>
 
 namespace
@@ -48,7 +48,7 @@ namespace
  */
 size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target)
 {
-    switch(gpu_target)
+    switch (gpu_target)
     {
         case arm_compute::GPUTarget::G76:
         case arm_compute::GPUTarget::G77:
@@ -71,35 +71,39 @@ namespace opencl
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClConv2d::ClConv2d()
-    : _operator()
+ClConv2d::ClConv2d() : _operator()
 {
 }
 
 ClConv2d::~ClConv2d() = default;
 
-void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                         const WeightsInfo &weights_info)
+void ClConv2d::configure(const CLCompileContext &compile_context,
+                         ITensorInfo            *src,
+                         ITensorInfo            *weights,
+                         ITensorInfo            *biases,
+                         ITensorInfo            *dst,
+                         const Conv2dInfo       &conv2d_info,
+                         const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info);
 
-    switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target()))
+    switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target()))
     {
         case ConvolutionMethod::WINOGRAD:
         {
             ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
-            ARM_COMPUTE_ERROR_ON(conv2d_info.post_ops.size() > 0);
             auto f = std::make_unique<ClWinogradConv2d>();
-            f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math);
+            f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info,
+                         conv2d_info.enable_fast_math);
             _operator = std::move(f);
             break;
         }
         case ConvolutionMethod::DIRECT:
         {
             ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
-            ARM_COMPUTE_ERROR_ON(conv2d_info.post_ops.size() > 0);
             auto f = std::make_unique<ClDirectConv2d>();
             f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info);
             _operator = std::move(f);
@@ -108,7 +112,6 @@ void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *s
         case ConvolutionMethod::INDIRECT:
         {
             ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
-            ARM_COMPUTE_ERROR_ON(conv2d_info.post_ops.size() > 0);
             auto f = std::make_unique<ClIndirectConv2d>();
             f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info);
             _operator = std::move(f);
@@ -128,38 +131,46 @@ void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *s
     _aux_mem = _operator->workspace();
 }
 
-Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+Status ClConv2d::validate(const ITensorInfo *src,
+                          const ITensorInfo *weights,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *dst,
+                          const Conv2dInfo  &conv2d_info,
                           const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
 
     const GPUTarget gpu_target = CLScheduler::get().target();
 
-    switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target))
+    switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target))
     {
         case ConvolutionMethod::WINOGRAD:
         {
             //Validate Winograd
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClWinogradConv2d is not supported");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClWinogradConv2d does not support PostOps");
-            ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math));
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClWinogradConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info,
+                                                                   conv2d_info.act_info, conv2d_info.enable_fast_math));
             break;
         }
         case ConvolutionMethod::DIRECT:
         {
             // Validate direct convolution layer
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClDirectConv2d is not supported");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClDirectConv2d does not support PostOps");
-            ARM_COMPUTE_RETURN_ON_ERROR(ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClDirectConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
             break;
         }
         case ConvolutionMethod::INDIRECT:
         {
             // Validate indirect convolution layer
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClIndirectConv2d is not supported");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClIndirectConv2d does not support PostOps");
-            ARM_COMPUTE_RETURN_ON_ERROR(ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClIndirectConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
             break;
         }
         case ConvolutionMethod::GEMM:
@@ -176,8 +187,12 @@ Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, co
     return Status{};
 }
 
-ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                                                   const WeightsInfo &weights_info, const GPUTarget gpu_target)
+ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src,
+                                                   const ITensorInfo *weights,
+                                                   const ITensorInfo *dst,
+                                                   const Conv2dInfo  &conv2d_info,
+                                                   const WeightsInfo &weights_info,
+                                                   const GPUTarget    gpu_target)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
@@ -197,20 +212,35 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
     using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>;
     using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
 
-    const std::vector<ConfigurationMethod> known_configs =
-    {
+    const std::vector<ConfigurationMethod> known_configs = {
         // Alexnet
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U),
+                                                     PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW),
+                            ConvolutionMethod::DIRECT),
         // VGG16 / VGG19
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U),
+                                                     PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW),
+                            ConvolutionMethod::DIRECT),
         // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW),
+                            ConvolutionMethod::GEMM),
         // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW),
+                            ConvolutionMethod::GEMM),
         // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC),
+                            ConvolutionMethod::GEMM),
         // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC),
+                            ConvolutionMethod::GEMM),
     };
 
     const auto find_config = [&](ConfigurationMethod c)
@@ -219,76 +249,89 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
         const PadStrideInfo            info        = std::get<3>(config);
         const DataLayout               data_layout = std::get<4>(config);
 
-        return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
-               && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
-               && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride() && (data_layout == src->data_layout());
+        return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) &&
+               std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+               std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+               info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+               info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+               info.stride() == conv_info.stride() && (data_layout == src->data_layout());
     };
 
     std::vector<ConfigurationMethod>::const_iterator found;
-    if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+    if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
     {
         return (*found).second;
     }
 
-    if(dilation != Size2D(1U, 1U))
+    if (dilation != Size2D(1U, 1U))
     {
         return ConvolutionMethod::GEMM;
     }
     else
     {
-        if(src->data_layout() == DataLayout::NCHW)
+        if (src->data_layout() == DataLayout::NCHW)
         {
             // SRGAN
-            if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
-               && (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)))
+            if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) &&
+                (conv_info.pad_top() < 3) &&
+                (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)))
             {
                 return ConvolutionMethod::DIRECT;
             }
-            if((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) && (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)))
+            if ((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) &&
+                (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)))
             {
                 return ConvolutionMethod::FFT;
             }
-            if(src->dimension(idx_c) < 16)
+            if (src->dimension(idx_c) < 16)
             {
                 return ConvolutionMethod::GEMM;
             }
-            return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+            return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math))
+                       ? ConvolutionMethod::WINOGRAD
+                       : ConvolutionMethod::GEMM;
         }
         else
         {
-            const bool   is_direct_valid           = bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
-            const bool   is_wino_valid             = bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math));
+            const bool is_direct_valid =
+                bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
+            const bool is_wino_valid =
+                bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math));
             const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target);
 
             // SRGAN case
-            if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
-               && is_direct_valid)
+            if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) &&
+                (conv_info.pad_top() < 3) && is_direct_valid)
             {
                 return ConvolutionMethod::DIRECT;
             }
 
             // Floating-point case: GeMM/Direct/Winograd
-            if(is_data_type_float(src->data_type()))
+            if (is_data_type_float(src->data_type()))
             {
                 // Get dst shape
-                TensorShape output_shape       = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-                const bool  is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
-                const bool  is_ifm_ge_8        = src->dimension(idx_c) >= 8;
-                const bool  is_ifm_ge_16       = src->dimension(idx_c) >= 16;
-                const bool  is_ofm_lte_8       = weights->dimension(3U) <= 8;
-                const bool  is_ofm_lt_64       = weights->dimension(3U) < 64;
-                const bool  workload_gte_8192  = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
-                const bool  is_ifm_gt_ofm      = src->dimension(idx_c) > weights->dimension(3U);
-                const bool  is_m_one           = output_shape[1] * output_shape[2] == 1;
-                const bool  is_unit_stride     = (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
-                const int32_t kernel_sz        = weights->dimension(idx_w) * weights->dimension(idx_h);
+                TensorShape output_shape =
+                    misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+                const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) &&
+                                                (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
+                const bool is_ifm_ge_8       = src->dimension(idx_c) >= 8;
+                const bool is_ifm_ge_16      = src->dimension(idx_c) >= 16;
+                const bool is_ofm_lte_8      = weights->dimension(3U) <= 8;
+                const bool is_ofm_lt_64      = weights->dimension(3U) < 64;
+                const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
+                const bool is_ifm_gt_ofm     = src->dimension(idx_c) > weights->dimension(3U);
+                const bool is_m_one          = output_shape[1] * output_shape[2] == 1;
+                const bool is_unit_stride =
+                    (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
+                const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h);
 
                 // Run Winograd if valid and IFM >= 8
-                if(is_wino_valid && is_ifm_ge_8)
+                if (is_wino_valid && is_ifm_ge_8)
                 {
-                    if(is_ofm_lte_8)
+                    if (is_ofm_lte_8)
                     {
-                        if(gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)
+                        if (gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 ||
+                            get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)
                         {
                             return ConvolutionMethod::WINOGRAD;
                         }
@@ -300,18 +343,19 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
                 }
 
                 // Direct convolution case
-                if(is_direct_valid)
+                if (is_direct_valid)
                 {
-                    if((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD))
+                    if ((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 ||
+                         get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD))
                     {
-                        if(is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm)
+                        if (is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm)
                         {
                             return ConvolutionMethod::DIRECT;
                         }
                     }
-                    else if(gpu_target == arm_compute::GPUTarget::G76)
+                    else if (gpu_target == arm_compute::GPUTarget::G76)
                     {
-                        if((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16))
+                        if ((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16))
                         {
                             return ConvolutionMethod::DIRECT;
                         }
@@ -320,21 +364,24 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
                     {
                         ConvolutionMethod preferred_conv_method = ConvolutionMethod::DIRECT;
 
-                        const bool is_indirect_valid = bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
+                        const bool is_indirect_valid =
+                            bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
 
                         // indirect conv2d should be called when:
                         // 1- When the kernel size is greater than 1x1 and less than or equal to 9x9 (81)
                         // 2- When the kernel size is odd
                         // 3- When the Gpu target is Arm Mali-G77
-                        if(is_indirect_valid)
+                        if (is_indirect_valid)
                         {
                             const bool is_kernel_sz_odd = kernel_sz % 2;
                             const bool is_g77           = gpu_target == GPUTarget::G77;
-                            preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77? ConvolutionMethod::INDIRECT : ConvolutionMethod::DIRECT;
+                            preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77
+                                                        ? ConvolutionMethod::INDIRECT
+                                                        : ConvolutionMethod::DIRECT;
                         }
 
                         // Direct/indirect convolution used for the first layer of the network
-                        if(workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64)
+                        if (workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64)
                         {
                             // In general, the question we should ask for the first convolution layer of a model is:
                             // when the execution time of im2col + gemm < direct?. Since im2col does not depend on the OFM, it means that
@@ -343,13 +390,13 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
                             return preferred_conv_method;
                         }
 
-                        if((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16)
+                        if ((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16)
                         {
                             return preferred_conv_method;
                         }
 
                         // Direct convolution used for the last layer of the network
-                        if(is_ofm_lte_8)
+                        if (is_ofm_lte_8)
                         {
                             return preferred_conv_method;
                         }
diff --git a/src/gpu/cl/operators/ClConv2d.h b/src/gpu/cl/operators/ClConv2d.h
index c6c366a7628e61056f72e99142b942407f4314ec..0cf3cbc1ceef91b660b69b7a10a8d82c70f87547 100644
--- a/src/gpu/cl/operators/ClConv2d.h
+++ b/src/gpu/cl/operators/ClConv2d.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 #include "src/gpu/cl/IClOperator.h"
@@ -112,15 +113,24 @@ public:
      * @param[in]  conv2d_info     Contains convolution 2d info described in @ref Conv2dInfo.
      * @param[in]  weights_info    Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p src.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                   const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
+                   const Conv2dInfo       &conv2d_info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref ClConv2d
      *
      * Similar to ClConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *dst,
+                           const Conv2dInfo  &conv2d_info,
                            const WeightsInfo &weights_info = WeightsInfo());
     /** Static function to check if given info will return the convolution called by @ref ClConv2d
      *
@@ -137,11 +147,15 @@ public:
      *
      * @return the Convolution Method Hint
      */
-    static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                                                    const WeightsInfo &weights_info, const GPUTarget gpu_target);
+    static ConvolutionMethod get_convolution_method(const ITensorInfo *src,
+                                                    const ITensorInfo *weights,
+                                                    const ITensorInfo *dst,
+                                                    const Conv2dInfo  &conv2d_info,
+                                                    const WeightsInfo &weights_info,
+                                                    const GPUTarget    gpu_target);
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
index 08122b685278d0e25c2f417dab380a46ed1b0406..cf24c68d21810f2a4d3c9587dc7314c7203e15a2 100644
--- a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
+++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
@@ -23,16 +23,19 @@
  */
 #include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context,
+                                               const ITensorInfo      *src,
+                                               ITensorInfo            *dst,
+                                               const TensorShape      &original_src_shape,
+                                               DataLayout              data_layout)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout);
     auto k = std::make_unique<kernels::ClConvertFullyConnectedWeightsKernel>();
@@ -40,9 +43,12 @@ void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_c
     _kernel = std::move(k);
 }
 
-Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src,
+                                                const ITensorInfo *dst,
+                                                const TensorShape &original_src_shape,
+                                                DataLayout         data_layout)
 {
     return kernels::ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
index 2794eb17b0ae54273863a34aab05d7f42c43e397..c46152081c1dbe873eef137a7f04516b42eb53c4 100644
--- a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
+++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
@@ -43,14 +43,21 @@ public:
      * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
      * @param[in] data_layout        The data layout the weights have been trained in.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   const TensorShape      &original_src_shape,
+                   DataLayout              data_layout);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClConvertFullyConnectedWeights::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_src_shape,
+                           DataLayout         data_layout);
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClCopy.cpp b/src/gpu/cl/operators/ClCopy.cpp
index d3b83040d083bfece057223d798d1878f4296823..e2be7cebd4d77018c5c95a3964141c55b8f12b6e 100644
--- a/src/gpu/cl/operators/ClCopy.cpp
+++ b/src/gpu/cl/operators/ClCopy.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClCopy.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClCopyKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -45,4 +44,4 @@ Status ClCopy::validate(const ITensorInfo *src, const ITensorInfo *dst, Window *
     return kernels::ClCopyKernel::validate(src, dst, dst_window);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClCopy.h b/src/gpu/cl/operators/ClCopy.h
index 9b427f967597029e6ea8d9a70286724c537de7fb..fe9b58c607a75a5f174b5044e7a334ce2df2c8b4 100644
--- a/src/gpu/cl/operators/ClCopy.h
+++ b/src/gpu/cl/operators/ClCopy.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_COPY_H
 
 #include "arm_compute/core/Window.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -44,7 +45,10 @@ public:
      * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   Window                 *dst_window = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCopy::configure()
diff --git a/src/gpu/cl/operators/ClCrop.cpp b/src/gpu/cl/operators/ClCrop.cpp
index cef9f14c7dc8caf1ed134586b0eb95bab1630a40..6313e4fbb527657faadd00fdeced62705982eab2 100644
--- a/src/gpu/cl/operators/ClCrop.cpp
+++ b/src/gpu/cl/operators/ClCrop.cpp
@@ -23,17 +23,22 @@
  */
 #include "src/gpu/cl/operators/ClCrop.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClCropKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
-                       Window *dst_window)
+void ClCrop::configure(const ClCompileContext &compile_context,
+                       const ITensorInfo      *src,
+                       ITensorInfo            *dst,
+                       Coordinates2D           start,
+                       Coordinates2D           end,
+                       uint32_t                batch_index,
+                       float                   extrapolation_value,
+                       Window                 *dst_window)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window);
     auto k = std::make_unique<kernels::ClCropKernel>();
@@ -41,9 +46,15 @@ void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInf
     _kernel = std::move(k);
 }
 
-Status ClCrop::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
+Status ClCrop::validate(const ITensorInfo *src,
+                        const ITensorInfo *dst,
+                        Coordinates2D      start,
+                        Coordinates2D      end,
+                        uint32_t           batch_index,
+                        float              extrapolation_value,
+                        Window            *dst_window)
 {
     return kernels::ClCropKernel::validate(src, dst, start, end, batch_index, extrapolation_value, dst_window);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClCrop.h b/src/gpu/cl/operators/ClCrop.h
index 1cf1c9bff4938b33035f7b711b1d2e79d7706366..e845cf372c3535d98f8b19dc25d4152fdc5e343a 100644
--- a/src/gpu/cl/operators/ClCrop.h
+++ b/src/gpu/cl/operators/ClCrop.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_CROP_H
 
 #include "arm_compute/core/Window.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -49,16 +50,27 @@ public:
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in]  dst_window          Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                   Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   Coordinates2D           start,
+                   Coordinates2D           end,
+                   uint32_t                batch_index,
+                   float                   extrapolation_value = 0,
+                   Window                 *dst_window          = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCrop::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                           Window *dst_window = nullptr);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           Coordinates2D      start,
+                           Coordinates2D      end,
+                           uint32_t           batch_index,
+                           float              extrapolation_value = 0,
+                           Window            *dst_window          = nullptr);
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClDequantize.cpp b/src/gpu/cl/operators/ClDequantize.cpp
index 0fccab63e04a0a95d111627256d65a2eb7b7c77c..eb6f9e7abb67290e051142f2f4ff61fd7ca0c591 100644
--- a/src/gpu/cl/operators/ClDequantize.cpp
+++ b/src/gpu/cl/operators/ClDequantize.cpp
@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/kernels/ClDequantizeKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClDequantizeKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/gpu/cl/operators/ClDirectConv2d.cpp b/src/gpu/cl/operators/ClDirectConv2d.cpp
index 0215dba422d24d3a3de13b66667bfa6f3832f629..17a196ce6b366891cbb61bdec087587f11a79db7 100644
--- a/src/gpu/cl/operators/ClDirectConv2d.cpp
+++ b/src/gpu/cl/operators/ClDirectConv2d.cpp
@@ -26,6 +26,8 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/gpu/cl/kernels/ClActivationKernel.h"
@@ -35,8 +37,6 @@
 #include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h"
 #include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
 
-#include "src/common/utils/Log.h"
-
 using namespace arm_compute::cl_direct_conv;
 
 namespace arm_compute
@@ -53,7 +53,8 @@ ITensorPack select_activation_src_dst(ITensorPack &tensors)
     return pack;
 }
 
-DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo
+config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
 {
     // Get GPU target
     GPUTarget gpu_target = CLScheduler::get().target();
@@ -65,8 +66,13 @@ DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *sr
 
 } // namespace
 
-void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                               const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void ClDirectConv2d::configure(const CLCompileContext    &compile_context,
+                               ITensorInfo               *src,
+                               ITensorInfo               *weights,
+                               ITensorInfo               *biases,
+                               ITensorInfo               *dst,
+                               const PadStrideInfo       &conv_info,
+                               const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info);
@@ -75,15 +81,17 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI
     const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info);
 
     // Configure direct convolution kernel
-    const ActivationLayerInfo conv2d_act_info = (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info : ActivationLayerInfo();
-    auto                      k               = std::make_unique<kernels::ClDirectConv2dKernel>();
+    const ActivationLayerInfo conv2d_act_info =
+        (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info
+                                                                                         : ActivationLayerInfo();
+    auto k = std::make_unique<kernels::ClDirectConv2dKernel>();
     k->set_target(CLScheduler::get().target());
     k->configure(compile_context, src, weights, biases, dst, conv_info, conv2d_act_info, desc);
     _direct_conv_kernel = std::move(k);
 
     // Configure border handler
     PixelValue zero_value(0.f);
-    if(is_data_type_quantized_asymmetric(src->data_type()))
+    if (is_data_type_quantized_asymmetric(src->data_type()))
     {
         zero_value = PixelValue(0, src->data_type(), src->quantization_info());
     }
@@ -92,7 +100,7 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI
     _src_border_handler = std::move(b);
 
     // Fused activation is currently supported for NHWC and floating point types
-    if(act_info.enabled() && !conv2d_act_info.enabled())
+    if (act_info.enabled() && !conv2d_act_info.enabled())
     {
         auto a = std::make_unique<kernels::ClActivationKernel>();
         a->configure(compile_context, dst, dst, act_info);
@@ -103,14 +111,19 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI
     CLScheduler::get().tune_kernel_static(*_direct_conv_kernel);
 }
 
-Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+Status ClDirectConv2d::validate(const ITensorInfo         *src,
+                                const ITensorInfo         *weights,
+                                const ITensorInfo         *biases,
+                                const ITensorInfo         *dst,
+                                const PadStrideInfo       &conv_info,
+                                const ActivationLayerInfo &act_info)
 {
     // Initialize the direct convolution descriptor
     const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc));
-    if(act_info.enabled())
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc));
+    if (act_info.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info));
     }
@@ -124,7 +137,7 @@ void ClDirectConv2d::run(ITensorPack &tensors)
     // Run direct convolution
     CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false);
     // Run activation kernel
-    if(_activation_kernel)
+    if (_activation_kernel)
     {
         auto act_pack = select_activation_src_dst(tensors);
         CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false);
diff --git a/src/gpu/cl/operators/ClDirectConv2d.h b/src/gpu/cl/operators/ClDirectConv2d.h
index fedb9e971e325766f8337cc20453721465c1c759..0f18490814a72d639fd6f13b040b74e2f7b4b895 100644
--- a/src/gpu/cl/operators/ClDirectConv2d.h
+++ b/src/gpu/cl/operators/ClDirectConv2d.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_DIRECT_CONV2D_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 #include "src/gpu/cl/IClOperator.h"
@@ -59,7 +60,12 @@ public:
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -67,16 +73,20 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited method overridden
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<IClKernel> _direct_conv_kernel{ nullptr };
-    std::unique_ptr<IClKernel> _src_border_handler{ nullptr };
-    std::unique_ptr<IClKernel> _activation_kernel{ nullptr };
+    std::unique_ptr<IClKernel> _direct_conv_kernel{nullptr};
+    std::unique_ptr<IClKernel> _src_border_handler{nullptr};
+    std::unique_ptr<IClKernel> _activation_kernel{nullptr};
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClDirectConv3d.cpp b/src/gpu/cl/operators/ClDirectConv3d.cpp
index 5d37f07f312c2582230c97c634ec40e63665965b..b08347936b38e61cdf0fa8df3c0de9bb6a3cd9ee 100644
--- a/src/gpu/cl/operators/ClDirectConv3d.cpp
+++ b/src/gpu/cl/operators/ClDirectConv3d.cpp
@@ -24,13 +24,19 @@
 #include "src/gpu/cl/operators/ClDirectConv3d.h"
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/gpu/cl/kernels/ClDirectConv3dKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClDirectConv3d::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+void ClDirectConv3d::configure(const CLCompileContext &compile_context,
+                               const ITensorInfo      *src0,
+                               const ITensorInfo      *src1,
+                               const ITensorInfo      *src2,
+                               ITensorInfo            *dst,
+                               const Conv3dInfo       &conv3d_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0);
 
@@ -40,7 +46,11 @@ void ClDirectConv3d::configure(const CLCompileContext &compile_context, const IT
     _direct_conv3d_kernel = std::move(k);
 }
 
-Status ClDirectConv3d::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status ClDirectConv3d::validate(const ITensorInfo *src0,
+                                const ITensorInfo *src1,
+                                const ITensorInfo *src2,
+                                const ITensorInfo *dst,
+                                const Conv3dInfo  &conv3d_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv3dKernel::validate(src0, src1, src2, dst, conv3d_info));
     return Status{};
diff --git a/src/gpu/cl/operators/ClDirectConv3d.h b/src/gpu/cl/operators/ClDirectConv3d.h
index fa58b5aedd5d453ad6c05213b02614f24eef3dbf..5fb32460e25ab86b123f5a60da45db7b5da3e923 100644
--- a/src/gpu/cl/operators/ClDirectConv3d.h
+++ b/src/gpu/cl/operators/ClDirectConv3d.h
@@ -67,7 +67,12 @@ public:
      * @param[in]  conv3d_info     Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   const ITensorInfo      *src2,
+                   ITensorInfo            *dst,
+                   const Conv3dInfo       &conv3d_info);
 
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -75,14 +80,18 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv3d_info);
 
     // Inherited method overridden
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<IClKernel> _direct_conv3d_kernel{ nullptr };
+    std::unique_ptr<IClKernel> _direct_conv3d_kernel{nullptr};
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */
diff --git a/src/gpu/cl/operators/ClElementwiseOperations.cpp b/src/gpu/cl/operators/ClElementwiseOperations.cpp
index 32d2b88798b43296c77f04a9d6037a4158291fb0..1325371d19555836a83bc8b74a4ce2f0a98362df 100644
--- a/src/gpu/cl/operators/ClElementwiseOperations.cpp
+++ b/src/gpu/cl/operators/ClElementwiseOperations.cpp
@@ -23,15 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClElementwiseOperations.h"
 
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClElementwiseDivision::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwiseDivision::configure(const ClCompileContext    &compile_context,
+                                      ITensorInfo               *src1,
+                                      ITensorInfo               *src2,
+                                      ITensorInfo               *dst,
+                                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -39,12 +42,19 @@ void ClElementwiseDivision::configure(const ClCompileContext &compile_context, I
     _kernel = std::move(k);
 }
 
-Status ClElementwiseDivision::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwiseDivision::validate(const ITensorInfo         *src1,
+                                       const ITensorInfo         *src2,
+                                       const ITensorInfo         *dst,
+                                       const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::DIV, src1, src2, dst, act_info);
 }
 
-void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwiseMax::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src1,
+                                 ITensorInfo               *src2,
+                                 ITensorInfo               *dst,
+                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -52,12 +62,19 @@ void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITenso
     _kernel = std::move(k);
 }
 
-Status ClElementwiseMax::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwiseMax::validate(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MAX, src1, src2, dst, act_info);
 }
 
-void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwiseMin::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src1,
+                                 ITensorInfo               *src2,
+                                 ITensorInfo               *dst,
+                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -65,12 +82,19 @@ void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITenso
     _kernel = std::move(k);
 }
 
-Status ClElementwiseMin::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwiseMin::validate(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MIN, src1, src2, dst, act_info);
 }
 
-void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwiseSquaredDiff::configure(const ClCompileContext    &compile_context,
+                                         ITensorInfo               *src1,
+                                         ITensorInfo               *src2,
+                                         ITensorInfo               *dst,
+                                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -78,12 +102,19 @@ void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context
     _kernel = std::move(k);
 }
 
-Status ClElementwiseSquaredDiff::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwiseSquaredDiff::validate(const ITensorInfo         *src1,
+                                          const ITensorInfo         *src2,
+                                          const ITensorInfo         *dst,
+                                          const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info);
 }
 
-void ClElementwisePower::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwisePower::configure(const ClCompileContext    &compile_context,
+                                   ITensorInfo               *src1,
+                                   ITensorInfo               *src2,
+                                   ITensorInfo               *dst,
+                                   const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -91,7 +122,10 @@ void ClElementwisePower::configure(const ClCompileContext &compile_context, ITen
     _kernel = std::move(k);
 }
 
-Status ClElementwisePower::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwisePower::validate(const ITensorInfo         *src1,
+                                    const ITensorInfo         *src2,
+                                    const ITensorInfo         *dst,
+                                    const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::POWER, src1, src2, dst, act_info);
 }
diff --git a/src/gpu/cl/operators/ClElementwiseOperations.h b/src/gpu/cl/operators/ClElementwiseOperations.h
index 120049cb7fb9154a52cebccd2b8307158ccff020..de7c018d7534b5a107323e3b8d0d50feaef4b3b2 100644
--- a/src/gpu/cl/operators/ClElementwiseOperations.h
+++ b/src/gpu/cl/operators/ClElementwiseOperations.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -48,14 +49,21 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwiseDivision::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for max
@@ -74,14 +82,21 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwiseMax::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for min
@@ -100,14 +115,21 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwiseMin::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for squared difference
@@ -126,14 +148,21 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwiseSquaredDiff::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for power
@@ -152,14 +181,21 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported:F16/F32.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwisePower::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClElementwiseUnary.cpp b/src/gpu/cl/operators/ClElementwiseUnary.cpp
index f94d402c0536a4148dd81b4de91e17ebeb77b1f1..914621183e1bdf5179306b5c7eb7ddaf22384b36 100644
--- a/src/gpu/cl/operators/ClElementwiseUnary.cpp
+++ b/src/gpu/cl/operators/ClElementwiseUnary.cpp
@@ -23,9 +23,8 @@
  */
 #include "src/gpu/cl/operators/ClElementwiseUnary.h"
 
-#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/gpu/cl/operators/ClFill.cpp b/src/gpu/cl/operators/ClFill.cpp
index ad22b15cff959693e23c74628d1d3a3a12581693..817b15ab202261698550ece4332e40d87ae49c0e 100644
--- a/src/gpu/cl/operators/ClFill.cpp
+++ b/src/gpu/cl/operators/ClFill.cpp
@@ -23,16 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClFill.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClFillKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClFill::configure(const ClCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window)
+void ClFill::configure(const ClCompileContext &compile_context,
+                       ITensorInfo            *tensor,
+                       const PixelValue       &constant_value,
+                       Window                 *dst_window)
 {
     ARM_COMPUTE_LOG_PARAMS(tensor, constant_value, dst_window);
     auto k = std::make_unique<kernels::ClFillKernel>();
@@ -45,4 +47,4 @@ Status ClFill::validate(const ITensorInfo *tensor, const PixelValue &constant_va
     return kernels::ClFillKernel::validate(tensor, constant_value, dst_window);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClFill.h b/src/gpu/cl/operators/ClFill.h
index 3bbe27ef71e42fcf4b09e075d6947eb91b1208b8..e13862aa6bec302b53ec16ca1960f353fb4ae790 100644
--- a/src/gpu/cl/operators/ClFill.h
+++ b/src/gpu/cl/operators/ClFill.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -44,7 +45,10 @@ public:
      * @param[in]     constant_value  The value used to fill the planes of the tensor
      * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *tensor,
+                   const PixelValue       &constant_value,
+                   Window                 *window = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClFill::configure()
diff --git a/src/gpu/cl/operators/ClFlatten.cpp b/src/gpu/cl/operators/ClFlatten.cpp
index e277c0d7e4b7b29f4f6e3fbba734ded24926da73..7532532c94795bb9c65fcf11185e75181e031857 100644
--- a/src/gpu/cl/operators/ClFlatten.cpp
+++ b/src/gpu/cl/operators/ClFlatten.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClFlatten.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClReshapeKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
diff --git a/src/gpu/cl/operators/ClFloor.cpp b/src/gpu/cl/operators/ClFloor.cpp
index 84f685e381215849ec0cab040ac0211e3c87a97d..6790160172c62c1265ddb25e7cd10c90ce524aa9 100644
--- a/src/gpu/cl/operators/ClFloor.cpp
+++ b/src/gpu/cl/operators/ClFloor.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClFloor.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClFloorKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
diff --git a/src/gpu/cl/operators/ClFullyConnected.cpp b/src/gpu/cl/operators/ClFullyConnected.cpp
index 5845bbc69e9d41dbb280390f5a9da4270f2b3c7e..6969ac8ab34181e5cd598ffc4d1444cf006b7d76 100644
--- a/src/gpu/cl/operators/ClFullyConnected.cpp
+++ b/src/gpu/cl/operators/ClFullyConnected.cpp
@@ -24,12 +24,13 @@
 #include "src/gpu/cl/operators/ClFullyConnected.h"
 
 #include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 #include "src/gpu/cl/operators/ClFlatten.h"
@@ -38,11 +39,8 @@
 #include "src/gpu/cl/operators/ClMatMul.h"
 #include "src/gpu/cl/operators/ClTranspose.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-
 #include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h"
 #include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 #include <algorithm>
@@ -62,8 +60,11 @@ inline TensorShape get_reshaped_matmul_tensor(const TensorShape &src)
     return TensorShape(src.x(), 1, src.y(), src.collapsed_from(2).z()); // Return value optimisation
 }
 
-Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo &dst,
-                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info)
+Status construct_gemmlowp_output_stage(const ITensorInfo       &src,
+                                       const ITensorInfo       &weights,
+                                       const ITensorInfo       &dst,
+                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                                       ActivationLayerInfo      activation_info)
 {
     gemmlowp_output_stage.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
     gemmlowp_output_stage.gemmlowp_offset     = 0;
@@ -73,7 +74,7 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo
     const auto data_type = src.data_type();
 
     // Configure output stage for quantized case
-    if(is_data_type_quantized_asymmetric(data_type))
+    if (is_data_type_quantized_asymmetric(data_type))
     {
         const QuantizationInfo        oq_info = dst.quantization_info();
         const UniformQuantizationInfo iq_unif = src.quantization_info().uniform();
@@ -85,15 +86,17 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo
         const float multiplier        = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale;
         int         output_multiplier = 0;
         int         output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
         PixelValue type_min{};
         PixelValue type_max{};
         std::tie(type_min, type_max) = get_min_max(data_type);
 
-        if(activation_info.enabled())
+        if (activation_info.enabled())
         {
-            std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info);
+            std::tie(type_min, type_max) =
+                get_quantized_activation_min_max(activation_info, data_type, output_quant_info);
         }
 
         // Set the GEMMLowp output stage info
@@ -109,31 +112,41 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo
     return Status{};
 }
 
-Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &dst, const FullyConnectedLayerInfo &fc_info, bool use_matmul)
+Status validate_mm(const ITensorInfo             &src,
+                   const ITensorInfo             &weights,
+                   const ITensorInfo             *bias,
+                   const ITensorInfo             &dst,
+                   const FullyConnectedLayerInfo &fc_info,
+                   bool                           use_matmul)
 {
     // Note : If input is dynamic and data is not batched, use matmul, else use gemm
     const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
-    const bool use_dynamic_gemm  = !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul
-    const bool is_quantized      = is_data_type_quantized_asymmetric(src.data_type());
+    const bool use_dynamic_gemm =
+        !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul
+    const bool is_quantized = is_data_type_quantized_asymmetric(src.data_type());
 
-    if(use_matmul)
+    if (use_matmul)
     {
         const MatMulInfo m_info = MatMulInfo().adj_rhs(transpose_weights);
 
         // Note: LHS is reshaped here to match ClMatMul expectations of batch index - From [M, B0, B1] to [M, 1, B0, B1]
         TensorInfo lhs_to_use = src.clone()->set_tensor_shape(get_reshaped_matmul_tensor(src.tensor_shape()));
 
-        const GPUTarget                                         gpu_target  = CLScheduler::get().target();
-        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> t           = cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
-        const MatMulKernelInfo                                  kernel_info = t->configure(&lhs_to_use, &weights, m_info);
+        const GPUTarget                                         gpu_target = CLScheduler::get().target();
+        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> t =
+            cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+        const MatMulKernelInfo kernel_info = t->configure(&lhs_to_use, &weights, m_info);
 
-        return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, fc_info.activation_info) :
-               kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, fc_info.activation_info);
+        return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst,
+                                                                          kernel_info, fc_info.activation_info)
+                            : kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info,
+                                                                      fc_info.activation_info);
     }
     else
     {
         GEMMLowpOutputStageInfo gemmlowp_output_stage;
-        ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info));
 
         const GEMMInfo &gemm_info = GEMMInfo(false,                           // is_a_reshaped
                                              false,                           // is_b_reshaped
@@ -147,7 +160,7 @@ Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITe
                                              true,                            // broadcast_bias
                                              ActivationLayerInfo());          // activation_info
 
-        if(is_quantized)
+        if (is_quantized)
         {
             const UniformQuantizationInfo iq_info = src.quantization_info().uniform();
             const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
@@ -158,11 +171,9 @@ Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITe
             const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
 
             // Validate gemmlowp function
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(&src.clone()->set_quantization_info(src_quantization_info),
-                                                                               &weights.clone()->set_quantization_info(weights_quantization_info),
-                                                                               bias,
-                                                                               &dst,
-                                                                               gemm_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(
+                &src.clone()->set_quantization_info(src_quantization_info),
+                &weights.clone()->set_quantization_info(weights_quantization_info), bias, &dst, gemm_info));
         }
         else
         {
@@ -188,11 +199,15 @@ ClFullyConnected::ClFullyConnected()
 
 ClFullyConnected::~ClFullyConnected() = default;
 
-void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
+void ClFullyConnected::configure_mm(const CLCompileContext        &compile_context,
+                                    ITensorInfo                   *src,
+                                    ITensorInfo                   *weights,
+                                    ITensorInfo                   *bias,
+                                    ITensorInfo                   *dst,
                                     const FullyConnectedLayerInfo &fc_info)
 {
     // If weights are dynamic and matmul is supported use matmul, else use gemm
-    if(_use_matmul)
+    if (_use_matmul)
     {
         // Specify whether transpose weights is necessary in matmul info
         const MatMulInfo mat_info = MatMulInfo().adj_rhs(_transpose_weights);
@@ -202,22 +217,25 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe
         _lhs_to_use = src->clone()->set_tensor_shape(get_reshaped_matmul_tensor(_lhs_to_use.tensor_shape()));
 
         // 2. Use heuristics to get kernel info object
-        const GPUTarget                                         gpu_target    = CLScheduler::get().target();
-        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> kernel_config = cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
-        MatMulKernelInfo                                        kernel_info   = kernel_config->configure(src, weights, mat_info);
+        const GPUTarget                                         gpu_target = CLScheduler::get().target();
+        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> kernel_config =
+            cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+        MatMulKernelInfo kernel_info = kernel_config->configure(src, weights, mat_info);
 
         // 3. Configure relevant matmul kernel
-        if(_is_quantized)
+        if (_is_quantized)
         {
             _matmul_lowp_native_kernel = std::make_unique<kernels::ClMatMulLowpNativeKernel>();
             _matmul_lowp_native_kernel->set_target(gpu_target);
-            _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, fc_info.activation_info);
+            _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info,
+                                                  fc_info.activation_info);
         }
         else
         {
             _matmul_native_kernel = std::make_unique<kernels::ClMatMulNativeKernel>();
             _matmul_native_kernel->set_target(gpu_target);
-            _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, fc_info.activation_info);
+            _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info,
+                                             fc_info.activation_info);
         }
     }
     else
@@ -238,7 +256,7 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe
                                              true,                            // broadcast_bias
                                              fc_info.activation_info);        // activation_info
 
-        if(_is_quantized)
+        if (_is_quantized)
         {
             // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
             // Extract and negate input and weights offset
@@ -248,8 +266,10 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe
             TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
             TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
 
-            src_info.set_quantization_info(QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset));
-            weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+            src_info.set_quantization_info(
+                QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset));
+            weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale,
+                                                                -weights_quantization_info.uniform().offset));
 
             // Configure gemmlowp function
             _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
@@ -264,16 +284,25 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe
     }
 }
 
-void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
+void ClFullyConnected::configure_conv_fc(const CLCompileContext        &compile_context,
+                                         ITensorInfo                   *src,
+                                         ITensorInfo                   *weights,
+                                         ITensorInfo                   *bias,
+                                         ITensorInfo                   *dst,
                                          const FullyConnectedLayerInfo &fc_info)
 {
     // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate.
-    ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+    ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) !=
+                          (src->dimension(0) * src->dimension(1) * src->dimension(2))));
 
     // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
     // Initialize output tensor for flatten
-    _flattened_src = src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW);
+    _flattened_src = src->clone()
+                         ->set_is_resizable(true)
+                         .reset_padding()
+                         .set_tensor_shape(compute_flatten_shape(src))
+                         .set_data_layout(DataLayout::NCHW);
 
     // Configure flatten kernel
     _flatten = std::make_unique<ClFlatten>();
@@ -284,7 +313,11 @@ void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context
     configure_mm(compile_context, &_flattened_src, weights, bias, dst, fc_info);
 }
 
-void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
+void ClFullyConnected::configure_fc_fc(const CLCompileContext        &compile_context,
+                                       ITensorInfo                   *src,
+                                       ITensorInfo                   *weights,
+                                       ITensorInfo                   *bias,
+                                       ITensorInfo                   *dst,
                                        const FullyConnectedLayerInfo &fc_info)
 {
     // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate.
@@ -294,7 +327,11 @@ void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context,
     configure_mm(compile_context, src, weights, bias, dst, fc_info);
 }
 
-void ClFullyConnected::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+void ClFullyConnected::configure(const CLCompileContext &compile_context,
+                                 ITensorInfo            *src,
+                                 ITensorInfo            *weights,
+                                 ITensorInfo            *biases,
+                                 ITensorInfo            *dst,
                                  FullyConnectedLayerInfo fc_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
@@ -317,8 +354,9 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     // 2. MatMul does not support broadcasting batch dimension, and therefore is disabled if fc is batched.
     // 3. When FC is after convolution and src tensor data layout does not match weights trained data layout (weights conversion kernel is required)
     const bool is_batched_fc_layer = dst->dimension(1) > 1;
-    _use_matmul                    = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
-    _dynamic_gemm                  = !weights->are_values_constant() && _transpose_weights && !_use_matmul;
+    _use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer &&
+                  !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
+    _dynamic_gemm = !weights->are_values_constant() && _transpose_weights && !_use_matmul;
 
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
@@ -327,11 +365,11 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     //  4) Fully Connected layer -> Fully Connected layer with batches
 
     // Check if we have a fully connected layer with batches
-    if(is_batched_fc_layer)
+    if (is_batched_fc_layer)
     {
-        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
-                                                                                  src->tensor_shape().cend(),
-                                                                                  dst->tensor_shape().cbegin() + 1));
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                            (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                        dst->tensor_shape().cbegin() + 1));
     }
     else
     {
@@ -341,7 +379,7 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     ITensorInfo *weights_used = weights;
 
     // Reshape weights if needed - Not needed when matmul is in use as matmul fuses transpose op.
-    if(_transpose_weights && !_use_matmul)
+    if (_transpose_weights && !_use_matmul)
     {
         // Reshape the weights
         _reshape_weights = std::make_unique<ClTranspose>();
@@ -351,14 +389,11 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     }
 
     // Convert weights if needed
-    if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
     {
         // Convert weights
         _convert_weights = std::make_unique<ClConvertFullyConnectedWeights>();
-        _convert_weights->configure(compile_context,
-                                    weights_used,
-                                    &_converted_weights,
-                                    src->tensor_shape(),
+        _convert_weights->configure(compile_context, weights_used, &_converted_weights, src->tensor_shape(),
                                     fc_info.weights_trained_layout);
 
         weights_used         = &_converted_weights;
@@ -366,7 +401,7 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
         _run_convert_weights = true;
     }
 
-    if(_is_fc_after_conv)
+    if (_is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
         configure_conv_fc(compile_context, src, weights_used, biases, dst, fc_info);
@@ -379,60 +414,69 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     // Update TensorInfo of final weights used (Need to be done in the end due to padding expansion)
     _weights_to_use = *weights_used;
 
-    if(_use_matmul)
+    if (_use_matmul)
     {
         // Note : MatMul does not use transpose and does not need auxillary memory, so only converted weights are added to aux_mem
-        _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size());
+        _aux_mem[ConvertedWeights] =
+            MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size());
     }
     else
     {
         // Set auxiliary memory requirements for gemm operators
         auto gemm_mem_req = (_is_quantized) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
-        for(unsigned int i = 0; i < gemm_mem_req.size(); ++i)
+        for (unsigned int i = 0; i < gemm_mem_req.size(); ++i)
         {
             _aux_mem[i] = gemm_mem_req[i];
         }
-        if(_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs
+        if (_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs
         {
             // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
             // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time
             _aux_mem[TransposedWeights] = MemoryInfo(
-                                              offset_int_vec(TransposedWeights),
-                                              _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
-                                              _reshaped_weights.total_size());
-            _aux_mem[ConvertedWeights] = MemoryInfo(
-                                             offset_int_vec(ConvertedWeights),
-                                             _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
-                                             _converted_weights.total_size());
+                offset_int_vec(TransposedWeights), _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+                _reshaped_weights.total_size());
+            _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights),
+                                                    _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+                                                    _converted_weights.total_size());
         }
         else
         {
             // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
-            const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
-            const auto converted_wei_lft  = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
-
-            _aux_mem[TransposedWeights] = MemoryInfo(
-                                              offset_int_vec(TransposedWeights),
-                                              _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft,
-                                              _reshaped_weights.total_size());
-            _aux_mem[ConvertedWeights] = MemoryInfo(
-                                             offset_int_vec(ConvertedWeights),
-                                             _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft,
-                                             _converted_weights.total_size());
+            const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights))
+                                                ? MemoryLifetime::Persistent
+                                                : MemoryLifetime::Prepare;
+            const auto converted_wei_lft  = (_weights_to_use_idx == offset_int_vec(ConvertedWeights))
+                                                ? MemoryLifetime::Persistent
+                                                : MemoryLifetime::Prepare;
+
+            _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights),
+                                                     _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft,
+                                                     _reshaped_weights.total_size());
+            _aux_mem[ConvertedWeights]  = MemoryInfo(offset_int_vec(ConvertedWeights),
+                                                    _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft,
+                                                     _converted_weights.total_size());
         }
     }
-    _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
+    _aux_mem[FlattenedSrc] =
+        MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
 }
 
-Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+Status ClFullyConnected::validate(const ITensorInfo      *src,
+                                  const ITensorInfo      *weights,
+                                  const ITensorInfo      *biases,
+                                  const ITensorInfo      *dst,
                                   FullyConnectedLayerInfo fc_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
-                                && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
     const GPUTarget gpu_target = get_arch_from_target(CLScheduler::get().target());
 
     const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
@@ -441,11 +485,20 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei
     // When using dynamic weights - use matmul kernels.
     // Note: MatMul does not support broadcasting so fallback with batched cases.
     const bool is_batched_fc_layer = dst->dimension(1) > 1;
-    const bool use_matmul          = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
-
-    const ITensorInfo &flatten_src       = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW));
-    const ITensorInfo &reshaped_weights  = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
-    const ITensorInfo &converted_weights = (transpose_weights && !use_matmul) ? TensorInfo(*reshaped_weights.clone()) : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding());
+    const bool use_matmul          = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() &&
+                            !is_batched_fc_layer &&
+                            !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
+
+    const ITensorInfo &flatten_src      = TensorInfo(src->clone()
+                                                         ->set_is_resizable(true)
+                                                         .reset_padding()
+                                                         .set_tensor_shape(compute_flatten_shape(src))
+                                                         .set_data_layout(DataLayout::NCHW));
+    const ITensorInfo &reshaped_weights = TensorInfo(
+        weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+    const ITensorInfo &converted_weights = (transpose_weights && !use_matmul)
+                                               ? TensorInfo(*reshaped_weights.clone())
+                                               : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding());
 
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
@@ -456,10 +509,10 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei
     const ITensorInfo *src_to_use     = src;
     const ITensorInfo *weights_to_use = weights;
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        if(is_data_type_quantized(src->data_type()))
+        if (is_data_type_quantized(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -470,11 +523,11 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei
     }
 
     // Check if FC is after conv (flatten kernel is run in case where FC is after conv.)
-    if(is_batched_fc_layer)
+    if (is_batched_fc_layer)
     {
-        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
-                                                                                 src->tensor_shape().cend(),
-                                                                                 dst->tensor_shape().cbegin() + 1));
+        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                           (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                       dst->tensor_shape().cbegin() + 1));
     }
     else
     {
@@ -482,29 +535,28 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei
     }
 
     // Transpose kernel does not run when matmul is supported as matmul fuses transpose op.
-    if(transpose_weights && !use_matmul)
+    if (transpose_weights && !use_matmul)
     {
         // Validate reshape weights kernel
         ARM_COMPUTE_RETURN_ON_ERROR(ClTranspose::validate(weights, &reshaped_weights));
         weights_to_use = &reshaped_weights;
     }
 
-    if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
     {
         // Validate convert weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(weights_to_use,
-                                                                             &converted_weights,
-                                                                             src->tensor_shape(),
-                                                                             fc_info.weights_trained_layout));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(
+            weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout));
         weights_to_use = &converted_weights;
     }
 
-    if(is_fc_after_conv)
+    if (is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
         // K Index of matrix multiplication. MatMul performs transpose in kernel, so index is 0 when matmul and transpose enabled
         const int weight_idx = (use_matmul && transpose_weights) ? 0 : 1;
-        ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            (weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
 
         // Validate flatten kernel
         ARM_COMPUTE_RETURN_ON_ERROR(ClFlatten::validate(src, &flatten_src));
@@ -539,24 +591,24 @@ void ClFullyConnected::run(ITensorPack &tensors)
     CLAuxTensorHandler weights(_weights_to_use_idx, _weights_to_use, tensors, false);
 
     // Linearize input if it comes from a convolutional layer
-    if(_is_fc_after_conv)
+    if (_is_fc_after_conv)
     {
-        ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } };
+        ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}};
         _flatten->run(flatten_pack);
     }
 
     ITensorPack gemm_pack = tensors;
     gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
-    if(_weights_to_use_idx != ACL_SRC_1)
+    if (_weights_to_use_idx != ACL_SRC_1)
     {
         gemm_pack.add_const_tensor(ACL_SRC_1, weights.get());
     }
 
     // Run MatMul Op
-    if(_use_matmul)
+    if (_use_matmul)
     {
         // Run matmul kernels for matrix multiplication
-        if(_is_quantized)
+        if (_is_quantized)
         {
             CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, gemm_pack, true);
         }
@@ -568,7 +620,7 @@ void ClFullyConnected::run(ITensorPack &tensors)
     else
     {
         // Run matrix multiply
-        if(_is_quantized)
+        if (_is_quantized)
         {
             _mm_gemmlowp->run(gemm_pack);
         }
@@ -582,7 +634,7 @@ void ClFullyConnected::run(ITensorPack &tensors)
 void ClFullyConnected::prepare(ITensorPack &tensors)
 {
     // Note : Running prepare() each run when _use_matmul is true is unnecessary unless weights conversion is needed.
-    if(!_is_prepared || _dynamic_gemm)
+    if (!_is_prepared || _dynamic_gemm)
     {
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
         ++_asrt_prepare_count;
@@ -598,10 +650,10 @@ void ClFullyConnected::prepare(ITensorPack &tensors)
         const ITensor *cur_weights = weights;
 
         // Reshape weights if needed. Disabled when matmul kernels are enabled as matmul fuses transpose.
-        if(_transpose_weights && !_use_matmul)
+        if (_transpose_weights && !_use_matmul)
         {
             // Run reshape weights kernel and mark weights as unused
-            ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
+            ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}};
             _reshape_weights->run(transpose_pack);
 
             cur_weights->mark_as_unused();
@@ -609,9 +661,9 @@ void ClFullyConnected::prepare(ITensorPack &tensors)
         }
 
         // Convert weights if needed
-        if(_run_convert_weights)
+        if (_run_convert_weights)
         {
-            ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
+            ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}};
             _convert_weights->run(convert_pack);
 
             cur_weights->mark_as_unused();
@@ -622,9 +674,9 @@ void ClFullyConnected::prepare(ITensorPack &tensors)
         gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
 
         // Prepare GEMM prepare and release unused weights
-        if(_dynamic_gemm || !_use_matmul)
+        if (_dynamic_gemm || !_use_matmul)
         {
-            if(!_is_quantized)
+            if (!_is_quantized)
             {
                 _mm_gemm->prepare(gemm_pack);
             }
diff --git a/src/gpu/cl/operators/ClFullyConnected.h b/src/gpu/cl/operators/ClFullyConnected.h
index d975859d87fa428ad5015278eb11e85610a2cc1d..72884ff7ad1f895262c300c3f98dc8d3a5a13902 100644
--- a/src/gpu/cl/operators/ClFullyConnected.h
+++ b/src/gpu/cl/operators/ClFullyConnected.h
@@ -21,14 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_FULLY_CONNECTED_H
-#define ARM_COMPUTE_CL_FULLY_CONNECTED_H
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/function_info/FullyConnectedLayerInfo.h"
 
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
 
 #include <memory>
 
@@ -47,7 +49,7 @@ namespace kernels
 {
 class ClMatMulNativeKernel;
 class ClMatMulLowpNativeKernel;
-}
+} // namespace kernels
 /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels:
  *
  *  -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer)
@@ -88,7 +90,11 @@ public:
      *                             Data type supported: Same as @p src.
      * @param[in]  fc_info         (Optional) Fully connected layer additional info
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
                    FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -96,18 +102,36 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *dst,
                            FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
 
     // Inherited methods overriden
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
-    void configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
-    void configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
-    void configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
+    void configure_fc_fc(const CLCompileContext        &compile_context,
+                         ITensorInfo                   *src,
+                         ITensorInfo                   *weights,
+                         ITensorInfo                   *bias,
+                         ITensorInfo                   *dst,
+                         const FullyConnectedLayerInfo &fc_info);
+    void configure_conv_fc(const CLCompileContext        &compile_context,
+                           ITensorInfo                   *src,
+                           ITensorInfo                   *weights,
+                           ITensorInfo                   *bias,
+                           ITensorInfo                   *dst,
+                           const FullyConnectedLayerInfo &fc_info);
+    void configure_mm(const CLCompileContext        &compile_context,
+                      ITensorInfo                   *src,
+                      ITensorInfo                   *weights,
+                      ITensorInfo                   *bias,
+                      ITensorInfo                   *dst,
+                      const FullyConnectedLayerInfo &fc_info);
 
 private:
     enum AuxTensorIdx
@@ -134,22 +158,22 @@ private:
     TensorInfo _reshaped_weights{};
     TensorInfo _lhs_to_use{};
     TensorInfo _weights_to_use{};
-    int        _weights_to_use_idx{ ACL_SRC_1 };
+    int        _weights_to_use_idx{ACL_SRC_1};
 
-    bool _run_convert_weights{ false };
-    bool _transpose_weights{ false };
-    bool _dynamic_gemm{ false };
-    bool _use_matmul{ false };
+    bool _run_convert_weights{false};
+    bool _transpose_weights{false};
+    bool _dynamic_gemm{false};
+    bool _use_matmul{false};
 
-    bool _is_fc_after_conv{ true };
-    bool _is_quantized{ false };
-    bool _is_prepared{ false };
+    bool _is_fc_after_conv{true};
+    bool _is_quantized{false};
+    bool _is_prepared{false};
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
-    int _asrt_run_count {};
+    int _asrt_run_count{};
     int _asrt_prepare_count{};
 #endif // ARM_COMPUTE_ASSERTS_ENABLED
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_FULLY_CONNECTED_H */
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H
diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp
index 8db6dabe580db97c2d82c50e9a5f322af1b5aa71..815c254c69d563509976c87d8de149cb61c3bdf0 100644
--- a/src/gpu/cl/operators/ClGemm.cpp
+++ b/src/gpu/cl/operators/ClGemm.cpp
@@ -33,12 +33,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
-#include "arm_compute/core/experimental/IPostOp.h"
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -46,8 +46,6 @@
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
 #include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
 #include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 #include "utils/TypePrinter.h"
 
@@ -68,35 +66,43 @@ inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
     return kernel_type == CLGEMMKernelType::NATIVE ? false : true;
 }
 //Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
-inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights)
+inline CLGEMMKernelType
+auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights)
 {
-    if(!constant_weights)
+    if (!constant_weights)
     {
         return CLGEMMKernelType::NATIVE;
     }
 
     auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
-    if(bool(gemm_kernel))
+    if (bool(gemm_kernel))
     {
-        if(validate_gemm_kernel(gemm_kernel.gemm_type))
+        if (validate_gemm_kernel(gemm_kernel.gemm_type))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.",
+                                                      to_string(gemm_kernel.gemm_type).c_str());
             return gemm_kernel.gemm_type;
         }
     }
     gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.",
+                                              to_string(gemm_kernel.gemm_type).c_str());
     return gemm_kernel.gemm_type;
 }
 // Validate lhs_info and rhs_info for reshaped only rhs kernel
-inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
-                                                    const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info)
+inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info,
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const ITensorInfo       *a,
+                                                    const ITensorInfo       *b,
+                                                    const ITensorInfo       *c,
+                                                    const ITensorInfo       *output,
+                                                    GEMMKernelInfo           gemm_kernel_info)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
     TensorInfo tmp_b_info{};
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
@@ -104,12 +110,14 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs
     gemm_kernel_info.lhs_info  = lhs_info;
     gemm_kernel_info.rhs_info  = rhs_info;
     gemm_kernel_info.has_pad_y = false;
-    if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
+    if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                                  rhs_info, gemm_kernel_info)))
     {
         return false;
     }
     gemm_kernel_info.has_pad_y = true;
-    if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
+    if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                                  rhs_info, gemm_kernel_info)))
     {
         return false;
     }
@@ -117,49 +125,65 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs
 }
 
 //Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
-inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a,
-                                                                                                 const ITensorInfo *b,
-                                                                                                 const ITensorInfo *c, const ITensorInfo *output)
+inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query,
+                                          GEMMKernelInfo               kernel_info,
+                                          const ITensorInfo           *a,
+                                          const ITensorInfo           *b,
+                                          const ITensorInfo           *c,
+                                          const ITensorInfo           *output)
 {
     auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
-    if(config)
+    if (config)
     {
-        if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))
+        if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
         }
     }
     config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 
 // Validate lhs_info and rhs_info for reshaped kernel
-inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
-                                           const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info, bool reinterpret_input_as_3d)
+inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info,
+                                           const GEMMRHSMatrixInfo &rhs_info,
+                                           const ITensorInfo       *a,
+                                           const ITensorInfo       *b,
+                                           const ITensorInfo       *c,
+                                           const ITensorInfo       *output,
+                                           GEMMKernelInfo           gemm_kernel_info,
+                                           bool                     reinterpret_input_as_3d)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel
     TensorInfo tmp_a_info{};
     TensorInfo tmp_b_info{};
 
     // Validate reshape LHS kernel
-    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));
-    if(!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))
+    auto_init_if_empty(tmp_a_info,
+                       a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));
+    if (!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))
     {
         return false;
     }
 
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
     // Validate mm kernel
     gemm_kernel_info.lhs_info = lhs_info;
     gemm_kernel_info.rhs_info = rhs_info;
-    if(!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
+    if (!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                           rhs_info, gemm_kernel_info)))
     {
         return false;
     }
@@ -167,21 +191,32 @@ inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, co
 }
 
 //Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs
-inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, const ITensorInfo *b,
-                                                                                        const ITensorInfo *c, const ITensorInfo *output, bool reinterpret_input_as_3d)
+inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query,
+                                 GEMMKernelInfo               kernel_info,
+                                 const ITensorInfo           *a,
+                                 const ITensorInfo           *b,
+                                 const ITensorInfo           *c,
+                                 const ITensorInfo           *output,
+                                 bool                         reinterpret_input_as_3d)
 {
     auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query);
-    if(config)
+    if (config)
     {
-        if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d))
+        if (validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info,
+                                           reinterpret_input_as_3d))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
         }
     }
     config = auto_heuristics::select_default_gemm_config_reshaped(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(),
+        to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 } // namespace
 
@@ -201,18 +236,24 @@ ClGemm::ClGemm()
 {
 }
 
-void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                              const GEMMInfo &gemm_info)
+void ClGemm::configure_native(const CLCompileContext &compile_context,
+                              ITensorInfo            *a,
+                              ITensorInfo            *b,
+                              ITensorInfo            *c,
+                              ITensorInfo            *output,
+                              float                   alpha,
+                              float                   beta,
+                              const GEMMInfo         &gemm_info)
 {
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -222,29 +263,36 @@ void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorIn
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     // Set the target for the kernels
     _mm_native_kernel->set_target(gpu_target);
 
-    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
 
     // Configure and tune matrix multiply kernel
-    _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info);
+    _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info,
+                                 kernel_info);
 }
 
-void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                                const GEMMInfo &gemm_info)
+void ClGemm::configure_reshaped(const CLCompileContext &compile_context,
+                                ITensorInfo            *a,
+                                ITensorInfo            *b,
+                                ITensorInfo            *c,
+                                ITensorInfo            *output,
+                                float                   alpha,
+                                float                   beta,
+                                const GEMMInfo         &gemm_info)
 {
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -254,7 +302,6 @@ void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensor
     kernel_info.reinterpret_input_as_3d = false;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     // Set the target for the kernels
     _reshape_lhs_kernel->set_target(gpu_target);
@@ -264,32 +311,42 @@ void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensor
     GEMMRHSMatrixInfo rhs_info{};
 
     // Pick up the GEMM configuration
-    std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b,
-                                                                    c, output, gemm_info.reinterpret_input_as_3d());
+    std::tie(lhs_info, rhs_info) =
+        auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size},
+                                         kernel_info, a, b, c, output, gemm_info.reinterpret_input_as_3d());
 
     _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
     _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
 
     // Configure and tune matrix multiply kernel
-    _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info,
+                                   kernel_info);
 
     // Request memory for LHS and RHS reshape matrix
     _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());
-    _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
 }
 
-void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                                         const GEMMInfo &gemm_info)
+void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context,
+                                         ITensorInfo            *a,
+                                         ITensorInfo            *b,
+                                         ITensorInfo            *c,
+                                         ITensorInfo            *output,
+                                         float                   alpha,
+                                         float                   beta,
+                                         const GEMMInfo         &gemm_info)
 {
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -299,7 +356,6 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     // Set the target for the kernels
     _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
@@ -308,7 +364,8 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context
     GEMMRHSMatrixInfo rhs_info{};
 
     // Pick up the GEMM configuration
-    std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, c, output);
+    std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}, kernel_info, a, b, c, output);
 
     // Transpose matrix
     _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
@@ -319,24 +376,33 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context
 
     // Configure matrix multiply kernel with no y padding support
     kernel_info.has_pad_y = false;
-    _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info,
+                                            kernel_info);
 
     // Request memory for RHS reshape matrix
-    _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
 }
 
-void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                                              const GEMMInfo &gemm_info)
+void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context,
+                                              ITensorInfo            *a,
+                                              ITensorInfo            *b,
+                                              ITensorInfo            *c,
+                                              ITensorInfo            *output,
+                                              float                   alpha,
+                                              float                   beta,
+                                              const GEMMInfo         &gemm_info)
 {
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -346,7 +412,6 @@ void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_co
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     // Set the target for the kernels
     _mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target);
@@ -355,9 +420,10 @@ void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_co
     GEMMRHSMatrixInfo rhs_info{};
 
     // Pick up the GEMM configuration
-    auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info         = gemm_config.lhs_info;
-    rhs_info         = gemm_config.rhs_info;
+    auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
     // Force H0 to 4 in order to use the MMUL extension
     rhs_info.h0 = 4;
 
@@ -366,13 +432,22 @@ void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_co
 
     // Configure matrix multiply kernel with no y padding support
     kernel_info.has_pad_y = false;
-    _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info,
+                                                 rhs_info, kernel_info);
 
     // Request memory for RHS reshape matrix
-    _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
 }
 
-Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate_native(const ITensorInfo *a,
+                               const ITensorInfo *b,
+                               const ITensorInfo *c,
+                               const ITensorInfo *output,
+                               float              alpha,
+                               float              beta,
+                               const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
@@ -381,12 +456,12 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const
     const GPUTarget    gpu_target              = CLScheduler::get().target();
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -396,17 +471,24 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
-    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
 
     // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate(a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate(
+        a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info));
 
     return Status{};
 }
 
-Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate_reshaped(const ITensorInfo *a,
+                                 const ITensorInfo *b,
+                                 const ITensorInfo *c,
+                                 const ITensorInfo *output,
+                                 float              alpha,
+                                 float              beta,
+                                 const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
@@ -418,12 +500,12 @@ Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, con
     const GPUTarget    gpu_target              = CLScheduler::get().target();
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -433,30 +515,39 @@ Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, con
     kernel_info.reinterpret_input_as_3d = false;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     GEMMLHSMatrixInfo lhs_info;
     GEMMRHSMatrixInfo rhs_info;
 
     // Pick up the GEMM configuration
     // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
-    const auto gemm_config = select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info               = gemm_config.lhs_info;
-    rhs_info               = gemm_config.rhs_info;
+    const auto gemm_config =
+        select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
 
-    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(
+                                       compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
 
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
     ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
 
     // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha,
+                                                                             beta, lhs_info, rhs_info, kernel_info));
 
     return Status{};
 }
 
-Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a,
+                                          const ITensorInfo *b,
+                                          const ITensorInfo *c,
+                                          const ITensorInfo *output,
+                                          float              alpha,
+                                          float              beta,
+                                          const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
@@ -467,12 +558,12 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf
     const GPUTarget    gpu_target              = CLScheduler::get().target();
     const DataType     data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -482,31 +573,39 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     GEMMLHSMatrixInfo lhs_info;
     GEMMRHSMatrixInfo rhs_info;
 
     // Pick up the GEMM configuration
     // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
-    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info               = gemm_config.lhs_info;
-    rhs_info               = gemm_config.rhs_info;
+    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
 
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
     ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
 
     // Validate matrix multiply
     kernel_info.has_pad_y = false;
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
 
     kernel_info.has_pad_y = true;
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
 
     return Status{};
 }
 
-Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a,
+                                               const ITensorInfo *b,
+                                               const ITensorInfo *c,
+                                               const ITensorInfo *output,
+                                               float              alpha,
+                                               float              beta,
+                                               const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
@@ -516,12 +615,12 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens
     const GPUTarget    gpu_target              = CLScheduler::get().target();
     const DataType     data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -531,16 +630,16 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     GEMMLHSMatrixInfo lhs_info;
     GEMMRHSMatrixInfo rhs_info;
 
     // Pick up the GEMM configuration
     // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
-    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info               = gemm_config.lhs_info;
-    rhs_info               = gemm_config.rhs_info;
+    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
     // Force H0 to 4 in order to use the MMUL extension
     rhs_info.h0 = 4;
 
@@ -549,12 +648,20 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens
 
     // Validate matrix multiply
     kernel_info.has_pad_y = false;
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
 
     return Status{};
 }
 
-void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void ClGemm::configure(const CLCompileContext &compile_context,
+                       ITensorInfo            *a,
+                       ITensorInfo            *b,
+                       ITensorInfo            *c,
+                       ITensorInfo            *output,
+                       float                   alpha,
+                       float                   beta,
+                       const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
@@ -567,20 +674,21 @@ void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a,
     _is_prepared                 = gemm_info.retain_internal_weights();
 
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
 
     // Select GEMMType
-    _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ CLScheduler::get().target(), a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run,
-                                                b->are_values_constant());
+    _gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{CLScheduler::get().target(), a->data_type(), m, n, k, batch_size},
+        _reshape_b_only_on_first_run, b->are_values_constant());
 
     const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
 
     ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
 
-    switch(_gemm_kernel_type)
+    switch (_gemm_kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         {
@@ -609,30 +717,41 @@ void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a,
     }
 }
 
-Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        const ITensorInfo *output,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
 {
     // Get the GPU target
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
 
     // Check data type early because the auto_select_gemm_kernel has assertions on supported data types
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
 
     // Select GEMMType
-    CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery
-    {
-        CLScheduler::get().target(), a->data_type(), m, n, k, batch_size,
-    },
-    gemm_info.reshape_b_only_on_first_run(), b->are_values_constant());
+    CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{
+            CLScheduler::get().target(),
+            a->data_type(),
+            m,
+            n,
+            k,
+            batch_size,
+        },
+        gemm_info.reshape_b_only_on_first_run(), b->are_values_constant());
 
     const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
 
     const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
 
-    switch(gemm_kernel_type)
+    switch (gemm_kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         {
@@ -651,7 +770,8 @@ Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
         }
         case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info));
             break;
         }
         default:
@@ -678,7 +798,7 @@ void ClGemm::run(ITensorPack &tensors)
     prepare(tensors);
 
     // Run matrix multiply kernel
-    switch(_gemm_kernel_type)
+    switch (_gemm_kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         {
@@ -688,13 +808,13 @@ void ClGemm::run(ITensorPack &tensors)
         case CLGEMMKernelType::RESHAPED:
         {
             // Run interleave kernel
-            ITensorPack reshape_lhs_pack{ { ACL_SRC, lhs }, { ACL_DST, lhs_reshaped.get() } };
+            ITensorPack reshape_lhs_pack{{ACL_SRC, lhs}, {ACL_DST, lhs_reshaped.get()}};
             CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false);
 
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
                 CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
             }
             // Copy original tensor pack and overwrite lhs and rhs with reshaped counterparts
@@ -702,7 +822,7 @@ void ClGemm::run(ITensorPack &tensors)
             gemm_reshaped_pack.add_const_tensor(ACL_SRC_0, lhs_reshaped.get());
             gemm_reshaped_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
 
-            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED)
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED)
             {
                 CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true);
             }
@@ -710,10 +830,10 @@ void ClGemm::run(ITensorPack &tensors)
         }
         case CLGEMMKernelType::RESHAPED_ONLY_RHS:
         {
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
                 CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
             }
             // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
@@ -726,7 +846,7 @@ void ClGemm::run(ITensorPack &tensors)
             ITensorPack gemm_reshaped_onlyrhs_pack(tensors);
             gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
 
-            if(has_pad_y)
+            if (has_pad_y)
             {
                 ARM_COMPUTE_ERROR_ON(has_pad_y);
             }
@@ -738,10 +858,10 @@ void ClGemm::run(ITensorPack &tensors)
         }
         case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
         {
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
                 CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
             }
             // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
@@ -754,7 +874,7 @@ void ClGemm::run(ITensorPack &tensors)
             ITensorPack gemm_reshaped_onlyrhs_pack(tensors);
             gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
 
-            if(has_pad_y)
+            if (has_pad_y)
             {
                 ARM_COMPUTE_ERROR_ON(has_pad_y);
             }
@@ -773,20 +893,22 @@ void ClGemm::run(ITensorPack &tensors)
 
 void ClGemm::prepare(ITensorPack &constants)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        const ITensor *src1    = constants.get_const_tensor(ACL_SRC_1);
-        ICLTensor     *rhs_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape)));
+        const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1);
+        ICLTensor     *rhs_aux =
+            utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape)));
 
         // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed
-        if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)
+        if ((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) &&
+            (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)
         {
             ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!");
 
             CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux);
             ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr);
 
-            ITensorPack reshape_rhs_pack{ { ACL_SRC, src1 }, { ACL_DST, rhs_reshaped.get() } };
+            ITensorPack reshape_rhs_pack{{ACL_SRC, src1}, {ACL_DST, rhs_reshaped.get()}};
             CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true);
         }
         _is_prepared = true;
diff --git a/src/gpu/cl/operators/ClGemm.h b/src/gpu/cl/operators/ClGemm.h
index 11f9f2b3d8ab30381a33d4690bab43b1f204fbf1..85dc1d6c8fd01d5f643b8a602f4ff51582ca6419 100644
--- a/src/gpu/cl/operators/ClGemm.h
+++ b/src/gpu/cl/operators/ClGemm.h
@@ -90,30 +90,95 @@ public:
      *                             if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
      *                             in case matrix A and matrix B have been already transformed.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *a,
+                   ITensorInfo            *b,
+                   ITensorInfo            *c,
+                   ITensorInfo            *output,
+                   float                   alpha,
+                   float                   beta,
+                   const GEMMInfo         &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClGemm::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
-    void configure_native(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    void configure_native(const CLCompileContext &compile_context,
+                          ITensorInfo            *a,
+                          ITensorInfo            *b,
+                          ITensorInfo            *c,
+                          ITensorInfo            *output,
+                          float                   alpha,
+                          float                   beta,
+                          const GEMMInfo         &gemm_info);
+    void configure_reshaped(const CLCompileContext &compile_context,
+                            ITensorInfo            *a,
+                            ITensorInfo            *b,
+                            ITensorInfo            *c,
+                            ITensorInfo            *output,
+                            float                   alpha,
+                            float                   beta,
+                            const GEMMInfo         &gemm_info);
+    void configure_reshaped_only_rhs(const CLCompileContext &compile_context,
+                                     ITensorInfo            *a,
+                                     ITensorInfo            *b,
+                                     ITensorInfo            *c,
+                                     ITensorInfo            *output,
+                                     float                   alpha,
+                                     float                   beta,
+                                     const GEMMInfo         &gemm_info);
+    void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context,
+                                          ITensorInfo            *a,
+                                          ITensorInfo            *b,
+                                          ITensorInfo            *c,
+                                          ITensorInfo            *output,
+                                          float                   alpha,
+                                          float                   beta,
+                                          const GEMMInfo         &gemm_info);
 
-    static Status validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    static Status validate_native(const ITensorInfo *a,
+                                  const ITensorInfo *b,
+                                  const ITensorInfo *c,
+                                  const ITensorInfo *output,
+                                  float              alpha,
+                                  float              beta,
+                                  const GEMMInfo    &gemm_info);
+    static Status validate_reshaped(const ITensorInfo *a,
+                                    const ITensorInfo *b,
+                                    const ITensorInfo *c,
+                                    const ITensorInfo *output,
+                                    float              alpha,
+                                    float              beta,
+                                    const GEMMInfo    &gemm_info);
+    static Status validate_reshaped_only_rhs(const ITensorInfo *a,
+                                             const ITensorInfo *b,
+                                             const ITensorInfo *c,
+                                             const ITensorInfo *output,
+                                             float              alpha,
+                                             float              beta,
+                                             const GEMMInfo    &gemm_info);
+    static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a,
+                                                  const ITensorInfo *b,
+                                                  const ITensorInfo *c,
+                                                  const ITensorInfo *output,
+                                                  float              alpha,
+                                                  float              beta,
+                                                  const GEMMInfo    &gemm_info);
 
 private:
     enum AuxTensorIdx
diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp
index 682477e4eaad396778d38138732c50f80e7ef6f3..55d815a1ef7cef18e7b40511c45a6550ae672a3c 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.cpp
+++ b/src/gpu/cl/operators/ClGemmConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,10 +28,12 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClActivationKernel.h"
@@ -41,8 +43,6 @@
 #include "src/gpu/cl/operators/ClGemm.h"
 #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -53,18 +53,38 @@ using namespace utils::cast;
 namespace opencl
 {
 ClGemmConv2d::ClGemmConv2d()
-    : _weights_reshape_kernel(nullptr), _im2col_kernel(nullptr), _mm_gemm(nullptr), _mm_gemmlowp(nullptr), _col2im_kernel(nullptr), _activation_kernel(nullptr), _im2col_output(), _weights_reshaped(),
-      _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _use_post_ops(false), _aux_mem(AuxTensorIdx::Count)
+    : _weights_reshape_kernel(nullptr),
+      _im2col_kernel(nullptr),
+      _mm_gemm(nullptr),
+      _mm_gemmlowp(nullptr),
+      _col2im_kernel(nullptr),
+      _activation_kernel(nullptr),
+      _im2col_output(),
+      _weights_reshaped(),
+      _gemm_output(),
+      _skip_im2col(false),
+      _skip_col2im(false),
+      _is_quantized(false),
+      _fuse_activation(true),
+      _append_bias(false),
+      _is_prepared(false),
+      _aux_mem(AuxTensorIdx::Count)
 {
 }
 ClGemmConv2d::~ClGemmConv2d() = default;
 
-void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+void ClGemmConv2d::configure_mm(const ClCompileContext        &compile_context,
+                                const ITensorInfo             *src,
+                                ITensorInfo                   *weights,
+                                ITensorInfo                   *biases,
+                                ITensorInfo                   *dst,
                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                                int gemm_3d_depth, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops)
+                                int                            gemm_3d_depth,
+                                const ActivationLayerInfo     &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
 
     const GEMMInfo &gemm_info = GEMMInfo(false,                 // is_a_reshaped
                                          false,                 // is_b_reshaped
@@ -76,21 +96,21 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I
                                          false,                 // fast_math
                                          false,                 // fp_mixed_precision
                                          true,                  // broadcast_bias
-                                         act_info,              // activation_info
-                                         post_ops               // post ops
-                                        );
+                                         act_info               // activation_info
+    );
 
-    TensorInfo tmp_src{ *src };
-    if(_is_quantized)
+    TensorInfo tmp_src{*src};
+    if (_is_quantized)
     {
-        ARM_COMPUTE_ERROR_ON_MSG(post_ops.size() > 0, "ClGemmConv2d quantized types do not support post ops");
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
         const QuantizationInfo input_quantization_info   = src->quantization_info();
         const QuantizationInfo weights_quantization_info = weights->quantization_info();
 
-        tmp_src.set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
-        weights->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+        tmp_src.set_quantization_info(
+            QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+        weights->set_quantization_info(
+            QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
         _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
         _mm_gemmlowp->configure(compile_context, &tmp_src, weights, biases, dst, gemm_info);
@@ -99,7 +119,7 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I
         weights->set_quantization_info(weights_quantization_info);
 
         auto mm_mem_req = _mm_gemmlowp->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
         {
             _aux_mem[cont] = mm_mem_req[cont];
         }
@@ -110,15 +130,21 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I
         _mm_gemm = std::make_unique<ClGemm>();
         _mm_gemm->configure(compile_context, &tmp_src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
         auto mm_mem_req = _mm_gemm->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
         {
             _aux_mem[cont] = mm_mem_req[cont];
         }
     }
 }
 
-Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops)
+Status ClGemmConv2d::validate_mm(const ITensorInfo             *src,
+                                 const ITensorInfo             *weights,
+                                 const ITensorInfo             *biases,
+                                 const ITensorInfo             *dst,
+                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                                 int                            gemm_3d_depth,
+                                 bool                           skip_im2col,
+                                 const ActivationLayerInfo     &act_info)
 {
     const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type());
 
@@ -132,13 +158,11 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig
                                          false,                 // fast_math
                                          false,                 // fp_mixed_precision
                                          true,                  // broadcast_bias
-                                         act_info,              // activation_info
-                                         post_ops               // post ops
-                                        );
+                                         act_info               // activation_info
+    );
 
-    if(is_quantized)
+    if (is_quantized)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(post_ops.size() > 0, "ClGemmConv2d quantized types do not support post ops");
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
         const QuantizationInfo input_quantization_info   = src->quantization_info();
@@ -146,8 +170,10 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig
 
         std::unique_ptr<ITensorInfo> src_qa     = src->clone();
         std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
-        src_qa->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
-        weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+        src_qa->set_quantization_info(
+            QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+        weights_qa->set_quantization_info(
+            QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
         // Perform validation step on GEMMLowp
         return ClGemmLowpMatrixMultiplyCore::validate(src_qa.get(), weights_qa.get(), biases, dst, gemm_info);
@@ -159,14 +185,17 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig
     }
 }
 
-void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                             const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info)
+void ClGemmConv2d::configure(const CLCompileContext &compile_context,
+                             ITensorInfo            *src,
+                             ITensorInfo            *weights,
+                             ITensorInfo            *biases,
+                             ITensorInfo            *dst,
+                             const Conv2dInfo       &conv2d_info,
+                             const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
-    ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst,
-                                                      conv2d_info,
-                                                      weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst, conv2d_info, weights_info));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info);
 
     const DataType   data_type   = src->data_type();
@@ -184,30 +213,26 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
 
     _is_prepared  = weights_info.retain_internal_weights();
     _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
-    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
+    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                    conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
     _skip_col2im  = data_layout == DataLayout::NHWC;
 
     // Only for quantize there are few cases where we cannot fuse the activation function in GEMM
     _fuse_activation = true;
-    _use_post_ops    = conv2d_info.post_ops.size() > 0;
 
     const ITensorInfo *gemm_input_to_use  = src;
     ITensorInfo       *gemm_output_to_use = dst;
 
     // Get parameters from conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
+    unsigned int stride_x        = 0;
+    unsigned int stride_y        = 0;
     std::tie(stride_x, stride_y) = conv2d_info.conv_info.stride();
 
     // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv2d_info.conv_info,
-                                                 conv2d_info.dilation);
+    unsigned int conv_w      = 0;
+    unsigned int conv_h      = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv2d_info.conv_info, conv2d_info.dilation);
 
     unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups;
 
@@ -215,28 +240,31 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
     _append_bias               = false;
 
     _weights_reshape_kernel = std::make_unique<kernels::ClWeightsReshapeKernel>();
-    if(conv2d_info.num_groups != 1 && biases != nullptr)
+    if (conv2d_info.num_groups != 1 && biases != nullptr)
     {
         // num_groups != 1 can only be for NCHW
         // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
         biases_to_use = nullptr;
         _append_bias  = true;
-        _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped, conv2d_info.num_groups);
+        _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped,
+                                           conv2d_info.num_groups);
     }
     else
     {
-        _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped, conv2d_info.num_groups);
+        _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped,
+                                           conv2d_info.num_groups);
     }
 
     // Create tensor to store im2col reshaped inputs
-    if(!_skip_im2col)
+    if (!_skip_im2col)
     {
         // Configure and tune im2col. im2col output shape is auto-initialized
         _im2col_kernel = std::make_unique<opencl::kernels::ClIm2ColKernel>();
 
         // Set the GPU target for im2col
         _im2col_kernel->set_target(CLScheduler::get().target());
-        _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height), conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups);
+        _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height),
+                                  conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups);
 
         // Set quantization info
         _im2col_output.set_quantization_info(src->quantization_info());
@@ -247,7 +275,7 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
     }
 
     // Create GEMM output tensor
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
         TensorShape shape_gemm;
 
@@ -268,7 +296,7 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
     gemmlowp_output_stage.gemmlowp_offset = 0;
 
     // Configure output stage for quantized case
-    if(_is_quantized)
+    if (_is_quantized)
     {
         const auto         output_quant_info        = (dst->total_size() == 0) ? iq_info : oq_info;
         const bool         is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
@@ -291,16 +319,16 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
         auto min_activation = min_val.get<int32_t>();
         auto max_activation = max_val.get<int32_t>();
 
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
 
-        if(conv2d_info.act_info.enabled())
+        if (conv2d_info.act_info.enabled())
         {
-            if(supported_acts.count(conv2d_info.act_info.activation()) != 0)
+            if (supported_acts.count(conv2d_info.act_info.activation()) != 0)
             {
-                std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
+                std::tie(min_activation, max_activation) =
+                    get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
             }
             else
             {
@@ -318,50 +346,60 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info, conv2d_info.post_ops);
+    configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use,
+                 gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info);
 
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
-        ARM_COMPUTE_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClGemmConv2d does not support post ops with col2im operation"); // Post ops must be performed after every other op
         // Set the GPU target for col2im
         _col2im_kernel = std::make_unique<opencl::kernels::ClCol2ImKernel>();
         _col2im_kernel->set_target(CLScheduler::get().target());
         // Configure and tune Col2Im
-        _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups);
+        _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h),
+                                  conv2d_info.num_groups);
         CLScheduler::get().tune_kernel_static(*_col2im_kernel.get());
     }
 
     ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
                              "Output shape does not match the expected one");
 
-    // Disable running of activation kernel if post ops are used
-    if(!_fuse_activation && !_use_post_ops)
+    if (!_fuse_activation)
     {
         _activation_kernel = std::make_unique<opencl::kernels::ClActivationKernel>();
         _activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info);
     }
 
-    _aux_mem[Im2ColOutput]    = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
-    _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size());
-    _aux_mem[GemmOutput]      = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
+    _aux_mem[Im2ColOutput] =
+        MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
+    _aux_mem[WeightsReshaped] =
+        MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size());
+    _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
 }
 
-Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+Status ClGemmConv2d::validate(const ITensorInfo *src,
+                              const ITensorInfo *weights,
+                              const ITensorInfo *biases,
+                              const ITensorInfo *dst,
+                              const Conv2dInfo  &conv2d_info,
                               const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
 
-    if(!is_quantized_per_channel)
+    if (!is_quantized_per_channel)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8");
-    ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) && (src->data_layout() == DataLayout::NCHW));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8),
+                                    "Grouping (num_groups != 1) is not supported with QASYMM8");
+    ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) &&
+                                (src->data_layout() == DataLayout::NCHW));
 
     const DataLayout data_layout = src->data_layout();
     const DataType   data_type   = src->data_type();
@@ -381,22 +419,19 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
     const ITensorInfo *gemm_output_to_use = dst;
     const ITensorInfo *weights_to_use     = weights;
     const bool         is_quantized       = is_data_type_quantized_asymmetric(data_type);
-    const bool         skip_im2col        = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1
-                                             && conv2d_info.conv_info.stride().second == 1);
-    const bool skip_col2im     = data_layout == DataLayout::NHWC;
-    bool       fuse_activation = true;
-    bool       use_post_ops    = conv2d_info.post_ops.size() > 0;
+    const bool         skip_im2col     = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                              conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
+    const bool         skip_col2im     = data_layout == DataLayout::NHWC;
+    bool               fuse_activation = true;
 
-    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != src->dimension(idx_channel));
+    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) !=
+                                src->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!skip_im2col
-                                    && conv2d_info.post_ops.size() > 0,
-                                    "ClGemmConv2d does not support post ops with col2im or im2col operation"); // Post ops must be performed after every other op
 
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_quantized)
+        if (is_quantized)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -408,7 +443,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     }
 
-    if(conv2d_info.act_info.enabled())
+    if (conv2d_info.act_info.enabled())
     {
         ARM_COMPUTE_ERROR_ON(conv2d_info.act_info.b() > conv2d_info.act_info.a());
     }
@@ -417,48 +452,50 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
 
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv2d_info.conv_info,
-                                                 conv2d_info.dilation);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv2d_info.conv_info, conv2d_info.dilation);
 
     unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups;
 
     const ITensorInfo *biases_to_use = biases;
     bool               append_bias   = false;
 
-    if(conv2d_info.num_groups != 1 && biases != nullptr)
+    if (conv2d_info.num_groups != 1 && biases != nullptr)
     {
         // num_groups != 1 can only be for NCHW
         // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
-        biases_to_use         = nullptr;
-        append_bias           = true;
-        weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type);
+        biases_to_use = nullptr;
+        append_bias   = true;
+        weights_reshaped_info =
+            TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type);
     }
     else
     {
-        weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type);
+        weights_reshaped_info =
+            TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type);
     }
 
     weights_to_use = &weights_reshaped_info;
 
-    if(!skip_im2col)
+    if (!skip_im2col)
     {
         const Size2D kernel_dims(kernel_width, kernel_height);
 
         // Output tensor auto initialization if not yet initialized
-        TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups == 1, conv2d_info.num_groups);
+        TensorShape expected_output_shape =
+            compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation,
+                                      conv2d_info.num_groups == 1, conv2d_info.num_groups);
 
         auto_init_if_empty(im2col_reshaped_info, src->clone()->set_tensor_shape(expected_output_shape));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info,
+                                                      append_bias, conv2d_info.dilation, conv2d_info.num_groups));
         gemm_input_to_use = &im2col_reshaped_info;
     }
 
     // Create GEMM output tensor
-    if(!skip_col2im)
+    if (!skip_col2im)
     {
         TensorShape shape_gemm;
 
@@ -476,7 +513,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
     gemmlowp_output_stage.gemmlowp_offset          = 0;
     gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
 
-    if(is_quantized)
+    if (is_quantized)
     {
         const UniformQuantizationInfo iq_info           = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info           = dst->quantization_info().uniform();
@@ -494,16 +531,16 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
         int min_activation = 0;
         int max_activation = 0;
 
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
 
-        if(conv2d_info.act_info.enabled())
+        if (conv2d_info.act_info.enabled())
         {
-            if(supported_acts.count(conv2d_info.act_info.activation()) != 0)
+            if (supported_acts.count(conv2d_info.act_info.activation()) != 0)
             {
-                std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
+                std::tie(min_activation, max_activation) =
+                    get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
             }
             else
             {
@@ -520,18 +557,18 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info,
-                                            conv2d_info.post_ops));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use,
+                                            gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info));
 
     // Validate Col2Im
-    if(!skip_col2im)
+    if (!skip_col2im)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups));
     }
 
     // Validate Activation Layer
-    // Disable running (thus validation) of activation kernel if post ops are used
-    if(!fuse_activation && !use_post_ops)
+    if (!fuse_activation)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info));
     }
@@ -554,30 +591,26 @@ void ClGemmConv2d::run(ITensorPack &tensors)
     CLAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
 
     // Run im2col
-    if(!_skip_im2col)
+    if (!_skip_im2col)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, src },
-            { TensorType::ACL_DST, im2col_output.get() }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}};
         CLScheduler::get().enqueue_op(*_im2col_kernel, pack, false);
         gemm_input_to_use = im2col_output.get();
     }
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
         gemm_output_to_use = gemm_output.get();
     }
     ITensorPack pack_mm = tensors;
     pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
     pack_mm.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
-    if(!_append_bias)
+    if (!_append_bias)
     {
         pack_mm.add_const_tensor(TensorType::ACL_SRC_2, biases);
     }
     pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
     // Runs ClGemm or ClGemmLowpMatrixMultiplyCore functions
-    if(_is_quantized)
+    if (_is_quantized)
     {
         // Run gemmlowp
         _mm_gemmlowp->run(pack_mm);
@@ -589,44 +622,32 @@ void ClGemmConv2d::run(ITensorPack &tensors)
     }
 
     // Reshape output matrix
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, gemm_output_to_use },
-            { TensorType::ACL_DST, dst }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
         CLScheduler::get().enqueue_op(*_col2im_kernel.get(), pack, false);
     }
 
     //Run Activation Layer if we cannot fuse in GEMM
-    // Disable running of activation kernel if post ops are used
-    if(!_fuse_activation && !_use_post_ops)
+    if (!_fuse_activation)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, dst },
-            { TensorType::ACL_DST, dst }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}};
         CLScheduler::get().enqueue_op(*_activation_kernel.get(), pack, false);
     }
 }
 
 void ClGemmConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Run weights reshaping and mark original weights tensor as unused
-        ICLTensor         *weights_reshaped_p = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
+        ICLTensor *weights_reshaped_p =
+            utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
         CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p);
         auto               weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        ITensorPack        pack =
-        {
-            { TensorType::ACL_SRC, weights },
-            { TensorType::ACL_DST, weights_reshaped.get() }
-        };
+        ITensorPack        pack    = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}};
 
-        if(_append_bias)
+        if (_append_bias)
         {
             const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
             pack.add_const_tensor(TensorType::ACL_BIAS, biases);
diff --git a/src/gpu/cl/operators/ClGemmConv2d.h b/src/gpu/cl/operators/ClGemmConv2d.h
index afde7c511d9915a90ce0110bae208e456887cc44..e8f3147ac30188213f5b57b6543dba16515b6def 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.h
+++ b/src/gpu/cl/operators/ClGemmConv2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_GEMM_CONV2D_H
-#define ARM_COMPUTE_CL_GEMM_CONV2D_H
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -101,20 +101,29 @@ public:
      * @param[in]  weights_info    Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights
      *                             tensor has also been transposed with CLGEMMReshapeRHSMatrixKernel. Data type supported: Same as @p input.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                   const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
+                   const Conv2dInfo       &conv2d_info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClGemmConvolution::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &conv2d_info,
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *output,
+                           const Conv2dInfo  &conv2d_info,
                            const WeightsInfo &weights_info = WeightsInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -131,9 +140,14 @@ private:
      * @param[in]      gemm_3d_depth         Depth of GEMM 3D
      * @param[in]      act_info              Activation to apply after the matrix multiplication
      */
-    void configure_mm(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+    void configure_mm(const CLCompileContext        &compile_context,
+                      const ITensorInfo             *src,
+                      ITensorInfo                   *weights,
+                      ITensorInfo                   *biases,
+                      ITensorInfo                   *dst,
                       const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                      int gemm_3d_depth, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {});
+                      int                            gemm_3d_depth,
+                      const ActivationLayerInfo     &act_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines
      *
      * @param[in] src                   Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -149,8 +163,14 @@ private:
      *
      * @return a status
      */
-    static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                              int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {});
+    static Status validate_mm(const ITensorInfo             *src,
+                              const ITensorInfo             *weights,
+                              const ITensorInfo             *biases,
+                              const ITensorInfo             *dst,
+                              const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                              int                            gemm_3d_depth,
+                              bool                           skip_im2col,
+                              const ActivationLayerInfo     &act_info);
 
     enum AuxTensorIdx
     {
@@ -178,10 +198,9 @@ private:
     bool _fuse_activation;
     bool _append_bias;
     bool _is_prepared;
-    bool _use_post_ops;
 
     experimental::MemoryRequirements _aux_mem;
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_CONV2D_H */
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
index 2622274587352eeb244ecf9fab3fe7bf78451231..71c247de79850db8f93535d084b03e06e9c9626b 100644
--- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
@@ -52,7 +52,7 @@ namespace
 {
 inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
 {
-    switch(kernel_type)
+    switch (kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         case CLGEMMKernelType::RESHAPED_ONLY_RHS:
@@ -71,32 +71,41 @@ inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
 inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run)
 {
     auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
-    if(bool(gemm_kernel))
+    if (bool(gemm_kernel))
     {
-        if(validate_gemm_kernel(gemm_kernel.gemm_type))
+        if (validate_gemm_kernel(gemm_kernel.gemm_type))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.",
+                                                      to_string(gemm_kernel.gemm_type).c_str());
             return gemm_kernel.gemm_type;
         }
     }
     gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.",
+                                              to_string(gemm_kernel.gemm_type).c_str());
     return gemm_kernel.gemm_type;
 }
 
 // Validate lhs_info and rhs_info for native kernel
-inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
+inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info,
+                                         const GEMMRHSMatrixInfo &rhs_info,
+                                         const ITensorInfo       *a,
+                                         const ITensorInfo       *b,
+                                         const GEMMReshapeInfo   &reshape_info)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
     TensorInfo mm_result_s32_info{};
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32));
+    auto_init_if_empty(
+        mm_result_s32_info,
+        a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32));
     // Validate mm kernel
     // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info
     // NOTE: This assumes:
     //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments).
     //  2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window).
-    if(!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)))
+    if (!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info,
+                                                             reshape_info)))
     {
         return false;
     }
@@ -104,31 +113,45 @@ inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, cons
 }
 
 // Automatically select between mlgo (prioritized) and default heuristics for native kernel configs
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query,
+                                                                               const ITensorInfo           *a,
+                                                                               const ITensorInfo           *b,
+                                                                               const GEMMReshapeInfo &reshape_info)
 {
     auto config = auto_heuristics::select_mlgo_gemm_config_native(query);
-    if(config)
+    if (config)
     {
-        if(validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))
+        if (validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
         }
     }
     config = auto_heuristics::select_default_gemm_config_native(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ",
+                                              to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 
 // Validate lhs_info and rhs_info for reshaped only rhs kernel
-inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
-                                                    unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
+inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info,
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const ITensorInfo       *a,
+                                                    const ITensorInfo       *b,
+                                                    const ITensorInfo       *output,
+                                                    unsigned int             m,
+                                                    unsigned int             n,
+                                                    unsigned int             k,
+                                                    bool                     reinterpret_input_as_3d,
+                                                    int                      depth_output_gemm3d)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
     TensorInfo tmp_b_info{};
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
@@ -148,7 +171,8 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs
     // Since we ignore the output stage, output data type has to be S32 to pass the validation
     TensorInfo output_info_copy(*output);
     output_info_copy.set_data_type(DataType::S32);
-    if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
+    if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy,
+                                                                      gemm_kernel_info)))
     {
         return false;
     }
@@ -156,14 +180,22 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs
 }
 
 // Validate lhs_info and rhs_info for reshaped only rhs kernel
-inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
-                                                         unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
+inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info,
+                                                         const GEMMRHSMatrixInfo &rhs_info,
+                                                         const ITensorInfo       *a,
+                                                         const ITensorInfo       *b,
+                                                         const ITensorInfo       *output,
+                                                         unsigned int             m,
+                                                         unsigned int             n,
+                                                         unsigned int             k,
+                                                         bool                     reinterpret_input_as_3d,
+                                                         int                      depth_output_gemm3d)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
     TensorInfo tmp_b_info{};
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
@@ -183,7 +215,8 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo
     // Since we ignore the output stage, output data type has to be S32 to pass the validation
     TensorInfo output_info_copy(*output);
     output_info_copy.set_data_type(DataType::S32);
-    if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
+    if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy,
+                                                                          gemm_kernel_info)))
     {
         return false;
     }
@@ -191,40 +224,55 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo
 }
 
 // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
-                                                                                          const ITensorInfo *a,
-                                                                                          const ITensorInfo *b, const ITensorInfo *output)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query,
+                                          bool                         reinterpret_input_as_3d,
+                                          int                          depth_output_gemm3d,
+                                          const ITensorInfo           *a,
+                                          const ITensorInfo           *b,
+                                          const ITensorInfo           *output)
 {
     auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
-    if(config)
+    if (config)
     {
-        if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d))
+        if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n,
+                                                    query.k, reinterpret_input_as_3d, depth_output_gemm3d))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
         }
     }
     config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 
 // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
-                                                                                               const ITensorInfo *a,
-                                                                                               const ITensorInfo *b, const ITensorInfo *output)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query,
+                                               bool                         reinterpret_input_as_3d,
+                                               int                          depth_output_gemm3d,
+                                               const ITensorInfo           *a,
+                                               const ITensorInfo           *b,
+                                               const ITensorInfo           *output)
 {
     ARM_COMPUTE_UNUSED(a, b, output, reinterpret_input_as_3d, depth_output_gemm3d);
     auto config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
-    validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(),
-                                              to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n,
+                                                 query.k, reinterpret_input_as_3d, depth_output_gemm3d);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 
 inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
 {
-    switch(kernel_type)
+    switch (kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
             return false;
@@ -254,8 +302,11 @@ ClGemmLowpMatrixMultiplyCore::ClGemmLowpMatrixMultiplyCore()
 ClGemmLowpMatrixMultiplyCore::~ClGemmLowpMatrixMultiplyCore() = default;
 
 void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context,
-                                             ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output,
-                                             const GEMMInfo &gemm_info)
+                                             ITensorInfo            *a,
+                                             ITensorInfo            *b,
+                                             ITensorInfo            *c,
+                                             ITensorInfo            *output,
+                                             const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
     ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info));
@@ -263,8 +314,8 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
 
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
     _a_offset                    = a->quantization_info().uniform().offset;
-    _convert_to_qasymm8          = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
-                                   && a->data_type() == DataType::QASYMM8;
+    _convert_to_qasymm8          = is_data_type_quantized_per_channel(b->data_type()) &&
+                          is_data_type_quantized_symmetric(b->data_type()) && a->data_type() == DataType::QASYMM8;
     _b_offset  = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset;
     _gemm_info = gemm_info;
 
@@ -282,17 +333,18 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     // Arguments used by GEMMReshapeInfo
     // in order to know how the matrices have been reshaped
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
 
     const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
-    _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run);
+    _gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size}, _reshape_b_only_on_first_run);
 
-    if(_convert_to_qasymm8)
+    if (_convert_to_qasymm8)
     {
         // Set data type for converted weights
         _qasymm8_weights = *b;
@@ -301,47 +353,50 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     }
 
     ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
-    if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+    if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
     {
         matrix_b = &_tmp_b;
 
         // Pick up the GEMM configuration
         // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
-                                                                                 depth_output_gemm3d,
-                                                                                 a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
+        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d,
+            depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
 
         // Configure reshape RHS kernel
-        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
+        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b,
+                                         rhs_info);
     }
-    if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+    if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
     {
         matrix_b = &_tmp_b;
 
         // Pick up the GEMM configuration
         // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
-                                                                                      depth_output_gemm3d,
-                                                                                      a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
+        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d,
+            depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
 
         // Configure reshape RHS kernel
-        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
+        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b,
+                                         rhs_info);
     }
 
     // Using default reduction info
-    const GEMMLowpReductionKernelInfo reduction_info {};
+    const GEMMLowpReductionKernelInfo reduction_info{};
 
     // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0)
+    if (_a_offset != 0)
     {
         _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
 
         // Configure Matrix B reduction kernel
-        _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
+        _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b,
+                                           &_vector_sum_col, reduction_info);
     }
 
     // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-    if(_b_offset != 0)
+    if (_b_offset != 0)
     {
         _vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 
@@ -360,17 +415,19 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     gemm_kernel_info.a_offset                = _a_offset;
     gemm_kernel_info.b_offset                = _b_offset;
     // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
-    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
     {
         // Configure offset contribution kernel
-        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                       ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                       : 1;
 
         _gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
         _gemm_output_stage_shifts      = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
 
         GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
         gemmlowp_output_stage.output_data_type        = a->data_type();
-        if(num_filters == 1)
+        if (num_filters == 1)
         {
             // Per-channel quantization with OFM == 1 is equivalent to uniform quantization.
             // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts
@@ -379,55 +436,67 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
 
         gemm_kernel_info.output_stage = gemmlowp_output_stage;
 
-        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS &&
+            gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             // Configure and tune matrix multiply kernel with fused output stage
-            _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                    _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+            _mm_reshaped_only_rhs_kernel->configure(
+                compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
         }
-        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL &&
+                 gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             // Configure and tune matrix multiply kernel with fused output stage
-            _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                         _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+            _mm_reshaped_only_rhs_mmul_kernel->configure(
+                compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
         }
         else
         {
             _run_output_stage = true;
 
-            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
             {
-                _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+                _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32,
+                                                        gemm_kernel_info);
             }
-            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
             {
-                _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+                _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32,
+                                                             gemm_kernel_info);
             }
             else
             {
                 // Pick up the GEMM configuration
                 // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-                std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
-                                                                              a, _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);
+                std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(
+                    auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a,
+                    _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);
 
                 // Configure matrix multiply kernel
-                _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info);
-
-                _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,
-                                                                    c != nullptr ? c : nullptr, output, a->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage,
-                                                                    &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+                _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info,
+                                             reshape_info);
+
+                _offset_contribution_output_stage_kernel->configure(
+                    compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                    _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, output, a->dimension(0),
+                    _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers,
+                    &_gemm_output_stage_shifts);
             }
         }
     }
     else
     {
         _run_offset_contribution = true;
-        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
         {
             // Configure and tune matrix multiply kernel
             _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
         }
-        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
         {
             // Configure and tune matrix multiply kernel
             _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
@@ -436,44 +505,65 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         {
             // Pick up the GEMM configuration
             // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-            std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
-                                                                          a, _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info);
+            std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(
+                auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a,
+                _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info);
 
             // Configure matrix multiply kernel
             _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info);
         }
 
         // Configure offset contribution kernel
-        _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,
-                                               c != nullptr ? c : nullptr, a->dimension(0), _a_offset, _b_offset);
+        _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                               _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                                               a->dimension(0), _a_offset, _b_offset);
     }
 
     // Request memory
-    _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
-    if(is_gemm_reshaped(_gemm_kernel_type))
+    _aux_mem[RhsQAsymm8] =
+        MemoryInfo(offset_int_vec(RhsQAsymm8),
+                   _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+                   _qasymm8_weights.total_size());
+    if (is_gemm_reshaped(_gemm_kernel_type))
     {
         // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation
-        _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
-        _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
-    }
-    if(_a_offset != 0)
-    {
-        _aux_mem[VecSumCol] = MemoryInfo(offset_int_vec(VecSumCol), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _vector_sum_col.total_size());
-    }
-    if(_b_offset != 0)
-    {
-        _aux_mem[VecSumRow] = MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
-    }
-    _aux_mem[ResultS32]   = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
-    _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, _gemm_output_stage_multipliers.total_size());
-    _aux_mem[Shifts]      = MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size());
+        _aux_mem[RhsQAsymm8] =
+            MemoryInfo(offset_int_vec(RhsQAsymm8),
+                       _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary,
+                       _qasymm8_weights.total_size());
+        _aux_mem[RhsReshape] = MemoryInfo(
+            offset_int_vec(RhsReshape),
+            _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    }
+    if (_a_offset != 0)
+    {
+        _aux_mem[VecSumCol] =
+            MemoryInfo(offset_int_vec(VecSumCol),
+                       _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+                       _vector_sum_col.total_size());
+    }
+    if (_b_offset != 0)
+    {
+        _aux_mem[VecSumRow] =
+            MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
+    }
+    _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
+    _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent,
+                                       _gemm_output_stage_multipliers.total_size());
+    _aux_mem[Shifts] =
+        MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size());
 }
 
-Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                              const ITensorInfo *b,
+                                              const ITensorInfo *c,
+                                              const ITensorInfo *output,
+                                              const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
@@ -492,39 +582,44 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
     const GPUTarget gpu_target = CLScheduler::get().target();
 
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
 
-    bool reshape_matrix_b = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, gemm_info.reshape_b_only_on_first_run()));
+    bool reshape_matrix_b = is_gemm_reshaped(
+        auto_select_gemm_kernel(auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size},
+                                gemm_info.reshape_b_only_on_first_run()));
 
     const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
-    bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
-                              && is_data_type_quantized_asymmetric(a->data_type());
+    bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) &&
+                              is_data_type_quantized_symmetric(b->data_type()) &&
+                              is_data_type_quantized_asymmetric(a->data_type());
     TensorInfo weights_info(*b);
-    if(convert_to_qasymm8)
+    if (convert_to_qasymm8)
     {
         b_offset = -128;
         weights_info.set_data_type(DataType::QASYMM8);
         ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP));
     }
     const ITensorInfo *matrix_b_info = &weights_info;
-    if(reshape_matrix_b)
+    if (reshape_matrix_b)
     {
         matrix_b_info = &tmp_b_info;
 
         // Pick up the GEMM configuration
         // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
         // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-        const auto res = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
-        lhs_info       = res.lhs_info;
-        rhs_info       = res.rhs_info;
+        const auto res = select_default_gemm_config_reshaped_only_rhs(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+        lhs_info = res.lhs_info;
+        rhs_info = res.rhs_info;
 
         // Validate reshape RHS kernel
-        auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
+        auto_init_if_empty(tmp_b_info,
+                           weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
         ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
     }
 
@@ -533,21 +628,23 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
 
     const GEMMLowpReductionKernelInfo reduction_info;
     // Validate matrix B reduction kernel only if _a_offset is not equal to 0
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
 
         // Configure Matrix B reduction kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
     }
 
     // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 
         // Configure matrix A reduction kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
     }
 
     GEMMKernelInfo gemm_kernel_info;
@@ -560,92 +657,99 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
     gemm_kernel_info.rhs_info                = rhs_info;
     gemm_kernel_info.a_offset                = a_offset;
     gemm_kernel_info.b_offset                = b_offset;
-    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
     {
-        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                       ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                       : 1;
 
-        const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+        const TensorInfo gemm_output_stage_multipliers_shifts_info(
+            TensorInfo(TensorShape(num_filters), 1, DataType::S32));
 
         GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
         gemmlowp_output_stage.output_data_type        = a->data_type();
 
         gemm_kernel_info.output_stage = gemmlowp_output_stage;
-        if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        if (reshape_matrix_b &&
+            gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
-                                                                                                a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                c,
-                                                                                                &gemm_output_stage_multipliers_shifts_info,
-                                                                                                &gemm_output_stage_multipliers_shifts_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                matrix_a_info, matrix_b_info, output, gemm_kernel_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+                b_offset == 0 ? nullptr : &info_vector_sum_row, c, &gemm_output_stage_multipliers_shifts_info,
+                &gemm_output_stage_multipliers_shifts_info));
         }
         else
         {
             TensorInfo mm_result_s32_info{};
 
-            if(reshape_matrix_b)
+            if (reshape_matrix_b)
             {
                 // Output tensor auto inizialitation if not yet initialized
-                auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
+                auto_init_if_empty(mm_result_s32_info, a->clone()
+                                                           ->set_tensor_shape(compute_mm_shape(
+                                                               *matrix_a_info, *matrix_b_info, reshape_info))
+                                                           .set_data_type(DataType::S32));
 
                 // Validate matrix multiply
-                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
+                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                    matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
             }
             else
             {
                 // Output tensor auto inizialitation if not yet initialized
-                auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
+                auto_init_if_empty(mm_result_s32_info, a->clone()
+                                                           ->set_tensor_shape(compute_mm_shape(
+                                                               *matrix_a_info, *matrix_b_info, false, reshape_info))
+                                                           .set_data_type(DataType::S32));
 
                 // Pick up the GEMM configuration
                 // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
                 // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-                const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
-                lhs_info       = res.lhs_info;
-                rhs_info       = res.rhs_info;
+                const auto res = select_default_gemm_config_native(
+                    auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+                lhs_info = res.lhs_info;
+                rhs_info = res.rhs_info;
 
                 // Validate matrix multiply
-                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(
+                    matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
             }
 
             // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
-                                                                                                a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                c,
-                                                                                                output,
-                                                                                                a_offset, b_offset,
-                                                                                                gemmlowp_output_stage,
-                                                                                                &gemm_output_stage_multipliers_shifts_info,
-                                                                                                &gemm_output_stage_multipliers_shifts_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(
+                &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+                b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, gemmlowp_output_stage,
+                &gemm_output_stage_multipliers_shifts_info, &gemm_output_stage_multipliers_shifts_info));
         }
     }
     else
     {
-        if(reshape_matrix_b)
+        if (reshape_matrix_b)
         {
             // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                matrix_a_info, matrix_b_info, output, gemm_kernel_info));
         }
         else
         {
             // Pick up the GEMM configuration
             // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-            const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
-            lhs_info       = res.lhs_info;
-            rhs_info       = res.rhs_info;
+            const auto res = select_default_gemm_config_native(
+                auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+            lhs_info = res.lhs_info;
+            rhs_info = res.rhs_info;
 
             // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(
+                matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
         }
 
-        if(output->total_size() != 0)
+        if (output->total_size() != 0)
         {
             // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(output,
-                                                                                     a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                     b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                     c,
-                                                                                     a_offset, b_offset));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(
+                output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row,
+                c, a_offset, b_offset));
         }
     }
 
@@ -675,73 +779,61 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
     const ITensor *matrix_a = a;
     const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b;
 
-    if(is_gemm_reshaped(_gemm_kernel_type))
+    if (is_gemm_reshaped(_gemm_kernel_type))
     {
         matrix_b = tmp_b.get();
-        if(!_reshape_b_only_on_first_run)
+        if (!_reshape_b_only_on_first_run)
         {
             // Run reshape matrix B
-            ITensorPack mtx_b_reshape_pack =
-            {
-                { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-                { TensorType::ACL_DST, tmp_b.get() }
-            };
+            ITensorPack mtx_b_reshape_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                              {TensorType::ACL_DST, tmp_b.get()}};
             CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false);
         }
     }
 
     // Run matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
+    if (_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
-        ITensorPack mtx_b_red_pack =
-        {
-            { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-            { TensorType::ACL_DST, vec_sum_col.get() }
-        };
+        ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                      {TensorType::ACL_DST, vec_sum_col.get()}};
         CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
     }
 
     // Run matrix A reduction kernel only if _b_offset is not equal to 0
-    if(_b_offset != 0)
+    if (_b_offset != 0)
     {
-        ITensorPack mtx_a_red_pack =
-        {
-            { TensorType::ACL_SRC, matrix_a },
-            { TensorType::ACL_DST, vec_sum_row.get() }
-        };
+        ITensorPack mtx_a_red_pack = {{TensorType::ACL_SRC, matrix_a}, {TensorType::ACL_DST, vec_sum_row.get()}};
         CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false);
     }
 
     // Run matrix multiply
-    if(is_gemm_reshaped(_gemm_kernel_type))
+    if (is_gemm_reshaped(_gemm_kernel_type))
     {
         ITensorPack gemm_reshaped_pack;
-        if(_run_offset_contribution)
+        if (_run_offset_contribution)
         {
-            gemm_reshaped_pack = ITensorPack({ { TensorType::ACL_SRC_0, matrix_a },
-                { TensorType::ACL_SRC_1, matrix_b },
-                { TensorType::ACL_DST, _run_output_stage ? res32.get() : dst }
-            });
+            gemm_reshaped_pack = ITensorPack({{TensorType::ACL_SRC_0, matrix_a},
+                                              {TensorType::ACL_SRC_1, matrix_b},
+                                              {TensorType::ACL_DST, _run_output_stage ? res32.get() : dst}});
         }
         else
         {
-            gemm_reshaped_pack = ITensorPack(
-            {
-                { TensorType::ACL_SRC, matrix_a },
-                { TensorType::ACL_SRC_1, matrix_b },
-                { TensorType::ACL_BIAS, c },
-                { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
-                { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },
-                { TensorType::ACL_SHIFTS, shifts.get() },
-                { TensorType::ACL_MULTIPLIERS, multipliers.get() },
-                { TensorType::ACL_DST, dst },
+            gemm_reshaped_pack = ITensorPack({
+                {TensorType::ACL_SRC, matrix_a},
+                {TensorType::ACL_SRC_1, matrix_b},
+                {TensorType::ACL_BIAS, c},
+                {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+                {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()},
+                {TensorType::ACL_SHIFTS, shifts.get()},
+                {TensorType::ACL_MULTIPLIERS, multipliers.get()},
+                {TensorType::ACL_DST, dst},
             });
         }
-        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
         {
             CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
         }
-        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
         {
             CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false);
         }
@@ -752,46 +844,39 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
     }
     else
     {
-        ITensorPack gemm_native_pack =
-        {
-            { TensorType::ACL_SRC_0, matrix_a },
-            { TensorType::ACL_SRC_1, matrix_b },
-            { TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get() }
-        };
+        ITensorPack gemm_native_pack = {{TensorType::ACL_SRC_0, matrix_a},
+                                        {TensorType::ACL_SRC_1, matrix_b},
+                                        {TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get()}};
         CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false);
     }
-    if(_run_output_stage)
+    if (_run_output_stage)
     {
         // Run offset contribution/output stage kernel
-        ITensorPack output_stage_pack =
-        {
-            { TensorType::ACL_SRC, res32.get() },
-            { TensorType::ACL_BIAS, c },
-            { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
-            { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },
-            { TensorType::ACL_SHIFTS, shifts.get() },
-            { TensorType::ACL_MULTIPLIERS, multipliers.get() },
-            { TensorType::ACL_DST, dst },
+        ITensorPack output_stage_pack = {
+            {TensorType::ACL_SRC, res32.get()},
+            {TensorType::ACL_BIAS, c},
+            {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+            {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()},
+            {TensorType::ACL_SHIFTS, shifts.get()},
+            {TensorType::ACL_MULTIPLIERS, multipliers.get()},
+            {TensorType::ACL_DST, dst},
         };
         CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true);
     }
-    if(_run_offset_contribution)
+    if (_run_offset_contribution)
     {
         // Run offset contribution kernel
-        ITensorPack offset_contrib_pack =
-        {
-            { TensorType::ACL_SRC_DST, dst },
-            { TensorType::ACL_BIAS, c },
-            { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
-            { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }
-        };
+        ITensorPack offset_contrib_pack = {{TensorType::ACL_SRC_DST, dst},
+                                           {TensorType::ACL_BIAS, c},
+                                           {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+                                           {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}};
         CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true);
     }
 }
 
 void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         auto               b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
@@ -800,56 +885,55 @@ void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
 
         ARM_COMPUTE_ERROR_ON_NULLPTR(b);
 
-        if(_convert_to_qasymm8)
+        if (_convert_to_qasymm8)
         {
-            ITensorPack convert_to_qs8_pack = { { ACL_SRC, b }, { ACL_DST, rhs_qasymm8.get() } };
+            ITensorPack convert_to_qs8_pack = {{ACL_SRC, b}, {ACL_DST, rhs_qasymm8.get()}};
             CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false);
             b->mark_as_unused();
         }
 
-        if(is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run)
+        if (is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run)
         {
             // Run reshape kernel and mark original weights tensor as unused
-            ITensorPack mtx_b_pack =
-            {
-                { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-                { TensorType::ACL_DST, tmp_b.get() }
-            };
+            ITensorPack mtx_b_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                      {TensorType::ACL_DST, tmp_b.get()}};
             CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false);
             b->mark_as_unused();
         }
 
         // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0 && _reshape_b_only_on_first_run)
+        if (_a_offset != 0 && _reshape_b_only_on_first_run)
         {
-            ITensorPack mtx_b_red_pack =
-            {
-                { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-                { TensorType::ACL_DST, vec_sum_col.get() }
-            };
+            ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                          {TensorType::ACL_DST, vec_sum_col.get()}};
             CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
         }
 
         // Compute GEMM output multipliers and shifts for output stage
         {
-            const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+            const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                           ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                           : 1;
 
             CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false);
             CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false);
 
             ICLTensor *multiplier_tensor = multipliers.get();
-            if(multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0)
+            if (multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0)
             {
                 multiplier_tensor->map(CLScheduler::get().queue(), true);
-                std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
+                std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)),
+                            _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(),
+                            num_filters * sizeof(int32_t));
                 multiplier_tensor->unmap(CLScheduler::get().queue());
             }
 
             ICLTensor *shifts_tensor = shifts.get();
-            if(shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0)
+            if (shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0)
             {
                 shifts_tensor->map(CLScheduler::get().queue(), true);
-                std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
+                std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)),
+                            _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
                 shifts_tensor->unmap(CLScheduler::get().queue());
             }
         }
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
index 6e32a90fc47978c8c23f7009c472c9051cf9c7ce..c80dc3a18225d60ca36b33ac0ce8585b73ae25aa 100644
--- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
@@ -93,18 +93,27 @@ public:
      * @param[in]  gemm_info       (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *a,
+                   ITensorInfo            *b,
+                   ITensorInfo            *c,
+                   ITensorInfo            *output,
+                   const GEMMInfo         &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClGemmLowpMatrixMultiplyCore::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -130,7 +139,7 @@ private:
     std::unique_ptr<kernels::ClGemmLowpMatrixAReductionKernel>                  _mtx_a_reduction_kernel;
     std::unique_ptr<kernels::ClGemmLowpMatrixBReductionKernel>                  _mtx_b_reduction_kernel;
     std::unique_ptr<kernels::ClGemmLowpOffsetContributionKernel>                _offset_contribution_kernel;
-    std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel>     _offset_contribution_output_stage_kernel;
+    std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
 
     // Temporary tensors
     TensorInfo _qasymm8_weights{};
@@ -141,13 +150,13 @@ private:
     TensorInfo _gemm_output_stage_multipliers{};
     TensorInfo _gemm_output_stage_shifts{};
 
-    int32_t          _a_offset{ 0 };
-    int32_t          _b_offset{ 0 };
-    bool             _reshape_b_only_on_first_run{ false };
-    bool             _run_output_stage{ false };
-    bool             _convert_to_qasymm8{ false };
-    bool             _run_offset_contribution{ false };
-    bool             _is_prepared{ false };
+    int32_t          _a_offset{0};
+    int32_t          _b_offset{0};
+    bool             _reshape_b_only_on_first_run{false};
+    bool             _run_output_stage{false};
+    bool             _convert_to_qasymm8{false};
+    bool             _run_offset_contribution{false};
+    bool             _is_prepared{false};
     GEMMInfo         _gemm_info{};
     CLGEMMKernelType _gemm_kernel_type{};
 
diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
index a61b11a3b15baf5c0635416be111300b72fc76d5..e3363e36857e0d8bb94e96e390649b09f36212b1 100644
--- a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
+++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
@@ -27,22 +27,25 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+void ClGemmLowpOutputStage::configure(const CLCompileContext        &compile_context,
+                                      const ITensorInfo             *src,
+                                      const ITensorInfo             *bias,
+                                      ITensorInfo                   *dst,
+                                      const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info);
 
-    switch(info.type)
+    switch (info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
         {
@@ -70,12 +73,16 @@ void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, c
     }
 }
 
-Status ClGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+Status ClGemmLowpOutputStage::validate(const ITensorInfo             *src,
+                                       const ITensorInfo             *bias,
+                                       const ITensorInfo             *dst,
+                                       const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM16);
 
-    switch(info.type)
+    switch (info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
             return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(src, bias, dst, &info);
@@ -94,7 +101,7 @@ void ClGemmLowpOutputStage::run(ITensorPack &tensors)
     const ITensor *bias = tensors.get_const_tensor(ACL_BIAS);
     ITensor       *dst  = tensors.get_tensor(ACL_DST);
 
-    ITensorPack pack{ { ACL_SRC, src }, { ACL_BIAS, bias }, { ACL_DST, dst } };
+    ITensorPack pack{{ACL_SRC, src}, {ACL_BIAS, bias}, {ACL_DST, dst}};
     CLScheduler::get().enqueue_op(*_kernel, pack, true);
 }
 } // namespace opencl
diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.h b/src/gpu/cl/operators/ClGemmLowpOutputStage.h
index 3f1b04dcce9d3e52dd94bcffa608139b56680832..6357e0200be30084b6dc3db9dd79f6465c557730 100644
--- a/src/gpu/cl/operators/ClGemmLowpOutputStage.h
+++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.h
@@ -71,14 +71,21 @@ public:
      * @param[out] dst             Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in]  info            GEMMLowp output stage metadata.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClGemmLowpOutputStage::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo &info);
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
diff --git a/src/gpu/cl/operators/ClIndirectConv2d.cpp b/src/gpu/cl/operators/ClIndirectConv2d.cpp
index b90097457474bdbc25bda0b8f011d922ef95d825..777fc9e5e14c556ac65587d5d15288cfb3ad8c66 100644
--- a/src/gpu/cl/operators/ClIndirectConv2d.cpp
+++ b/src/gpu/cl/operators/ClIndirectConv2d.cpp
@@ -27,16 +27,15 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h"
 #include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
 #include "src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h"
 #include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h"
 
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-
-#include "src/common/utils/Log.h"
-
 using namespace arm_compute::cl_indirect_conv;
 
 namespace arm_compute
@@ -47,7 +46,8 @@ using namespace arm_compute::experimental;
 
 namespace
 {
-DirectConvComputeKernelInfo config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo
+config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
 {
     // Get GPU target
     GPUTarget gpu_target = CLScheduler::get().target();
@@ -59,8 +59,13 @@ DirectConvComputeKernelInfo config_indirect_convolution_nhwc(const ITensorInfo *
 
 } // namespace
 
-void ClIndirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                 const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void ClIndirectConv2d::configure(const CLCompileContext    &compile_context,
+                                 ITensorInfo               *src,
+                                 ITensorInfo               *weights,
+                                 ITensorInfo               *biases,
+                                 ITensorInfo               *dst,
+                                 const PadStrideInfo       &conv_info,
+                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info);
@@ -86,25 +91,29 @@ void ClIndirectConv2d::configure(const CLCompileContext &compile_context, ITenso
     CLScheduler::get().tune_kernel_static(*_indirect_conv_kernel);
 
     // Request memory for the indirect buffer
-    _aux_mem[IndirectBuffer] = MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size());
+    _aux_mem[IndirectBuffer] =
+        MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size());
 }
 
-Status ClIndirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                  const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+Status ClIndirectConv2d::validate(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  const ITensorInfo         *dst,
+                                  const PadStrideInfo       &conv_info,
+                                  const ActivationLayerInfo &act_info)
 {
     // Initialize the direct convolution descriptor
     const DirectConvComputeKernelInfo desc = config_indirect_convolution_nhwc(src, weights, conv_info);
 
-    TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(),
-                                                                                         src->data_layout(),
-                                                                                         weights->tensor_shape(),
-                                                                                         conv_info,
-                                                                                         desc);
+    TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape(
+        src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc);
 
     TensorInfo indirect_buffer(ind_buffer_shape, 1, DataType::S32);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate(src, weights, &indirect_buffer, conv_info, desc));
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst, conv_info, act_info, desc));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate(
+        src, weights, &indirect_buffer, conv_info, desc));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst,
+                                                                          conv_info, act_info, desc));
 
     return Status{};
 }
@@ -124,9 +133,10 @@ void ClIndirectConv2d::run(ITensorPack &tensors)
 
 void ClIndirectConv2d::prepare(ITensorPack &constants)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        ICLTensor *indirect_buffer_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(IndirectBuffer)));
+        ICLTensor *indirect_buffer_aux =
+            utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(IndirectBuffer)));
         ARM_COMPUTE_ERROR_ON(indirect_buffer_aux == nullptr);
 
         ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Preparing indirect buffer");
@@ -134,7 +144,7 @@ void ClIndirectConv2d::prepare(ITensorPack &constants)
         CLAuxTensorHandler indirect_buffer(_indirect_buffer, *indirect_buffer_aux);
         ARM_COMPUTE_ERROR_ON(indirect_buffer.get()->cl_buffer().get() == nullptr);
 
-        ITensorPack indirect_buffer_pack{ { ACL_DST, indirect_buffer.get() } };
+        ITensorPack indirect_buffer_pack{{ACL_DST, indirect_buffer.get()}};
         CLScheduler::get().enqueue_op(*_addr_precalculation_kernel, indirect_buffer_pack, true);
 
         _is_prepared = true;
diff --git a/src/gpu/cl/operators/ClIndirectConv2d.h b/src/gpu/cl/operators/ClIndirectConv2d.h
index e50fa25069c63d82002a4ce50997eebdceaa1817..29e796efd967616b8da43f85392d6b864da89c42 100644
--- a/src/gpu/cl/operators/ClIndirectConv2d.h
+++ b/src/gpu/cl/operators/ClIndirectConv2d.h
@@ -77,7 +77,12 @@ public:
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -85,12 +90,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -100,11 +109,11 @@ private:
         Count
     };
 
-    std::unique_ptr<IClKernel>       _indirect_conv_kernel{ nullptr };
-    std::unique_ptr<IClKernel>       _addr_precalculation_kernel{ nullptr };
+    std::unique_ptr<IClKernel>       _indirect_conv_kernel{nullptr};
+    std::unique_ptr<IClKernel>       _addr_precalculation_kernel{nullptr};
     TensorInfo                       _indirect_buffer{};
-    bool                             _is_prepared{ false };
-    experimental::MemoryRequirements _aux_mem{ Count };
+    bool                             _is_prepared{false};
+    experimental::MemoryRequirements _aux_mem{Count};
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClLogicalNot.cpp b/src/gpu/cl/operators/ClLogicalNot.cpp
index b2eb89b320fe4f43bc7c34c3aa469203f13d269d..d8d4186d009e33afcfa0df3c110db4c96a5e7a17 100644
--- a/src/gpu/cl/operators/ClLogicalNot.cpp
+++ b/src/gpu/cl/operators/ClLogicalNot.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClLogicalNot.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp
index 49d14127cadb8a048346334c1f2ff090530e21e5..9962ee550acdd599f48754566d3cdd8d0cb87fa3 100644
--- a/src/gpu/cl/operators/ClMatMul.cpp
+++ b/src/gpu/cl/operators/ClMatMul.cpp
@@ -28,7 +28,10 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h"
 #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h"
 #include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h"
 #include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h"
 #include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
@@ -39,19 +42,76 @@ namespace arm_compute
 {
 namespace opencl
 {
+namespace
+{
+enum class MatMulKernelType
+{
+    /** Native matrix multiplication for FP types */
+    NATIVE_FP,
+
+    /** Native matrix multiplication for quantized types */
+    NATIVE_QUANTIZED,
+
+    /** Native matrix multiplication using MMUL extension for FP types */
+    NATIVE_MMUL_FP,
+
+    /** Native matrix multiplication using MMUL extension for Quantized types */
+    NATIVE_MMUL_QUANTIZED
+};
+
+MatMulKernelType get_matmul_kernel(const ITensorInfo         *lhs,
+                                   const ITensorInfo         *rhs,
+                                   const MatMulInfo          &matmul_info,
+                                   const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(lhs, rhs, matmul_info, act_info);
+
+    const bool is_quantized      = is_data_type_quantized_asymmetric(lhs->data_type());
+    const bool is_mmul_supported = arm_matrix_multiply_supported(CLKernelLibrary::get().get_device());
+
+    const int k = matmul_info.adj_lhs() ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+
+    if (is_quantized)
+    {
+        // MMUL kernel works only when K is a multiple of 16
+        if (is_mmul_supported && !act_info.enabled() && k % 16 == 0)
+        {
+            return MatMulKernelType::NATIVE_MMUL_QUANTIZED;
+        }
+
+        return MatMulKernelType::NATIVE_QUANTIZED;
+    }
+    else
+    {
+        // MMUL kernel works only when K is a multiple of 4
+        if (is_mmul_supported && !act_info.enabled() && k % 4 == 0)
+        {
+            return MatMulKernelType::NATIVE_MMUL_FP;
+        }
+
+        return MatMulKernelType::NATIVE_FP;
+    }
+
+    return is_quantized ? MatMulKernelType::NATIVE_QUANTIZED : MatMulKernelType::NATIVE_FP;
+}
+} // namespace
 using namespace arm_compute::opencl::kernels;
 
 ClMatMul::ClMatMul()
-    : _matmul_native_kernel(std::make_unique<ClMatMulNativeKernel>()),
-      _matmul_lowp_native_kernel(std::make_unique<ClMatMulLowpNativeKernel>())
 {
 }
 
-Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
+Status ClMatMul::validate(const ITensorInfo         *lhs,
+                          const ITensorInfo         *rhs,
+                          const ITensorInfo         *dst,
+                          const MatMulInfo          &matmul_info,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     const GPUTarget gpu_target = CLScheduler::get().target();
 
@@ -59,13 +119,27 @@ Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
 
     const MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
 
-    const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
-
-    return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info) :
-           ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+    switch (get_matmul_kernel(lhs, rhs, matmul_info, act_info))
+    {
+        case MatMulKernelType::NATIVE_FP:
+            return ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+        case MatMulKernelType::NATIVE_MMUL_FP:
+            return ClMatMulNativeMMULKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info);
+        case MatMulKernelType::NATIVE_QUANTIZED:
+            return ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+        case MatMulKernelType::NATIVE_MMUL_QUANTIZED:
+            return ClMatMulLowpNativeMMULKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+        default:
+            ARM_COMPUTE_ERROR("Unsupported MatMul Kernel!");
+    }
 }
 
-void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
+void ClMatMul::configure(const CLCompileContext    &compile_context,
+                         ITensorInfo               *lhs,
+                         ITensorInfo               *rhs,
+                         ITensorInfo               *dst,
+                         const MatMulInfo          &matmul_info,
+                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info);
@@ -73,40 +147,56 @@ void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *l
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_info));
 
-    _is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
-
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
-    std::unique_ptr<IClMatMulNativeKernelConfig> t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
-
-    MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
-
-    if(_is_quantized)
-    {
-        _matmul_lowp_native_kernel->set_target(gpu_target);
+    const GPUTarget        gpu_target    = CLScheduler::get().target();
+    const auto             kernel_config = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+    const MatMulKernelInfo kernel_info   = kernel_config->configure(lhs, rhs, matmul_info);
 
-        // Configure the low-precision native matrix multiply kernel
-        _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
-    }
-    else
+    switch (get_matmul_kernel(lhs, rhs, matmul_info, act_info))
     {
-        _matmul_native_kernel->set_target(gpu_target);
-
-        // Configure the native matrix multiply kernel
-        _matmul_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+        case MatMulKernelType::NATIVE_FP:
+        {
+            auto kernel = std::make_unique<ClMatMulNativeKernel>();
+            kernel->set_target(gpu_target);
+
+            kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+            _matmul_kernel = std::move(kernel);
+        }
+        break;
+        case MatMulKernelType::NATIVE_MMUL_FP:
+        {
+            auto kernel = std::make_unique<ClMatMulNativeMMULKernel>();
+            kernel->set_target(gpu_target);
+
+            kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info);
+            _matmul_kernel = std::move(kernel);
+        }
+        break;
+        case MatMulKernelType::NATIVE_QUANTIZED:
+        {
+            auto kernel = std::make_unique<ClMatMulLowpNativeKernel>();
+            kernel->set_target(gpu_target);
+
+            kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+            _matmul_kernel = std::move(kernel);
+        }
+        break;
+        case MatMulKernelType::NATIVE_MMUL_QUANTIZED:
+        {
+            auto kernel = std::make_unique<ClMatMulLowpNativeMMULKernel>();
+            kernel->set_target(gpu_target);
+
+            kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+            _matmul_kernel = std::move(kernel);
+        }
+        break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported MatMul Kernel!");
     }
 }
 
 void ClMatMul::run(ITensorPack &tensors)
 {
-    if(_is_quantized)
-    {
-        CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, tensors, true);
-    }
-    else
-    {
-        CLScheduler::get().enqueue_op(*_matmul_native_kernel, tensors, true);
-    }
+    CLScheduler::get().enqueue_op(*_matmul_kernel, tensors, /* flush */ true);
 }
 
 } // namespace opencl
diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h
index abbb75239a143323ae1d8119ad1e3f33f550090d..1733def21c14169e3a4bf6c986e78021bdd3914e 100644
--- a/src/gpu/cl/operators/ClMatMul.h
+++ b/src/gpu/cl/operators/ClMatMul.h
@@ -21,14 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_GPU_CL_OPERATORS_CLMATMUL
-#define ACL_SRC_GPU_CL_OPERATORS_CLMATMUL
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/function_info/MatMulInfo.h"
+
+#include "src/gpu/cl/IClKernel.h"
 #include "src/gpu/cl/IClOperator.h"
-#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
-#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
 
 #include <memory>
 
@@ -73,7 +73,11 @@ public:
      * @param[in]  matmul_info     Contains MatMul operation information described in @ref MatMulInfo.
      * @param[in]  act_info        Class containing information about fused activation function.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *dst,
+                   const MatMulInfo          &matmul_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -81,16 +85,17 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &matmul_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<kernels::ClMatMulNativeKernel>     _matmul_native_kernel{ nullptr };
-    std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{ nullptr };
-
-    bool _is_quantized{ false };
+    std::unique_ptr<opencl::IClKernel> _matmul_kernel{nullptr};
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ACL_SRC_GPU_CL_OPERATORS_CLMATMUL */
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H
diff --git a/src/gpu/cl/operators/ClMul.cpp b/src/gpu/cl/operators/ClMul.cpp
index 2066f0cfaa9e748fccc59e28494bb4f051cce898..10cf8a6a38fe0b8afef9765338c58573933f75e8 100644
--- a/src/gpu/cl/operators/ClMul.cpp
+++ b/src/gpu/cl/operators/ClMul.cpp
@@ -24,17 +24,23 @@
 #include "src/gpu/cl/operators/ClMul.h"
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/kernels/ClMulKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClMulKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                      ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void ClMul::configure(const CLCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      float                      scale,
+                      ConvertPolicy              overflow_policy,
+                      RoundingPolicy             rounding_policy,
+                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
     auto k = std::make_unique<kernels::ClMulKernel>();
@@ -42,22 +48,34 @@ void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1
     _kernel = std::move(k);
 }
 
-Status ClMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                       ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status ClMul::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       float                      scale,
+                       ConvertPolicy              overflow_policy,
+                       RoundingPolicy             rounding_policy,
+                       const ActivationLayerInfo &act_info)
 {
     return kernels::ClMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
 }
 
-void ClComplexMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClComplexMul::configure(const CLCompileContext    &compile_context,
+                             ITensorInfo               *src1,
+                             ITensorInfo               *src2,
+                             ITensorInfo               *dst,
+                             const ActivationLayerInfo &act_info)
 {
     auto k = std::make_unique<kernels::ClComplexMulKernel>();
     k->configure(compile_context, src1, src2, dst, act_info);
     _kernel = std::move(k);
 }
 
-Status ClComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClComplexMul::validate(const ITensorInfo         *src1,
+                              const ITensorInfo         *src2,
+                              const ITensorInfo         *dst,
+                              const ActivationLayerInfo &act_info)
 {
     return kernels::ClComplexMulKernel::validate(src1, src2, dst, act_info);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClMul.h b/src/gpu/cl/operators/ClMul.h
index 6086bc9d524638392db17cccc3eb3b9a016b281e..1cf4d68d4cd953490ee448e0528cfd5a1321ec43 100644
--- a/src/gpu/cl/operators/ClMul.h
+++ b/src/gpu/cl/operators/ClMul.h
@@ -66,16 +66,27 @@ public:
      * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClComplexMulKernel */
@@ -92,14 +103,21 @@ public:
      * @param[out]     dst             The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClComplexMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPRelu.cpp b/src/gpu/cl/operators/ClPRelu.cpp
index cf4ebe6083740af545d54b6b876da6be40065ba8..f3efd00bba9cba5ad1e8c3fb3244f7c3946fef14 100644
--- a/src/gpu/cl/operators/ClPRelu.cpp
+++ b/src/gpu/cl/operators/ClPRelu.cpp
@@ -23,16 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClPRelu.h"
 
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
 using KernelType = kernels::ClArithmeticKernel;
-void ClPRelu::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output)
+void ClPRelu::configure(const CLCompileContext &compile_context,
+                        ITensorInfo            *input,
+                        ITensorInfo            *alpha,
+                        ITensorInfo            *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input, alpha, output);
     auto k = std::make_unique<KernelType>();
@@ -49,7 +51,7 @@ void ClPRelu::run(ITensorPack &tensors)
 {
     // Output tensor can be given as nullptr for in-place computation.
     // In this case, get the input tensor and use it as the output tensor.
-    if(tensors.get_tensor(TensorType::ACL_DST) == nullptr)
+    if (tensors.get_tensor(TensorType::ACL_DST) == nullptr)
     {
         auto src_tensor = const_cast<ITensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
         ARM_COMPUTE_ERROR_ON_MSG(src_tensor == nullptr, "invalid source tensor is given for in-place computation");
@@ -58,4 +60,4 @@ void ClPRelu::run(ITensorPack &tensors)
     IClOperator::run(tensors);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPRelu.h b/src/gpu/cl/operators/ClPRelu.h
index 8084ab86cd26b29e2cbc825f4489a5d76bc10144..45ce858fb00eb237e617d7dcc1e19ec943fefdb7 100644
--- a/src/gpu/cl/operators/ClPRelu.h
+++ b/src/gpu/cl/operators/ClPRelu.h
@@ -47,7 +47,8 @@ public:
      * @param[in]  alpha           PRelu layer parameters. Data types supported: same of @p input.
      * @param[out] output          Destination tensor. Data type supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPRelu::configure()
diff --git a/src/gpu/cl/operators/ClPermute.cpp b/src/gpu/cl/operators/ClPermute.cpp
index ed56f97bfea1c04411797c18b06c3835e4843081..3851e22b6ae0488022798b8549737bb990364352 100644
--- a/src/gpu/cl/operators/ClPermute.cpp
+++ b/src/gpu/cl/operators/ClPermute.cpp
@@ -23,16 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClPermute.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClPermuteKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClPermute::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
+void ClPermute::configure(const ClCompileContext  &compile_context,
+                          const ITensorInfo       *src,
+                          ITensorInfo             *dst,
+                          const PermutationVector &perm)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, perm);
     auto k = std::make_unique<kernels::ClPermuteKernel>();
@@ -45,4 +47,4 @@ Status ClPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const
     return kernels::ClPermuteKernel::validate(src, dst, perm);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPermute.h b/src/gpu/cl/operators/ClPermute.h
index 3e87329f9bd10e22e3fee45f7fe7488e16b4f3ca..6349358a187de5a9eab99fe2befc18641b2dc5e6 100644
--- a/src/gpu/cl/operators/ClPermute.h
+++ b/src/gpu/cl/operators/ClPermute.h
@@ -44,7 +44,10 @@ public:
      * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
      * @param[in] perm            Permutation vector
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src,
+                   ITensorInfo             *dst,
+                   const PermutationVector &perm);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPermute::configure()
@@ -55,4 +58,4 @@ public:
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_PERMUTE_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_PERMUTE_H */
diff --git a/src/gpu/cl/operators/ClPool2d.cpp b/src/gpu/cl/operators/ClPool2d.cpp
index 3da90b8ced0f2d5801761e5e9c4d8084331d517f..e4507dc1a173aa8557e58ad80769aa702a5bce25 100644
--- a/src/gpu/cl/operators/ClPool2d.cpp
+++ b/src/gpu/cl/operators/ClPool2d.cpp
@@ -25,16 +25,19 @@
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClPool2dKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices)
+void ClPool2d::configure(const ClCompileContext &compile_context,
+                         ITensorInfo            *src,
+                         ITensorInfo            *dst,
+                         const PoolingLayerInfo &info,
+                         ITensorInfo            *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, dst, info, indices);
@@ -49,7 +52,10 @@ void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *s
     CLScheduler::get().tune_kernel_static(*_kernel);
 }
 
-Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices)
+Status ClPool2d::validate(const ITensorInfo      *src,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &info,
+                          const ITensorInfo      *indices)
 {
     return kernels::ClPool2dKernel::validate(src, dst, info, indices);
 }
diff --git a/src/gpu/cl/operators/ClPool2d.h b/src/gpu/cl/operators/ClPool2d.h
index f353ba262ebb7d8e93c7455b055ca57006134f07..9c2fd1c3f2ea7a76415359d44ec479095043c8c5 100644
--- a/src/gpu/cl/operators/ClPool2d.h
+++ b/src/gpu/cl/operators/ClPool2d.h
@@ -50,14 +50,21 @@ public:
      * @param[in]  info            Pooling layer parameters.
      * @param[out] indices         (optional) The indices info of the maximal values. Data type supported: U32.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const PoolingLayerInfo &info,
+                   ITensorInfo            *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPool2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &info,
+                           const ITensorInfo      *indices = nullptr);
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPool3d.cpp b/src/gpu/cl/operators/ClPool3d.cpp
index 7dec6c5958651a63ad15c407af3832924630f92b..d230413659c8b56f237d1950ec12d22f193321cf 100644
--- a/src/gpu/cl/operators/ClPool3d.cpp
+++ b/src/gpu/cl/operators/ClPool3d.cpp
@@ -25,16 +25,18 @@
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClPool3dKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClPool3d::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &info)
+void ClPool3d::configure(const ClCompileContext   &compile_context,
+                         const ITensorInfo        *src,
+                         ITensorInfo              *dst,
+                         const Pooling3dLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, dst, info);
diff --git a/src/gpu/cl/operators/ClPool3d.h b/src/gpu/cl/operators/ClPool3d.h
index 7d994fd19462ca512d83c6a4768b8428d9d1f322..9fd78bfd693c38663e90da0fb25787de89a35fa0 100644
--- a/src/gpu/cl/operators/ClPool3d.h
+++ b/src/gpu/cl/operators/ClPool3d.h
@@ -51,7 +51,10 @@ public:
      * @param[out] dst             Destination tensor info.
      * @param[in]  info            3d Pooling layer parameters.
      */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &info);
+    void configure(const ClCompileContext   &compile_context,
+                   const ITensorInfo        *src,
+                   ITensorInfo              *dst,
+                   const Pooling3dLayerInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPool3d::configure()
diff --git a/src/gpu/cl/operators/ClQuantize.cpp b/src/gpu/cl/operators/ClQuantize.cpp
index 47ae5cea4715a0388689b393e4288d72d670c1dd..8560b5553e9b6d6c54df03ce703e21885840eb4d 100644
--- a/src/gpu/cl/operators/ClQuantize.cpp
+++ b/src/gpu/cl/operators/ClQuantize.cpp
@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/kernels/ClQuantizeKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClQuantizeKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/gpu/cl/operators/ClReshape.cpp b/src/gpu/cl/operators/ClReshape.cpp
index 560966f4fcb7c1742a98b33a3ffe62ffbfeb8691..1dd5b760cb591cd8e5c3f99c4716a0a82a916ce5 100644
--- a/src/gpu/cl/operators/ClReshape.cpp
+++ b/src/gpu/cl/operators/ClReshape.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClReshape.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClReshapeKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -45,4 +44,4 @@ Status ClReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
     return kernels::ClReshapeKernel::validate(src, dst);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClScale.cpp b/src/gpu/cl/operators/ClScale.cpp
index 0798b19ca02b32474ea3bf191e76c8a1c3a23b57..184e2aa00623b08e59a64decd5df4e3a10ed2192 100644
--- a/src/gpu/cl/operators/ClScale.cpp
+++ b/src/gpu/cl/operators/ClScale.cpp
@@ -25,17 +25,20 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClScaleKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClScale::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
+void ClScale::configure(const CLCompileContext &compile_context,
+                        ITensorInfo            *src,
+                        ITensorInfo            *dst,
+                        const ScaleKernelInfo  &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, dst, info);
@@ -61,4 +64,4 @@ void ClScale::run(ITensorPack &tensors)
     CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClScale.h b/src/gpu/cl/operators/ClScale.h
index af97cf23e73fa6d8b8fcd079e85fa6703e13f5e2..1427bb4fdceafe5c95f1e71f88851120d47e5800 100644
--- a/src/gpu/cl/operators/ClScale.h
+++ b/src/gpu/cl/operators/ClScale.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_SCALE_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -49,7 +50,8 @@ public:
      *                                All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]     info            @ref ScaleKernelInfo descriptor to be used to configure
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClScale::configure()
diff --git a/src/gpu/cl/operators/ClSoftmax.cpp b/src/gpu/cl/operators/ClSoftmax.cpp
index 03809553a396a542ff450f9e59c8c232abfbb74e..427f6b4f9214eebad51a6370155d933ddf29d2c2 100644
--- a/src/gpu/cl/operators/ClSoftmax.cpp
+++ b/src/gpu/cl/operators/ClSoftmax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,15 +22,15 @@
  * SOFTWARE.
  */
 #include "src/gpu/cl/operators/ClSoftmax.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
-#include "src/core/helpers/SoftmaxHelpers.h"
 #include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
-#include "src/gpu/cl/operators/ClPermute.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-#include "support/Cast.h"
-
-#include "src/common/utils/Log.h"
 
 using namespace arm_compute::experimental;
 
@@ -38,152 +38,47 @@ namespace arm_compute
 {
 namespace opencl
 {
-ClSoftmax::ClSoftmax()
-    : _permute_input(std::make_unique<ClPermute>()),
-      _permute_output(std::make_unique<ClPermute>()),
-      _max_shift_exp_sum_kernel(std::make_unique<kernels::ClLogits1DMaxShiftExpSumKernel>()),
-      _norm_kernel(std::make_unique<kernels::ClLogits1DNormKernel>()),
-      _max_info(),
-      _sum_info(),
-      _tmp_info(),
-      _permuted_src_info(),
-      _permuted_dst_info(),
-      _aux_mem(InternalTensorIdx::COUNT)
+
+ClSoftmax::ClSoftmax() : _aux_mem(InternalTensorIdx::COUNT)
 {
 }
 
-void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info)
+void ClSoftmax::configure(const CLCompileContext  &compile_context,
+                          const ITensorInfo       &src,
+                          ITensorInfo             &dst,
+                          const SoftmaxKernelInfo &info)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, info));
     ARM_COMPUTE_LOG_PARAMS(src, dst, info);
 
-    const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
-
-    _needs_permute = actual_axis != 0;
-
-    const ITensorInfo &tmp_input_info  = _needs_permute ? _permuted_src_info : src;
-    ITensorInfo       &tmp_output_info = _needs_permute ? _permuted_dst_info : dst;
+    auto k = std::make_unique<kernels::ClSoftmaxKernel>();
+    k->configure(compile_context, src, dst, info);
 
-    if(_needs_permute)
-    {
-        const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
-        _permute_input->configure(compile_context, &src, &_permuted_src_info, perm_info);
-    }
+    _tmp_info = k->tmp_tensor_info();
 
-    DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type();
-    _tmp_info              = tmp_input_info.clone()->set_data_type(tmp_data_type);
+    _kernel = std::move(k);
 
-    TensorShape max_sum_shape = tmp_input_info.tensor_shape();
-    _max_info                 = tmp_input_info.clone()->set_tensor_shape(max_sum_shape);
-    _sum_info                 = tmp_input_info.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type);
-
-    // Set GPU target to kernels
-    _max_shift_exp_sum_kernel->set_target(CLScheduler::get().target());
-
-    _max_shift_exp_sum_kernel->configure(compile_context, tmp_input_info, _max_info, _tmp_info, _sum_info, info);
-    _norm_kernel->configure(compile_context, _tmp_info, _sum_info, tmp_output_info, info);
-
-    if(_needs_permute)
-    {
-        const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
-        _permute_output->configure(compile_context, &_permuted_dst_info, &dst, perm_info);
-    }
-
-    _aux_mem[InternalTensorIdx::SUM] = MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size());
-    _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size());
-    _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size());
-
-    _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _permuted_src_info.total_size());
-    _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _permuted_dst_info.total_size());
+    _aux_mem[InternalTensorIdx::TMP] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size());
 }
 
 Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src.num_dimensions() > 4, "Only up to 4 dimensions are supported");
-    ARM_COMPUTE_UNUSED(info.beta);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) || static_cast<int32_t>(src.num_dimensions()) <= info.axis);
-
-    const size_t actual_axis   = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
-    const bool   needs_permute = actual_axis != 0;
-    if(needs_permute)
-    {
-        const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
-        const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector);
-        TensorInfo              input_permuted(src.clone()->set_tensor_shape(permuted_shape));
-        ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&src, &input_permuted, permutation_vector));
-        TensorInfo output_permuted(dst.clone()->set_tensor_shape(permuted_shape));
-        ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&output_permuted, &dst, permutation_vector));
-    }
-
-    // Create intermediate tensor info
-    DataType   tmp_data_type = is_data_type_quantized_asymmetric(src.data_type()) ? DataType::S32 : src.data_type();
-    TensorInfo tensor_info_tmp(src.clone()->set_data_type(tmp_data_type).set_is_resizable(true));
-
-    TensorShape max_sum_shape = src.tensor_shape();
-    max_sum_shape.set(0, 1);
-    TensorInfo tensor_info_max(src.clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
-    TensorInfo tensor_info_sum(src.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DNormKernel::validate(tensor_info_tmp, tensor_info_sum, dst, info));
-
-    return Status{};
+    return kernels::ClSoftmaxKernel::validate(src, dst, info);
 }
 
 void ClSoftmax::run(ITensorPack &tensors)
 {
-    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    CLAuxTensorHandler sum(offset_int_vec(InternalTensorIdx::SUM), _sum_info, tensors, false);
-    CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors, false);
-    CLAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max_info, tensors, false);
-
-    CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors, false);
-    CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors, false);
-
-    if(_needs_permute)
-    {
-        ITensorPack pack;
-        pack.add_const_tensor(TensorType::ACL_SRC, src);
-        pack.add_tensor(TensorType::ACL_DST, permuted_src.get());
-        _permute_input.get()->run(pack);
-    }
-
-    ITensorPack sum_pack;
-    ITensorPack norm_pack;
-    if(_needs_permute)
-    {
-        sum_pack.add_const_tensor(TensorType::ACL_SRC, permuted_src.get());
-        norm_pack.add_tensor(TensorType::ACL_DST, permuted_dst.get());
-    }
-    else
-    {
-        sum_pack.add_const_tensor(TensorType::ACL_SRC, src);
-        norm_pack.add_tensor(TensorType::ACL_DST, dst);
-    }
-    sum_pack.add_tensor(TensorType::ACL_DST, tmp.get());
-    sum_pack.add_tensor(TensorType::ACL_INT_0, max.get());
-    sum_pack.add_tensor(TensorType::ACL_INT_1, sum.get());
-
-    norm_pack.add_const_tensor(TensorType::ACL_SRC, tmp.get());
-    norm_pack.add_tensor(TensorType::ACL_INT_0, sum.get());
-
-    CLScheduler::get().enqueue_op(*_max_shift_exp_sum_kernel.get(), sum_pack, false);
-    CLScheduler::get().enqueue_op(*_norm_kernel.get(), norm_pack, false);
-
-    if(_needs_permute)
-    {
-        ITensorPack pack;
-        pack.add_const_tensor(TensorType::ACL_SRC, permuted_dst.get());
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _permute_output.get()->run(pack);
-    }
+    CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors);
+
+    tensors.add_tensor(TensorType::ACL_INT_0, tmp.get());
+
+    CLScheduler::get().enqueue_op(*_kernel, tensors, false);
 }
 
 experimental::MemoryRequirements ClSoftmax::workspace() const
 {
     return _aux_mem;
 }
+
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClSoftmax.h b/src/gpu/cl/operators/ClSoftmax.h
index 6c9af585d627df7cc6614b4dc7beb89e44982382..232fcfebd128fea4408ef8e049137ed8fc8b5786 100644
--- a/src/gpu/cl/operators/ClSoftmax.h
+++ b/src/gpu/cl/operators/ClSoftmax.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,24 +21,26 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_SOFTMAX_H
-#define ARM_COMPUTE_CL_SOFTMAX_H
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLSOFTMAX_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLSOFTMAX_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "src/gpu/cl/ClCompileContext.h"
+
 #include "src/gpu/cl/IClOperator.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
+class ITensorInfo;
+class ITensorPack;
 struct SoftmaxKernelInfo;
 
 namespace opencl
 {
-class ClPermute;
 namespace kernels
 {
-class ClLogits1DMaxShiftExpSumKernel;
-class ClLogits1DNormKernel;
+class ClSoftmaxKernel;
 } // namespace kernels
 class ClSoftmax : public IClOperator
 {
@@ -52,7 +54,10 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src
      * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       &src,
+                   ITensorInfo             &dst,
+                   const SoftmaxKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClSoftmax::configure()
@@ -60,36 +65,22 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
-    // Inherited methods overridden:
+
     void run(ITensorPack &tensors) override;
+
     experimental::MemoryRequirements workspace() const override;
 
 private:
     enum InternalTensorIdx
     {
-        MAX = 0,
-        SUM,
-        TMP,
-        PERMUTED_SRC,
-        PERMUTED_DST,
-        COUNT
+        TMP = 0,
+        COUNT,
     };
 
-    std::unique_ptr<ClPermute>                               _permute_input;
-    std::unique_ptr<ClPermute>                               _permute_output;
-    std::unique_ptr<kernels::ClLogits1DMaxShiftExpSumKernel> _max_shift_exp_sum_kernel;
-    std::unique_ptr<kernels::ClLogits1DNormKernel>           _norm_kernel;
-    bool                                                     _needs_permute{ false };
-
-    TensorInfo _max_info;
-    TensorInfo _sum_info;
-    TensorInfo _tmp_info;
-    TensorInfo _permuted_src_info;
-    TensorInfo _permuted_dst_info;
-
-    experimental::MemoryRequirements _aux_mem{};
+    TensorInfo                       _tmp_info{};
+    experimental::MemoryRequirements _aux_mem;
 };
 
-} // opencl
-} // arm_compute
-#endif /* ARM_COMPUTE_CL_SOFTMAX_H */
\ No newline at end of file
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLSOFTMAX_H
diff --git a/src/gpu/cl/operators/ClSub.cpp b/src/gpu/cl/operators/ClSub.cpp
index 53be04a70f037071a1a32ccd291c48b30c66fb06..5c6d0c31848b6bcb779e60ddc7482dc42da39c44 100644
--- a/src/gpu/cl/operators/ClSub.cpp
+++ b/src/gpu/cl/operators/ClSub.cpp
@@ -23,17 +23,20 @@
  */
 #include "src/gpu/cl/operators/ClSub.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
-                      ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void ClSub::configure(const ClCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      ConvertPolicy              policy,
+                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info);
     auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
@@ -41,8 +44,11 @@ void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1
     _kernel = std::move(k);
 }
 
-Status ClSub::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst,
-                       ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status ClSub::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
 {
     return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::SUB, src1, src2, dst, policy, act_info);
 }
diff --git a/src/gpu/cl/operators/ClSub.h b/src/gpu/cl/operators/ClSub.h
index 7eac4371434b34609c2d7886f847bd9518b8bed1..6a97275b86cc2e077ec18901cf57652a8a0879d7 100644
--- a/src/gpu/cl/operators/ClSub.h
+++ b/src/gpu/cl/operators/ClSub.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_SUB_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -65,7 +66,11 @@ public:
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -73,7 +78,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy,
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 } // namespace opencl
diff --git a/src/gpu/cl/operators/ClTranspose.cpp b/src/gpu/cl/operators/ClTranspose.cpp
index 26feffe2b9a0191621bbae07e34c8302d1d0e4c0..28da0d640aaea78a1eb1b8bdd9e9008904a1cbab 100644
--- a/src/gpu/cl/operators/ClTranspose.cpp
+++ b/src/gpu/cl/operators/ClTranspose.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClTranspose.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClTransposeKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -45,4 +44,4 @@ Status ClTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst)
     return kernels::ClTransposeKernel::validate(src, dst);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClTransposedConvolution.cpp b/src/gpu/cl/operators/ClTransposedConvolution.cpp
index 90dbe7f291bec32ed65afcec49b44646d1407535..cec438faeb7cd577e43d117efd7e6a77700503fd 100644
--- a/src/gpu/cl/operators/ClTransposedConvolution.cpp
+++ b/src/gpu/cl/operators/ClTransposedConvolution.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/gpu/cl/kernels/ClTransposedConvolutionKernel.h"
 
@@ -32,8 +33,12 @@ namespace arm_compute
 {
 namespace opencl
 {
-void ClTransposedConvolution::configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights,
-                                        const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info)
+void ClTransposedConvolution::configure(const CLCompileContext &compile_context,
+                                        const ITensorInfo      *input,
+                                        const ITensorInfo      *weights,
+                                        const ITensorInfo      *biases,
+                                        ITensorInfo            *output,
+                                        const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, deconv_info);
@@ -43,10 +48,14 @@ void ClTransposedConvolution::configure(const CLCompileContext &compile_context,
     _transposed_conv_kernel = std::move(kernel_object);
 }
 
-Status ClTransposedConvolution::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases,
-                                         const ITensorInfo *output, const PadStrideInfo &deconv_info)
+Status ClTransposedConvolution::validate(const ITensorInfo   *input,
+                                         const ITensorInfo   *weights,
+                                         const ITensorInfo   *biases,
+                                         const ITensorInfo   *output,
+                                         const PadStrideInfo &deconv_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info));
     return Status{};
 }
 
diff --git a/src/gpu/cl/operators/ClTransposedConvolution.h b/src/gpu/cl/operators/ClTransposedConvolution.h
index 58ebc689ed66bb7f85eb823dc774e351e13036fe..660c4f85c14e28efc10ba92a753d41f9ad7b771b 100644
--- a/src/gpu/cl/operators/ClTransposedConvolution.h
+++ b/src/gpu/cl/operators/ClTransposedConvolution.h
@@ -68,23 +68,30 @@ public:
      * @param[in]  deconv_info     Contains padding and stride information described in @ref PadStrideInfo.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights,
-                   const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   const ITensorInfo      *weights,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *output,
+                   const PadStrideInfo    &deconv_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClTransposedConvolution::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases,
-                           const ITensorInfo *output, const PadStrideInfo &deconv_info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *biases,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &deconv_info);
 
     // Inherited method overridden
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<IClKernel> _transposed_conv_kernel{ nullptr };
+    std::unique_ptr<IClKernel> _transposed_conv_kernel{nullptr};
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */
diff --git a/src/gpu/cl/operators/ClWinogradConv2d.cpp b/src/gpu/cl/operators/ClWinogradConv2d.cpp
index b4163a59863ee1e54df621f0130c795170715c8b..8ec96b247e118cc1a001c54bd45581c8a06f990d 100644
--- a/src/gpu/cl/operators/ClWinogradConv2d.cpp
+++ b/src/gpu/cl/operators/ClWinogradConv2d.cpp
@@ -24,20 +24,19 @@
 #include "src/gpu/cl/operators/ClWinogradConv2d.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h"
 #include "src/gpu/cl/kernels/ClWinogradInputTransformKernel.h"
 #include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 using namespace arm_compute::experimental;
@@ -55,15 +54,16 @@ Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims,
     const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height);
 
     // Check if the input spatial dimensions are smaller than 4
-    const bool is_input_lt4_nchw = (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW);
+    const bool is_input_lt4_nchw =
+        (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW);
 
-    if(kernel_max_dim == 3U)
+    if (kernel_max_dim == 3U)
     {
-        if(kernel_dims == Size2D(3U, 3U))
+        if (kernel_dims == Size2D(3U, 3U))
         {
             output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U);
         }
-        else if(kernel_dims == Size2D(3U, 1U))
+        else if (kernel_dims == Size2D(3U, 1U))
         {
             output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U);
         }
@@ -72,15 +72,13 @@ Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims,
             output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U);
         }
     }
-    else if(kernel_max_dim == 5U)
+    else if (kernel_max_dim == 5U)
     {
-        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U,
-                             kernel_dims.height == 1 ? 1U : 4U);
+        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, kernel_dims.height == 1 ? 1U : 4U);
     }
-    else if(kernel_max_dim == 7U)
+    else if (kernel_max_dim == 7U)
     {
-        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U,
-                             kernel_dims.height == 1 ? 1U : 2U);
+        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, kernel_dims.height == 1 ? 1U : 2U);
     }
 
     return output_tile;
@@ -91,11 +89,9 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz
     // Check if we want to configure a Winograd configuration which requires fast math
     using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    std::vector<WinogradConfiguration> fast_math_winograd =
-    {
+    std::vector<WinogradConfiguration> fast_math_winograd = {
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
-        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))
-    };
+        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))};
 
     auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
                             std::pair<int, int>(kernel_size.width, kernel_size.height));
@@ -103,8 +99,13 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz
     return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
 }
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                          const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status validate_arguments(const ITensorInfo         *src,
+                          const ITensorInfo         *weights,
+                          const ITensorInfo         *biases,
+                          const ITensorInfo         *dst,
+                          const PadStrideInfo       &conv_info,
+                          const ActivationLayerInfo &act_info,
+                          bool                       enable_fast_math)
 {
     // Get indeces for the width and height
     const size_t idx_width  = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
@@ -115,41 +116,49 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
     const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), "Winograd only supports padding up to half kernel size");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), "Winograd only supports padding up to half kernel size");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        ((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))),
+        "Winograd only supports padding up to half kernel size");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        ((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))),
+        "Winograd only supports padding up to half kernel size");
 
     // Check if the Winograd configuration requires fast math
-    if(!enable_fast_math)
+    if (!enable_fast_math)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+            src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size),
+                                        "This Winograd configuration requires enable_fast_math=true");
     }
 
-    const WinogradInfo winograd_info = WinogradInfo(output_tile,
-                                                    kernel_size,
-                                                    input_dims,
-                                                    conv_info,
-                                                    src->data_layout());
+    const WinogradInfo winograd_info =
+        WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout());
 
     // Validate input transform
-    const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
-    const TensorInfo  input0       = src->clone()->set_tensor_shape(input0_shape);
+    const TensorShape input0_shape =
+        misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
+    const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradInputTransformKernel::validate(src, &input0, winograd_info));
 
     // Validate filter transform
-    const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
-    const TensorInfo  input1       = weights->clone()->set_tensor_shape(input1_shape);
+    const TensorShape input1_shape =
+        misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
+    const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradFilterTransformKernel::validate(weights, &input1, winograd_info));
 
     // Validate batched matrix multiply
     TensorShape batched_mm_output_shape = input0.tensor_shape();
     batched_mm_output_shape[0]          = input1.tensor_shape()[0];
     const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
-                                                                                                                     GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f,
+                         GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+                                  GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))));
 
     // Configure output transform
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info));
     return Status{};
 }
 
@@ -171,8 +180,14 @@ ClWinogradConv2d::ClWinogradConv2d()
 
 ClWinogradConv2d::~ClWinogradConv2d() = default;
 
-void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                 const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+void ClWinogradConv2d::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src,
+                                 ITensorInfo               *weights,
+                                 ITensorInfo               *biases,
+                                 ITensorInfo               *dst,
+                                 const PadStrideInfo       &conv_info,
+                                 const ActivationLayerInfo &act_info,
+                                 bool                       enable_fast_math)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info, enable_fast_math);
@@ -187,50 +202,53 @@ void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITenso
     const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
 
     // Check if the Winograd configuration requires fast math
-    if(!enable_fast_math)
+    if (!enable_fast_math)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
-        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1,
+                                                      DataType::F32); //disable winograd for fp16 if fast math is false.
+        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size),
+                                 "This Winograd configuration requires enable_fast_math=true");
     }
-    const WinogradInfo winograd_info = WinogradInfo(output_tile,
-                                                    kernel_size,
-                                                    input_dims,
-                                                    conv_info,
-                                                    src->data_layout());
+    const WinogradInfo winograd_info =
+        WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout());
 
     _is_prepared = false;
 
     // Configure input transform
     _input_transform->configure(compile_context, src, &_input0, winograd_info);
-    _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT, PixelValue());
+    _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT,
+                              PixelValue());
 
     // Configure filter transform
     _filter_transform->configure(compile_context, weights, &_input1, winograd_info);
 
     // Configure batched matrix multiply
-    _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0,
-                                                                                                                  false, false,
-                                                                                                                  GEMMLowpOutputStageInfo(),
-                                                                                                                  (src->data_type() == DataType::F16)));
+    _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f,
+                          GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+                                   GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16)));
 
     // Configure output transform
     _output_transform->set_target(CLScheduler::get().target());
     _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info);
 
-    _aux_mem                             = _batched_mm.workspace();
-    const MemoryLifetime wino_wei_lifetm = std::any_of(std::begin(_aux_mem), std::end(_aux_mem), [](const auto & r)
-    {
-        return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0);
-    }) ?
-    MemoryLifetime::Prepare :
-    MemoryLifetime::Persistent;
+    _aux_mem = _batched_mm.workspace();
+    const MemoryLifetime wino_wei_lifetm =
+        std::any_of(std::begin(_aux_mem), std::end(_aux_mem),
+                    [](const auto &r) { return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0); })
+            ? MemoryLifetime::Prepare
+            : MemoryLifetime::Persistent;
     _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size()));
     _aux_mem.push_back(MemoryInfo(offset_int_vec(3), wino_wei_lifetm, _input1.total_size()));
     _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size()));
 }
 
-Status ClWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                                  const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status ClWinogradConv2d::validate(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  const ITensorInfo         *dst,
+                                  const PadStrideInfo       &conv_info,
+                                  const ActivationLayerInfo &act_info,
+                                  bool                       enable_fast_math)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
     return Status{};
@@ -251,10 +269,9 @@ void ClWinogradConv2d::run(ITensorPack &tensors)
     prepare(tensors);
 
     // Run input transform
-    ITensorPack pack_it
-    {
-        { TensorType::ACL_SRC, src },
-        { TensorType::ACL_DST, input0.get() },
+    ITensorPack pack_it{
+        {TensorType::ACL_SRC, src},
+        {TensorType::ACL_DST, input0.get()},
     };
     CLScheduler::get().enqueue_op(_border_handler, pack_it, false);
     CLScheduler::get().enqueue_op(*_input_transform, pack_it, false);
@@ -263,31 +280,31 @@ void ClWinogradConv2d::run(ITensorPack &tensors)
     ITensorPack pack_mm = tensors;
     pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get());
     pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get());
-    is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1) : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get());
+    is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1)
+                     : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get());
     _batched_mm.run(pack_mm);
 
     // Run output transform
-    ITensorPack pack_ot
-    {
-        { TensorType::ACL_SRC_0, batched_mm_output.get() },
-        { TensorType::ACL_SRC_1, biases },
-        { TensorType::ACL_DST, dst },
+    ITensorPack pack_ot{
+        {TensorType::ACL_SRC_0, batched_mm_output.get()},
+        {TensorType::ACL_SRC_1, biases},
+        {TensorType::ACL_DST, dst},
     };
     CLScheduler::get().enqueue_op(*_output_transform, pack_ot);
 }
 
 void ClWinogradConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        auto       weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+        auto weights =
+            utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
         ICLTensor *in1_aux = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(3)));
 
         CLAuxTensorHandler input1(_input1, *in1_aux);
-        ITensorPack        pack_ft
-        {
-            { TensorType::ACL_SRC, weights },
-            { TensorType::ACL_DST, input1.get() },
+        ITensorPack        pack_ft{
+            {TensorType::ACL_SRC, weights},
+            {TensorType::ACL_DST, input1.get()},
         };
         // Run filter transform and mark original weights as unused
         CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false);
@@ -308,4 +325,4 @@ experimental::MemoryRequirements ClWinogradConv2d::workspace() const
     return _aux_mem;
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClWinogradConv2d.h b/src/gpu/cl/operators/ClWinogradConv2d.h
index eb2f7a72b2c18f0bbb892bea9afb0c4773a1fd63..54ec1a173780175e1b30a65ab9a1a108dde5f363 100644
--- a/src/gpu/cl/operators/ClWinogradConv2d.h
+++ b/src/gpu/cl/operators/ClWinogradConv2d.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_WINOGRADCONV2D_H
 
 #include "arm_compute/runtime/CL/CLTensor.h"
+
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
@@ -41,7 +42,7 @@ namespace kernels
 class ClWinogradInputTransformKernel;
 class ClWinogradFilterTransformKernel;
 class ClWinogradOutputTransformKernel;
-} // kernels
+} // namespace kernels
 /** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels:
  *
  *  -# @ref kernels::ClWinogradInputTransformKernel
@@ -93,20 +94,31 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClWinogradConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
     // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/gpu/cl/utils/ClAuxTensorHandler.h b/src/gpu/cl/utils/ClAuxTensorHandler.h
index af383489a1a6f7d35f3d78c3dcba071c64602690..81dc3baef46d23d15bfb2e9ec236fc7b320dc89e 100644
--- a/src/gpu/cl/utils/ClAuxTensorHandler.h
+++ b/src/gpu/cl/utils/ClAuxTensorHandler.h
@@ -39,25 +39,26 @@ namespace opencl
 class CLAuxTensorHandler
 {
 public:
-    CLAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
+    CLAuxTensorHandler(
+        int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
         : _tensor()
     {
-        if(info.total_size() == 0)
+        if (info.total_size() == 0)
         {
             return;
         }
         _tensor.allocator()->soft_init(info);
 
         ICLTensor *packed_tensor = utils::cast::polymorphic_downcast<ICLTensor *>(pack.get_tensor(slot_id));
-        if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
+        if ((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
         {
-            if(!bypass_alloc)
+            if (!bypass_alloc)
             {
                 _tensor.allocator()->allocate();
                 ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor");
             }
 
-            if(pack_inject)
+            if (pack_inject)
             {
                 pack.add_tensor(slot_id, &_tensor);
                 _injected_tensor_pack = &pack;
@@ -70,22 +71,21 @@ public:
         }
     }
 
-    CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor)
-        : _tensor()
+    CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor) : _tensor()
     {
         _tensor.allocator()->soft_init(info);
-        if(info.total_size() <= tensor.info()->total_size())
+        if (info.total_size() <= tensor.info()->total_size())
         {
             _tensor.allocator()->import_memory(tensor.cl_buffer());
         }
     }
 
-    CLAuxTensorHandler(const CLAuxTensorHandler &) = delete;
+    CLAuxTensorHandler(const CLAuxTensorHandler &)          = delete;
     CLAuxTensorHandler &operator=(const CLAuxTensorHandler) = delete;
 
     ~CLAuxTensorHandler()
     {
-        if(_injected_tensor_pack)
+        if (_injected_tensor_pack)
         {
             _injected_tensor_pack->remove_tensor(_injected_slot_id);
         }
@@ -103,9 +103,9 @@ public:
 
 private:
     CLTensor     _tensor{};
-    ITensorPack *_injected_tensor_pack{ nullptr };
-    int          _injected_slot_id{ TensorType::ACL_UNKNOWN };
+    ITensorPack *_injected_tensor_pack{nullptr};
+    int          _injected_slot_id{TensorType::ACL_UNKNOWN};
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */
diff --git a/src/graph/DataLayerVisitor.cpp b/src/graph/DataLayerVisitor.cpp
index 85d24b46546b0ec025b8c11dae101ab9a0d65683..f0fac25577e0cd82bbd6d76448a1ed0e962b0e2b 100644
--- a/src/graph/DataLayerVisitor.cpp
+++ b/src/graph/DataLayerVisitor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,8 +25,8 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/TypePrinter.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/TypePrinter.h"
 
 namespace arm_compute
 {
@@ -43,17 +43,14 @@ void add_convolution_layer_data(DataLayerVisitor::LayerData &layer_data, T &node
     layer_data["data_layout"] = to_string(layout);
     // Add padding info
     std::ostringstream padding;
-    padding << "[" << to_string(ps_info.pad_left()) << ","
-            << to_string(ps_info.pad_top()) << ","
-            << to_string(ps_info.pad_bottom()) << ","
-            << to_string(ps_info.pad_right()) << "]";
+    padding << "[" << to_string(ps_info.pad_left()) << "," << to_string(ps_info.pad_top()) << ","
+            << to_string(ps_info.pad_bottom()) << "," << to_string(ps_info.pad_right()) << "]";
 
     layer_data["pad"] = padding.str();
 
     // Add stride info
     std::ostringstream stride;
-    stride << "[" << to_string(ps_info.stride().first) << ","
-           << to_string(ps_info.stride().second) << "]";
+    stride << "[" << to_string(ps_info.stride().first) << "," << to_string(ps_info.stride().second) << "]";
 
     layer_data["stride"] = stride.str();
 
@@ -68,12 +65,12 @@ void add_convolution_layer_data(DataLayerVisitor::LayerData &layer_data, T &node
 
     // Change input names for weights / bias (if applicable)
     // Assumes input(1) is weights and input(2) is bias
-    if(layer_data.count("input_shape1"))
+    if (layer_data.count("input_shape1"))
     {
         layer_data["weights_shape"] = layer_data["input_shape1"];
         layer_data.erase("input_shape1");
     }
-    if(layer_data.count("input_shape2"))
+    if (layer_data.count("input_shape2"))
     {
         layer_data["bias_shape"] = layer_data["input_shape2"];
         layer_data.erase("input_shape2");
@@ -92,16 +89,17 @@ template <typename T>
 void add_generic_layer_data(DataLayerVisitor::LayerData &layer_data, T &node)
 {
     // Loop over each input tensor
-    for(size_t tensor_no = 0; tensor_no < node.num_inputs(); ++tensor_no)
+    for (size_t tensor_no = 0; tensor_no < node.num_inputs(); ++tensor_no)
     {
         // Add input tensor shapes
-        if(node.input(tensor_no) != nullptr)
+        if (node.input(tensor_no) != nullptr)
         {
-            layer_data["input_shape" + to_string(tensor_no)] = "[" + to_string(node.input(tensor_no)->desc().shape) + "]";
+            layer_data["input_shape" + to_string(tensor_no)] =
+                "[" + to_string(node.input(tensor_no)->desc().shape) + "]";
         }
     }
     // Add output tensor shape
-    if(node.output(0) != nullptr)
+    if (node.output(0) != nullptr)
     {
         layer_data["output_shape0"] = "[" + to_string(node.output(0)->desc().shape) + "]";
     }
@@ -131,14 +129,6 @@ void DataLayerVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
     add_convolution_layer_method<FusedConvolutionBatchNormalizationNode>(_layer_data, n);
 }
 
-void DataLayerVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
-{
-    _layer_data.clear();
-    add_generic_layer_data<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
-    add_convolution_layer_data<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
-    add_convolution_layer_method<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
-}
-
 void DataLayerVisitor::visit(FusedDepthwiseConvolutionBatchNormalizationNode &n)
 {
     _layer_data.clear();
diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index 4ce53589d4f290a8019748856d40bc8f32e41e8a..3ae83f2e80f6243a6d6f026d29bec9abae11e6ff 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp
@@ -34,24 +34,24 @@ Graph::Graph(GraphID id, std::string name)
 
 bool Graph::remove_node(NodeID nid)
 {
-    if(nid >= _nodes.size())
+    if (nid >= _nodes.size())
     {
         return false;
     }
 
     std::unique_ptr<INode> &node = _nodes[nid];
 
-    if(node)
+    if (node)
     {
         // Remove input connections
-        for(auto &input_eid : node->_input_edges)
+        for (auto &input_eid : node->_input_edges)
         {
             remove_connection(input_eid);
         }
 
         // Remove output connections
         std::set<EdgeID> output_edges_copy = node->output_edges();
-        for(auto &output_eid : output_edges_copy)
+        for (auto &output_eid : output_edges_copy)
         {
             remove_connection(output_eid);
         }
@@ -71,8 +71,10 @@ EdgeID Graph::add_connection(NodeID source, size_t source_idx, NodeID sink, size
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
 
     // Check if node index is valid, if node exists and finally if the connection index is valid
-    ARM_COMPUTE_ERROR_ON((source >= _nodes.size()) || (_nodes[source] == nullptr) || (source_idx >= _nodes[source]->num_outputs()));
-    ARM_COMPUTE_ERROR_ON((sink >= _nodes.size()) || (_nodes[sink] == nullptr) || (sink_idx >= _nodes[sink]->num_inputs()));
+    ARM_COMPUTE_ERROR_ON((source >= _nodes.size()) || (_nodes[source] == nullptr) ||
+                         (source_idx >= _nodes[source]->num_outputs()));
+    ARM_COMPUTE_ERROR_ON((sink >= _nodes.size()) || (_nodes[sink] == nullptr) ||
+                         (sink_idx >= _nodes[sink]->num_inputs()));
 
     // Get nodes
     std::unique_ptr<INode> &source_node = _nodes[source];
@@ -80,23 +82,25 @@ EdgeID Graph::add_connection(NodeID source, size_t source_idx, NodeID sink, size
 
     // Check for duplicate connections (Check only sink node)
     Edge *sink_node_edge = sink_node->input_edge(sink_idx);
-    if((sink_node_edge != nullptr) && (sink_node_edge->producer_id() == source) && (sink_node_edge->producer_idx() == source_idx)
-       && (sink_node_edge->consumer_id() == sink) && (sink_node_edge->consumer_idx() == sink_idx))
+    if ((sink_node_edge != nullptr) && (sink_node_edge->producer_id() == source) &&
+        (sink_node_edge->producer_idx() == source_idx) && (sink_node_edge->consumer_id() == sink) &&
+        (sink_node_edge->consumer_idx() == sink_idx))
     {
         return sink_node_edge->id();
     }
 
     // Check if there is already a tensor associated with output if not create one
     TensorID tid = source_node->output_id(source_idx);
-    if(tid == NullTensorID)
+    if (tid == NullTensorID)
     {
         tid = create_tensor();
     }
     std::unique_ptr<Tensor> &tensor = _tensors[tid];
 
     // Create connections
-    EdgeID eid        = _edges.size();
-    auto   connection = std::make_unique<Edge>(eid, source_node.get(), source_idx, sink_node.get(), sink_idx, tensor.get());
+    EdgeID eid = _edges.size();
+    auto   connection =
+        std::make_unique<Edge>(eid, source_node.get(), source_idx, sink_node.get(), sink_idx, tensor.get());
     _edges.push_back(std::move(connection));
 
     // Add connections to source and sink nodes
@@ -117,7 +121,7 @@ EdgeID Graph::add_connection(NodeID source, size_t source_idx, NodeID sink, size
 
 bool Graph::remove_connection(EdgeID eid)
 {
-    if(eid >= _edges.size())
+    if (eid >= _edges.size())
     {
         return false;
     }
@@ -125,22 +129,22 @@ bool Graph::remove_connection(EdgeID eid)
     std::unique_ptr<Edge> &edge = _edges[eid];
 
     // Remove node connections
-    if(edge != nullptr)
+    if (edge != nullptr)
     {
         // Get tensor bound to the edge
-        if(edge->tensor() != nullptr)
+        if (edge->tensor() != nullptr)
         {
             edge->tensor()->unbind_edge(eid);
         }
 
         // Remove edges from source node
-        if(edge->producer() != nullptr)
+        if (edge->producer() != nullptr)
         {
             edge->producer()->_output_edges.erase(eid);
         }
 
         // Remove edges from sink node
-        if((edge->consumer() != nullptr) && (edge->consumer_idx() < edge->consumer()->_input_edges.size()))
+        if ((edge->consumer() != nullptr) && (edge->consumer_idx() < edge->consumer()->_input_edges.size()))
         {
             edge->consumer()->_input_edges[edge->consumer_idx()] = EmptyEdgeID;
         }
@@ -231,4 +235,4 @@ Tensor *Graph::tensor(TensorID id)
     return (id >= _tensors.size()) ? nullptr : _tensors[id].get();
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp
index 7e5d3133d174caba58e4ff8effb54f3a304cbc97..eab91b2347965521c26d43dc1c2c05238cdc46c9 100644
--- a/src/graph/GraphBuilder.cpp
+++ b/src/graph/GraphBuilder.cpp
@@ -24,10 +24,10 @@
 #include "arm_compute/graph/GraphBuilder.h"
 
 #include "arm_compute/core/utils/DataTypeUtils.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/algorithms/TopologicalSort.h"
+#include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/ToolchainSupport.h"
 
@@ -41,7 +41,8 @@ inline void check_nodeidx_pair(const NodeIdxPair &pair, const Graph &g)
 {
     ARM_COMPUTE_UNUSED(pair);
     ARM_COMPUTE_UNUSED(g);
-    ARM_COMPUTE_ERROR_ON((pair.node_id >= g.nodes().size()) || (g.node((pair).node_id) == nullptr) || (pair.index >= g.node(pair.node_id)->num_outputs()));
+    ARM_COMPUTE_ERROR_ON((pair.node_id >= g.nodes().size()) || (g.node((pair).node_id) == nullptr) ||
+                         (pair.index >= g.node(pair.node_id)->num_outputs()));
 }
 
 Status set_node_params(Graph &g, NodeID nid, NodeParams &params)
@@ -67,7 +68,8 @@ Status set_accessor_on_node(Graph &g, NodeID nid, bool is_output, size_t idx, IT
     return Status{};
 }
 
-NodeID add_const_node_with_name(Graph &g, NodeParams params, const std::string &name, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
+NodeID add_const_node_with_name(
+    Graph &g, NodeParams params, const std::string &name, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
 {
     params.name = params.name.empty() ? "" : params.name + name;
     auto nid    = GraphBuilder::add_const_node(g, params, desc, std::move(accessor));
@@ -76,7 +78,7 @@ NodeID add_const_node_with_name(Graph &g, NodeParams params, const std::string &
 }
 
 template <typename NT, typename... Args>
-NodeID create_simple_single_input_output_node(Graph &g, NodeParams &params, NodeIdxPair input, Args &&... args)
+NodeID create_simple_single_input_output_node(Graph &g, NodeParams &params, NodeIdxPair input, Args &&...args)
 {
     check_nodeidx_pair(input, g);
 
@@ -88,14 +90,17 @@ NodeID create_simple_single_input_output_node(Graph &g, NodeParams &params, Node
 }
 
 template <typename NT, typename... Args>
-NodeID create_simple_multiple_input_single_output_node(Graph &g, NodeParams &params, const std::vector<NodeIdxPair> &inputs, Args &&... args)
+NodeID create_simple_multiple_input_single_output_node(Graph                          &g,
+                                                       NodeParams                     &params,
+                                                       const std::vector<NodeIdxPair> &inputs,
+                                                       Args &&...args)
 {
     ARM_COMPUTE_ERROR_ON(inputs.size() == 0);
 
     NodeID nid = g.add_node<NT>(std::forward<Args>(args)...);
 
     unsigned int i = 0;
-    for(const auto &input : inputs)
+    for (const auto &input : inputs)
     {
         check_nodeidx_pair(input, g);
         g.add_connection(input.node_id, input.index, nid, i++);
@@ -106,7 +111,8 @@ NodeID create_simple_multiple_input_single_output_node(Graph &g, NodeParams &par
 }
 } // namespace
 
-NodeID GraphBuilder::add_const_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
+NodeID
+GraphBuilder::add_const_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
 {
     auto nid = g.add_node<ConstNode>(desc);
     set_node_params(g, nid, params);
@@ -114,7 +120,8 @@ NodeID GraphBuilder::add_const_node(Graph &g, NodeParams params, const TensorDes
     return nid;
 }
 
-NodeID GraphBuilder::add_input_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
+NodeID
+GraphBuilder::add_input_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
 {
     auto nid = g.add_node<InputNode>(desc);
     set_node_params(g, nid, params);
@@ -134,21 +141,35 @@ NodeID GraphBuilder::add_output_node(Graph &g, NodeParams params, NodeIdxPair in
     return nid;
 }
 
-NodeID GraphBuilder::add_activation_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info,
+NodeID GraphBuilder::add_activation_node(Graph                  &g,
+                                         NodeParams              params,
+                                         NodeIdxPair             input,
+                                         ActivationLayerInfo     act_info,
                                          const QuantizationInfo &out_quant_info)
 {
     return create_simple_single_input_output_node<ActivationLayerNode>(g, params, input, act_info, out_quant_info);
 }
 
-NodeID GraphBuilder::add_arg_min_max_node(Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, unsigned int axis,
-                                          DataType out_data_type, const QuantizationInfo &out_quant_info)
+NodeID GraphBuilder::add_arg_min_max_node(Graph                  &g,
+                                          NodeParams              params,
+                                          NodeIdxPair             input,
+                                          ReductionOperation      op,
+                                          unsigned int            axis,
+                                          DataType                out_data_type,
+                                          const QuantizationInfo &out_quant_info)
 {
-    return create_simple_single_input_output_node<ArgMinMaxLayerNode>(g, params, input, op, axis, out_data_type, out_quant_info);
+    return create_simple_single_input_output_node<ArgMinMaxLayerNode>(g, params, input, op, axis, out_data_type,
+                                                                      out_quant_info);
 }
 
-NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, float epsilon,
-                                                  ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr var_accessor,
-                                                  ITensorAccessorUPtr beta_accessor, ITensorAccessorUPtr gamma_accessor)
+NodeID GraphBuilder::add_batch_normalization_node(Graph              &g,
+                                                  NodeParams          params,
+                                                  NodeIdxPair         input,
+                                                  float               epsilon,
+                                                  ITensorAccessorUPtr mean_accessor,
+                                                  ITensorAccessorUPtr var_accessor,
+                                                  ITensorAccessorUPtr beta_accessor,
+                                                  ITensorAccessorUPtr gamma_accessor)
 {
     check_nodeidx_pair(input, g);
 
@@ -168,14 +189,14 @@ NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, N
 
     // Create beta node
     NodeID beta_nid = EmptyNodeID;
-    if(has_beta)
+    if (has_beta)
     {
         beta_nid = add_const_node_with_name(g, params, "Beta", common_desc, std::move(beta_accessor));
     }
 
     // Create gamma node
     NodeID gamma_nid = EmptyNodeID;
-    if(has_gamma)
+    if (has_gamma)
     {
         gamma_nid = add_const_node_with_name(g, params, "Gamma", common_desc, std::move(gamma_accessor));
     }
@@ -185,11 +206,11 @@ NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, N
     g.add_connection(input.node_id, input.index, batch_norm_nid, 0);
     g.add_connection(mean_nid, 0, batch_norm_nid, 1);
     g.add_connection(var_nid, 0, batch_norm_nid, 2);
-    if(has_beta)
+    if (has_beta)
     {
         g.add_connection(beta_nid, 0, batch_norm_nid, 3);
     }
-    if(has_gamma)
+    if (has_gamma)
     {
         g.add_connection(gamma_nid, 0, batch_norm_nid, 4);
     }
@@ -198,7 +219,8 @@ NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, N
     return batch_norm_nid;
 }
 
-NodeID GraphBuilder::add_bounding_box_transform_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info)
+NodeID GraphBuilder::add_bounding_box_transform_node(
+    Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info)
 {
     check_nodeidx_pair(input, g);
     check_nodeidx_pair(deltas, g);
@@ -217,10 +239,17 @@ NodeID GraphBuilder::add_channel_shuffle_node(Graph &g, NodeParams params, NodeI
     return create_simple_single_input_output_node<ChannelShuffleLayerNode>(g, params, input, num_groups);
 }
 
-NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                          Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo conv_info,
-                                          unsigned int num_groups, ConvolutionMethod method, FastMathHint fast_math_hint,
-                                          ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor,
+NodeID GraphBuilder::add_convolution_node(Graph                  &g,
+                                          NodeParams              params,
+                                          NodeIdxPair             input,
+                                          Size2D                  kernel_spatial_extend,
+                                          unsigned int            depth,
+                                          PadStrideInfo           conv_info,
+                                          unsigned int            num_groups,
+                                          ConvolutionMethod       method,
+                                          FastMathHint            fast_math_hint,
+                                          ITensorAccessorUPtr     weights_accessor,
+                                          ITensorAccessorUPtr     bias_accessor,
                                           const QuantizationInfo &weights_quant_info,
                                           const QuantizationInfo &out_quant_info)
 {
@@ -241,7 +270,7 @@ NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPa
     w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL),
                      get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) / num_groups);
     w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::BATCHES), depth);
-    if(!weights_quant_info.empty())
+    if (!weights_quant_info.empty())
     {
         w_desc.quant_info = weights_quant_info;
     }
@@ -250,11 +279,11 @@ NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPa
 
     // Create bias nodes
     NodeID b_nid = EmptyNodeID;
-    if(has_bias)
+    if (has_bias)
     {
         TensorDescriptor b_desc = input_tensor_desc;
         b_desc.shape            = TensorShape(depth);
-        if(is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
+        if (is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
         {
             b_desc.data_type = DataType::S32;
         }
@@ -265,7 +294,7 @@ NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPa
     NodeID conv_nid = g.add_node<ConvolutionLayerNode>(conv_info, num_groups, method, fast_math_hint, out_quant_info);
     g.add_connection(input.node_id, input.index, conv_nid, 0);
     g.add_connection(w_nid, 0, conv_nid, 1);
-    if(has_bias)
+    if (has_bias)
     {
         g.add_connection(b_nid, 0, conv_nid, 2);
     }
@@ -274,8 +303,12 @@ NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPa
     return conv_nid;
 }
 
-NodeID GraphBuilder::add_deconvolution_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                            Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo deconv_info,
+NodeID GraphBuilder::add_deconvolution_node(Graph              &g,
+                                            NodeParams          params,
+                                            NodeIdxPair         input,
+                                            Size2D              kernel_spatial_extend,
+                                            unsigned int        depth,
+                                            PadStrideInfo       deconv_info,
                                             ITensorAccessorUPtr weights_accessor,
                                             ITensorAccessorUPtr bias_accessor)
 {
@@ -301,11 +334,11 @@ NodeID GraphBuilder::add_deconvolution_node(Graph &g, NodeParams params, NodeIdx
 
     // Create bias nodes
     NodeID b_nid = EmptyNodeID;
-    if(has_bias)
+    if (has_bias)
     {
         TensorDescriptor b_desc = input_tensor_desc;
         b_desc.shape            = TensorShape(depth);
-        if(is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
+        if (is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
         {
             b_desc.data_type = DataType::S32;
         }
@@ -313,10 +346,10 @@ NodeID GraphBuilder::add_deconvolution_node(Graph &g, NodeParams params, NodeIdx
     }
 
     // Create convolution node and connect
-    NodeID deconv_nid = g.add_node<DeconvolutionLayerNode>(descriptors::DeconvolutionLayerDescriptor{ deconv_info });
+    NodeID deconv_nid = g.add_node<DeconvolutionLayerNode>(descriptors::DeconvolutionLayerDescriptor{deconv_info});
     g.add_connection(input.node_id, input.index, deconv_nid, 0);
     g.add_connection(w_nid, 0, deconv_nid, 1);
-    if(has_bias)
+    if (has_bias)
     {
         g.add_connection(b_nid, 0, deconv_nid, 2);
     }
@@ -325,14 +358,26 @@ NodeID GraphBuilder::add_deconvolution_node(Graph &g, NodeParams params, NodeIdx
     return deconv_nid;
 }
 
-NodeID GraphBuilder::add_concatenate_node(Graph &g, NodeParams params, const std::vector<NodeIdxPair> &inputs, const descriptors::ConcatLayerDescriptor &concat_descriptor)
+NodeID GraphBuilder::add_concatenate_node(Graph                                    &g,
+                                          NodeParams                                params,
+                                          const std::vector<NodeIdxPair>           &inputs,
+                                          const descriptors::ConcatLayerDescriptor &concat_descriptor)
 {
-    return create_simple_multiple_input_single_output_node<ConcatenateLayerNode>(g, params, inputs, inputs.size(), concat_descriptor);
+    return create_simple_multiple_input_single_output_node<ConcatenateLayerNode>(g, params, inputs, inputs.size(),
+                                                                                 concat_descriptor);
 }
 
-NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D kernel_spatial_extend,
-                                                    PadStrideInfo conv_info, int depth_multiplier, DepthwiseConvolutionMethod method,
-                                                    ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor, const QuantizationInfo &quant_info, const QuantizationInfo &out_quant_info)
+NodeID GraphBuilder::add_depthwise_convolution_node(Graph                     &g,
+                                                    NodeParams                 params,
+                                                    NodeIdxPair                input,
+                                                    Size2D                     kernel_spatial_extend,
+                                                    PadStrideInfo              conv_info,
+                                                    int                        depth_multiplier,
+                                                    DepthwiseConvolutionMethod method,
+                                                    ITensorAccessorUPtr        weights_accessor,
+                                                    ITensorAccessorUPtr        bias_accessor,
+                                                    const QuantizationInfo    &quant_info,
+                                                    const QuantizationInfo    &out_quant_info)
 {
     check_nodeidx_pair(input, g);
     ARM_COMPUTE_ERROR_ON((kernel_spatial_extend.width == 0) || (kernel_spatial_extend.height == 0));
@@ -349,7 +394,7 @@ NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params,
     w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
     w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL),
                      get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier);
-    if(!quant_info.empty())
+    if (!quant_info.empty())
     {
         w_desc.quant_info = quant_info;
     }
@@ -358,12 +403,13 @@ NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params,
 
     // Create bias nodes
     NodeID b_nid = EmptyNodeID;
-    if(has_bias)
+    if (has_bias)
     {
         TensorDescriptor b_desc = input_tensor_desc;
-        b_desc.shape            = TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier);
+        b_desc.shape =
+            TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier);
 
-        if(is_data_type_quantized_asymmetric(b_desc.data_type))
+        if (is_data_type_quantized_asymmetric(b_desc.data_type))
         {
             b_desc.data_type = DataType::S32;
         }
@@ -375,7 +421,7 @@ NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params,
     NodeID conv_nid = g.add_node<DepthwiseConvolutionLayerNode>(conv_info, depth_multiplier, method, out_quant_info);
     g.add_connection(input.node_id, input.index, conv_nid, 0);
     g.add_connection(w_nid, 0, conv_nid, 1);
-    if(has_bias)
+    if (has_bias)
     {
         g.add_connection(b_nid, 0, conv_nid, 2);
     }
@@ -394,7 +440,12 @@ NodeID GraphBuilder::add_dequantization_node(Graph &g, NodeParams params, NodeId
     return create_simple_single_input_output_node<DequantizationLayerNode>(g, params, input);
 }
 
-NodeID GraphBuilder::add_detection_output_node(Graph &g, NodeParams params, NodeIdxPair input_loc, NodeIdxPair input_conf, NodeIdxPair input_priorbox, const DetectionOutputLayerInfo &detect_info)
+NodeID GraphBuilder::add_detection_output_node(Graph                          &g,
+                                               NodeParams                      params,
+                                               NodeIdxPair                     input_loc,
+                                               NodeIdxPair                     input_conf,
+                                               NodeIdxPair                     input_priorbox,
+                                               const DetectionOutputLayerInfo &detect_info)
 {
     check_nodeidx_pair(input_loc, g);
     check_nodeidx_pair(input_conf, g);
@@ -411,18 +462,24 @@ NodeID GraphBuilder::add_detection_output_node(Graph &g, NodeParams params, Node
     return detect_nid;
 }
 
-NodeID GraphBuilder::add_detection_post_process_node(Graph &g, NodeParams params, NodeIdxPair input_box_encoding, NodeIdxPair input_class_prediction, const DetectionPostProcessLayerInfo &detect_info,
-                                                     ITensorAccessorUPtr anchors_accessor, const QuantizationInfo &anchor_quant_info)
+NodeID GraphBuilder::add_detection_post_process_node(Graph                               &g,
+                                                     NodeParams                           params,
+                                                     NodeIdxPair                          input_box_encoding,
+                                                     NodeIdxPair                          input_class_prediction,
+                                                     const DetectionPostProcessLayerInfo &detect_info,
+                                                     ITensorAccessorUPtr                  anchors_accessor,
+                                                     const QuantizationInfo              &anchor_quant_info)
 {
     check_nodeidx_pair(input_box_encoding, g);
     check_nodeidx_pair(input_class_prediction, g);
 
     // Get input tensor descriptor
-    const TensorDescriptor input_box_encoding_tensor_desc = get_tensor_descriptor(g, g.node(input_box_encoding.node_id)->outputs()[0]);
+    const TensorDescriptor input_box_encoding_tensor_desc =
+        get_tensor_descriptor(g, g.node(input_box_encoding.node_id)->outputs()[0]);
 
     // Calculate anchor descriptor
     TensorDescriptor anchor_desc = input_box_encoding_tensor_desc;
-    if(!anchor_quant_info.empty())
+    if (!anchor_quant_info.empty())
     {
         anchor_desc.quant_info = anchor_quant_info;
     }
@@ -446,12 +503,13 @@ NodeID GraphBuilder::add_dummy_node(Graph &g, NodeParams params, NodeIdxPair inp
     return create_simple_single_input_output_node<DummyNode>(g, params, input, shape);
 }
 
-NodeID GraphBuilder::add_elementwise_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation)
+NodeID GraphBuilder::add_elementwise_node(
+    Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation)
 {
     check_nodeidx_pair(input0, g);
     check_nodeidx_pair(input1, g);
 
-    NodeID nid = g.add_node<EltwiseLayerNode>(descriptors::EltwiseLayerDescriptor{ operation });
+    NodeID nid = g.add_node<EltwiseLayerNode>(descriptors::EltwiseLayerDescriptor{operation});
 
     g.add_connection(input0.node_id, input0.index, nid, 0);
     g.add_connection(input1.node_id, input1.index, nid, 1);
@@ -466,9 +524,15 @@ NodeID GraphBuilder::add_flatten_node(Graph &g, NodeParams params, NodeIdxPair i
     return create_simple_single_input_output_node<FlattenLayerNode>(g, params, input);
 }
 
-NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
-                                               NodeID weights_nid, NodeID bias_nid,
-                                               const FullyConnectedLayerInfo fc_info, const QuantizationInfo &out_quant_info, FastMathHint fast_math_hint)
+NodeID GraphBuilder::add_fully_connected_layer(Graph                        &g,
+                                               NodeParams                    params,
+                                               NodeIdxPair                   input,
+                                               unsigned int                  num_outputs,
+                                               NodeID                        weights_nid,
+                                               NodeID                        bias_nid,
+                                               const FullyConnectedLayerInfo fc_info,
+                                               const QuantizationInfo       &out_quant_info,
+                                               FastMathHint                  fast_math_hint)
 {
     check_nodeidx_pair(input, g);
     ARM_COMPUTE_ERROR_ON(num_outputs == 0);
@@ -483,7 +547,7 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node
     NodeID fc_nid = g.add_node<FullyConnectedLayerNode>(num_outputs, out_quant_info, fc_info, fast_math_hint);
     g.add_connection(input.node_id, input.index, fc_nid, 0);
     g.add_connection(weights_nid, 0, fc_nid, 1);
-    if(has_bias)
+    if (has_bias)
     {
         g.add_connection(bias_nid, 0, fc_nid, 2);
     }
@@ -493,10 +557,16 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node
     return fc_nid;
 }
 
-NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
-                                               ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor,
+NodeID GraphBuilder::add_fully_connected_layer(Graph                        &g,
+                                               NodeParams                    params,
+                                               NodeIdxPair                   input,
+                                               unsigned int                  num_outputs,
+                                               ITensorAccessorUPtr           weights_accessor,
+                                               ITensorAccessorUPtr           bias_accessor,
                                                const FullyConnectedLayerInfo fc_info,
-                                               const QuantizationInfo &weights_quant_info, const QuantizationInfo &out_quant_info, FastMathHint fast_math_hint)
+                                               const QuantizationInfo       &weights_quant_info,
+                                               const QuantizationInfo       &out_quant_info,
+                                               FastMathHint                  fast_math_hint)
 {
     check_nodeidx_pair(input, g);
     ARM_COMPUTE_ERROR_ON(num_outputs == 0);
@@ -507,16 +577,17 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node
     const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
 
     // Create weights node
-    TensorDescriptor w_desc = FullyConnectedLayerNode::compute_weights_descriptor(input_tensor_desc, num_outputs, fc_info, weights_quant_info);
+    TensorDescriptor w_desc = FullyConnectedLayerNode::compute_weights_descriptor(input_tensor_desc, num_outputs,
+                                                                                  fc_info, weights_quant_info);
     NodeID           w_nid  = add_const_node_with_name(g, params, "Weights", w_desc, std::move(weights_accessor));
 
     // Create bias nodes
     NodeID b_nid = EmptyNodeID;
-    if(has_bias)
+    if (has_bias)
     {
         TensorDescriptor b_desc = input_tensor_desc;
         b_desc.shape            = TensorShape(num_outputs);
-        if(is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
+        if (is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
         {
             b_desc.data_type = DataType::S32;
         }
@@ -527,7 +598,7 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node
     NodeID fc_nid = g.add_node<FullyConnectedLayerNode>(num_outputs, out_quant_info, fc_info, fast_math_hint);
     g.add_connection(input.node_id, input.index, fc_nid, 0);
     g.add_connection(w_nid, 0, fc_nid, 1);
-    if(has_bias)
+    if (has_bias)
     {
         g.add_connection(b_nid, 0, fc_nid, 2);
     }
@@ -537,7 +608,12 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node
     return fc_nid;
 }
 
-NodeID GraphBuilder::add_generate_proposals_node(Graph &g, NodeParams params, NodeIdxPair scores, NodeIdxPair deltas, NodeIdxPair anchors, GenerateProposalsInfo info)
+NodeID GraphBuilder::add_generate_proposals_node(Graph                &g,
+                                                 NodeParams            params,
+                                                 NodeIdxPair           scores,
+                                                 NodeIdxPair           deltas,
+                                                 NodeIdxPair           anchors,
+                                                 GenerateProposalsInfo info)
 {
     check_nodeidx_pair(scores, g);
     check_nodeidx_pair(deltas, g);
@@ -558,13 +634,14 @@ NodeID GraphBuilder::add_l2_normalize_node(Graph &g, NodeParams params, NodeIdxP
     return create_simple_single_input_output_node<L2NormalizeLayerNode>(g, params, input, axis, epsilon);
 }
 
-NodeID GraphBuilder::add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info)
+NodeID
+GraphBuilder::add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info)
 {
     return create_simple_single_input_output_node<NormalizationLayerNode>(g, params, input, norm_info);
 }
 
-NodeID GraphBuilder::add_normalize_planar_yuv_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                                   ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr std_accessor)
+NodeID GraphBuilder::add_normalize_planar_yuv_node(
+    Graph &g, NodeParams params, NodeIdxPair input, ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr std_accessor)
 {
     check_nodeidx_pair(input, g);
 
@@ -589,12 +666,14 @@ NodeID GraphBuilder::add_normalize_planar_yuv_node(Graph &g, NodeParams params,
     return norm_planar_yuv_nid;
 }
 
-NodeID GraphBuilder::add_pad_node(Graph &g, NodeParams params, NodeIdxPair input, const PaddingList &paddings, PixelValue pad_value)
+NodeID GraphBuilder::add_pad_node(
+    Graph &g, NodeParams params, NodeIdxPair input, const PaddingList &paddings, PixelValue pad_value)
 {
     return create_simple_single_input_output_node<PadLayerNode>(g, params, input, paddings, pad_value);
 }
 
-NodeID GraphBuilder::add_permute_node(Graph &g, NodeParams params, NodeIdxPair input, PermutationVector perm, DataLayout layout)
+NodeID GraphBuilder::add_permute_node(
+    Graph &g, NodeParams params, NodeIdxPair input, PermutationVector perm, DataLayout layout)
 {
     return create_simple_single_input_output_node<PermuteLayerNode>(g, params, input, perm, layout);
 }
@@ -618,12 +697,18 @@ NodeID GraphBuilder::add_pooling_node(Graph &g, NodeParams params, NodeIdxPair i
     return create_simple_single_input_output_node<PoolingLayerNode>(g, params, input, pool_info);
 }
 
-NodeID GraphBuilder::add_print_node(Graph &g, NodeParams params, NodeIdxPair input, std::ostream &stream, const IOFormatInfo &format_info, const std::function<ITensor *(ITensor *)> transform)
+NodeID GraphBuilder::add_print_node(Graph                                    &g,
+                                    NodeParams                                params,
+                                    NodeIdxPair                               input,
+                                    std::ostream                             &stream,
+                                    const IOFormatInfo                       &format_info,
+                                    const std::function<ITensor *(ITensor *)> transform)
 {
     return create_simple_single_input_output_node<PrintLayerNode>(g, params, input, stream, format_info, transform);
 }
 
-NodeID GraphBuilder::add_priorbox_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, const PriorBoxLayerInfo &prior_info)
+NodeID GraphBuilder::add_priorbox_node(
+    Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, const PriorBoxLayerInfo &prior_info)
 {
     check_nodeidx_pair(input0, g);
     check_nodeidx_pair(input1, g);
@@ -638,12 +723,16 @@ NodeID GraphBuilder::add_priorbox_node(Graph &g, NodeParams params, NodeIdxPair
     return prior_nid;
 }
 
-NodeID GraphBuilder::add_quantization_node(Graph &g, NodeParams params, NodeIdxPair input, const QuantizationInfo &out_quant_info)
+NodeID GraphBuilder::add_quantization_node(Graph                  &g,
+                                           NodeParams              params,
+                                           NodeIdxPair             input,
+                                           const QuantizationInfo &out_quant_info)
 {
     return create_simple_single_input_output_node<QuantizationLayerNode>(g, params, input, out_quant_info);
 }
 
-NodeID GraphBuilder::add_reduction_operation_node(Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, int axis, bool keep_dims)
+NodeID GraphBuilder::add_reduction_operation_node(
+    Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, int axis, bool keep_dims)
 {
     return create_simple_single_input_output_node<ReductionLayerNode>(g, params, input, op, axis, keep_dims);
 }
@@ -658,13 +747,14 @@ NodeID GraphBuilder::add_reshape_node(Graph &g, NodeParams params, NodeIdxPair i
     return create_simple_single_input_output_node<ReshapeLayerNode>(g, params, input, shape);
 }
 
-NodeID GraphBuilder::add_resize_node(Graph &g, NodeParams params, NodeIdxPair input, InterpolationPolicy policy,
-                                     float width_scale, float height_scale)
+NodeID GraphBuilder::add_resize_node(
+    Graph &g, NodeParams params, NodeIdxPair input, InterpolationPolicy policy, float width_scale, float height_scale)
 {
     return create_simple_single_input_output_node<ResizeLayerNode>(g, params, input, policy, width_scale, height_scale);
 }
 
-NodeID GraphBuilder::add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info)
+NodeID GraphBuilder::add_roi_align_node(
+    Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info)
 {
     check_nodeidx_pair(input, g);
     check_nodeidx_pair(rois, g);
@@ -678,7 +768,11 @@ NodeID GraphBuilder::add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair
     return nid;
 }
 
-NodeID GraphBuilder::add_scale_layer(Graph &g, const NodeParams &params, NodeIdxPair input, ITensorAccessorUPtr mul_accessor, ITensorAccessorUPtr add_accessor)
+NodeID GraphBuilder::add_scale_layer(Graph              &g,
+                                     const NodeParams   &params,
+                                     NodeIdxPair         input,
+                                     ITensorAccessorUPtr mul_accessor,
+                                     ITensorAccessorUPtr add_accessor)
 {
     check_nodeidx_pair(input, g);
 
@@ -688,22 +782,23 @@ NodeID GraphBuilder::add_scale_layer(Graph &g, const NodeParams &params, NodeIdx
 
     // Create mul node
     TensorDescriptor mul_desc = input_tensor_desc;
-    const size_t     C        = input_tensor_desc.shape[get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL)];
+    const size_t     C = input_tensor_desc.shape[get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL)];
     mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::WIDTH), 1);
     mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::HEIGHT), 1);
     mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL), C);
     NodeID      mul_const_nid   = add_const_node_with_name(g, params, "Mul", mul_desc, std::move(mul_accessor));
-    NodeIdxPair mul_const_nidxp = { mul_const_nid, 0 };
+    NodeIdxPair mul_const_nidxp = {mul_const_nid, 0};
 
     // Create add node
     TensorDescriptor add_desc        = mul_desc;
     NodeID           add_const_nid   = add_const_node_with_name(g, params, "Add", add_desc, std::move(add_accessor));
-    NodeIdxPair      add_const_nidxp = { add_const_nid, 0 };
+    NodeIdxPair      add_const_nidxp = {add_const_nid, 0};
 
     // Create node and connect
-    NodeID      mul_node      = GraphBuilder::add_elementwise_node(g, params, input, mul_const_nidxp, EltwiseOperation::Mul);
-    NodeIdxPair mulnode_nidxp = { mul_node, 0 };
-    NodeID      add_node      = GraphBuilder::add_elementwise_node(g, params, mulnode_nidxp, add_const_nidxp, EltwiseOperation::Add);
+    NodeID      mul_node = GraphBuilder::add_elementwise_node(g, params, input, mul_const_nidxp, EltwiseOperation::Mul);
+    NodeIdxPair mulnode_nidxp = {mul_node, 0};
+    NodeID      add_node =
+        GraphBuilder::add_elementwise_node(g, params, mulnode_nidxp, add_const_nidxp, EltwiseOperation::Add);
 
     return add_node;
 }
@@ -713,17 +808,25 @@ NodeID GraphBuilder::add_softmax_node(Graph &g, NodeParams params, NodeIdxPair i
     return create_simple_single_input_output_node<SoftmaxLayerNode>(g, params, input, beta);
 }
 
-NodeID GraphBuilder::add_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends)
+NodeID
+GraphBuilder::add_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends)
 {
     return create_simple_single_input_output_node<SliceLayerNode>(g, params, input, starts, ends);
 }
 
-NodeID GraphBuilder::add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis)
+NodeID
+GraphBuilder::add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis)
 {
     return create_simple_single_input_output_node<SplitLayerNode>(g, params, input, num_splits, axis);
 }
 
-NodeID GraphBuilder::add_strided_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends, BiStrides &strides, StridedSliceLayerInfo info)
+NodeID GraphBuilder::add_strided_slice_node(Graph                &g,
+                                            NodeParams            params,
+                                            NodeIdxPair           input,
+                                            Coordinates          &starts,
+                                            Coordinates          &ends,
+                                            BiStrides            &strides,
+                                            StridedSliceLayerInfo info)
 {
     return create_simple_single_input_output_node<StridedSliceLayerNode>(g, params, input, starts, ends, strides, info);
 }
@@ -770,7 +873,8 @@ NodeID GraphBuilder::add_yolo_node(Graph &g, NodeParams params, NodeIdxPair inpu
     g.add_connection(input.node_id, input.index, cls, 0);
     g.add_connection(cls, 0, cls_act, 0);
 
-    NodeID concat = g.add_node<ConcatenateLayerNode>(3, descriptors::ConcatLayerDescriptor(DataLayoutDimension::CHANNEL));
+    NodeID concat =
+        g.add_node<ConcatenateLayerNode>(3, descriptors::ConcatLayerDescriptor(DataLayoutDimension::CHANNEL));
     set_node_params(g, concat, params);
     g.add_connection(act_box, 0, concat, 0);
     g.add_connection(imm, 0, concat, 1);
diff --git a/src/graph/GraphContext.cpp b/src/graph/GraphContext.cpp
index 7b74c2fe0e563a7bc91536146eedd1b726a9995b..10850aa2598cd34b50da4b36c4ee59a789f2a410 100644
--- a/src/graph/GraphContext.cpp
+++ b/src/graph/GraphContext.cpp
@@ -24,15 +24,14 @@
 #include "arm_compute/graph/GraphContext.h"
 
 #include "arm_compute/graph.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/Utils.h"
 
 namespace arm_compute
 {
 namespace graph
 {
-GraphContext::GraphContext()
-    : _config(), _memory_managers(), _weights_managers()
+GraphContext::GraphContext() : _config(), _memory_managers(), _weights_managers()
 {
 }
 
@@ -56,7 +55,7 @@ void GraphContext::set_config(const GraphConfig &config)
 bool GraphContext::insert_memory_management_ctx(MemoryManagerContext &&memory_ctx)
 {
     Target target = memory_ctx.target;
-    if(target == Target::UNSPECIFIED || _memory_managers.find(target) != std::end(_memory_managers))
+    if (target == Target::UNSPECIFIED || _memory_managers.find(target) != std::end(_memory_managers))
     {
         return false;
     }
@@ -79,7 +78,7 @@ bool GraphContext::insert_weights_management_ctx(WeightsManagerContext &&weights
 {
     Target target = weights_managers.target;
 
-    if(_weights_managers.find(target) != std::end(_weights_managers))
+    if (_weights_managers.find(target) != std::end(_weights_managers))
     {
         return false;
     }
@@ -102,17 +101,17 @@ std::map<Target, WeightsManagerContext> &GraphContext::weights_managers()
 void GraphContext::finalize()
 {
     const size_t num_pools = 1;
-    for(auto &mm_obj : _memory_managers)
+    for (auto &mm_obj : _memory_managers)
     {
         ARM_COMPUTE_ERROR_ON(!mm_obj.second.allocator);
 
         // Finalize intra layer memory manager
-        if(mm_obj.second.intra_mm != nullptr)
+        if (mm_obj.second.intra_mm != nullptr)
         {
             mm_obj.second.intra_mm->populate(*mm_obj.second.allocator, num_pools);
         }
         // Finalize cross layer memory manager
-        if(mm_obj.second.cross_mm != nullptr)
+        if (mm_obj.second.cross_mm != nullptr)
         {
             mm_obj.second.cross_mm->populate(*mm_obj.second.allocator, num_pools);
         }
diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp
index 45b608c70a82970cbe3a176a15227087d0cf8512..58ae60d4cc1686b85e88d696fa3e386d78eada33 100644
--- a/src/graph/GraphManager.cpp
+++ b/src/graph/GraphManager.cpp
@@ -23,15 +23,15 @@
  */
 #include "arm_compute/graph/GraphManager.h"
 
+#include "arm_compute/graph/algorithms/TopologicalSort.h"
+#include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h"
+#include "arm_compute/graph/detail/ExecutionHelpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/PassManager.h"
 #include "arm_compute/graph/TypePrinter.h"
 #include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/algorithms/TopologicalSort.h"
-#include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h"
-#include "arm_compute/graph/detail/ExecutionHelpers.h"
 
 #include "src/common/utils/Log.h"
 
@@ -39,8 +39,7 @@ namespace arm_compute
 {
 namespace graph
 {
-GraphManager::GraphManager()
-    : _workloads()
+GraphManager::GraphManager() : _workloads()
 {
 }
 
@@ -49,7 +48,7 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &
     ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Initiate graph configuration!");
 
     // Check if graph has been registered
-    if(_workloads.find(graph.id()) != std::end(_workloads))
+    if (_workloads.find(graph.id()) != std::end(_workloads))
     {
         ARM_COMPUTE_ERROR("Graph is already registered!");
     }
@@ -62,7 +61,7 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &
 
     // In case CLVK is selected, use the CL backend and
     // update config
-    if(target == Target::CLVK)
+    if (target == Target::CLVK)
     {
         forced_target       = Target::CL;
         GraphConfig config  = ctx.config();
@@ -71,7 +70,7 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &
         ctx.set_config(config);
     }
 
-    if(!is_target_supported(target))
+    if (!is_target_supported(target))
     {
         forced_target = get_default_target();
         ARM_COMPUTE_LOG_GRAPH_INFO("Switching target from " << target << " to " << forced_target << std::endl);
@@ -105,7 +104,7 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &
     detail::prepare_all_tasks(workload);
 
     // Setup tensor memory (Allocate all tensors or setup transition manager)
-    if(ctx.config().use_transition_memory_manager)
+    if (ctx.config().use_transition_memory_manager)
     {
         detail::configure_transition_manager(graph, ctx, workload);
     }
@@ -130,10 +129,10 @@ void GraphManager::execute_graph(Graph &graph)
     auto it = _workloads.find(graph.id());
     ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_workloads), "Graph is not registered!");
 
-    while(true)
+    while (true)
     {
         // Call input accessors
-        if(!detail::call_all_input_node_accessors(it->second))
+        if (!detail::call_all_input_node_accessors(it->second))
         {
             return;
         }
@@ -142,7 +141,7 @@ void GraphManager::execute_graph(Graph &graph)
         detail::call_all_tasks(it->second);
 
         // Call output accessors
-        if(!detail::call_all_output_node_accessors(it->second))
+        if (!detail::call_all_output_node_accessors(it->second))
         {
             return;
         }
@@ -157,4 +156,4 @@ void GraphManager::invalidate_graph(Graph &graph)
     _workloads.erase(it);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
index e5b4adda26ffcb925aa5687c865fc132ffe64a01..83c3ef7e37f5ff542a0b3ddfe3ffa10337928be2 100644
--- a/src/graph/INode.cpp
+++ b/src/graph/INode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018,2021 Arm Limited.
+ * Copyright (c) 2018,2021,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,6 @@ namespace graph
 INode::INode()
     : _graph(nullptr), _id(EmptyNodeID), _common_params({ "", Target::UNSPECIFIED}),
       _outputs(), _input_edges(), _output_edges(), _assigned_target(Target::UNSPECIFIED)
-      ,_post_op_info_list(std::list<std::unique_ptr<ConvPostOpInfo>> {})
 {
 }
 // clang-format on
@@ -76,17 +75,17 @@ void INode::set_assigned_target(Target target)
 
 void INode::set_output_tensor(TensorID tid, size_t idx)
 {
-    if(tid != NullTensorID && (idx < _outputs.size()) && (_graph->tensor(tid) != nullptr))
+    if (tid != NullTensorID && (idx < _outputs.size()) && (_graph->tensor(tid) != nullptr))
     {
         ARM_COMPUTE_ERROR_ON(_graph == nullptr);
         Tensor *updated_tensor = _graph->tensor(tid);
         _outputs[idx]          = tid;
 
         // Set tensor to all output edges of the node
-        for(auto &output_edge_id : _output_edges)
+        for (auto &output_edge_id : _output_edges)
         {
             auto output_edge = _graph->edge(output_edge_id);
-            if(output_edge != nullptr)
+            if (output_edge != nullptr)
             {
                 // Unbind edge from current tensor
                 auto current_output_tensor = output_edge->tensor();
@@ -200,15 +199,5 @@ Target INode::assigned_target() const
 {
     return _assigned_target;
 }
-
-const std::list<std::unique_ptr<ConvPostOpInfo>> &INode::post_op_info_list() const
-{
-    return _post_op_info_list;
-}
-
-std::list<std::unique_ptr<ConvPostOpInfo>> &INode::post_op_info_list()
-{
-    return _post_op_info_list;
-}
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/INodeVisitor.cpp b/src/graph/INodeVisitor.cpp
index f067d618bd9c2cb429b14e430c9df0490d30a8e2..90b2e3327fb96af51f29688e4325912233ab5ac9 100644
--- a/src/graph/INodeVisitor.cpp
+++ b/src/graph/INodeVisitor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph/INodeVisitor.h"
+
 #include "arm_compute/graph/nodes/Nodes.h"
 
 namespace arm_compute
@@ -85,14 +86,6 @@ void DefaultNodeVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
 {
     default_visit(n);
 }
-void DefaultNodeVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
-{
-    default_visit(n);
-}
-void DefaultNodeVisitor::visit(FusedConvolutionWithPostOpNode &n)
-{
-    default_visit(n);
-}
 void DefaultNodeVisitor::visit(FusedDepthwiseConvolutionBatchNormalizationNode &n)
 {
     default_visit(n);
diff --git a/src/graph/PassManager.cpp b/src/graph/PassManager.cpp
index f7e214c1b45b92887fa456b49d08adb2d38152c4..9a889e1da34522109816adb0e5fa460d527fd5e8 100644
--- a/src/graph/PassManager.cpp
+++ b/src/graph/PassManager.cpp
@@ -29,8 +29,7 @@ namespace arm_compute
 {
 namespace graph
 {
-PassManager::PassManager()
-    : _passes()
+PassManager::PassManager() : _passes()
 {
 }
 
@@ -46,7 +45,7 @@ IGraphMutator *PassManager::pass(size_t index)
 
 void PassManager::append(std::unique_ptr<IGraphMutator> pass, bool conditional)
 {
-    if(pass && conditional)
+    if (pass && conditional)
     {
         ARM_COMPUTE_LOG_GRAPH_VERBOSE("Appending mutating pass : " << pass->name() << std::endl);
         _passes.push_back(std::move(pass));
@@ -60,9 +59,9 @@ void PassManager::clear()
 
 void PassManager::run_all(Graph &g)
 {
-    for(auto &pass : _passes)
+    for (auto &pass : _passes)
     {
-        if(pass)
+        if (pass)
         {
             ARM_COMPUTE_LOG_GRAPH_INFO("Running mutating pass : " << pass->name() << std::endl);
             pass->mutate(g);
@@ -72,9 +71,9 @@ void PassManager::run_all(Graph &g)
 
 void PassManager::run_type(Graph &g, IGraphMutator::MutationType type)
 {
-    for(auto &pass : _passes)
+    for (auto &pass : _passes)
     {
-        if(pass && (pass->type() == type))
+        if (pass && (pass->type() == type))
         {
             ARM_COMPUTE_LOG_GRAPH_INFO("Running mutating pass : " << pass->name() << std::endl);
             pass->mutate(g);
@@ -84,17 +83,17 @@ void PassManager::run_type(Graph &g, IGraphMutator::MutationType type)
 
 void PassManager::run_index(Graph &g, size_t index)
 {
-    if(index >= _passes.size())
+    if (index >= _passes.size())
     {
         return;
     }
 
     auto &pass = _passes.at(index);
-    if(pass != nullptr)
+    if (pass != nullptr)
     {
         ARM_COMPUTE_LOG_GRAPH_INFO("Running mutating pass : " << pass->name() << std::endl);
         pass->mutate(g);
     }
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/Tensor.cpp b/src/graph/Tensor.cpp
index 3d4723430f832998fd10a95f7afff1c0485e57f9..72679c4ea496ff29c73d527efeb233f823947daa 100644
--- a/src/graph/Tensor.cpp
+++ b/src/graph/Tensor.cpp
@@ -75,20 +75,20 @@ std::unique_ptr<ITensorAccessor> Tensor::extract_accessor()
 bool Tensor::call_accessor()
 {
     // Early exit guard
-    if(!_accessor || !_handle)
+    if (!_accessor || !_handle)
     {
         return false;
     }
 
     const bool access_data = _accessor->access_tensor_data();
 
-    if(access_data)
+    if (access_data)
     {
         // Map tensor
         _handle->map(true);
 
         // Return in case of null backend buffer
-        if(_handle->tensor().buffer() == nullptr)
+        if (_handle->tensor().buffer() == nullptr)
         {
             return false;
         }
@@ -97,7 +97,7 @@ bool Tensor::call_accessor()
     // Call accessor
     bool retval = _accessor->access_tensor(_handle->tensor());
 
-    if(access_data)
+    if (access_data)
     {
         // Unmap tensor
         _handle->unmap();
diff --git a/src/graph/TypeLoader.cpp b/src/graph/TypeLoader.cpp
index 3c51289dbac8e60d219e0eaf87285929011aac55..e1248fbb6bdedad84a0096ac16ca2e931693f334 100644
--- a/src/graph/TypeLoader.cpp
+++ b/src/graph/TypeLoader.cpp
@@ -31,10 +31,9 @@ namespace arm_compute
 {
 arm_compute::DataLayout data_layout_from_name(const std::string &name)
 {
-    static const std::map<std::string, arm_compute::DataLayout> data_layouts =
-    {
-        { "nhwc", DataLayout::NHWC },
-        { "nchw", DataLayout::NCHW },
+    static const std::map<std::string, arm_compute::DataLayout> data_layouts = {
+        {"nhwc", DataLayout::NHWC},
+        {"nchw", DataLayout::NCHW},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -45,7 +44,7 @@ arm_compute::DataLayout data_layout_from_name(const std::string &name)
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
@@ -55,11 +54,10 @@ namespace graph
 {
 Target target_from_name(const std::string &name)
 {
-    static const std::map<std::string, Target> targets =
-    {
-        { "neon", Target::NEON },
-        { "cl", Target::CL },
-        { "clvk", Target::CLVK },
+    static const std::map<std::string, Target> targets = {
+        {"neon", Target::NEON},
+        {"cl", Target::CL},
+        {"clvk", Target::CLVK},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -70,7 +68,7 @@ Target target_from_name(const std::string &name)
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
@@ -79,12 +77,11 @@ Target target_from_name(const std::string &name)
 
 ConvolutionMethod Convolution_method_from_name(const std::string &name)
 {
-    static const std::map<std::string, ConvolutionMethod> methods =
-    {
-        { "default", ConvolutionMethod::Default },
-        { "direct", ConvolutionMethod::Direct },
-        { "gemm", ConvolutionMethod::GEMM },
-        { "winograd", ConvolutionMethod::Winograd },
+    static const std::map<std::string, ConvolutionMethod> methods = {
+        {"default", ConvolutionMethod::Default},
+        {"direct", ConvolutionMethod::Direct},
+        {"gemm", ConvolutionMethod::GEMM},
+        {"winograd", ConvolutionMethod::Winograd},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -95,7 +92,7 @@ ConvolutionMethod Convolution_method_from_name(const std::string &name)
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
@@ -104,10 +101,9 @@ ConvolutionMethod Convolution_method_from_name(const std::string &name)
 
 DepthwiseConvolutionMethod depthwise_convolution_method_from_name(const std::string &name)
 {
-    static const std::map<std::string, DepthwiseConvolutionMethod> methods =
-    {
-        { "default", DepthwiseConvolutionMethod::Default },
-        { "optimized3x3", DepthwiseConvolutionMethod::Optimized3x3 },
+    static const std::map<std::string, DepthwiseConvolutionMethod> methods = {
+        {"default", DepthwiseConvolutionMethod::Default},
+        {"optimized3x3", DepthwiseConvolutionMethod::Optimized3x3},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -118,7 +114,7 @@ DepthwiseConvolutionMethod depthwise_convolution_method_from_name(const std::str
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
diff --git a/src/graph/Utils.cpp b/src/graph/Utils.cpp
index dcab177a3bb05fd4dc8cffed92e1247fcaed5ea5..452d8ec7b2b064c48259f4e89d24ec6512abe86b 100644
--- a/src/graph/Utils.cpp
+++ b/src/graph/Utils.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/graph/Utils.h"
 
-#include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/mutators/GraphMutators.h"
 
 namespace arm_compute
@@ -33,16 +33,17 @@ namespace graph
 {
 bool is_target_supported(Target target)
 {
-    return backends::BackendRegistry::get().contains(target) && backends::BackendRegistry::get().find_backend(target)->is_backend_supported();
+    return backends::BackendRegistry::get().contains(target) &&
+           backends::BackendRegistry::get().find_backend(target)->is_backend_supported();
 }
 
 Target get_default_target()
 {
-    if(is_target_supported(Target::NEON))
+    if (is_target_supported(Target::NEON))
     {
         return Target::NEON;
     }
-    if(is_target_supported(Target::CL))
+    if (is_target_supported(Target::CL))
     {
         return Target::CL;
     }
@@ -52,18 +53,18 @@ Target get_default_target()
 void force_target_to_graph(Graph &g, Target target)
 {
     auto &nodes = g.nodes();
-    for(auto &node : nodes)
+    for (auto &node : nodes)
     {
-        if(node)
+        if (node)
         {
             node->set_assigned_target(target);
         }
     }
 
     auto &tensors = g.tensors();
-    for(auto &tensor : tensors)
+    for (auto &tensor : tensors)
     {
-        if(tensor)
+        if (tensor)
         {
             tensor->desc().target = target;
         }
@@ -76,9 +77,9 @@ PassManager create_default_pass_manager(Target target, const GraphConfig &cfg)
     PassManager pm;
 
     // Passes that mutate graph IR
-    if(cfg.use_synthetic_type)
+    if (cfg.use_synthetic_type)
     {
-        switch(cfg.synthetic_type)
+        switch (cfg.synthetic_type)
         {
             case DataType::QASYMM8:
             case DataType::QASYMM8_SIGNED:
@@ -107,9 +108,9 @@ PassManager create_default_pass_manager(Target target, const GraphConfig &cfg)
 
 void release_default_graph_context(GraphContext &ctx)
 {
-    for(const auto &backend : backends::BackendRegistry::get().backends())
+    for (const auto &backend : backends::BackendRegistry::get().backends())
     {
-        if(backend.second->is_backend_supported())
+        if (backend.second->is_backend_supported())
         {
             backend.second->release_backend_context(ctx);
         }
@@ -118,9 +119,9 @@ void release_default_graph_context(GraphContext &ctx)
 
 void sync_backends()
 {
-    for(const auto &backend : backends::BackendRegistry::get().backends())
+    for (const auto &backend : backends::BackendRegistry::get().backends())
     {
-        if(backend.second->backend_allocator())
+        if (backend.second->backend_allocator())
         {
             backend.second->sync();
         }
@@ -129,10 +130,10 @@ void sync_backends()
 
 void setup_requested_backend_context(GraphContext &ctx, Target target)
 {
-    if(backends::BackendRegistry::get().contains(target))
+    if (backends::BackendRegistry::get().contains(target))
     {
         const auto &backend = backends::BackendRegistry::get().find_backend(target);
-        if(backend->is_backend_supported())
+        if (backend->is_backend_supported())
         {
             backend->setup_backend_context(ctx);
         }
@@ -141,20 +142,22 @@ void setup_requested_backend_context(GraphContext &ctx, Target target)
 
 size_t get_dimension_size(const TensorDescriptor &descriptor, const DataLayoutDimension data_layout_dimension)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(descriptor.layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
+    ARM_COMPUTE_ERROR_ON_MSG(descriptor.layout == DataLayout::UNKNOWN,
+                             "Cannot retrieve the dimension index for an unknown layout!");
     return descriptor.shape[get_dimension_idx(descriptor.layout, data_layout_dimension)];
 }
 
 size_t get_dimension_idx(DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
+    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN,
+                             "Cannot retrieve the dimension index for an unknown layout!");
 
     /* Return the index based on the data layout
      * [N C H W]
      * [3 2 1 0]
      * [N H W C]
      */
-    switch(data_layout_dimension)
+    switch (data_layout_dimension)
     {
         case DataLayoutDimension::CHANNEL:
             return (data_layout == DataLayout::NCHW) ? 2 : 0;
@@ -181,13 +184,13 @@ std::vector<NodeIdxPair> get_driving_nodes(const INode &node)
     const Graph *g = node.graph();
     ARM_COMPUTE_ERROR_ON(g == nullptr);
 
-    for(auto &output_edge_id : node.output_edges())
+    for (auto &output_edge_id : node.output_edges())
     {
         auto output_edge = g->edge(output_edge_id);
-        if(output_edge != nullptr)
+        if (output_edge != nullptr)
         {
             ARM_COMPUTE_ERROR_ON(output_edge->consumer() == nullptr);
-            driving_nodes.push_back({ output_edge->consumer_id(), output_edge->consumer_idx() });
+            driving_nodes.push_back({output_edge->consumer_id(), output_edge->consumer_idx()});
         }
     }
 
@@ -201,13 +204,13 @@ std::vector<NodeIdxPair> get_driver_nodes(const INode &node)
     const Graph *g = node.graph();
     ARM_COMPUTE_ERROR_ON(g == nullptr);
 
-    for(auto &input_edge_id : node.input_edges())
+    for (auto &input_edge_id : node.input_edges())
     {
         auto input_edge = g->edge(input_edge_id);
-        if(input_edge != nullptr)
+        if (input_edge != nullptr)
         {
             ARM_COMPUTE_ERROR_ON(input_edge->producer() == nullptr);
-            driver_nodes.push_back({ input_edge->producer_id(), input_edge->producer_idx() });
+            driver_nodes.push_back({input_edge->producer_id(), input_edge->producer_idx()});
         }
     }
 
@@ -216,7 +219,7 @@ std::vector<NodeIdxPair> get_driver_nodes(const INode &node)
 
 void configure_tensor(Tensor *tensor)
 {
-    if(tensor != nullptr && tensor->handle() == nullptr)
+    if (tensor != nullptr && tensor->handle() == nullptr)
     {
         Target                         target  = tensor->desc().target;
         backends::IDeviceBackend      &backend = backends::BackendRegistry::get().get_backend(target);
diff --git a/src/graph/Workload.cpp b/src/graph/Workload.cpp
index b9d57295b007406dc05c1a26d28a02da5f834e1e..9dddad7cbd14d362578ce4e390905d02180c3254 100644
--- a/src/graph/Workload.cpp
+++ b/src/graph/Workload.cpp
@@ -40,12 +40,12 @@ void ExecutionTask::operator()()
 
 void execute_task(ExecutionTask &task)
 {
-    if(task.task)
+    if (task.task)
     {
         task.task->run();
     }
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
-    else if(task.node->type() == NodeType::PrintLayer)
+    else if (task.node->type() == NodeType::PrintLayer)
     {
         auto print_node   = utils::cast::polymorphic_downcast<PrintLayerNode *>(task.node);
         auto input_handle = print_node->input(0)->handle();
@@ -61,14 +61,13 @@ void execute_task(ExecutionTask &task)
 
 void ExecutionTask::prepare()
 {
-    if(task)
+    if (task)
     {
         task->prepare();
     }
 }
 
-TaskExecutor::TaskExecutor()
-    : execute_function(execute_task)
+TaskExecutor::TaskExecutor() : execute_function(execute_task)
 {
 }
 
@@ -78,4 +77,4 @@ TaskExecutor &TaskExecutor::get()
     return executor;
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/algorithms/TopologicalSort.cpp b/src/graph/algorithms/TopologicalSort.cpp
index 3a693524710acf40045a19b31c6b00079f662902..08e14e1657107a83dff260f8010116cfa89dbde8 100644
--- a/src/graph/algorithms/TopologicalSort.cpp
+++ b/src/graph/algorithms/TopologicalSort.cpp
@@ -50,14 +50,14 @@ inline bool all_inputs_are_visited(const INode *node, const std::vector<bool> &v
     ARM_COMPUTE_ERROR_ON(graph == nullptr);
 
     bool are_all_visited = true;
-    for(const auto &input_edge_id : node->input_edges())
+    for (const auto &input_edge_id : node->input_edges())
     {
-        if(input_edge_id != EmptyNodeID)
+        if (input_edge_id != EmptyNodeID)
         {
             const Edge *input_edge = graph->edge(input_edge_id);
             ARM_COMPUTE_ERROR_ON(input_edge == nullptr);
             ARM_COMPUTE_ERROR_ON(input_edge->producer() == nullptr);
-            if(!visited[input_edge->producer_id()])
+            if (!visited[input_edge->producer_id()])
             {
                 are_all_visited = false;
                 break;
@@ -80,9 +80,9 @@ std::vector<NodeID> bfs(Graph &g)
     std::list<NodeID> queue;
 
     // Push inputs and mark as visited
-    for(auto &input : g.nodes(NodeType::Input))
+    for (auto &input : g.nodes(NodeType::Input))
     {
-        if(input != EmptyNodeID)
+        if (input != EmptyNodeID)
         {
             visited[input] = true;
             queue.push_back(input);
@@ -90,9 +90,9 @@ std::vector<NodeID> bfs(Graph &g)
     }
 
     // Push const nodes and mark as visited
-    for(auto &const_node : g.nodes(NodeType::Const))
+    for (auto &const_node : g.nodes(NodeType::Const))
     {
-        if(const_node != EmptyNodeID)
+        if (const_node != EmptyNodeID)
         {
             visited[const_node] = true;
             queue.push_back(const_node);
@@ -100,7 +100,7 @@ std::vector<NodeID> bfs(Graph &g)
     }
 
     // Iterate over vector and edges
-    while(!queue.empty())
+    while (!queue.empty())
     {
         // Dequeue a node from queue and process
         NodeID n = queue.front();
@@ -109,11 +109,11 @@ std::vector<NodeID> bfs(Graph &g)
 
         const INode *node = g.node(n);
         ARM_COMPUTE_ERROR_ON(node == nullptr);
-        for(const auto &eid : node->output_edges())
+        for (const auto &eid : node->output_edges())
         {
             const Edge *e = g.edge(eid);
             ARM_COMPUTE_ERROR_ON(e == nullptr);
-            if(!visited[e->consumer_id()] && detail::all_inputs_are_visited(e->consumer(), visited))
+            if (!visited[e->consumer_id()] && detail::all_inputs_are_visited(e->consumer(), visited))
             {
                 visited[e->consumer_id()] = true;
                 queue.push_back(e->consumer_id());
@@ -135,9 +135,9 @@ std::vector<NodeID> dfs(Graph &g)
     std::stack<NodeID> stack;
 
     // Push inputs and mark as visited
-    for(auto &input : g.nodes(NodeType::Input))
+    for (auto &input : g.nodes(NodeType::Input))
     {
-        if(input != EmptyNodeID)
+        if (input != EmptyNodeID)
         {
             visited[input] = true;
             stack.push(input);
@@ -145,9 +145,9 @@ std::vector<NodeID> dfs(Graph &g)
     }
 
     // Push const nodes and mark as visited
-    for(auto &const_node : g.nodes(NodeType::Const))
+    for (auto &const_node : g.nodes(NodeType::Const))
     {
-        if(const_node != EmptyNodeID)
+        if (const_node != EmptyNodeID)
         {
             visited[const_node] = true;
             stack.push(const_node);
@@ -155,7 +155,7 @@ std::vector<NodeID> dfs(Graph &g)
     }
 
     // Iterate over vector and edges
-    while(!stack.empty())
+    while (!stack.empty())
     {
         // Pop a node from stack and process
         NodeID n = stack.top();
@@ -163,7 +163,7 @@ std::vector<NodeID> dfs(Graph &g)
         stack.pop();
 
         // Mark node as visited
-        if(!visited[n])
+        if (!visited[n])
         {
             visited[n] = true;
         }
@@ -171,11 +171,11 @@ std::vector<NodeID> dfs(Graph &g)
         const INode *node = g.node(n);
         ARM_COMPUTE_ERROR_ON(node == nullptr);
         // Reverse iterate to push branches from right to left and pop on the opposite order
-        for(const auto &eid : arm_compute::utils::iterable::reverse_iterate(node->output_edges()))
+        for (const auto &eid : arm_compute::utils::iterable::reverse_iterate(node->output_edges()))
         {
             const Edge *e = g.edge(eid);
             ARM_COMPUTE_ERROR_ON(e == nullptr);
-            if(!visited[e->consumer_id()] && detail::all_inputs_are_visited(e->consumer(), visited))
+            if (!visited[e->consumer_id()] && detail::all_inputs_are_visited(e->consumer(), visited))
             {
                 stack.push(e->consumer_id());
             }
diff --git a/src/graph/backends/BackendRegistry.cpp b/src/graph/backends/BackendRegistry.cpp
index 46b4f99e23f37d58cd8eda7ef34f2a5b14fbefc3..bb6af79f8bfa5eb5a9092a36976fca3bdb0da0d0 100644
--- a/src/graph/backends/BackendRegistry.cpp
+++ b/src/graph/backends/BackendRegistry.cpp
@@ -31,8 +31,7 @@ namespace graph
 {
 namespace backends
 {
-BackendRegistry::BackendRegistry()
-    : _registered_backends()
+BackendRegistry::BackendRegistry() : _registered_backends()
 {
 }
 
diff --git a/src/graph/backends/CL/CLDeviceBackend.cpp b/src/graph/backends/CL/CLDeviceBackend.cpp
index 01e5ab173096c0e09cdd51219a6dbf0c91f51f8e..e27a4109d13f3d2d4c28a54be0bfbddba95dd8d1 100644
--- a/src/graph/backends/CL/CLDeviceBackend.cpp
+++ b/src/graph/backends/CL/CLDeviceBackend.cpp
@@ -23,18 +23,17 @@
  */
 #include "arm_compute/graph/backends/CL/CLDeviceBackend.h"
 
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/GraphContext.h"
-#include "arm_compute/graph/INode.h"
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/graph/backends/BackendRegistrar.h"
 #include "arm_compute/graph/backends/CL/CLFunctionFactory.h"
 #include "arm_compute/graph/backends/CL/CLNodeValidator.h"
 #include "arm_compute/graph/backends/CL/CLSubTensorHandle.h"
 #include "arm_compute/graph/backends/CL/CLTensorHandle.h"
-
-#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/Tensor.h"
 #include "arm_compute/runtime/BlobLifetimeManager.h"
 #include "arm_compute/runtime/CL/CLBufferAllocator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
@@ -64,7 +63,12 @@ bool file_exists(const std::string &filename)
 static detail::BackendRegistrar<CLDeviceBackend> CLDeviceBackend_registrar(Target::CL);
 
 CLDeviceBackend::CLDeviceBackend()
-    : _context_count(0), _tuner(), _gemm_heuristics(), _allocator(nullptr), _tuner_file(), _backend_type(CLBackendType::Native)
+    : _context_count(0),
+      _tuner(),
+      _gemm_heuristics(),
+      _allocator(nullptr),
+      _tuner_file(),
+      _backend_type(CLBackendType::Native)
 {
 }
 
@@ -95,7 +99,7 @@ void CLDeviceBackend::release_backend_context(GraphContext &ctx)
 {
     ARM_COMPUTE_UNUSED(ctx);
     _context_count--;
-    if(_context_count == 0) // No more context using the backend: free resources
+    if (_context_count == 0) // No more context using the backend: free resources
     {
         _allocator = nullptr;
     }
@@ -105,7 +109,7 @@ void CLDeviceBackend::setup_backend_context(GraphContext &ctx)
 {
     // Force backend initialization
     _context_count++;
-    if(_context_count == 1)
+    if (_context_count == 1)
     {
         _backend_type = ctx.config().backend_type;
         initialize_backend();
@@ -115,7 +119,7 @@ void CLDeviceBackend::setup_backend_context(GraphContext &ctx)
     _tuner_file = ctx.config().tuner_file;
 
     // Load tuner data if available
-    if(file_exists(_tuner_file))
+    if (file_exists(_tuner_file))
     {
         _tuner.load_from_file(_tuner_file);
     }
@@ -128,7 +132,7 @@ void CLDeviceBackend::setup_backend_context(GraphContext &ctx)
     CLScheduler::get().gemm_heuristics()->reload_from_file(ctx.config().mlgo_file);
 
     // Setup a management backend
-    if(ctx.memory_management_ctx(Target::CL) == nullptr)
+    if (ctx.memory_management_ctx(Target::CL) == nullptr)
     {
         MemoryManagerContext mm_ctx;
         mm_ctx.target      = Target::CL;
@@ -141,7 +145,7 @@ void CLDeviceBackend::setup_backend_context(GraphContext &ctx)
     }
 
     // Create function level weights manager
-    if(ctx.weights_management_ctx(Target::CL) == nullptr)
+    if (ctx.weights_management_ctx(Target::CL) == nullptr)
     {
         WeightsManagerContext wm_ctx;
         wm_ctx.target = Target::CL;
@@ -174,9 +178,10 @@ std::unique_ptr<ITensorHandle> CLDeviceBackend::create_tensor(const Tensor &tens
     return std::make_unique<CLTensorHandle>(info);
 }
 
-std::unique_ptr<ITensorHandle> CLDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
+std::unique_ptr<ITensorHandle>
+CLDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
 {
-    if(parent == nullptr)
+    if (parent == nullptr)
     {
         return nullptr;
     }
@@ -203,7 +208,7 @@ arm_compute::Status CLDeviceBackend::validate_node(INode &node)
 
 std::shared_ptr<arm_compute::IMemoryManager> CLDeviceBackend::create_memory_manager(MemoryManagerAffinity affinity)
 {
-    if(affinity == MemoryManagerAffinity::Offset)
+    if (affinity == MemoryManagerAffinity::Offset)
     {
         ARM_COMPUTE_LOG_GRAPH_WARNING("CL Backend does not support offset affinity memory management!");
         return nullptr;
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index c67f6a538b5883d2c05c8ed8e307e2e38596cfa1..d4e1aa880f450326d0a326e8e80d0c57a72140d5 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,12 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph/backends/CL/CLFunctionFactory.h"
-
+#include "arm_compute/graph/backends/FunctionHelpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
-#include "arm_compute/graph/backends/FunctionHelpers.h"
 #include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
+
 #include "src/core/CL/CLKernels.h"
 #include "support/Cast.h"
 
@@ -89,20 +89,19 @@ class CPPWrapperFunction : public IFunction
 {
 public:
     /* Default constructor */
-    CPPWrapperFunction()
-        : _tensors(), _func(nullptr)
+    CPPWrapperFunction() : _tensors(), _func(nullptr)
     {
     }
 
     void run() override
     {
-        for(auto &tensor : _tensors)
+        for (auto &tensor : _tensors)
         {
             tensor->map(CLScheduler::get().queue());
         }
         _func->run();
 
-        for(auto &tensor : _tensors)
+        for (auto &tensor : _tensors)
         {
             tensor->unmap(CLScheduler::get().queue());
         }
@@ -127,7 +126,8 @@ namespace detail
 {
 // Specialized functions
 template <>
-std::unique_ptr<IFunction> create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(DetectionOutputLayerNode &node)
+std::unique_ptr<IFunction>
+create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(DetectionOutputLayerNode &node)
 {
     validate_node<CLTargetInfo>(node, 3 /* expected inputs */, 1 /* expected outputs */);
 
@@ -149,16 +149,12 @@ std::unique_ptr<IFunction> create_detection_output_layer<CPPDetectionOutputLayer
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << CLTargetInfo::TargetType
-                               << " Data Type: " << input0->info()->data_type()
-                               << " Input0 shape: " << input0->info()->tensor_shape()
-                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << CLTargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type() << " Input0 shape: "
+                               << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape()
                                << " Input2 shape: " << input2->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << " DetectionOutputLayer info: " << detect_info
-                               << std::endl);
+                               << " DetectionOutputLayer info: " << detect_info << std::endl);
 
     auto wrap_function = std::make_unique<CPPWrapperFunction>();
 
@@ -171,7 +167,8 @@ std::unique_ptr<IFunction> create_detection_output_layer<CPPDetectionOutputLayer
     return std::move(wrap_function);
 }
 template <>
-std::unique_ptr<IFunction> create_detection_post_process_layer<CPPDetectionPostProcessLayer, CLTargetInfo>(DetectionPostProcessLayerNode &node)
+std::unique_ptr<IFunction>
+create_detection_post_process_layer<CPPDetectionPostProcessLayer, CLTargetInfo>(DetectionPostProcessLayerNode &node)
 {
     validate_node<CLTargetInfo>(node, 3 /* expected inputs */, 4 /* expected outputs */);
 
@@ -199,19 +196,15 @@ std::unique_ptr<IFunction> create_detection_post_process_layer<CPPDetectionPostP
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << CLTargetInfo::TargetType
-                               << " Data Type: " << input0->info()->data_type()
-                               << " Input0 shape: " << input0->info()->tensor_shape()
-                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << CLTargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type() << " Input0 shape: "
+                               << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape()
                                << " Input2 shape: " << input2->info()->tensor_shape()
                                << " Output0 shape: " << output0->info()->tensor_shape()
                                << " Output1 shape: " << output1->info()->tensor_shape()
                                << " Output2 shape: " << output2->info()->tensor_shape()
                                << " Output3 shape: " << output3->info()->tensor_shape()
-                               << " DetectionPostProcessLayer info: " << detect_info
-                               << std::endl);
+                               << " DetectionPostProcessLayer info: " << detect_info << std::endl);
 
     auto wrap_function = std::make_unique<CPPWrapperFunction>();
 
@@ -230,96 +223,128 @@ std::unique_ptr<IFunction> create_detection_post_process_layer<CPPDetectionPostP
 
 std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &ctx)
 {
-    if(node == nullptr)
+    if (node == nullptr)
     {
         return nullptr;
     }
 
     NodeType type = node->type();
-    switch(type)
+    switch (type)
     {
         case NodeType::ActivationLayer:
-            return detail::create_activation_layer<CLActivationLayer, CLTargetInfo>(*polymorphic_downcast<ActivationLayerNode *>(node));
+            return detail::create_activation_layer<CLActivationLayer, CLTargetInfo>(
+                *polymorphic_downcast<ActivationLayerNode *>(node));
         case NodeType::ArgMinMaxLayer:
-            return detail::create_arg_min_max_layer<CLArgMinMaxLayer, CLTargetInfo>(*polymorphic_downcast<ArgMinMaxLayerNode *>(node));
+            return detail::create_arg_min_max_layer<CLArgMinMaxLayer, CLTargetInfo>(
+                *polymorphic_downcast<ArgMinMaxLayerNode *>(node));
         case NodeType::BatchNormalizationLayer:
-            return detail::create_batch_normalization_layer<CLBatchNormalizationLayer, CLTargetInfo>(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
+            return detail::create_batch_normalization_layer<CLBatchNormalizationLayer, CLTargetInfo>(
+                *polymorphic_downcast<BatchNormalizationLayerNode *>(node));
         case NodeType::BoundingBoxTransformLayer:
-            return detail::create_bounding_box_transform_layer<CLBoundingBoxTransform, CLTargetInfo>(*polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
+            return detail::create_bounding_box_transform_layer<CLBoundingBoxTransform, CLTargetInfo>(
+                *polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
         case NodeType::ChannelShuffleLayer:
-            return detail::create_channel_shuffle_layer<CLChannelShuffleLayer, CLTargetInfo>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
+            return detail::create_channel_shuffle_layer<CLChannelShuffleLayer, CLTargetInfo>(
+                *polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
-            return detail::create_convolution_layer<CLConvolutionLayerFunctions, CLTargetInfo>(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
+            return detail::create_convolution_layer<CLConvolutionLayerFunctions, CLTargetInfo>(
+                *polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
         case NodeType::DeconvolutionLayer:
-            return detail::create_deconvolution_layer<CLDeconvolutionLayer, CLTargetInfo>(*polymorphic_downcast<DeconvolutionLayerNode *>(node), ctx);
+            return detail::create_deconvolution_layer<CLDeconvolutionLayer, CLTargetInfo>(
+                *polymorphic_downcast<DeconvolutionLayerNode *>(node), ctx);
         case NodeType::ConcatenateLayer:
-            return detail::create_concatenate_layer<CLConcatenateLayer, CLTargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
+            return detail::create_concatenate_layer<CLConcatenateLayer, CLTargetInfo>(
+                *polymorphic_downcast<ConcatenateLayerNode *>(node));
         case NodeType::DepthToSpaceLayer:
-            return detail::create_depth_to_space_layer<CLDepthToSpaceLayer, CLTargetInfo>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
+            return detail::create_depth_to_space_layer<CLDepthToSpaceLayer, CLTargetInfo>(
+                *polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
-            return detail::create_depthwise_convolution_layer<CLDepthwiseConvolutionLayer, CLTargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+            return detail::create_depthwise_convolution_layer<CLDepthwiseConvolutionLayer, CLTargetInfo>(
+                *polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::DequantizationLayer:
-            return detail::create_dequantization_layer<CLDequantizationLayer, CLTargetInfo>(*polymorphic_downcast<DequantizationLayerNode *>(node));
+            return detail::create_dequantization_layer<CLDequantizationLayer, CLTargetInfo>(
+                *polymorphic_downcast<DequantizationLayerNode *>(node));
         case NodeType::DetectionOutputLayer:
-            return detail::create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
+            return detail::create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(
+                *polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::DetectionPostProcessLayer:
-            return detail::create_detection_post_process_layer<CPPDetectionPostProcessLayer, CLTargetInfo>(*polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
+            return detail::create_detection_post_process_layer<CPPDetectionPostProcessLayer, CLTargetInfo>(
+                *polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
         case NodeType::EltwiseLayer:
-            return detail::create_eltwise_layer<CLEltwiseFunctions, CLTargetInfo>(*polymorphic_downcast<EltwiseLayerNode *>(node));
+            return detail::create_eltwise_layer<CLEltwiseFunctions, CLTargetInfo>(
+                *polymorphic_downcast<EltwiseLayerNode *>(node));
         case NodeType::UnaryEltwiseLayer:
-            return detail::create_unary_eltwise_layer<CLUnaryEltwiseFunctions, CLTargetInfo>(*polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
+            return detail::create_unary_eltwise_layer<CLUnaryEltwiseFunctions, CLTargetInfo>(
+                *polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
         case NodeType::FlattenLayer:
-            return detail::create_flatten_layer<CLFlattenLayer, CLTargetInfo>(*polymorphic_downcast<FlattenLayerNode *>(node));
+            return detail::create_flatten_layer<CLFlattenLayer, CLTargetInfo>(
+                *polymorphic_downcast<FlattenLayerNode *>(node));
         case NodeType::FullyConnectedLayer:
-            return detail::create_fully_connected_layer<CLFullyConnectedLayer, CLTargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+            return detail::create_fully_connected_layer<CLFullyConnectedLayer, CLTargetInfo>(
+                *polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
         case NodeType::FusedConvolutionBatchNormalizationLayer:
-            return detail::create_fused_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
-        case NodeType::FusedConvolutionWithPostOp:
-            return detail::create_fused_convolution_with_post_op<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionWithPostOpNode *>(node), ctx);
+            return detail::create_fused_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(
+                *polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer:
-            return detail::create_fused_depthwise_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
+            return detail::create_fused_depthwise_convolution_batch_normalization_layer<CLFusedLayerTypes,
+                                                                                        CLTargetInfo>(
+                *polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::GenerateProposalsLayer:
-            return detail::create_generate_proposals_layer<CLGenerateProposalsLayer, CLTargetInfo>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node), ctx);
+            return detail::create_generate_proposals_layer<CLGenerateProposalsLayer, CLTargetInfo>(
+                *polymorphic_downcast<GenerateProposalsLayerNode *>(node), ctx);
         case NodeType::L2NormalizeLayer:
-            return detail::create_l2_normalize_layer<CLL2NormalizeLayer, CLTargetInfo>(*polymorphic_downcast<L2NormalizeLayerNode *>(node), ctx);
+            return detail::create_l2_normalize_layer<CLL2NormalizeLayer, CLTargetInfo>(
+                *polymorphic_downcast<L2NormalizeLayerNode *>(node), ctx);
         case NodeType::NormalizationLayer:
-            return detail::create_normalization_layer<CLNormalizationLayer, CLTargetInfo>(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
+            return detail::create_normalization_layer<CLNormalizationLayer, CLTargetInfo>(
+                *polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
         case NodeType::NormalizePlanarYUVLayer:
-            return detail::create_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer, CLTargetInfo>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
+            return detail::create_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer, CLTargetInfo>(
+                *polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
         case NodeType::PadLayer:
             return detail::create_pad_layer<CLPadLayer, CLTargetInfo>(*polymorphic_downcast<PadLayerNode *>(node));
         case NodeType::PermuteLayer:
-            return detail::create_permute_layer<CLPermute, CLTargetInfo>(*polymorphic_downcast<PermuteLayerNode *>(node));
+            return detail::create_permute_layer<CLPermute, CLTargetInfo>(
+                *polymorphic_downcast<PermuteLayerNode *>(node));
         case NodeType::PoolingLayer:
-            return detail::create_pooling_layer<CLPoolingLayer, CLTargetInfo>(*polymorphic_downcast<PoolingLayerNode *>(node));
+            return detail::create_pooling_layer<CLPoolingLayer, CLTargetInfo>(
+                *polymorphic_downcast<PoolingLayerNode *>(node));
         case NodeType::PReluLayer:
-            return detail::create_prelu_layer<CLPReluLayer, CLTargetInfo>(*polymorphic_downcast<PReluLayerNode *>(node));
+            return detail::create_prelu_layer<CLPReluLayer, CLTargetInfo>(
+                *polymorphic_downcast<PReluLayerNode *>(node));
         case NodeType::PrintLayer:
             return detail::create_print_layer<CLTargetInfo>(*polymorphic_downcast<PrintLayerNode *>(node));
         case NodeType::PriorBoxLayer:
-            return detail::create_priorbox_layer<CLPriorBoxLayer, CLTargetInfo>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
+            return detail::create_priorbox_layer<CLPriorBoxLayer, CLTargetInfo>(
+                *polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::QuantizationLayer:
-            return detail::create_quantization_layer<CLQuantizationLayer, CLTargetInfo>(*polymorphic_downcast<QuantizationLayerNode *>(node));
+            return detail::create_quantization_layer<CLQuantizationLayer, CLTargetInfo>(
+                *polymorphic_downcast<QuantizationLayerNode *>(node));
         case NodeType::ReductionOperationLayer:
-            return detail::create_reduction_operation_layer<CLReductionOperation, CLTargetInfo>(*polymorphic_downcast<ReductionLayerNode *>(node), ctx);
+            return detail::create_reduction_operation_layer<CLReductionOperation, CLTargetInfo>(
+                *polymorphic_downcast<ReductionLayerNode *>(node), ctx);
         case NodeType::ReorgLayer:
-            return detail::create_reorg_layer<CLReorgLayer, CLTargetInfo>(*polymorphic_downcast<ReorgLayerNode *>(node));
+            return detail::create_reorg_layer<CLReorgLayer, CLTargetInfo>(
+                *polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
-            return detail::create_reshape_layer<CLReshapeLayer, CLTargetInfo>(*polymorphic_downcast<ReshapeLayerNode *>(node));
+            return detail::create_reshape_layer<CLReshapeLayer, CLTargetInfo>(
+                *polymorphic_downcast<ReshapeLayerNode *>(node));
         case NodeType::ResizeLayer:
             return detail::create_resize_layer<CLScale, CLTargetInfo>(*polymorphic_downcast<ResizeLayerNode *>(node));
         case NodeType::ROIAlignLayer:
-            return detail::create_roi_align_layer<CLROIAlignLayer, CLTargetInfo>(*polymorphic_downcast<ROIAlignLayerNode *>(node));
+            return detail::create_roi_align_layer<CLROIAlignLayer, CLTargetInfo>(
+                *polymorphic_downcast<ROIAlignLayerNode *>(node));
         case NodeType::SliceLayer:
             return detail::create_slice_layer<CLSlice, CLTargetInfo>(*polymorphic_downcast<SliceLayerNode *>(node));
         case NodeType::SoftmaxLayer:
-            return detail::create_softmax_layer<CLSoftmaxLayer, CLTargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+            return detail::create_softmax_layer<CLSoftmaxLayer, CLTargetInfo>(
+                *polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
         case NodeType::StackLayer:
-            return detail::create_stack_layer<CLStackLayer, CLTargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
+            return detail::create_stack_layer<CLStackLayer, CLTargetInfo>(
+                *polymorphic_downcast<StackLayerNode *>(node));
         case NodeType::StridedSliceLayer:
-            return detail::create_strided_slice_layer<CLStridedSlice, CLTargetInfo>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
-        case NodeType::FusedConvolutionBatchNormalizationLayerWithPostOpsLayer:
-            return detail::create_fused_convolution_batch_normalization_with_post_op<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationWithPostOpsNode *>(node), ctx);
+            return detail::create_strided_slice_layer<CLStridedSlice, CLTargetInfo>(
+                *polymorphic_downcast<StridedSliceLayerNode *>(node));
         default:
             return nullptr;
     }
diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index c50782db48bed4decd0b8a825c05474cc9ebb007..510eda7935854a82444bb82f7763ac0cffd9765b 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 
 #include "arm_compute/graph/backends/ValidateHelpers.h"
 #include "arm_compute/graph/nodes/Nodes.h"
-
 #include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
 
@@ -57,43 +56,51 @@ struct CLUnaryEltwiseLayerFunctions
 
 Status CLNodeValidator::validate(INode *node)
 {
-    if(node == nullptr)
+    if (node == nullptr)
     {
         return Status{};
     }
 
     NodeType type = node->type();
-    switch(type)
+    switch (type)
     {
         case NodeType::ArgMinMaxLayer:
-            return detail::validate_arg_min_max_layer<CLArgMinMaxLayer>(*polymorphic_downcast<ArgMinMaxLayerNode *>(node));
+            return detail::validate_arg_min_max_layer<CLArgMinMaxLayer>(
+                *polymorphic_downcast<ArgMinMaxLayerNode *>(node));
         case NodeType::BoundingBoxTransformLayer:
-            return detail::validate_bounding_box_transform_layer<CLBoundingBoxTransform>(*polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
+            return detail::validate_bounding_box_transform_layer<CLBoundingBoxTransform>(
+                *polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
         case NodeType::ChannelShuffleLayer:
-            return detail::validate_channel_shuffle_layer<CLChannelShuffleLayer>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
+            return detail::validate_channel_shuffle_layer<CLChannelShuffleLayer>(
+                *polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
-            return detail::validate_convolution_layer<CLConvolutionLayer,
-                   CLDirectConvolutionLayer,
-                   CLGEMMConvolutionLayer,
-                   CLWinogradConvolutionLayer>(*polymorphic_downcast<ConvolutionLayerNode *>(node));
-        case NodeType::FusedConvolutionWithPostOp:
-            return detail::validate_fused_convolution_with_post_op<CLGEMMConvolutionLayer>(*polymorphic_downcast<FusedConvolutionWithPostOpNode *>(node));
+            return detail::validate_convolution_layer<CLConvolutionLayer, CLDirectConvolutionLayer,
+                                                      CLGEMMConvolutionLayer, CLWinogradConvolutionLayer>(
+                *polymorphic_downcast<ConvolutionLayerNode *>(node));
         case NodeType::DepthToSpaceLayer:
-            return detail::validate_depth_to_space_layer<CLDepthToSpaceLayer>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
+            return detail::validate_depth_to_space_layer<CLDepthToSpaceLayer>(
+                *polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
-            return detail::validate_depthwise_convolution_layer<CLDepthwiseConvolutionLayer>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+            return detail::validate_depthwise_convolution_layer<CLDepthwiseConvolutionLayer>(
+                *polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::DequantizationLayer:
-            return detail::validate_dequantization_layer<CLDequantizationLayer>(*polymorphic_downcast<DequantizationLayerNode *>(node));
+            return detail::validate_dequantization_layer<CLDequantizationLayer>(
+                *polymorphic_downcast<DequantizationLayerNode *>(node));
         case NodeType::DetectionOutputLayer:
-            return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
+            return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(
+                *polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::DetectionPostProcessLayer:
-            return detail::validate_detection_post_process_layer<CPPDetectionPostProcessLayer>(*polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
+            return detail::validate_detection_post_process_layer<CPPDetectionPostProcessLayer>(
+                *polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
         case NodeType::GenerateProposalsLayer:
-            return detail::validate_generate_proposals_layer<CLGenerateProposalsLayer>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node));
+            return detail::validate_generate_proposals_layer<CLGenerateProposalsLayer>(
+                *polymorphic_downcast<GenerateProposalsLayerNode *>(node));
         case NodeType::L2NormalizeLayer:
-            return detail::validate_l2_normalize_layer<CLL2NormalizeLayer>(*polymorphic_downcast<L2NormalizeLayerNode *>(node));
+            return detail::validate_l2_normalize_layer<CLL2NormalizeLayer>(
+                *polymorphic_downcast<L2NormalizeLayerNode *>(node));
         case NodeType::NormalizePlanarYUVLayer:
-            return detail::validate_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
+            return detail::validate_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer>(
+                *polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
         case NodeType::PadLayer:
             return detail::validate_pad_layer<CLPadLayer>(*polymorphic_downcast<PadLayerNode *>(node));
         case NodeType::PermuteLayer:
@@ -103,9 +110,11 @@ Status CLNodeValidator::validate(INode *node)
         case NodeType::PriorBoxLayer:
             return detail::validate_priorbox_layer<CLPriorBoxLayer>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::QuantizationLayer:
-            return detail::validate_quantization_layer<CLQuantizationLayer>(*polymorphic_downcast<QuantizationLayerNode *>(node));
+            return detail::validate_quantization_layer<CLQuantizationLayer>(
+                *polymorphic_downcast<QuantizationLayerNode *>(node));
         case NodeType::ReductionOperationLayer:
-            return detail::validate_reduction_operation_layer<CLReductionOperation>(*polymorphic_downcast<ReductionLayerNode *>(node));
+            return detail::validate_reduction_operation_layer<CLReductionOperation>(
+                *polymorphic_downcast<ReductionLayerNode *>(node));
         case NodeType::ReorgLayer:
             return detail::validate_reorg_layer<CLReorgLayer>(*polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
@@ -115,11 +124,14 @@ Status CLNodeValidator::validate(INode *node)
         case NodeType::SliceLayer:
             return detail::validate_slice_layer<CLSlice>(*polymorphic_downcast<SliceLayerNode *>(node));
         case NodeType::StridedSliceLayer:
-            return detail::validate_strided_slice_layer<CLStridedSlice>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
+            return detail::validate_strided_slice_layer<CLStridedSlice>(
+                *polymorphic_downcast<StridedSliceLayerNode *>(node));
         case NodeType::EltwiseLayer:
-            return detail::validate_eltwise_Layer<CLEltwiseLayerFunctions>(*polymorphic_downcast<EltwiseLayerNode *>(node));
+            return detail::validate_eltwise_Layer<CLEltwiseLayerFunctions>(
+                *polymorphic_downcast<EltwiseLayerNode *>(node));
         case NodeType::UnaryEltwiseLayer:
-            return detail::validate_unary_eltwise_layer<CLUnaryEltwiseLayerFunctions>(*polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
+            return detail::validate_unary_eltwise_layer<CLUnaryEltwiseLayerFunctions>(
+                *polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
         default:
             return Status{};
     }
diff --git a/src/graph/backends/CL/CLSubTensorHandle.cpp b/src/graph/backends/CL/CLSubTensorHandle.cpp
index b97d25890aba99db429d8242e8a08c38c4e7029b..ccdc877a1895aa597524ce557a778a32d6022a05 100644
--- a/src/graph/backends/CL/CLSubTensorHandle.cpp
+++ b/src/graph/backends/CL/CLSubTensorHandle.cpp
@@ -31,7 +31,10 @@ namespace graph
 {
 namespace backends
 {
-CLSubTensorHandle::CLSubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent)
+CLSubTensorHandle::CLSubTensorHandle(ITensorHandle     *parent_handle,
+                                     const TensorShape &shape,
+                                     const Coordinates &coords,
+                                     bool               extend_parent)
     : _sub_tensor(), _parent_handle(nullptr)
 {
     ARM_COMPUTE_ERROR_ON(!parent_handle);
@@ -98,4 +101,4 @@ Target CLSubTensorHandle::target() const
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/backends/CL/CLTensorHandle.cpp b/src/graph/backends/CL/CLTensorHandle.cpp
index a496c2ce477df28f1671ff89cd631bb894cda6b7..1b69f9dede5411cb013a0854a0440f804d9d5824 100644
--- a/src/graph/backends/CL/CLTensorHandle.cpp
+++ b/src/graph/backends/CL/CLTensorHandle.cpp
@@ -31,8 +31,7 @@ namespace graph
 {
 namespace backends
 {
-CLTensorHandle::CLTensorHandle(const ITensorInfo &info)
-    : _tensor()
+CLTensorHandle::CLTensorHandle(const ITensorInfo &info) : _tensor()
 {
     _tensor.allocator()->init(info);
 }
@@ -49,7 +48,7 @@ void CLTensorHandle::free()
 
 void CLTensorHandle::manage(IMemoryGroup *mg)
 {
-    if(mg != nullptr)
+    if (mg != nullptr)
     {
         mg->manage(&_tensor);
     }
@@ -68,7 +67,7 @@ void CLTensorHandle::unmap()
 void CLTensorHandle::release_if_unused()
 {
     // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used
-    if(!_tensor.is_used())
+    if (!_tensor.is_used())
     {
         _tensor.allocator()->free();
     }
@@ -100,4 +99,4 @@ Target CLTensorHandle::target() const
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/backends/NEON/NEDeviceBackend.cpp b/src/graph/backends/NEON/NEDeviceBackend.cpp
index 18456538da4de49a6c0224c43dda5dd297e0c27c..fc7b3098030121166795daf228b30ae305494da0 100644
--- a/src/graph/backends/NEON/NEDeviceBackend.cpp
+++ b/src/graph/backends/NEON/NEDeviceBackend.cpp
@@ -23,18 +23,17 @@
  */
 #include "arm_compute/graph/backends/NEON/NEDeviceBackend.h"
 
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/GraphContext.h"
-#include "arm_compute/graph/INode.h"
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/graph/backends/BackendRegistrar.h"
 #include "arm_compute/graph/backends/NEON/NEFunctionFactory.h"
 #include "arm_compute/graph/backends/NEON/NENodeValidator.h"
 #include "arm_compute/graph/backends/NEON/NESubTensorHandle.h"
 #include "arm_compute/graph/backends/NEON/NETensorHandle.h"
-
-#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/Tensor.h"
 #include "arm_compute/runtime/Allocator.h"
 #include "arm_compute/runtime/BlobLifetimeManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
@@ -53,8 +52,7 @@ namespace backends
 /** Register CPU backend */
 static detail::BackendRegistrar<NEDeviceBackend> NEDeviceBackend_registrar(Target::NEON);
 
-NEDeviceBackend::NEDeviceBackend()
-    : _allocator()
+NEDeviceBackend::NEDeviceBackend() : _allocator()
 {
 }
 
@@ -72,13 +70,13 @@ void NEDeviceBackend::release_backend_context(GraphContext &ctx)
 void NEDeviceBackend::setup_backend_context(GraphContext &ctx)
 {
     // Set number of threads
-    if(ctx.config().num_threads >= 0)
+    if (ctx.config().num_threads >= 0)
     {
         Scheduler::get().set_num_threads(ctx.config().num_threads);
     }
 
     // Create function level memory manager
-    if(ctx.memory_management_ctx(Target::NEON) == nullptr)
+    if (ctx.memory_management_ctx(Target::NEON) == nullptr)
     {
         MemoryManagerContext mm_ctx;
         mm_ctx.target      = Target::NEON;
@@ -91,7 +89,7 @@ void NEDeviceBackend::setup_backend_context(GraphContext &ctx)
     }
 
     // Create function level weights manager
-    if(ctx.weights_management_ctx(Target::NEON) == nullptr)
+    if (ctx.weights_management_ctx(Target::NEON) == nullptr)
     {
         WeightsManagerContext wm_ctx;
         wm_ctx.target = Target::NEON;
@@ -124,9 +122,10 @@ std::unique_ptr<ITensorHandle> NEDeviceBackend::create_tensor(const Tensor &tens
     return std::make_unique<NETensorHandle>(info);
 }
 
-std::unique_ptr<ITensorHandle> NEDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
+std::unique_ptr<ITensorHandle>
+NEDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
 {
-    if(parent == nullptr)
+    if (parent == nullptr)
     {
         return nullptr;
     }
@@ -154,7 +153,7 @@ arm_compute::Status NEDeviceBackend::validate_node(INode &node)
 std::shared_ptr<arm_compute::IMemoryManager> NEDeviceBackend::create_memory_manager(MemoryManagerAffinity affinity)
 {
     std::shared_ptr<ILifetimeManager> lifetime_mgr = nullptr;
-    if(affinity == MemoryManagerAffinity::Buffer)
+    if (affinity == MemoryManagerAffinity::Buffer)
     {
         lifetime_mgr = std::make_shared<BlobLifetimeManager>();
     }
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index 57c64475ecea1f2bb3a33222838d4f553cf11b30..fe15d4cec1ee2b47bc36c74925e920fe6bf443b3 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -23,13 +23,13 @@
  */
 #include "arm_compute/graph/backends/NEON/NEFunctionFactory.h"
 
+#include "arm_compute/graph/backends/FunctionHelpers.h"
+#include "arm_compute/graph/backends/Utils.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/TypePrinter.h"
-#include "arm_compute/graph/backends/FunctionHelpers.h"
-#include "arm_compute/graph/backends/Utils.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/TypePrinter.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 
@@ -88,7 +88,8 @@ struct NEFusedLayerTypes
 namespace detail
 {
 template <>
-std::unique_ptr<IFunction> create_normalization_layer<NENormalizationLayer, NETargetInfo>(NormalizationLayerNode &node, GraphContext &ctx)
+std::unique_ptr<IFunction> create_normalization_layer<NENormalizationLayer, NETargetInfo>(NormalizationLayerNode &node,
+                                                                                          GraphContext           &ctx)
 {
     validate_node<NETargetInfo>(node, 1 /* expected inputs */, 1 /* expected outputs */);
 
@@ -105,99 +106,127 @@ std::unique_ptr<IFunction> create_normalization_layer<NENormalizationLayer, NETa
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << NETargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Normalization info: " << norm_info.type()
-                               << std::endl);
+                               << node.name() << " Type: " << node.type() << " Target: " << NETargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type() << " Input shape: "
+                               << input->info()->tensor_shape() << " Output shape: " << output->info()->tensor_shape()
+                               << " Normalization info: " << norm_info.type() << std::endl);
 
-    return std::move(func);
+    return func;
 }
 } // namespace detail
 
 std::unique_ptr<IFunction> NEFunctionFactory::create(INode *node, GraphContext &ctx)
 {
-    if(node == nullptr)
+    if (node == nullptr)
     {
         return nullptr;
     }
 
     NodeType type = node->type();
-    switch(type)
+    switch (type)
     {
         case NodeType::ActivationLayer:
-            return detail::create_activation_layer<NEActivationLayer, NETargetInfo>(*polymorphic_downcast<ActivationLayerNode *>(node));
+            return detail::create_activation_layer<NEActivationLayer, NETargetInfo>(
+                *polymorphic_downcast<ActivationLayerNode *>(node));
         case NodeType::ArgMinMaxLayer:
-            return detail::create_arg_min_max_layer<NEArgMinMaxLayer, NETargetInfo>(*polymorphic_downcast<ArgMinMaxLayerNode *>(node));
+            return detail::create_arg_min_max_layer<NEArgMinMaxLayer, NETargetInfo>(
+                *polymorphic_downcast<ArgMinMaxLayerNode *>(node));
         case NodeType::BatchNormalizationLayer:
-            return detail::create_batch_normalization_layer<NEBatchNormalizationLayer, NETargetInfo>(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
+            return detail::create_batch_normalization_layer<NEBatchNormalizationLayer, NETargetInfo>(
+                *polymorphic_downcast<BatchNormalizationLayerNode *>(node));
         case NodeType::ChannelShuffleLayer:
-            return detail::create_channel_shuffle_layer<NEChannelShuffleLayer, NETargetInfo>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
+            return detail::create_channel_shuffle_layer<NEChannelShuffleLayer, NETargetInfo>(
+                *polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
-            return detail::create_convolution_layer<NEConvolutionLayerFunctions, NETargetInfo>(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
+            return detail::create_convolution_layer<NEConvolutionLayerFunctions, NETargetInfo>(
+                *polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
         case NodeType::DepthToSpaceLayer:
-            return detail::create_depth_to_space_layer<NEDepthToSpaceLayer, NETargetInfo>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
+            return detail::create_depth_to_space_layer<NEDepthToSpaceLayer, NETargetInfo>(
+                *polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DeconvolutionLayer:
-            return detail::create_deconvolution_layer<NEDeconvolutionLayer, NETargetInfo>(*polymorphic_downcast<DeconvolutionLayerNode *>(node), ctx);
+            return detail::create_deconvolution_layer<NEDeconvolutionLayer, NETargetInfo>(
+                *polymorphic_downcast<DeconvolutionLayerNode *>(node), ctx);
         case NodeType::ConcatenateLayer:
-            return detail::create_concatenate_layer<NEConcatenateLayer, NETargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
+            return detail::create_concatenate_layer<NEConcatenateLayer, NETargetInfo>(
+                *polymorphic_downcast<ConcatenateLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
-            return detail::create_depthwise_convolution_layer<NEDepthwiseConvolutionLayer, NETargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+            return detail::create_depthwise_convolution_layer<NEDepthwiseConvolutionLayer, NETargetInfo>(
+                *polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::DequantizationLayer:
-            return detail::create_dequantization_layer<NEDequantizationLayer, NETargetInfo>(*polymorphic_downcast<DequantizationLayerNode *>(node));
+            return detail::create_dequantization_layer<NEDequantizationLayer, NETargetInfo>(
+                *polymorphic_downcast<DequantizationLayerNode *>(node));
         case NodeType::DetectionOutputLayer:
-            return detail::create_detection_output_layer<CPPDetectionOutputLayer, NETargetInfo>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
+            return detail::create_detection_output_layer<CPPDetectionOutputLayer, NETargetInfo>(
+                *polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::DetectionPostProcessLayer:
-            return detail::create_detection_post_process_layer<NEDetectionPostProcessLayer, NETargetInfo>(*polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
+            return detail::create_detection_post_process_layer<NEDetectionPostProcessLayer, NETargetInfo>(
+                *polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
         case NodeType::EltwiseLayer:
-            return detail::create_eltwise_layer<NEEltwiseFunctions, NETargetInfo>(*polymorphic_downcast<EltwiseLayerNode *>(node));
+            return detail::create_eltwise_layer<NEEltwiseFunctions, NETargetInfo>(
+                *polymorphic_downcast<EltwiseLayerNode *>(node));
         case NodeType::UnaryEltwiseLayer:
-            return detail::create_unary_eltwise_layer<NEUnaryEltwiseFunctions, NETargetInfo>(*polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
+            return detail::create_unary_eltwise_layer<NEUnaryEltwiseFunctions, NETargetInfo>(
+                *polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
         case NodeType::FlattenLayer:
-            return detail::create_flatten_layer<NEFlattenLayer, NETargetInfo>(*polymorphic_downcast<FlattenLayerNode *>(node));
+            return detail::create_flatten_layer<NEFlattenLayer, NETargetInfo>(
+                *polymorphic_downcast<FlattenLayerNode *>(node));
         case NodeType::FullyConnectedLayer:
-            return detail::create_fully_connected_layer<NEFullyConnectedLayer, NETargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+            return detail::create_fully_connected_layer<NEFullyConnectedLayer, NETargetInfo>(
+                *polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
         case NodeType::FusedConvolutionBatchNormalizationLayer:
-            return detail::create_fused_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
+            return detail::create_fused_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(
+                *polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer:
-            return detail::create_fused_depthwise_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(*polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
+            return detail::create_fused_depthwise_convolution_batch_normalization_layer<NEFusedLayerTypes,
+                                                                                        NETargetInfo>(
+                *polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::L2NormalizeLayer:
-            return detail::create_l2_normalize_layer<NEL2NormalizeLayer, NETargetInfo>(*polymorphic_downcast<L2NormalizeLayerNode *>(node), ctx);
+            return detail::create_l2_normalize_layer<NEL2NormalizeLayer, NETargetInfo>(
+                *polymorphic_downcast<L2NormalizeLayerNode *>(node), ctx);
         case NodeType::NormalizationLayer:
-            return detail::create_normalization_layer<NENormalizationLayer, NETargetInfo>(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
+            return detail::create_normalization_layer<NENormalizationLayer, NETargetInfo>(
+                *polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
         case NodeType::PadLayer:
             return detail::create_pad_layer<NEPadLayer, NETargetInfo>(*polymorphic_downcast<PadLayerNode *>(node));
         case NodeType::PermuteLayer:
-            return detail::create_permute_layer<NEPermute, NETargetInfo>(*polymorphic_downcast<PermuteLayerNode *>(node));
+            return detail::create_permute_layer<NEPermute, NETargetInfo>(
+                *polymorphic_downcast<PermuteLayerNode *>(node));
         case NodeType::PoolingLayer:
-            return detail::create_pooling_layer<NEPoolingLayer, NETargetInfo>(*polymorphic_downcast<PoolingLayerNode *>(node));
+            return detail::create_pooling_layer<NEPoolingLayer, NETargetInfo>(
+                *polymorphic_downcast<PoolingLayerNode *>(node));
         case NodeType::PReluLayer:
-            return detail::create_prelu_layer<NEPReluLayer, NETargetInfo>(*polymorphic_downcast<PReluLayerNode *>(node));
+            return detail::create_prelu_layer<NEPReluLayer, NETargetInfo>(
+                *polymorphic_downcast<PReluLayerNode *>(node));
         case NodeType::PrintLayer:
             return detail::create_print_layer<NETargetInfo>(*polymorphic_downcast<PrintLayerNode *>(node));
         case NodeType::PriorBoxLayer:
-            return detail::create_priorbox_layer<NEPriorBoxLayer, NETargetInfo>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
+            return detail::create_priorbox_layer<NEPriorBoxLayer, NETargetInfo>(
+                *polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::QuantizationLayer:
-            return detail::create_quantization_layer<NEQuantizationLayer, NETargetInfo>(*polymorphic_downcast<QuantizationLayerNode *>(node));
+            return detail::create_quantization_layer<NEQuantizationLayer, NETargetInfo>(
+                *polymorphic_downcast<QuantizationLayerNode *>(node));
         case NodeType::ReductionOperationLayer:
-            return detail::create_reduction_operation_layer<NEReductionOperation, NETargetInfo>(*polymorphic_downcast<ReductionLayerNode *>(node), ctx);
+            return detail::create_reduction_operation_layer<NEReductionOperation, NETargetInfo>(
+                *polymorphic_downcast<ReductionLayerNode *>(node), ctx);
         case NodeType::ReorgLayer:
-            return detail::create_reorg_layer<NEReorgLayer, NETargetInfo>(*polymorphic_downcast<ReorgLayerNode *>(node));
+            return detail::create_reorg_layer<NEReorgLayer, NETargetInfo>(
+                *polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
-            return detail::create_reshape_layer<NEReshapeLayer, NETargetInfo>(*polymorphic_downcast<ReshapeLayerNode *>(node));
+            return detail::create_reshape_layer<NEReshapeLayer, NETargetInfo>(
+                *polymorphic_downcast<ReshapeLayerNode *>(node));
         case NodeType::ResizeLayer:
             return detail::create_resize_layer<NEScale, NETargetInfo>(*polymorphic_downcast<ResizeLayerNode *>(node));
         case NodeType::SliceLayer:
             return detail::create_slice_layer<NESlice, NETargetInfo>(*polymorphic_downcast<SliceLayerNode *>(node));
         case NodeType::SoftmaxLayer:
-            return detail::create_softmax_layer<NESoftmaxLayer, NETargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+            return detail::create_softmax_layer<NESoftmaxLayer, NETargetInfo>(
+                *polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
         case NodeType::StackLayer:
-            return detail::create_stack_layer<NEStackLayer, NETargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
+            return detail::create_stack_layer<NEStackLayer, NETargetInfo>(
+                *polymorphic_downcast<StackLayerNode *>(node));
         case NodeType::StridedSliceLayer:
-            return detail::create_strided_slice_layer<NEStridedSlice, NETargetInfo>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
+            return detail::create_strided_slice_layer<NEStridedSlice, NETargetInfo>(
+                *polymorphic_downcast<StridedSliceLayerNode *>(node));
         default:
             return nullptr;
     }
diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
index a485e5d23532f6118e475bb5b9d5bab3ff32d181..a97806f92cb046b42644d8c4fc4d7bbf7a41a373 100644
--- a/src/graph/backends/NEON/NENodeValidator.cpp
+++ b/src/graph/backends/NEON/NENodeValidator.cpp
@@ -25,9 +25,9 @@
 
 #include "arm_compute/graph/backends/ValidateHelpers.h"
 #include "arm_compute/graph/nodes/Nodes.h"
-
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
+
 #include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
@@ -56,41 +56,51 @@ struct NEUnaryEltwiseLayerFunctions
 
 Status NENodeValidator::validate(INode *node)
 {
-    if(node == nullptr)
+    if (node == nullptr)
     {
         return Status{};
     }
 
     NodeType type = node->type();
-    switch(type)
+    switch (type)
     {
         case NodeType::ArgMinMaxLayer:
-            return detail::validate_arg_min_max_layer<NEArgMinMaxLayer>(*polymorphic_downcast<ArgMinMaxLayerNode *>(node));
+            return detail::validate_arg_min_max_layer<NEArgMinMaxLayer>(
+                *polymorphic_downcast<ArgMinMaxLayerNode *>(node));
         case NodeType::BoundingBoxTransformLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : BoundingBoxTransformLayer");
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                            "Unsupported operation : BoundingBoxTransformLayer");
         case NodeType::ChannelShuffleLayer:
-            return detail::validate_channel_shuffle_layer<NEChannelShuffleLayer>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
+            return detail::validate_channel_shuffle_layer<NEChannelShuffleLayer>(
+                *polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
-            return detail::validate_convolution_layer<NEConvolutionLayer,
-                   NEDirectConvolutionLayer,
-                   NEGEMMConvolutionLayer,
-                   NEWinogradConvolutionLayer>(*polymorphic_downcast<ConvolutionLayerNode *>(node));
+            return detail::validate_convolution_layer<NEConvolutionLayer, NEDirectConvolutionLayer,
+                                                      NEGEMMConvolutionLayer, NEWinogradConvolutionLayer>(
+                *polymorphic_downcast<ConvolutionLayerNode *>(node));
         case NodeType::DepthToSpaceLayer:
-            return detail::validate_depth_to_space_layer<NEDepthToSpaceLayer>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
+            return detail::validate_depth_to_space_layer<NEDepthToSpaceLayer>(
+                *polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
-            return detail::validate_depthwise_convolution_layer<NEDepthwiseConvolutionLayer>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+            return detail::validate_depthwise_convolution_layer<NEDepthwiseConvolutionLayer>(
+                *polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::DequantizationLayer:
-            return detail::validate_dequantization_layer<NEDequantizationLayer>(*polymorphic_downcast<DequantizationLayerNode *>(node));
+            return detail::validate_dequantization_layer<NEDequantizationLayer>(
+                *polymorphic_downcast<DequantizationLayerNode *>(node));
         case NodeType::DetectionOutputLayer:
-            return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
+            return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(
+                *polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::DetectionPostProcessLayer:
-            return detail::validate_detection_post_process_layer<NEDetectionPostProcessLayer>(*polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
+            return detail::validate_detection_post_process_layer<NEDetectionPostProcessLayer>(
+                *polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
         case NodeType::GenerateProposalsLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : GenerateProposalsLayer");
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                            "Unsupported operation : GenerateProposalsLayer");
         case NodeType::L2NormalizeLayer:
-            return detail::validate_l2_normalize_layer<NEL2NormalizeLayer>(*polymorphic_downcast<L2NormalizeLayerNode *>(node));
+            return detail::validate_l2_normalize_layer<NEL2NormalizeLayer>(
+                *polymorphic_downcast<L2NormalizeLayerNode *>(node));
         case NodeType::NormalizePlanarYUVLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : NormalizePlanarYUVLayer");
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                            "Unsupported operation : NormalizePlanarYUVLayer");
         case NodeType::PadLayer:
             return detail::validate_pad_layer<NEPadLayer>(*polymorphic_downcast<PadLayerNode *>(node));
         case NodeType::PermuteLayer:
@@ -100,23 +110,29 @@ Status NENodeValidator::validate(INode *node)
         case NodeType::PriorBoxLayer:
             return detail::validate_priorbox_layer<NEPriorBoxLayer>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::QuantizationLayer:
-            return detail::validate_quantization_layer<NEQuantizationLayer>(*polymorphic_downcast<QuantizationLayerNode *>(node));
+            return detail::validate_quantization_layer<NEQuantizationLayer>(
+                *polymorphic_downcast<QuantizationLayerNode *>(node));
         case NodeType::ReductionOperationLayer:
-            return detail::validate_reduction_operation_layer<NEReductionOperation>(*polymorphic_downcast<ReductionLayerNode *>(node));
+            return detail::validate_reduction_operation_layer<NEReductionOperation>(
+                *polymorphic_downcast<ReductionLayerNode *>(node));
         case NodeType::ReorgLayer:
             return detail::validate_reorg_layer<NEReorgLayer>(*polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
             return detail::validate_reshape_layer<NEReshapeLayer>(*polymorphic_downcast<ReshapeLayerNode *>(node));
         case NodeType::ROIAlignLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ROIAlignLayer");
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                            "Unsupported operation : ROIAlignLayer");
         case NodeType::SliceLayer:
             return detail::validate_slice_layer<NESlice>(*polymorphic_downcast<SliceLayerNode *>(node));
         case NodeType::StridedSliceLayer:
-            return detail::validate_strided_slice_layer<NEStridedSlice>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
+            return detail::validate_strided_slice_layer<NEStridedSlice>(
+                *polymorphic_downcast<StridedSliceLayerNode *>(node));
         case NodeType::EltwiseLayer:
-            return detail::validate_eltwise_Layer<NEEltwiseLayerFunctions>(*polymorphic_downcast<EltwiseLayerNode *>(node));
+            return detail::validate_eltwise_Layer<NEEltwiseLayerFunctions>(
+                *polymorphic_downcast<EltwiseLayerNode *>(node));
         case NodeType::UnaryEltwiseLayer:
-            return detail::validate_unary_eltwise_layer<NEUnaryEltwiseLayerFunctions>(*polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
+            return detail::validate_unary_eltwise_layer<NEUnaryEltwiseLayerFunctions>(
+                *polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
         default:
             return Status{};
     }
diff --git a/src/graph/backends/NEON/NESubTensorHandle.cpp b/src/graph/backends/NEON/NESubTensorHandle.cpp
index 36f29d0d100c5a974874d156b6cd556419c99d83..8964a00c5ef1b7c5ff7a2e11735821fd14d8761f 100644
--- a/src/graph/backends/NEON/NESubTensorHandle.cpp
+++ b/src/graph/backends/NEON/NESubTensorHandle.cpp
@@ -29,7 +29,10 @@ namespace graph
 {
 namespace backends
 {
-NESubTensorHandle::NESubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent)
+NESubTensorHandle::NESubTensorHandle(ITensorHandle     *parent_handle,
+                                     const TensorShape &shape,
+                                     const Coordinates &coords,
+                                     bool               extend_parent)
     : _sub_tensor(), _parent_handle(nullptr)
 {
     ARM_COMPUTE_ERROR_ON(!parent_handle);
@@ -95,4 +98,4 @@ Target NESubTensorHandle::target() const
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/backends/NEON/NETensorHandle.cpp b/src/graph/backends/NEON/NETensorHandle.cpp
index 4393156e8ab17934e60b541e36995b85fdf43251..dabf67060dabc3b83eac34c5a24750a3ef5957b3 100644
--- a/src/graph/backends/NEON/NETensorHandle.cpp
+++ b/src/graph/backends/NEON/NETensorHandle.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/graph/backends/NEON/NETensorHandle.h"
 
 #include "arm_compute/runtime/MemoryGroup.h"
+
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -32,8 +33,7 @@ namespace graph
 {
 namespace backends
 {
-NETensorHandle::NETensorHandle(const ITensorInfo &info)
-    : _tensor()
+NETensorHandle::NETensorHandle(const ITensorInfo &info) : _tensor()
 {
     _tensor.allocator()->init(info);
 }
@@ -50,7 +50,7 @@ void NETensorHandle::free()
 
 void NETensorHandle::manage(IMemoryGroup *mg)
 {
-    if(mg != nullptr)
+    if (mg != nullptr)
     {
         mg->manage(&_tensor);
     }
@@ -68,7 +68,7 @@ void NETensorHandle::unmap()
 void NETensorHandle::release_if_unused()
 {
     // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used
-    if(!_tensor.is_used())
+    if (!_tensor.is_used())
     {
         _tensor.allocator()->free();
     }
@@ -100,4 +100,4 @@ Target NETensorHandle::target() const
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
index b45f453f23c17263d82f1cc2837d3006df43d870..1e813dc678f7f26b565ab59d8758955f200a5112 100644
--- a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
+++ b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
@@ -23,6 +23,8 @@
  */
 #include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h"
 
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/GraphManager.h"
@@ -30,9 +32,7 @@
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/Types.h"
 #include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 
-#include "arm_compute/core/ITensor.h"
 #include "support/Cast.h"
 
 #include <algorithm>
@@ -78,28 +78,28 @@ IMemoryGroup *get_memory_group_from_handle(GraphContext &ctx, ITensorHandle *han
  */
 std::set<ITensorHandle *> get_const_handles(const Graph &g)
 {
-    std::set<NodeType> const_node_types = { NodeType::Input, NodeType::Output, NodeType::Const };
+    std::set<NodeType> const_node_types = {NodeType::Input, NodeType::Output, NodeType::Const};
 
     std::set<ITensorHandle *> const_tensors;
 
     auto &nodes = g.nodes();
-    for(auto &node : nodes)
+    for (auto &node : nodes)
     {
         // If its a const node:
-        if(node != nullptr && const_node_types.find(node->type()) != std::end(const_node_types))
+        if (node != nullptr && const_node_types.find(node->type()) != std::end(const_node_types))
         {
             // TODO (geopin01) : Create IO iterator wrappers
             // Add all its inputs / outputs to the list of constant handles
-            for(unsigned int i = 0; i < node->num_inputs(); ++i)
+            for (unsigned int i = 0; i < node->num_inputs(); ++i)
             {
-                if(node->input(i) != nullptr)
+                if (node->input(i) != nullptr)
                 {
                     const_tensors.insert(node->input(i)->handle()->parent_handle());
                 }
             }
-            for(unsigned int i = 0; i < node->num_outputs(); ++i)
+            for (unsigned int i = 0; i < node->num_outputs(); ++i)
             {
-                if(node->output(i) != nullptr)
+                if (node->output(i) != nullptr)
                 {
                     const_tensors.insert(node->output(i)->handle()->parent_handle());
                 }
@@ -118,9 +118,8 @@ std::set<ITensorHandle *> get_const_handles(const Graph &g)
  *
  * @return List of transition handles
  */
-TaskHandles get_transition_handles(GraphContext                    &ctx,
-                                   ExecutionTask                   &task,
-                                   const std::set<ITensorHandle *> &const_tensors)
+TaskHandles
+get_transition_handles(GraphContext &ctx, ExecutionTask &task, const std::set<ITensorHandle *> &const_tensors)
 {
     ARM_COMPUTE_ERROR_ON(task.node == nullptr || (task.task == nullptr && !is_utility_node(task.node)));
     INode &node = *task.node;
@@ -128,28 +127,30 @@ TaskHandles get_transition_handles(GraphContext                    &ctx,
     TaskHandles transition_handles;
 
     // Add input handles
-    for(unsigned int i = 0; i < node.input_edges().size(); ++i)
+    for (unsigned int i = 0; i < node.input_edges().size(); ++i)
     {
         Edge *input_edge = node.input_edge(i);
         // If this input is the output of another node
-        if(input_edge != nullptr && input_edge->tensor() != nullptr && const_tensors.find(input_edge->tensor()->handle()->parent_handle()) == std::end(const_tensors))
+        if (input_edge != nullptr && input_edge->tensor() != nullptr &&
+            const_tensors.find(input_edge->tensor()->handle()->parent_handle()) == std::end(const_tensors))
         {
             // Then add it to the list of transition buffers
             ITensorHandle *tensor_handle = input_edge->tensor()->handle()->parent_handle();
-            IMemoryGroup *mm_group      = get_memory_group_from_handle(ctx, tensor_handle);
+            IMemoryGroup  *mm_group      = get_memory_group_from_handle(ctx, tensor_handle);
             transition_handles.input_handles.emplace_back(std::make_pair(tensor_handle, mm_group));
         }
     }
 
     // Add output handles
-    for(unsigned int i = 0; i < node.num_outputs(); ++i)
+    for (unsigned int i = 0; i < node.num_outputs(); ++i)
     {
         Tensor *output_tensor = node.output(i);
         // If this output is used as an input for another node
-        if(output_tensor != nullptr && const_tensors.find(output_tensor->handle()->parent_handle()) == std::end(const_tensors))
+        if (output_tensor != nullptr &&
+            const_tensors.find(output_tensor->handle()->parent_handle()) == std::end(const_tensors))
         {
             ITensorHandle *tensor_handle = output_tensor->handle()->parent_handle();
-            IMemoryGroup *mm_group      = get_memory_group_from_handle(ctx, tensor_handle);
+            IMemoryGroup  *mm_group      = get_memory_group_from_handle(ctx, tensor_handle);
             transition_handles.output_handles.emplace_back(std::make_pair(tensor_handle, mm_group));
         }
     }
@@ -164,11 +165,11 @@ TaskHandles get_transition_handles(GraphContext                    &ctx,
  */
 void count_input_handles_per_target(const TaskHandles &task_handles, TargetHandleCounter &handle_counter)
 {
-    for(const auto &handle : task_handles.input_handles)
+    for (const auto &handle : task_handles.input_handles)
     {
         ITensorHandle *key            = handle.first;
         HandleCounter &target_counter = handle_counter[key->target()];
-        if(target_counter.find(key) == std::end(target_counter))
+        if (target_counter.find(key) == std::end(target_counter))
         {
             target_counter.emplace(std::make_pair(key, 1));
         }
@@ -192,12 +193,12 @@ void configure_handle_lifetime(std::vector<TaskHandles> &tasks_handles, const Ha
     // Acquires the given handles and sets them as in flight if they aren't already
     auto acquire = [&](std::vector<std::pair<ITensorHandle *, IMemoryGroup *>> &handles)
     {
-        for(auto &handle : handles)
+        for (auto &handle : handles)
         {
             ITensorHandle *parent_handle = handle.first;
             ARM_COMPUTE_ERROR_ON(parent_handle == nullptr);
             // If the tensor is not already in flight:
-            if(tensors_in_flight.find(parent_handle) == std::end(tensors_in_flight))
+            if (tensors_in_flight.find(parent_handle) == std::end(tensors_in_flight))
             {
                 ARM_COMPUTE_ERROR_ON(hc.find(parent_handle) == std::end(hc));
                 // Then add it to the list of in flight tensors
@@ -208,20 +209,20 @@ void configure_handle_lifetime(std::vector<TaskHandles> &tasks_handles, const Ha
         }
     };
 
-    for(auto &task_handle : tasks_handles)
+    for (auto &task_handle : tasks_handles)
     {
         // Marking all the input and output tensors of the task as in flight
         acquire(task_handle.input_handles);
         acquire(task_handle.output_handles);
 
         // Releasing the input tensors
-        for(auto &input_handle : task_handle.input_handles)
+        for (auto &input_handle : task_handle.input_handles)
         {
             ITensorHandle *ihandle = input_handle.first;
             ARM_COMPUTE_ERROR_ON(ihandle == nullptr);
             ARM_COMPUTE_ERROR_ON(tensors_in_flight.find(ihandle) == std::end(tensors_in_flight));
             --tensors_in_flight[ihandle];
-            if(tensors_in_flight[ihandle] <= 0)
+            if (tensors_in_flight[ihandle] <= 0)
             {
                 // Remove tensor for tensors in flight
                 tensors_in_flight.erase(ihandle);
@@ -242,7 +243,7 @@ void configure_transition_manager(Graph &g, GraphContext &ctx, ExecutionWorkload
     TargetHandleCounter      target_handle_count;
 
     // Count handles
-    for(auto &task : workload.tasks)
+    for (auto &task : workload.tasks)
     {
         // Populates IO handles
         tasks_handles.push_back(get_transition_handles(ctx, task, const_tensors));
@@ -252,12 +253,12 @@ void configure_transition_manager(Graph &g, GraphContext &ctx, ExecutionWorkload
     }
 
     // Setup memory managers
-    for(auto &hc : target_handle_count)
+    for (auto &hc : target_handle_count)
     {
         MemoryManagerContext *mm_ctx = ctx.memory_management_ctx(hc.first);
-        if(mm_ctx != nullptr)
+        if (mm_ctx != nullptr)
         {
-            if(mm_ctx->cross_mm != nullptr && mm_ctx->cross_group != nullptr)
+            if (mm_ctx->cross_mm != nullptr && mm_ctx->cross_group != nullptr)
             {
                 // Manage and allocate tensors
                 configure_handle_lifetime(tasks_handles, hc.second);
diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp
index ac800df76ce4da008ef14bab063fcc0c75636e10..870d24a6c77080bbfbdbd9bb225998bdec13a20f 100644
--- a/src/graph/detail/ExecutionHelpers.cpp
+++ b/src/graph/detail/ExecutionHelpers.cpp
@@ -23,12 +23,12 @@
  */
 #include "arm_compute/graph/detail/ExecutionHelpers.h"
 
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/GraphManager.h"
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 
 namespace arm_compute
 {
@@ -41,9 +41,9 @@ void validate_all_nodes(Graph &g)
     auto &nodes = g.nodes();
 
     // Create tasks
-    for(auto &node : nodes)
+    for (auto &node : nodes)
     {
-        if(node != nullptr)
+        if (node != nullptr)
         {
             Target                    assigned_target = node->assigned_target();
             backends::IDeviceBackend &backend         = backends::BackendRegistry::get().get_backend(assigned_target);
@@ -57,9 +57,9 @@ void configure_all_tensors(Graph &g)
 {
     auto &tensors = g.tensors();
 
-    for(auto &tensor : tensors)
+    for (auto &tensor : tensors)
     {
-        if(tensor && tensor->handle() == nullptr)
+        if (tensor && tensor->handle() == nullptr)
         {
             Target                         target  = tensor->desc().target;
             backends::IDeviceBackend      &backend = backends::BackendRegistry::get().get_backend(target);
@@ -72,10 +72,10 @@ void configure_all_tensors(Graph &g)
 
 void allocate_all_input_tensors(INode &node)
 {
-    for(unsigned int i = 0; i < node.num_inputs(); ++i)
+    for (unsigned int i = 0; i < node.num_inputs(); ++i)
     {
         Tensor *tensor = node.input(i);
-        if(tensor != nullptr && !tensor->bound_edges().empty())
+        if (tensor != nullptr && !tensor->bound_edges().empty())
         {
             ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!");
             tensor->handle()->allocate();
@@ -85,10 +85,10 @@ void allocate_all_input_tensors(INode &node)
 
 void allocate_all_output_tensors(INode &node)
 {
-    for(unsigned int i = 0; i < node.num_outputs(); ++i)
+    for (unsigned int i = 0; i < node.num_outputs(); ++i)
     {
         Tensor *tensor = node.output(i);
-        if(tensor != nullptr && !tensor->bound_edges().empty())
+        if (tensor != nullptr && !tensor->bound_edges().empty())
         {
             ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!");
             tensor->handle()->allocate();
@@ -98,11 +98,11 @@ void allocate_all_output_tensors(INode &node)
 
 void allocate_const_tensors(Graph &g)
 {
-    for(auto &node : g.nodes())
+    for (auto &node : g.nodes())
     {
-        if(node != nullptr)
+        if (node != nullptr)
         {
-            switch(node->type())
+            switch (node->type())
             {
                 case NodeType::Const:
                 case NodeType::Input:
@@ -121,9 +121,10 @@ void allocate_all_tensors(Graph &g)
 {
     auto &tensors = g.tensors();
 
-    for(auto &tensor : tensors)
+    for (auto &tensor : tensors)
     {
-        if(tensor && !tensor->bound_edges().empty() && tensor->handle() != nullptr && tensor->handle()->tensor().info()->is_resizable() && tensor->handle()->tensor().is_used())
+        if (tensor && !tensor->bound_edges().empty() && tensor->handle() != nullptr &&
+            tensor->handle()->tensor().info()->is_resizable() && tensor->handle()->tensor().is_used())
         {
             tensor->handle()->allocate();
         }
@@ -140,15 +141,15 @@ ExecutionWorkload configure_all_nodes(Graph &g, GraphContext &ctx, const std::ve
     workload.tasks.reserve(node_order.size());
 
     // Create tasks
-    for(auto &node_id : node_order)
+    for (auto &node_id : node_order)
     {
         auto node = g.node(node_id);
-        if(node != nullptr)
+        if (node != nullptr)
         {
             Target                     assigned_target = node->assigned_target();
-            backends::IDeviceBackend &backend         = backends::BackendRegistry::get().get_backend(assigned_target);
+            backends::IDeviceBackend  &backend         = backends::BackendRegistry::get().get_backend(assigned_target);
             std::unique_ptr<IFunction> func            = backend.configure_node(*node, ctx);
-            if(func != nullptr || is_utility_node(node))
+            if (func != nullptr || is_utility_node(node))
             {
                 workload.tasks.emplace_back(ExecutionTask(std::move(func), node));
             }
@@ -156,14 +157,14 @@ ExecutionWorkload configure_all_nodes(Graph &g, GraphContext &ctx, const std::ve
     }
 
     // Add inputs and outputs
-    for(auto &node : g.nodes())
+    for (auto &node : g.nodes())
     {
-        if(node != nullptr && node->type() == NodeType::Input)
+        if (node != nullptr && node->type() == NodeType::Input)
         {
             workload.inputs.push_back(node->output(0));
         }
 
-        if(node != nullptr && node->type() == NodeType::Output)
+        if (node != nullptr && node->type() == NodeType::Output)
         {
             workload.outputs.push_back(node->input(0));
             continue;
@@ -175,9 +176,9 @@ ExecutionWorkload configure_all_nodes(Graph &g, GraphContext &ctx, const std::ve
 
 void release_unused_tensors(Graph &g)
 {
-    for(auto &tensor : g.tensors())
+    for (auto &tensor : g.tensors())
     {
-        if(tensor != nullptr && tensor->handle() != nullptr)
+        if (tensor != nullptr && tensor->handle() != nullptr)
         {
             tensor->handle()->release_if_unused();
         }
@@ -194,11 +195,11 @@ void call_all_const_node_accessors(Graph &g)
 {
     auto &nodes = g.nodes();
 
-    for(auto &node : nodes)
+    for (auto &node : nodes)
     {
-        if(node != nullptr && node->type() == NodeType::Const && node->num_outputs())
+        if (node != nullptr && node->type() == NodeType::Const && node->num_outputs())
         {
-            if(!node->output(0)->bound_edges().empty())
+            if (!node->output(0)->bound_edges().empty())
             {
                 call_tensor_accessor(node->output(0));
             }
@@ -209,18 +210,19 @@ void call_all_const_node_accessors(Graph &g)
 bool call_all_input_node_accessors(ExecutionWorkload &workload)
 {
     bool is_valid = true;
-    std::for_each(std::begin(workload.inputs), std::end(workload.inputs), [&](Tensor * input_tensor)
-    {
-        bool valid_input = (input_tensor != nullptr) && input_tensor->call_accessor();
-        is_valid         = is_valid && valid_input;
-    });
+    std::for_each(std::begin(workload.inputs), std::end(workload.inputs),
+                  [&](Tensor *input_tensor)
+                  {
+                      bool valid_input = (input_tensor != nullptr) && input_tensor->call_accessor();
+                      is_valid         = is_valid && valid_input;
+                  });
     return is_valid;
 }
 
 void prepare_all_tasks(ExecutionWorkload &workload)
 {
     ARM_COMPUTE_ERROR_ON(workload.graph == nullptr);
-    for(auto &task : workload.tasks)
+    for (auto &task : workload.tasks)
     {
         task.prepare();
         release_unused_tensors(*workload.graph);
@@ -232,24 +234,24 @@ void call_all_tasks(ExecutionWorkload &workload)
     ARM_COMPUTE_ERROR_ON(workload.ctx == nullptr);
 
     // Acquire memory for the transition buffers
-    for(auto &mm_ctx : workload.ctx->memory_managers())
+    for (auto &mm_ctx : workload.ctx->memory_managers())
     {
-        if(mm_ctx.second.cross_group != nullptr)
+        if (mm_ctx.second.cross_group != nullptr)
         {
             mm_ctx.second.cross_group->acquire();
         }
     }
 
     // Execute tasks
-    for(auto &task : workload.tasks)
+    for (auto &task : workload.tasks)
     {
         task();
     }
 
     // Release memory for the transition buffers
-    for(auto &mm_ctx : workload.ctx->memory_managers())
+    for (auto &mm_ctx : workload.ctx->memory_managers())
     {
-        if(mm_ctx.second.cross_group != nullptr)
+        if (mm_ctx.second.cross_group != nullptr)
         {
             mm_ctx.second.cross_group->release();
         }
@@ -259,11 +261,12 @@ void call_all_tasks(ExecutionWorkload &workload)
 bool call_all_output_node_accessors(ExecutionWorkload &workload)
 {
     bool is_valid = true;
-    std::for_each(std::begin(workload.outputs), std::end(workload.outputs), [&](Tensor * output_tensor)
-    {
-        bool valid_output = (output_tensor != nullptr) && output_tensor->call_accessor();
-        is_valid          = is_valid && valid_output;
-    });
+    std::for_each(std::begin(workload.outputs), std::end(workload.outputs),
+                  [&](Tensor *output_tensor)
+                  {
+                      bool valid_output = (output_tensor != nullptr) && output_tensor->call_accessor();
+                      is_valid          = is_valid && valid_output;
+                  });
 
     sync_backends();
 
diff --git a/src/graph/frontend/Stream.cpp b/src/graph/frontend/Stream.cpp
index 44c8400874795cb626287343db56a8828f6eb45a..383a6dc67f34f80da307235ad2993bfd000336ba 100644
--- a/src/graph/frontend/Stream.cpp
+++ b/src/graph/frontend/Stream.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/graph/frontend/Stream.h"
 
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/frontend/ILayer.h"
+#include "arm_compute/graph/Utils.h"
 
 namespace arm_compute
 {
@@ -32,8 +32,7 @@ namespace graph
 {
 namespace frontend
 {
-Stream::Stream(size_t id, std::string name)
-    : _ctx(), _manager(), _g(id, std::move(name))
+Stream::Stream(size_t id, std::string name) : _ctx(), _manager(), _g(id, std::move(name))
 {
 }
 
diff --git a/src/graph/frontend/SubStream.cpp b/src/graph/frontend/SubStream.cpp
index 4b42207e80fdc2d55fd47137dabd65f0e301af54..8596aaa1a3a333968d1f2db5bdd1f458011d803b 100644
--- a/src/graph/frontend/SubStream.cpp
+++ b/src/graph/frontend/SubStream.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/graph/frontend/SubStream.h"
 
-#include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/frontend/ILayer.h"
+#include "arm_compute/graph/Graph.h"
 
 namespace arm_compute
 {
@@ -32,8 +32,7 @@ namespace graph
 {
 namespace frontend
 {
-SubStream::SubStream(IStream &s)
-    : _s(s)
+SubStream::SubStream(IStream &s) : _s(s)
 {
     _hints     = s.hints();
     _tail_node = s.tail_node();
diff --git a/src/graph/mutators/DepthConcatSubTensorMutator.cpp b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
index 963b94843294631cfb777c3d0665472aefeeed33..1b7ee3c4a4c793f1313c8d1dadee15a359cd3efb 100644
--- a/src/graph/mutators/DepthConcatSubTensorMutator.cpp
+++ b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
@@ -23,12 +23,12 @@
  */
 #include "arm_compute/graph/mutators/DepthConcatSubTensorMutator.h"
 
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/algorithms/TopologicalSort.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/nodes/ConcatenateLayerNode.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/Cast.h"
 #include "support/Iterable.h"
@@ -50,7 +50,7 @@ IGraphMutator::MutationType DepthConcatSubTensorMutator::type() const
 void DepthConcatSubTensorMutator::mutate(Graph &g)
 {
     // Early exit if no Concatenation layers exist in graph
-    if(g.nodes(NodeType::ConcatenateLayer).empty())
+    if (g.nodes(NodeType::ConcatenateLayer).empty())
     {
         return;
     }
@@ -59,43 +59,48 @@ void DepthConcatSubTensorMutator::mutate(Graph &g)
     std::vector<NodeID> topological_sorted_node_ids = dfs(g);
 
     // Should be in reverse order of execution
-    for(auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids))
+    for (auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids))
     {
         INode *node = g.node(node_id);
-        if(node != nullptr && node->type() == NodeType::ConcatenateLayer && node->output(0) != nullptr)
+        if (node != nullptr && node->type() == NodeType::ConcatenateLayer && node->output(0) != nullptr)
         {
             // Get output tensor
             auto output_tensor = node->output(0);
 
             // Check concatenation axis (Sub-tensor optimization is supported for concatenation axis >=2)
             auto *concat_node = arm_compute::utils::cast::polymorphic_downcast<ConcatenateLayerNode *>(node);
-            if(output_tensor == nullptr || get_dimension_idx(output_tensor->desc().layout, concat_node->concatenation_axis()) < 2)
+            if (output_tensor == nullptr ||
+                get_dimension_idx(output_tensor->desc().layout, concat_node->concatenation_axis()) < 2)
             {
                 continue;
             }
 
             // Check that all tensor have the same target, valid inputs and same quantization info
-            bool is_valid = std::all_of(node->input_edges().cbegin(), node->input_edges().cend(),
-                                        [&](const EdgeID & eid)
-            {
-                return (g.edge(eid) != nullptr) && (g.edge(eid)->tensor() != nullptr) && (g.edge(eid)->tensor()->desc().target == output_tensor->desc().target)
-                       && (g.edge(eid)->tensor()->desc().quant_info == output_tensor->desc().quant_info);
-            });
+            bool is_valid =
+                std::all_of(node->input_edges().cbegin(), node->input_edges().cend(),
+                            [&](const EdgeID &eid)
+                            {
+                                return (g.edge(eid) != nullptr) && (g.edge(eid)->tensor() != nullptr) &&
+                                       (g.edge(eid)->tensor()->desc().target == output_tensor->desc().target) &&
+                                       (g.edge(eid)->tensor()->desc().quant_info == output_tensor->desc().quant_info);
+                            });
 
             // Create subtensors
-            if(is_valid && is_target_supported(output_tensor->desc().target))
+            if (is_valid && is_target_supported(output_tensor->desc().target))
             {
                 ARM_COMPUTE_LOG_GRAPH_VERBOSE("Using sub-tensors for the node with ID : "
                                               << node->id() << " and name : " << node->name() << std::endl);
                 // Create sub-tensor handles
                 unsigned depth = 0;
-                for(unsigned int i = 0; i < node->input_edges().size(); ++i)
+                for (unsigned int i = 0; i < node->input_edges().size(); ++i)
                 {
                     auto       input_tensor = node->input(i);
                     const auto input_shape  = input_tensor->desc().shape;
 
-                    backends::IDeviceBackend      &backend = backends::BackendRegistry::get().get_backend(input_tensor->desc().target);
-                    std::unique_ptr<ITensorHandle> handle  = backend.create_subtensor(output_tensor->handle(), input_shape, Coordinates(0, 0, depth), false);
+                    backends::IDeviceBackend &backend =
+                        backends::BackendRegistry::get().get_backend(input_tensor->desc().target);
+                    std::unique_ptr<ITensorHandle> handle =
+                        backend.create_subtensor(output_tensor->handle(), input_shape, Coordinates(0, 0, depth), false);
                     input_tensor->set_handle(std::move(handle));
 
                     depth += input_shape.z();
diff --git a/src/graph/mutators/GroupedConvolutionMutator.cpp b/src/graph/mutators/GroupedConvolutionMutator.cpp
index b7c551ce8b6c27a2c2ef7a7967cb2219d9e62361..31efba6bb1a8230112e5c23bf94bd5e9c99dd517 100644
--- a/src/graph/mutators/GroupedConvolutionMutator.cpp
+++ b/src/graph/mutators/GroupedConvolutionMutator.cpp
@@ -23,15 +23,14 @@
  */
 #include "arm_compute/graph/mutators/GroupedConvolutionMutator.h"
 
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphBuilder.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 #include <set>
@@ -42,43 +41,51 @@ namespace graph
 {
 namespace
 {
-NodeID create_grouped_convolution(Graph &g, const NodeParams &params, NodeIdxPair input, NodeID weights, NodeID bias,
-                                  PadStrideInfo conv_info, ConvolutionMethod method, ActivationLayerInfo fused_act, FastMathHint fast_math_hint, unsigned int num_groups)
+NodeID create_grouped_convolution(Graph              &g,
+                                  const NodeParams   &params,
+                                  NodeIdxPair         input,
+                                  NodeID              weights,
+                                  NodeID              bias,
+                                  PadStrideInfo       conv_info,
+                                  ConvolutionMethod   method,
+                                  ActivationLayerInfo fused_act,
+                                  FastMathHint        fast_math_hint,
+                                  unsigned int        num_groups)
 {
     bool has_bias = (bias != EmptyNodeID);
 
     // Split input
     const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
-    const unsigned int     input_idx         = get_dimension_idx(input_tensor_desc.layout, DataLayoutDimension::CHANNEL);
-    NodeID                 input_split       = GraphBuilder::add_split_node(g, params, input, num_groups, input_idx);
+    const unsigned int     input_idx   = get_dimension_idx(input_tensor_desc.layout, DataLayoutDimension::CHANNEL);
+    NodeID                 input_split = GraphBuilder::add_split_node(g, params, input, num_groups, input_idx);
 
     // Split weights
     const TensorDescriptor weights_tensor_desc = get_tensor_descriptor(g, g.node(weights)->outputs()[0]);
-    const unsigned int     batch_idx           = get_dimension_idx(weights_tensor_desc.layout, DataLayoutDimension::BATCHES);
-    NodeID                 weights_split       = GraphBuilder::add_split_node(g, params, { weights, 0 }, num_groups, batch_idx);
+    const unsigned int     batch_idx     = get_dimension_idx(weights_tensor_desc.layout, DataLayoutDimension::BATCHES);
+    NodeID                 weights_split = GraphBuilder::add_split_node(g, params, {weights, 0}, num_groups, batch_idx);
 
     // Split bias
     NodeID bias_split = EmptyNodeID;
-    if(has_bias)
+    if (has_bias)
     {
         // Split bias
-        bias_split = GraphBuilder::add_split_node(g, params, { bias, 0 }, num_groups, 0);
+        bias_split = GraphBuilder::add_split_node(g, params, {bias, 0}, num_groups, 0);
     }
 
     std::vector<NodeIdxPair> convolution_outputs;
-    for(unsigned int i = 0; i < num_groups; ++i)
+    for (unsigned int i = 0; i < num_groups; ++i)
     {
         NodeParams group_params = params;
         NodeID     conv_nid     = g.add_node<ConvolutionLayerNode>(conv_info, 1, method, fast_math_hint);
         g.add_connection(input_split, i, conv_nid, 0);
         g.add_connection(weights_split, i, conv_nid, 1);
-        if(has_bias)
+        if (has_bias)
         {
             g.add_connection(bias_split, i, conv_nid, 2);
         }
 
         // Add group name
-        if(!group_params.name.empty())
+        if (!group_params.name.empty())
         {
             group_params.name.append("_g" + arm_compute::support::cpp11::to_string(i));
         }
@@ -92,7 +99,7 @@ NodeID create_grouped_convolution(Graph &g, const NodeParams &params, NodeIdxPai
         auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node);
         conv_node->set_fused_activation(fused_act);
 
-        convolution_outputs.push_back({ conv_nid, 0 });
+        convolution_outputs.push_back({conv_nid, 0});
     }
 
     // Depth concatenate output
@@ -113,7 +120,7 @@ IGraphMutator::MutationType GroupedConvolutionMutator::type() const
 void GroupedConvolutionMutator::mutate(Graph &g)
 {
     // Early exit if no Convolution layers exist in graph
-    if(g.nodes(NodeType::ConvolutionLayer).empty())
+    if (g.nodes(NodeType::ConvolutionLayer).empty())
     {
         return;
     }
@@ -122,17 +129,18 @@ void GroupedConvolutionMutator::mutate(Graph &g)
     size_t total_nodes = g.nodes().size();
 
     // Iterate over convolution nodes
-    for(unsigned int i = 0; i < total_nodes; ++i)
+    for (unsigned int i = 0; i < total_nodes; ++i)
     {
         INode *node = g.node(i);
-        if(node != nullptr && node->type() == NodeType::ConvolutionLayer && arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node)->num_groups() != 1)
+        if (node != nullptr && node->type() == NodeType::ConvolutionLayer &&
+            arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node)->num_groups() != 1)
         {
             // Validate node
             backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(node->assigned_target());
             Status                    status  = backend.validate_node(*node);
 
             // If grouped convolution is not supported
-            if(!bool(status))
+            if (!bool(status))
             {
                 // Down-cast node
                 auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node);
@@ -151,7 +159,8 @@ void GroupedConvolutionMutator::mutate(Graph &g)
                 ARM_COMPUTE_ERROR_ON(conv_node->input_edge(0) == nullptr || conv_node->input_edge(1) == nullptr);
                 const NodeID input_id   = conv_node->input_edge(0)->producer()->id();
                 const NodeID weights_id = conv_node->input_edge(1)->producer()->id();
-                const NodeID bias_id    = (conv_node->input_edge(2) != nullptr) ? conv_node->input_edge(2)->producer()->id() : EmptyNodeID;
+                const NodeID bias_id =
+                    (conv_node->input_edge(2) != nullptr) ? conv_node->input_edge(2)->producer()->id() : EmptyNodeID;
 
                 // Get driving nodes
                 std::vector<NodeIdxPair> driving_nodes = get_driving_nodes(*node);
@@ -164,14 +173,15 @@ void GroupedConvolutionMutator::mutate(Graph &g)
                 NodeID   latest_nid = g.nodes().size();
 
                 // Create grouped convolution node
-                NodeID grouped_conv_id = create_grouped_convolution(g, params, { input_id, 0 }, weights_id, bias_id,
-                                                                    conv_info, conv_method, fused_act_info, fast_math_hint, num_groups);
+                NodeID grouped_conv_id =
+                    create_grouped_convolution(g, params, {input_id, 0}, weights_id, bias_id, conv_info, conv_method,
+                                               fused_act_info, fast_math_hint, num_groups);
 
                 // Remove convolution node
                 g.remove_node(node->id());
 
                 // Update batch normalization node outputs
-                for(auto &driving_node : driving_nodes)
+                for (auto &driving_node : driving_nodes)
                 {
                     g.add_connection(grouped_conv_id, 0, driving_node.node_id, driving_node.index);
                 }
@@ -180,17 +190,16 @@ void GroupedConvolutionMutator::mutate(Graph &g)
                 g.node(grouped_conv_id)->output(0)->set_accessor(std::move(node_accessor));
 
                 // Configure new tensors and nodes
-                std::for_each(g.tensors().begin() + latest_tid, g.tensors().end(), [](std::unique_ptr<Tensor> &t)
-                {
-                    configure_tensor(t.get());
-                });
-                std::for_each(g.nodes().begin() + latest_nid, g.nodes().end(), [&assigned_target](std::unique_ptr<INode> &n)
-                {
-                    if(n != nullptr)
-                    {
-                        n->set_assigned_target(assigned_target);
-                    }
-                });
+                std::for_each(g.tensors().begin() + latest_tid, g.tensors().end(),
+                              [](std::unique_ptr<Tensor> &t) { configure_tensor(t.get()); });
+                std::for_each(g.nodes().begin() + latest_nid, g.nodes().end(),
+                              [&assigned_target](std::unique_ptr<INode> &n)
+                              {
+                                  if (n != nullptr)
+                                  {
+                                      n->set_assigned_target(assigned_target);
+                                  }
+                              });
             }
         }
     }
diff --git a/src/graph/mutators/InPlaceOperationMutator.cpp b/src/graph/mutators/InPlaceOperationMutator.cpp
index d3ea940895cac769cb451dbdfb5fcdb4de81176d..a51dcc4f428b4bdb7163c962d4b687bd495b3d65 100644
--- a/src/graph/mutators/InPlaceOperationMutator.cpp
+++ b/src/graph/mutators/InPlaceOperationMutator.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h"
 #include "arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h"
+
 #include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
@@ -48,7 +49,7 @@ bool output_edges_are_separate_tensors(Graph &g, const Edge *input_edge)
     const auto input_tensor  = input_edge->tensor();
     const auto input_edge_id = input_edge->id();
 
-    if(parent_node == nullptr)
+    if (parent_node == nullptr)
     {
         return false;
     }
@@ -57,24 +58,23 @@ bool output_edges_are_separate_tensors(Graph &g, const Edge *input_edge)
 
     // If the output is connected to only one edge, then computations can
     // be done in-place.
-    if(output_edges.size() == 1)
+    if (output_edges.size() == 1)
     {
         return true;
     }
 
-    return std::all_of(output_edges.begin(),
-                       output_edges.end(),
-                       [&](const EdgeID & edge_id)
-    {
-        // Skip check on current input edge
-        if(edge_id == input_edge_id)
-        {
-            return true;
-        }
-
-        auto edge = g.edge(edge_id);
-        return edge->tensor() != input_tensor;
-    });
+    return std::all_of(output_edges.begin(), output_edges.end(),
+                       [&](const EdgeID &edge_id)
+                       {
+                           // Skip check on current input edge
+                           if (edge_id == input_edge_id)
+                           {
+                               return true;
+                           }
+
+                           auto edge = g.edge(edge_id);
+                           return edge->tensor() != input_tensor;
+                       });
 }
 
 // If do in-place calculation, then need to use the new output and inherit original output's accessor
@@ -109,12 +109,14 @@ void try_in_place_depthwiseconv(std::unique_ptr<INode> &node)
     // Extract PadStrideInfo and depth multiplier
     PadStrideInfo conv_info{};
     unsigned int  depth_multiplier{};
-    if(node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer)
+    if (node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer)
     {
-        conv_info        = polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->convolution_info();
-        depth_multiplier = polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->depth_multiplier();
+        conv_info =
+            polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->convolution_info();
+        depth_multiplier =
+            polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->depth_multiplier();
     }
-    else if(node->type() == NodeType::DepthwiseConvolutionLayer)
+    else if (node->type() == NodeType::DepthwiseConvolutionLayer)
     {
         conv_info        = polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node.get())->convolution_info();
         depth_multiplier = polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node.get())->depth_multiplier();
@@ -126,7 +128,8 @@ void try_in_place_depthwiseconv(std::unique_ptr<INode> &node)
     const auto out_shape = current_output_tensor->desc().shape;
     const auto qinfo_out = current_output_tensor->desc().quant_info;
 
-    bool input_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, input_shape, 0) && (qinfo_input == qinfo_out) && (input_tensor->accessor() == nullptr);
+    bool input_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, input_shape, 0) &&
+                              (qinfo_input == qinfo_out) && (input_tensor->accessor() == nullptr);
 
     // Specify conditions with which input can be in-placed
     input_can_in_place &= weight_layout == input_tensor->desc().layout && weight_layout == DataLayout::NHWC;
@@ -141,13 +144,14 @@ void try_in_place_depthwiseconv(std::unique_ptr<INode> &node)
     input_can_in_place &= !conv_info.has_padding();
     // NOTE: Dilation should also be (1, 1). However currently dilation is not supported in the depthwise conv node
 
-    if(input_can_in_place)
+    if (input_can_in_place)
     {
         set_new_output_and_inherit_accessor(node, current_output_tensor, input_tensor);
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor "
+                                      "or the quantization info are different.\n");
     }
 }
 
@@ -170,7 +174,7 @@ void try_in_place_elementwise(std::unique_ptr<INode> &node)
 
     const TensorShape out_shape = TensorShape::broadcast_shape(shape0, shape1);
     // Inputs are not broadcast compatible
-    if(out_shape.total_size() == 0)
+    if (out_shape.total_size() == 0)
     {
         return;
     }
@@ -181,22 +185,27 @@ void try_in_place_elementwise(std::unique_ptr<INode> &node)
     const auto qinfo_out = current_output_tensor->desc().quant_info;
 
     // Can do in place, if the input has same shape as output, has same quntisation info as output, has same data type as output and input doesn't have accessor.
-    bool input0_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape0, 0) && (qinfo0 == qinfo_out)
-                               && (input0_tensor->desc().data_type == current_output_tensor->desc().data_type) && (input0_tensor->accessor() == nullptr);
-    bool input1_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape1, 0) && (qinfo1 == qinfo_out)
-                               && (input1_tensor->desc().data_type == current_output_tensor->desc().data_type) && (input1_tensor->accessor() == nullptr);
-
-    if(input0_can_in_place)
+    bool input0_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape0, 0) &&
+                               (qinfo0 == qinfo_out) &&
+                               (input0_tensor->desc().data_type == current_output_tensor->desc().data_type) &&
+                               (input0_tensor->accessor() == nullptr);
+    bool input1_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape1, 0) &&
+                               (qinfo1 == qinfo_out) &&
+                               (input1_tensor->desc().data_type == current_output_tensor->desc().data_type) &&
+                               (input1_tensor->accessor() == nullptr);
+
+    if (input0_can_in_place)
     {
         set_new_output_and_inherit_accessor(node, current_output_tensor, input0_tensor);
     }
-    else if(input1_can_in_place)
+    else if (input1_can_in_place)
     {
         set_new_output_and_inherit_accessor(node, current_output_tensor, input1_tensor);
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor "
+                                      "or the quantization info are different.\n");
     }
 }
 } // namespace
@@ -213,33 +222,31 @@ IGraphMutator::MutationType InPlaceOperationMutator::type() const
 
 void InPlaceOperationMutator::mutate(Graph &g)
 {
-    std::set<NodeType> in_place_nodes =
-    {
-        NodeType::ActivationLayer,
-        NodeType::BatchNormalizationLayer,
-        NodeType::EltwiseLayer,
-        NodeType::UnaryEltwiseLayer,
-        NodeType::DepthwiseConvolutionLayer,
-        NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer,
-        NodeType::PrintLayer
-    };
+    std::set<NodeType> in_place_nodes = {NodeType::ActivationLayer,
+                                         NodeType::BatchNormalizationLayer,
+                                         NodeType::EltwiseLayer,
+                                         NodeType::UnaryEltwiseLayer,
+                                         NodeType::DepthwiseConvolutionLayer,
+                                         NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer,
+                                         NodeType::PrintLayer};
 
     // Not interested in the order of nodes
-    for(auto &node : g.nodes())
+    for (auto &node : g.nodes())
     {
-        if(node && in_place_nodes.find(node->type()) != std::end(in_place_nodes))
+        if (node && in_place_nodes.find(node->type()) != std::end(in_place_nodes))
         {
             // Get input edge
             Edge *input_edge = node->input_edge(0);
 
             // Check if parent has a single output if yes then force in place calculation else not
-            if((input_edge != nullptr) && output_edges_are_separate_tensors(g, input_edge))
+            if ((input_edge != nullptr) && output_edges_are_separate_tensors(g, input_edge))
             {
-                if(node->type() == NodeType::EltwiseLayer)
+                if (node->type() == NodeType::EltwiseLayer)
                 {
                     try_in_place_elementwise(node);
                 }
-                else if(node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer || node->type() == NodeType::DepthwiseConvolutionLayer)
+                else if (node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer ||
+                         node->type() == NodeType::DepthwiseConvolutionLayer)
                 {
                     try_in_place_depthwiseconv(node);
                 }
@@ -252,9 +259,11 @@ void InPlaceOperationMutator::mutate(Graph &g)
                     ARM_COMPUTE_ERROR_ON(current_output_tensor == nullptr || new_output_tensor == nullptr);
 
                     // Prevent in-place operation if there is an accessor bound to the in-place tensor or quantization info are different
-                    if(new_output_tensor->accessor() != nullptr || current_output_tensor->desc().quant_info != new_output_tensor->desc().quant_info)
+                    if (new_output_tensor->accessor() != nullptr ||
+                        current_output_tensor->desc().quant_info != new_output_tensor->desc().quant_info)
                     {
-                        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n");
+                        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to "
+                                                      "the input tensor or the quantization info are different.\n");
                     }
                     else
                     {
diff --git a/src/graph/mutators/MutatorUtils.cpp b/src/graph/mutators/MutatorUtils.cpp
index c8f38f34e7746703ac0da2ed0d52c659da762fac..f47240eaddda844bfe69211e3237fdd3c2104406 100644
--- a/src/graph/mutators/MutatorUtils.cpp
+++ b/src/graph/mutators/MutatorUtils.cpp
@@ -29,14 +29,14 @@ namespace graph
 {
 bool is_padding_in_height_or_width(const DataLayout &layout, const PaddingList &padding_list)
 {
-    if(layout == DataLayout::NCHW || layout == DataLayout::NHWC)
+    if (layout == DataLayout::NCHW || layout == DataLayout::NHWC)
     {
         const unsigned int height_index = get_dimension_idx(layout, DataLayoutDimension::HEIGHT);
         const unsigned int width_index  = get_dimension_idx(layout, DataLayoutDimension::WIDTH);
 
-        for(unsigned int i = 0; i < padding_list.size(); ++i)
+        for (unsigned int i = 0; i < padding_list.size(); ++i)
         {
-            if(i != height_index && i != width_index && padding_list[i] != PaddingInfo(0, 0))
+            if (i != height_index && i != width_index && padding_list[i] != PaddingInfo(0, 0))
             {
                 // if the index is not either height or width, don't fuse
                 return false;
@@ -49,4 +49,4 @@ bool is_padding_in_height_or_width(const DataLayout &layout, const PaddingList &
     return false;
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/mutators/NodeExecutionMethodMutator.cpp b/src/graph/mutators/NodeExecutionMethodMutator.cpp
index 09a3cf50c0b40f6557197d890b1a42bd98fb4b54..588befecae7d55d807b1f538052157e7463cdc3b 100644
--- a/src/graph/mutators/NodeExecutionMethodMutator.cpp
+++ b/src/graph/mutators/NodeExecutionMethodMutator.cpp
@@ -23,11 +23,11 @@
  */
 #include "arm_compute/graph/mutators/NodeExecutionMethodMutator.h"
 
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/Cast.h"
 
@@ -49,17 +49,17 @@ template <typename Setter>
 void set_default_on_invalid_method(Graph &g, NodeType node_type, Setter &&setter)
 {
     const std::vector<NodeID> &node_ids = g.nodes(node_type);
-    for(auto &node_id : node_ids)
+    for (auto &node_id : node_ids)
     {
         INode *node = g.node(node_id);
-        if(node != nullptr)
+        if (node != nullptr)
         {
             // Validate node
             backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(node->assigned_target());
             Status                    status  = backend.validate_node(*node);
 
             // Set default execution method in case of failure
-            if(!bool(status))
+            if (!bool(status))
             {
                 setter(node);
             }
@@ -81,22 +81,26 @@ IGraphMutator::MutationType NodeExecutionMethodMutator::type() const
 void NodeExecutionMethodMutator::mutate(Graph &g)
 {
     // Convolution Layer
-    set_default_on_invalid_method(g, NodeType::ConvolutionLayer, [](INode * n)
-    {
-        ARM_COMPUTE_LOG_GRAPH_INFO("Switched ConvolutionLayer method of node with ID : "
-                                   << n->id() << " and Name: " << n->name() << std::endl);
-        auto *casted_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(n);
-        casted_node->set_convolution_method(ConvolutionMethod::Default);
-    });
+    set_default_on_invalid_method(g, NodeType::ConvolutionLayer,
+                                  [](INode *n)
+                                  {
+                                      ARM_COMPUTE_LOG_GRAPH_INFO("Switched ConvolutionLayer method of node with ID : "
+                                                                 << n->id() << " and Name: " << n->name() << std::endl);
+                                      auto *casted_node =
+                                          arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(n);
+                                      casted_node->set_convolution_method(ConvolutionMethod::Default);
+                                  });
 
     // Depthwise Convolution Layer
-    set_default_on_invalid_method(g, NodeType::DepthwiseConvolutionLayer, [](INode * n)
-    {
-        ARM_COMPUTE_LOG_GRAPH_INFO("Switched Depthwise ConvolutionLayer method of node with ID : "
-                                   << n->id() << " and Name: " << n->name() << std::endl);
-        auto *casted_node = arm_compute::utils::cast::polymorphic_downcast<DepthwiseConvolutionLayerNode *>(n);
-        casted_node->set_depthwise_convolution_method(DepthwiseConvolutionMethod::Default);
-    });
+    set_default_on_invalid_method(
+        g, NodeType::DepthwiseConvolutionLayer,
+        [](INode *n)
+        {
+            ARM_COMPUTE_LOG_GRAPH_INFO("Switched Depthwise ConvolutionLayer method of node with ID : "
+                                       << n->id() << " and Name: " << n->name() << std::endl);
+            auto *casted_node = arm_compute::utils::cast::polymorphic_downcast<DepthwiseConvolutionLayerNode *>(n);
+            casted_node->set_depthwise_convolution_method(DepthwiseConvolutionMethod::Default);
+        });
 }
 } // namespace graph
 } // namespace arm_compute
diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index 8eb3e4cb71bce3baee458f5cd845fcea12a510e4..998a4a05c7b20ca594da81a82e4c6d1d8de99769 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp
@@ -24,17 +24,14 @@
 #include "arm_compute/graph/mutators/NodeFusionMutator.h"
 
 #include "arm_compute/core/utils/DataTypeUtils.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/GraphBuilder.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
-#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h"
-#include "arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "src/graph/mutators/MutatorUtils.h"
-
 #include "support/Cast.h"
 
 #include <list>
@@ -48,7 +45,7 @@ namespace detail
 {
 void transfer_driving_nodes_and_remove_old_node(Graph &g, INode *new_node, INode *old_node, bool add_output_tensor)
 {
-    if(new_node == nullptr || old_node == nullptr)
+    if (new_node == nullptr || old_node == nullptr)
     {
         return;
     }
@@ -57,7 +54,7 @@ void transfer_driving_nodes_and_remove_old_node(Graph &g, INode *new_node, INode
     std::vector<NodeIdxPair> last_driving_nodes = get_driving_nodes(*old_node);
 
     // Extract last fusable node accessor if any
-    if(old_node->output(0) == nullptr)
+    if (old_node->output(0) == nullptr)
     {
         return;
     }
@@ -67,10 +64,10 @@ void transfer_driving_nodes_and_remove_old_node(Graph &g, INode *new_node, INode
     g.remove_node(old_node->id());
 
     // Update fused node outputs
-    for(auto &driving_node : last_driving_nodes)
+    for (auto &driving_node : last_driving_nodes)
     {
         g.add_connection(new_node->id(), 0, driving_node.node_id, driving_node.index);
-        if(add_output_tensor)
+        if (add_output_tensor)
         {
             configure_tensor(new_node->output(0));
         }
@@ -85,19 +82,21 @@ void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge
     ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
 
     auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(output_edge->producer());
-    auto *bn_node   = arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
+    auto *bn_node =
+        arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
 
     // Not fusing if number of groups is greater than 1
-    if(conv_node->num_groups() > 1)
+    if (conv_node->num_groups() > 1)
     {
         return;
     }
 
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : " << output_edge->producer_id()
-                                  << " with BatchNormalization Layer node with ID : " << output_edge->consumer_id() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : "
+                                  << output_edge->producer_id() << " with BatchNormalization Layer node with ID : "
+                                  << output_edge->consumer_id() << std::endl);
 
     // Prevent fusion if fused node has an output accessor
-    if(conv_node->output(0)->accessor() == nullptr)
+    if (conv_node->output(0)->accessor() == nullptr)
     {
         const Target assigned_target = conv_node->assigned_target();
 
@@ -117,9 +116,10 @@ void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge
         const auto epsilon = bn_node->epsilon();
 
         // Create the fused node
-        const NodeID fused_id = g.add_node<FusedConvolutionBatchNormalizationNode>(epsilon, conv_info, num_groups, conv_method, fast_math_hint, act_info);
+        const NodeID fused_id = g.add_node<FusedConvolutionBatchNormalizationNode>(
+            epsilon, conv_info, num_groups, conv_method, fast_math_hint, act_info);
 
-        if(conv_node->input_edge(2) != nullptr)
+        if (conv_node->input_edge(2) != nullptr)
         {
             auto conv_bias_id = conv_node->input_edge(2)->producer_id();
             g.add_connection(conv_bias_id, 0, fused_id, 2);
@@ -131,13 +131,13 @@ void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge
         g.add_connection(bn_mean_id, 0, fused_id, 3);
         g.add_connection(bn_var_id, 0, fused_id, 4);
 
-        if(bn_node->input_edge(3) != nullptr)
+        if (bn_node->input_edge(3) != nullptr)
         {
             const auto bn_beta_id = bn_node->input_edge(3)->producer_id();
             g.add_connection(bn_beta_id, 0, fused_id, 5);
         }
 
-        if(bn_node->input_edge(4) != nullptr)
+        if (bn_node->input_edge(4) != nullptr)
         {
             const auto bn_gamma_id = bn_node->input_edge(4)->producer_id();
             g.add_connection(bn_gamma_id, 0, fused_id, 6);
@@ -149,14 +149,15 @@ void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge
         transfer_driving_nodes_and_remove_old_node(g, fused_node, bn_node, true);
 
         fused_node->set_assigned_target(assigned_target);
-        fused_node->set_common_node_parameters(NodeParams{ conv_node->name() + "+" + bn_node_name, assigned_target });
+        fused_node->set_common_node_parameters(NodeParams{conv_node->name() + "+" + bn_node_name, assigned_target});
 
         // Remove convolution node
         g.remove_node(conv_node->id());
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution with batch normalization due to the presence of an output accessor\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+            "Prevented fusion of convolution with batch normalization due to the presence of an output accessor\n");
     }
 }
 
@@ -164,14 +165,17 @@ void fuse_depthwise_convolution_with_batch_normalization(Graph &g, const Edge *o
 {
     ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
 
-    auto *depth_conv_node = arm_compute::utils::cast::polymorphic_downcast<DepthwiseConvolutionLayerNode *>(output_edge->producer());
-    auto *bn_node         = arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
+    auto *depth_conv_node =
+        arm_compute::utils::cast::polymorphic_downcast<DepthwiseConvolutionLayerNode *>(output_edge->producer());
+    auto *bn_node =
+        arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
 
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing depthwise convolution node with ID : " << output_edge->producer_id()
-                                  << " with BatchNormalization Layer node with ID : " << output_edge->consumer_id() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing depthwise convolution node with ID : "
+                                  << output_edge->producer_id() << " with BatchNormalization Layer node with ID : "
+                                  << output_edge->consumer_id() << std::endl);
 
     // Prevent fusion if fused node has an output accessor
-    if(depth_conv_node->output(0)->accessor() == nullptr)
+    if (depth_conv_node->output(0)->accessor() == nullptr)
     {
         const Target assigned_target = depth_conv_node->assigned_target();
 
@@ -191,9 +195,10 @@ void fuse_depthwise_convolution_with_batch_normalization(Graph &g, const Edge *o
         const auto epsilon     = bn_node->epsilon();
 
         // Create the fused node
-        const NodeID fused_id = g.add_node<FusedDepthwiseConvolutionBatchNormalizationNode>(epsilon, conv_info, depth_multiplier, depth_conv_method, act_info);
+        const NodeID fused_id = g.add_node<FusedDepthwiseConvolutionBatchNormalizationNode>(
+            epsilon, conv_info, depth_multiplier, depth_conv_method, act_info);
 
-        if(depth_conv_node->input_edge(2) != nullptr)
+        if (depth_conv_node->input_edge(2) != nullptr)
         {
             const auto conv_bias_id = depth_conv_node->input_edge(2)->producer_id();
             g.add_connection(conv_bias_id, 0, fused_id, 2);
@@ -213,19 +218,23 @@ void fuse_depthwise_convolution_with_batch_normalization(Graph &g, const Edge *o
         transfer_driving_nodes_and_remove_old_node(g, fused_node, bn_node, true);
 
         fused_node->set_assigned_target(assigned_target);
-        fused_node->set_common_node_parameters(NodeParams{ depth_conv_node->name() + "+" + bn_node_name, assigned_target });
+        fused_node->set_common_node_parameters(
+            NodeParams{depth_conv_node->name() + "+" + bn_node_name, assigned_target});
 
         // Remove convolution node
         g.remove_node(depth_conv_node->id());
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of depthwise convolution with batch normalization due to the presence of an output accessor\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of depthwise convolution with batch normalization due to the "
+                                      "presence of an output accessor\n");
     }
 }
 
 template <typename N>
-void fuse_node_with_activation(Graph &g, const Edge *output_edge, const std::set<Activation> &supported_fused_activations)
+void fuse_node_with_activation(Graph                      &g,
+                               const Edge                 *output_edge,
+                               const std::set<Activation> &supported_fused_activations)
 {
     ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
 
@@ -235,22 +244,23 @@ void fuse_node_with_activation(Graph &g, const Edge *output_edge, const std::set
     ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr || n_node->output(0) == nullptr);
 
     // Check if activation is supported for fusion
-    if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
+    if (supported_fused_activations.count(act_node->activation_info().activation()) == 0)
     {
         return;
     }
 
     // EltwiseLayerNode can only be fused when dataype is float
-    if(n_node->type() == NodeType::EltwiseLayer && !is_data_type_float(n_node->output(0)->desc().data_type))
+    if (n_node->type() == NodeType::EltwiseLayer && !is_data_type_float(n_node->output(0)->desc().data_type))
     {
         return;
     }
 
     ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing node with ID : " << output_edge->producer_id()
-                                  << " with Activation Layer node with ID : " << output_edge->consumer_id() << std::endl);
+                                                           << " with Activation Layer node with ID : "
+                                                           << output_edge->consumer_id() << std::endl);
 
     // Prevent fusion if fused node has an output accessor
-    if(n_node->output(0)->accessor() == nullptr)
+    if (n_node->output(0)->accessor() == nullptr)
     {
         // Set activation info to fused node
         n_node->set_fused_activation(act_node->activation_info());
@@ -259,7 +269,8 @@ void fuse_node_with_activation(Graph &g, const Edge *output_edge, const std::set
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of node with activation due to the presence of an output accessor\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+            "Prevented fusion of node with activation due to the presence of an output accessor\n");
     }
 }
 
@@ -270,8 +281,8 @@ void fuse_pad_with_convolution(Graph &g, const Edge *output_edge)
     auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<N *>(output_edge->consumer());
 
     const Edge *input_edge = pad_node->input_edge(0);
-    if(input_edge != nullptr && input_edge->tensor() != nullptr && pad_node->output(0)->accessor() == nullptr
-       && pad_node->pad_value().get<float>() == 0.0)
+    if (input_edge != nullptr && input_edge->tensor() != nullptr && pad_node->output(0)->accessor() == nullptr &&
+        pad_node->pad_value().get<float>() == 0.0)
     {
         const DataLayout  layout       = input_edge->tensor()->desc().layout;
         const PaddingList padding_list = pad_node->padding();
@@ -282,18 +293,14 @@ void fuse_pad_with_convolution(Graph &g, const Edge *output_edge)
         const PaddingInfo pad_w = width_index < padding_list.size() ? padding_list[width_index] : PaddingInfo(0, 0);
         const PaddingInfo pad_h = height_index < padding_list.size() ? padding_list[height_index] : PaddingInfo(0, 0);
 
-        if(is_padding_in_height_or_width(layout, padding_list))
+        if (is_padding_in_height_or_width(layout, padding_list))
         {
             // Add paddings to the convolution node
             const PadStrideInfo conv_info = conv_node->convolution_info();
-            const PadStrideInfo new_conv_info(
-                conv_info.stride().first,
-                conv_info.stride().second,
-                conv_info.pad_left() + pad_w.first,
-                conv_info.pad_right() + pad_w.second,
-                conv_info.pad_top() + pad_h.first,
-                conv_info.pad_bottom() + pad_h.second,
-                conv_info.round());
+            const PadStrideInfo new_conv_info(conv_info.stride().first, conv_info.stride().second,
+                                              conv_info.pad_left() + pad_w.first, conv_info.pad_right() + pad_w.second,
+                                              conv_info.pad_top() + pad_h.first, conv_info.pad_bottom() + pad_h.second,
+                                              conv_info.round());
             conv_node->set_convolution_info(new_conv_info);
 
             // Update drivers of the convolution node
@@ -301,7 +308,7 @@ void fuse_pad_with_convolution(Graph &g, const Edge *output_edge)
             g.remove_node(pad_node->id());
 
             // Update fused node inputs
-            for(auto &driver_node : pad_driver_nodes)
+            for (auto &driver_node : pad_driver_nodes)
             {
                 g.add_connection(driver_node.node_id, driver_node.index, conv_node->id(), 0);
             }
@@ -310,22 +317,23 @@ void fuse_pad_with_convolution(Graph &g, const Edge *output_edge)
 }
 
 template <typename N1, typename N2, typename F, typename... Args>
-void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&... optional_arguments)
+void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&...optional_arguments)
 {
     // Note that fused nodes may be added to the end of the node list.
     // Instead of only looping over the original list of nodes, we loop over the current node list which could be growing.
     // This is intentional as it probes the newly added fused nodes for further fusing opportunities.
-    for(unsigned int i = 0; i < g.nodes().size(); ++i)
+    for (unsigned int i = 0; i < g.nodes().size(); ++i)
     {
         auto node = g.node(i);
         // Check if the node is of type N1 and not a branching node
-        if(node && node->type() == N1::node_type && node->output_edges().size() == 1)
+        if (node && node->type() == N1::node_type && node->output_edges().size() == 1)
         {
             const auto output_edge_id = *node->output_edges().begin();
             const auto output_edge    = g.edge(output_edge_id);
 
             // Check if following node is a type N2 node
-            if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && (output_edge->consumer()->type() == N2::node_type) && prec(*output_edge->producer()))
+            if ((output_edge != nullptr) && (output_edge->consumer() != nullptr) &&
+                (output_edge->consumer()->type() == N2::node_type) && prec(*output_edge->producer()))
             {
                 fuse_fcn(g, output_edge, optional_arguments...);
             }
@@ -333,458 +341,23 @@ void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse
     }
 }
 
-/** Check valid combinations:
- *
- * | Main operator | Post operators             |
- * |:--------------|:---------------------------|
- * |conv           | add                        |
- * |conv           | act + add                  |
- * |conv           | add + act                  |
- * |conv           | act + add + act            |
- *
-*/
-#define MAX_VALIDE_COMBINATION 4
-#define MAX_POST_OP_NUM 3
-NodeType valide_post_op_type[MAX_VALIDE_COMBINATION][MAX_POST_OP_NUM] = { { EltwiseLayerNode::node_type },
-    { EltwiseLayerNode::node_type, ActivationLayerNode::node_type },
-    { ActivationLayerNode::node_type, EltwiseLayerNode::node_type },
-    { ActivationLayerNode::node_type, EltwiseLayerNode::node_type, ActivationLayerNode::node_type }
-};
-
-bool check_post_op_type(NodeType *post_op_type, int len)
-{
-    if(len > MAX_POST_OP_NUM || len <= 0)
-    {
-        return false;
-    }
-
-    bool found = false;
-    for(int i = 0; i < MAX_VALIDE_COMBINATION; ++i)
-    {
-        for(int j = 0; j < len; ++j)
-        {
-            if(post_op_type[j] != valide_post_op_type[i][j])
-            {
-                found = false;
-                break;
-            }
-            found = true;
-        }
-        if(found)
-            break;
-    }
-
-    return found;
-}
-
-void fuse_convolution_with_post_op(Graph &g, INode *fused_node, std::list<INode *> post_op_node_list, int prev_op_dst_pos)
-{
-    unsigned int op_idx = 0;
-    // Fuse post operators with conv
-    for(const auto &post_op : post_op_node_list)
-    {
-        switch(post_op->type())
-        {
-            case EltwiseLayerNode::node_type:
-            {
-                auto *eltwise_node = arm_compute::utils::cast::polymorphic_downcast<EltwiseLayerNode *>(post_op);
-                ARM_COMPUTE_ERROR_ON(eltwise_node->output(0) == nullptr);
-
-                fused_node->post_op_info_list().push_back(std::make_unique<ConvPostOpInfoEltwiseAdd>(prev_op_dst_pos, eltwise_node->convert_policy()));
-                ARM_COMPUTE_LOG_GRAPH_VERBOSE(" with Elementwise Layer node with ID : " << post_op->id());
-                break;
-            }
-            case ActivationLayerNode::node_type:
-            {
-                auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(post_op);
-                ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr);
-
-                fused_node->post_op_info_list().push_back(std::make_unique<ConvPostOpInfoActivation>(act_node->activation_info()));
-                ARM_COMPUTE_LOG_GRAPH_VERBOSE(" with Activation Layer node with ID : " << post_op->id());
-                break;
-            }
-            default:
-            {
-                break;
-            }
-        }
-
-        if(op_idx == post_op_node_list.size() - 1) // last fusable node
-        {
-            transfer_driving_nodes_and_remove_old_node(g, fused_node, post_op, true);
-        }
-        else
-        {
-            // Remove node
-            g.remove_node(post_op->id());
-        }
-        op_idx++;
-    }
-}
-
-std::list<INode *> get_post_op_list(Graph &g, int &eltwise_operand_id, int &prev_op_dst_pos, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
-{
-    std::list<INode *> post_op_node_list    = {};
-    NodeID             prev_op_dst_id       = conv_node_id;
-    NodeType           post_op_type_list[3] = { NodeType::Dummy, NodeType::Dummy, NodeType::Dummy };
-    int                post_op_idx          = 0;
-
-    // Get list of the connected nodes
-    auto current_node = g.node(conv_node_id);
-
-    while(post_op_node_list.size() < 3)
-    {
-        // This convolution node must have only one output edge, otherwise this function would not have been called
-
-        auto current_output_edge_id = current_node->output_edges().begin();
-        auto current_output_edge    = g.edge(*current_output_edge_id);
-        auto post_op_node           = current_output_edge->consumer();
-
-        bool fusable_post_op = false;
-        if(post_op_node != nullptr && post_op_node->output_edges().size() > 0)
-        {
-            switch(post_op_node->type())
-            {
-                case EltwiseLayerNode::node_type:
-                {
-                    auto *eltwise_node = arm_compute::utils::cast::polymorphic_downcast<EltwiseLayerNode *>(post_op_node);
-                    ARM_COMPUTE_ERROR_ON(eltwise_node->output(0) == nullptr);
-                    if(eltwise_node->output(0)->accessor() == nullptr)
-                    {
-                        post_op_node_list.push_back(post_op_node);
-                        fusable_post_op                  = true;
-                        post_op_type_list[post_op_idx++] = eltwise_node->type();
-
-                        // Extract elementwise inputs
-                        const auto eltwise_input_id_0 = eltwise_node->input_edge(0)->producer_id();
-                        const auto eltwise_input_id_1 = eltwise_node->input_edge(1)->producer_id();
-                        if(eltwise_input_id_0 == prev_op_dst_id)
-                        {
-                            eltwise_operand_id = eltwise_input_id_1;
-                            prev_op_dst_pos    = 0;
-                        }
-                        else if(eltwise_input_id_1 == prev_op_dst_id)
-                        {
-                            eltwise_operand_id = eltwise_input_id_0;
-                            prev_op_dst_pos    = 1;
-                        }
-                    }
-                    else
-                    {
-                        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with elementwise due to the presence of an output accessor\n");
-                    }
-                    break;
-                }
-                case ActivationLayerNode::node_type:
-                {
-                    auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(post_op_node);
-                    ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr);
-                    // Check if activation is supported for fusion
-                    if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
-                    {
-                        break;
-                    }
-                    if(act_node->output(0)->accessor() == nullptr)
-                    {
-                        post_op_node_list.push_back(post_op_node);
-                        fusable_post_op                  = true;
-                        post_op_type_list[post_op_idx++] = act_node->type();
-                        prev_op_dst_id                   = act_node->id();
-                    }
-                    else
-                    {
-                        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
-                    }
-                    break;
-                }
-                default:
-                {
-                    break;
-                }
-            }
-
-            // Check if the node is not a branching node and current node is fusable
-            if(post_op_node->output_edges().size() == 1 && fusable_post_op == true)
-            {
-                current_node = post_op_node;
-            }
-            else
-            {
-                break;
-            }
-        }
-    }
-
-    // Check whether it's valid post op list
-    if(post_op_node_list.size() > 0)
-    {
-        bool fuse_with_post_op = check_post_op_type(post_op_type_list, post_op_node_list.size());
-        if(!fuse_with_post_op)
-        {
-            post_op_node_list.clear();
-        }
-    }
-
-    return post_op_node_list;
-}
-
-/** Fuse below operators:
- *
- * | Main operator | Post operators             |
- * |:--------------|:---------------------------|
- * |conv           | add                        |
- * |conv           | act + add                  |
- * |conv           | add + act                  |
- * |conv           | act + add + act            |
- *
- * Notes: currently, only GEMM supports fusion with post operator
-*/
-void fuse_convolution_with_post_ops(Graph &g, const Edge *output_edge, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
-{
-    ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
-
-    auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(output_edge->producer());
-    ARM_COMPUTE_ERROR_ON(conv_node->output(0) == nullptr);
-
-    const ConvolutionMethod conv_algorithm = conv_node->convolution_method();
-    if(conv_algorithm != ConvolutionMethod::GEMM)
-    {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
-        return;
-    }
-
-    // Prevent fusion if fused node has an output accessor
-    if(conv_node->output(0)->accessor() == nullptr)
-    {
-        // If data type is FP32/FP16, data layout is NHWC, and filter size is 1x1, fuse convolution with post op, as Conv1x1 always leads to GEMM.
-        const Edge *input_edge = conv_node->input_edge(1);
-        if(input_edge != nullptr && input_edge->tensor() != nullptr)
-        {
-            const DataLayout  data_layout  = input_edge->tensor()->desc().layout;
-            const DataType    data_type    = input_edge->tensor()->desc().data_type;
-            const TensorShape tensor_shape = input_edge->tensor()->desc().shape;
-            if((data_layout != DataLayout::NHWC) || (is_data_type_float(data_type) == false) || (tensor_shape.y() != 1) || (tensor_shape.z() != 1))
-            {
-                ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
-                return;
-            }
-        }
-        else
-        {
-            return;
-        }
-
-        // Get post op list
-        int                eltwise_operand_id = 0;
-        int                prev_op_dst_pos    = 0; // Previous operator dst's postion in current operator
-        std::list<INode *> post_op_node_list  = get_post_op_list(g, eltwise_operand_id, prev_op_dst_pos, conv_node_id, supported_fused_activations);
-
-        if(post_op_node_list.size() == 0)
-        {
-            return;
-        }
-        else // Do convolution fusion with post op if there're one(elementwise), two or more operators
-        {
-            const Target assigned_target = conv_node->assigned_target();
-
-            // Extract conv inputs
-            const auto   conv_input_id   = conv_node->input_edge(0)->producer_id();
-            const auto   conv_weights_id = conv_node->input_edge(1)->producer_id();
-            const auto   conv_info       = conv_node->convolution_info();
-            const auto   conv_method     = conv_node->convolution_method();
-            const auto   num_groups      = conv_node->num_groups();
-            FastMathHint fast_math_hint  = conv_node->fast_math_hint();
-
-            // Create the fused node
-            const NodeID fused_id = g.add_node<FusedConvolutionWithPostOpNode>(conv_info, num_groups, conv_method, fast_math_hint);
-            ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : " << conv_node->id());
-
-            // Add connections from the conv inputs to the fused node
-            g.add_connection(conv_input_id, 0, fused_id, 0);
-            g.add_connection(conv_weights_id, 0, fused_id, 1);
-            if(conv_node->input_edge(2) != nullptr)
-            {
-                auto conv_bias_id = conv_node->input_edge(2)->producer_id();
-                g.add_connection(conv_bias_id, 0, fused_id, 2);
-            }
-            // Adding the Element wise operand in case the post op is element wise operation
-            auto it = std::find_if(post_op_node_list.begin(),
-                                   post_op_node_list.end(),
-                                   [&](const INode * nd)
-            {
-                return (nd->type() == graph::NodeType::EltwiseLayer);
-            });
-
-            if(it != post_op_node_list.end())
-            {
-                g.add_connection(eltwise_operand_id, 0, fused_id, 3);
-            }
-            g.remove_node(conv_node->id());
-
-            // Update fused node outputs
-            auto fused_node = g.node(fused_id);
-            fused_node->set_assigned_target(assigned_target);
-
-            // Fuse convolution with post op
-            fuse_convolution_with_post_op(g, fused_node, post_op_node_list, prev_op_dst_pos);
-
-            post_op_node_list.clear();
-            ARM_COMPUTE_LOG_GRAPH_VERBOSE(std::endl);
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
-    }
-}
-
-void fuse_convolution_batch_normalization_with_post_ops(Graph &g, const Edge *output_edge, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
-{
-    ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
-
-    auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(output_edge->producer());
-    ARM_COMPUTE_ERROR_ON(conv_node->output(0) == nullptr);
-    const ConvolutionMethod conv_algorithm = conv_node->convolution_method();
-    if(conv_algorithm != ConvolutionMethod::GEMM)
-    {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
-        return;
-    }
-
-    // Prevent fusion if fused node has an output accessor
-    if(conv_node->output(0)->accessor() == nullptr)
-    {
-        // If data type is FP32/FP16, data layout is NHWC, and filter size is 1x1, fuse convolution with post op, as Conv1x1 always leads to GEMM.
-        const Edge *input_edge = conv_node->input_edge(1);
-        if(input_edge != nullptr && input_edge->tensor() != nullptr)
-        {
-            const DataLayout  data_layout  = input_edge->tensor()->desc().layout;
-            const DataType    data_type    = input_edge->tensor()->desc().data_type;
-            const TensorShape tensor_shape = input_edge->tensor()->desc().shape;
-            if((data_layout != DataLayout::NHWC) || (is_data_type_float(data_type) == false) || (tensor_shape.y() != 1) || (tensor_shape.z() != 1))
-            {
-                ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
-                return;
-            }
-        }
-        else
-        {
-            return;
-        }
-
-        // Get post op list
-        int                eltwise_operand_id = 0;
-        int                prev_op_dst_pos    = 0; // Previous operator dst's postion in current operator
-        std::list<INode *> post_op_node_list  = get_post_op_list(g, eltwise_operand_id, prev_op_dst_pos, conv_node_id, supported_fused_activations);
-
-        if(post_op_node_list.size() == 0)
-        {
-            return;
-        }
-        else // Do convolution fusion with post op if there're one(elementwise), two or more operators
-        {
-            const Target assigned_target = conv_node->assigned_target();
-
-            // Extract conv inputs
-            const auto   conv_input_id   = conv_node->input_edge(0)->producer_id();
-            const auto   conv_weights_id = conv_node->input_edge(1)->producer_id();
-            const auto   bn_mean_id      = conv_node->input_edge(3)->producer_id();
-            const auto   bn_var_id       = conv_node->input_edge(4)->producer_id();
-            const auto   conv_info       = conv_node->convolution_info();
-            const auto   conv_method     = conv_node->convolution_method();
-            const auto   num_groups      = conv_node->num_groups();
-            FastMathHint fast_math_hint  = conv_node->fast_math_hint();
-
-            // Create the fused node
-
-            const float  epsilon  = conv_node->epsilon();
-            const NodeID fused_id = g.add_node<FusedConvolutionBatchNormalizationWithPostOpsNode>(epsilon, conv_info, num_groups, conv_method, fast_math_hint);
-
-            ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing FusedConvolutionBatchNormalization node with ID : " << conv_node->id());
-
-            // Add connections from the conv inputs to the fused node
-            g.add_connection(conv_input_id, 0, fused_id, 0);
-            g.add_connection(conv_weights_id, 0, fused_id, 1);
-
-            if(conv_node->input_edge(2) != nullptr)
-            {
-                auto conv_bias_id = conv_node->input_edge(2)->producer_id();
-                g.add_connection(conv_bias_id, 0, fused_id, 2);
-            }
-            g.add_connection(bn_mean_id, 0, fused_id, 3);
-            g.add_connection(bn_var_id, 0, fused_id, 4);
-
-            // Move connections of old FusedConvolutionBatchNormalization to the fused node
-            if(conv_node->input_edge(5) != nullptr)
-            {
-                const auto bn_beta_id = conv_node->input_edge(5)->producer_id();
-                g.add_connection(bn_beta_id, 0, fused_id, 5);
-            }
-
-            if(conv_node->input_edge(6) != nullptr)
-            {
-                const auto bn_gamma_id = conv_node->input_edge(6)->producer_id();
-                g.add_connection(bn_gamma_id, 0, fused_id, 6);
-            }
-
-            // Adding the Element wise operand in case the post op is element wise operation
-            auto it = std::find_if(post_op_node_list.begin(),
-                                   post_op_node_list.end(),
-                                   [&](const INode * nd)
-            {
-                return (nd->type() == graph::NodeType::EltwiseLayer);
-            });
-
-            if(it != post_op_node_list.end())
-            {
-                g.add_connection(eltwise_operand_id, 0, fused_id, 7);
-            }
-
-            // Update fused node outputs
-            auto fused_node = g.node(fused_id);
-            fused_node->set_assigned_target(assigned_target);
-
-            auto conv_node_name = conv_node->name();
-
-            // collect the post ops names
-            std::string post_ops_name = "";
-            for(auto &post_op : post_op_node_list)
-            {
-                post_ops_name += post_op->name();
-            }
-            fused_node->set_common_node_parameters(NodeParams{ conv_node->name() + "+" + post_ops_name, assigned_target });
-
-            // Fuse convolution with post op
-            fuse_convolution_with_post_op(g, fused_node, post_op_node_list, prev_op_dst_pos);
-
-            post_op_node_list.clear();
-            g.remove_node(conv_node->id());
-            ARM_COMPUTE_LOG_GRAPH_VERBOSE(std::endl);
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
-    }
-}
-
 template <typename N1, typename F, typename... Args>
-void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&... optional_arguments)
+void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&...optional_arguments)
 {
     // Note that fused nodes may be added to the end of the node list.
     // Instead of only looping over the original list of nodes, we loop over the current node list which could be growing.
     // This is intentional as it probes the newly added fused nodes for further fusing opportunities.
-    for(unsigned int i = 0; i < g.nodes().size(); ++i)
+    for (unsigned int i = 0; i < g.nodes().size(); ++i)
     {
         auto node = g.node(i);
         // Check if the node is of type N1 and not a branching node
-        if(node && node->type() == N1::node_type && node->output_edges().size() == 1)
+        if (node && node->type() == N1::node_type && node->output_edges().size() == 1)
         {
             const auto output_edge_id = *node->output_edges().begin();
             const auto output_edge    = g.edge(output_edge_id);
 
             // Check if it's the correct target
-            if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && prec(*output_edge->producer()))
+            if ((output_edge != nullptr) && (output_edge->consumer() != nullptr) && prec(*output_edge->producer()))
             {
                 fuse_fcn(g, output_edge, i, optional_arguments...);
             }
@@ -806,30 +379,24 @@ IGraphMutator::MutationType NodeFusionMutator::type() const
 void NodeFusionMutator::mutate(Graph &g)
 {
     // Supported activations when fusing
-    const std::set<Activation> supported_fused_activations = { Activation::ABS, Activation::BOUNDED_RELU, Activation::ELU,
-                                                               Activation::HARD_SWISH, Activation::IDENTITY, Activation::LEAKY_RELU,
-                                                               Activation::LINEAR, Activation::LOGISTIC, Activation::LU_BOUNDED_RELU,
-                                                               Activation::RELU, Activation::SOFT_RELU, Activation::SQRT,
-                                                               Activation::SQUARE, Activation::TANH
-                                                             };
+    const std::set<Activation> supported_fused_activations = {
+        Activation::ABS,        Activation::BOUNDED_RELU, Activation::ELU,
+        Activation::HARD_SWISH, Activation::IDENTITY,     Activation::LEAKY_RELU,
+        Activation::LINEAR,     Activation::LOGISTIC,     Activation::LU_BOUNDED_RELU,
+        Activation::RELU,       Activation::SOFT_RELU,    Activation::SQRT,
+        Activation::SQUARE,     Activation::TANH};
 
     // Preconditions
-    auto empty_prec = [](INode &)
-    {
-        return true;
-    };
-    auto cl_target_prec = [](INode & n)
-    {
-        return n.assigned_target() == Target::CL;
-    };
-    auto qs8_prec = [&g](INode & n)
+    auto empty_prec     = [](INode &) { return true; };
+    auto cl_target_prec = [](INode &n) { return n.assigned_target() == Target::CL; };
+    auto qs8_prec       = [&g](INode &n)
     {
         ARM_COMPUTE_ERROR_ON(n.output(0) == nullptr);
 
         const auto output_edge_id = *n.output_edges().begin();
         const auto output_edge    = g.edge(output_edge_id);
         // To perform fusion the two nodes must have same output quantization information
-        const bool same_qinfo     = n.output(0)->desc().quant_info == output_edge->producer()->output(0)->desc().quant_info;
+        const bool same_qinfo = n.output(0)->desc().quant_info == output_edge->producer()->output(0)->desc().quant_info;
         const bool output_qasymm8 = n.output(0)->desc().data_type == DataType::QASYMM8;
 
         return (output_qasymm8 && same_qinfo) || !output_qasymm8;
@@ -837,21 +404,25 @@ void NodeFusionMutator::mutate(Graph &g)
 
     // Fusion mutations
 
-    detail::fuse_layer<PadLayerNode, ConvolutionLayerNode>(g, empty_prec, detail::fuse_pad_with_convolution<ConvolutionLayerNode>);
-    detail::fuse_layer<PadLayerNode, DepthwiseConvolutionLayerNode>(g, empty_prec, detail::fuse_pad_with_convolution<DepthwiseConvolutionLayerNode>);
-    // The fusion of PostOps to ConvolutionLayer:
-    // It must occur after the fusion of PadLayer into ConvolutionLayer
-    // It must occur before the fusion of normal ActivationLayer into ConvolutionLayer as it takes precedence
-    detail::fuse_layer<ConvolutionLayerNode>(g, cl_target_prec, detail::fuse_convolution_with_post_ops, supported_fused_activations);
-    detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations);
-    detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
-    detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations);
-    detail::fuse_layer<FullyConnectedLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<FullyConnectedLayerNode>, supported_fused_activations);
-    detail::fuse_layer<EltwiseLayerNode, ActivationLayerNode>(g, cl_target_prec, detail::fuse_node_with_activation<EltwiseLayerNode>, supported_fused_activations);
+    detail::fuse_layer<PadLayerNode, ConvolutionLayerNode>(g, empty_prec,
+                                                           detail::fuse_pad_with_convolution<ConvolutionLayerNode>);
+    detail::fuse_layer<PadLayerNode, DepthwiseConvolutionLayerNode>(
+        g, empty_prec, detail::fuse_pad_with_convolution<DepthwiseConvolutionLayerNode>);
+    detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(
+        g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations);
+    detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(
+        g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
+    detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(
+        g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations);
+    detail::fuse_layer<FullyConnectedLayerNode, ActivationLayerNode>(
+        g, empty_prec, detail::fuse_node_with_activation<FullyConnectedLayerNode>, supported_fused_activations);
+    detail::fuse_layer<EltwiseLayerNode, ActivationLayerNode>(
+        g, cl_target_prec, detail::fuse_node_with_activation<EltwiseLayerNode>, supported_fused_activations);
     // The fusion of BatchNormalizationLayer must occur after the fusion of ActivationLayer. Because FusedConvolutionBatchNormalizationNode assumes the BatchNormalization is already fused with activation, if any
-    detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_convolution_with_batch_normalization);
-    detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
-    detail::fuse_layer<FusedConvolutionBatchNormalizationNode>(g, cl_target_prec, detail::fuse_convolution_batch_normalization_with_post_ops, supported_fused_activations);
+    detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(
+        g, empty_prec, detail::fuse_convolution_with_batch_normalization);
+    detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(
+        g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
 }
 } // namespace graph
 } // namespace arm_compute
diff --git a/src/graph/mutators/SplitLayerSubTensorMutator.cpp b/src/graph/mutators/SplitLayerSubTensorMutator.cpp
index 2c28a1a2d129ba82cdb838b728d87f34405898fd..533f8944cf4d9185c2978c06cc9437952bbd9974 100644
--- a/src/graph/mutators/SplitLayerSubTensorMutator.cpp
+++ b/src/graph/mutators/SplitLayerSubTensorMutator.cpp
@@ -23,12 +23,12 @@
  */
 #include "arm_compute/graph/mutators/SplitLayerSubTensorMutator.h"
 
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/algorithms/TopologicalSort.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/nodes/SplitLayerNode.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/Cast.h"
 #include "support/Iterable.h"
@@ -50,7 +50,7 @@ IGraphMutator::MutationType SplitLayerSubTensorMutator::type() const
 void SplitLayerSubTensorMutator::mutate(Graph &g)
 {
     // Early exit if no Split layers exist in graph
-    if(g.nodes(NodeType::SplitLayer).empty())
+    if (g.nodes(NodeType::SplitLayer).empty())
     {
         return;
     }
@@ -59,23 +59,23 @@ void SplitLayerSubTensorMutator::mutate(Graph &g)
     std::vector<NodeID> topological_sorted_node_ids = dfs(g);
 
     // Should be in reverse order of execution
-    for(auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids))
+    for (auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids))
     {
         INode *node = g.node(node_id);
-        if(node != nullptr && node->type() == NodeType::SplitLayer && node->input(0) != nullptr)
+        if (node != nullptr && node->type() == NodeType::SplitLayer && node->input(0) != nullptr)
         {
             // Get output tensor
             Tensor *input_tensor = node->input(0);
 
             // Check that all tensor have the same target and are valid
             bool is_valid = std::all_of(node->outputs().cbegin(), node->outputs().cend(),
-                                        [&](const TensorID & tid)
-            {
-                return (g.tensor(tid) != nullptr) && (g.tensor(tid)->desc().target == input_tensor->desc().target);
-            });
+                                        [&](const TensorID &tid) {
+                                            return (g.tensor(tid) != nullptr) &&
+                                                   (g.tensor(tid)->desc().target == input_tensor->desc().target);
+                                        });
 
             // Create subtensors
-            if(is_valid && is_target_supported(input_tensor->desc().target))
+            if (is_valid && is_target_supported(input_tensor->desc().target))
             {
                 ARM_COMPUTE_LOG_GRAPH_VERBOSE("Using sub-tensors for the node with ID : "
                                               << node->id() << " and name : " << node->name() << std::endl);
@@ -87,15 +87,18 @@ void SplitLayerSubTensorMutator::mutate(Graph &g)
                 const bool         extend_parent = (axis < 2);
 
                 // Create sub-tensor handles
-                for(unsigned int i = 0; i < node->outputs().size(); ++i)
+                for (unsigned int i = 0; i < node->outputs().size(); ++i)
                 {
                     Tensor           *output_tensor = node->output(i);
                     const TensorShape output_shape  = output_tensor->desc().shape;
                     Coordinates       coords;
-                    std::tie(std::ignore, coords) = split_node->compute_output_descriptor(input_tensor->desc(), num_splits, axis, i);
+                    std::tie(std::ignore, coords) =
+                        split_node->compute_output_descriptor(input_tensor->desc(), num_splits, axis, i);
 
-                    backends::IDeviceBackend      &backend = backends::BackendRegistry::get().get_backend(output_tensor->desc().target);
-                    std::unique_ptr<ITensorHandle> handle  = backend.create_subtensor(input_tensor->handle(), output_shape, coords, extend_parent);
+                    backends::IDeviceBackend &backend =
+                        backends::BackendRegistry::get().get_backend(output_tensor->desc().target);
+                    std::unique_ptr<ITensorHandle> handle =
+                        backend.create_subtensor(input_tensor->handle(), output_shape, coords, extend_parent);
                     output_tensor->set_handle(std::move(handle));
                 }
             }
diff --git a/src/graph/mutators/SyntheticDataTypeMutator.cpp b/src/graph/mutators/SyntheticDataTypeMutator.cpp
index 74d040b81d837213e7acf4c3612cb38246e3a334..3dc2480e8525504b3400f9907ec4dced4fafbba4 100644
--- a/src/graph/mutators/SyntheticDataTypeMutator.cpp
+++ b/src/graph/mutators/SyntheticDataTypeMutator.cpp
@@ -26,8 +26,8 @@
 #include "arm_compute/graph/GraphBuilder.h"
 #include "arm_compute/graph/ITensorAccessor.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/Cast.h"
 
@@ -62,14 +62,12 @@ public:
  */
 bool is_mutation_supported(Graph &g)
 {
-    const std::set<NodeType> unsupported_node_types = { NodeType::DetectionOutputLayer,
-                                                        NodeType::NormalizationLayer,
-                                                        NodeType::PriorBoxLayer
-                                                      };
+    const std::set<NodeType> unsupported_node_types = {NodeType::DetectionOutputLayer, NodeType::NormalizationLayer,
+                                                       NodeType::PriorBoxLayer};
 
-    for(const auto &utype : unsupported_node_types)
+    for (const auto &utype : unsupported_node_types)
     {
-        if(!g.nodes(utype).empty())
+        if (!g.nodes(utype).empty())
         {
             return false;
         }
@@ -83,12 +81,12 @@ bool is_mutation_supported(Graph &g)
  */
 void remove_optimized_nodes(Graph &g)
 {
-    const std::set<NodeType> optimized_node_types = { NodeType::BatchNormalizationLayer };
+    const std::set<NodeType> optimized_node_types = {NodeType::BatchNormalizationLayer};
 
-    for(const auto &opt_type : optimized_node_types)
+    for (const auto &opt_type : optimized_node_types)
     {
         const std::vector<NodeID> opt_nodes_ids = g.nodes(opt_type);
-        for(const auto &node_id : opt_nodes_ids)
+        for (const auto &node_id : opt_nodes_ids)
         {
             INode *node = g.node(node_id);
 
@@ -108,7 +106,7 @@ void remove_optimized_nodes(Graph &g)
             g.remove_node(node->id());
 
             // Update connections
-            for(auto &driving_node : driving_nodes)
+            for (auto &driving_node : driving_nodes)
             {
                 g.add_connection(producer->id(), producer_edge_id, driving_node.node_id, driving_node.index);
             }
@@ -123,11 +121,11 @@ void remove_optimized_nodes(Graph &g)
 void convert_tensors(Graph &g, DataType data_type)
 {
     auto &tensors = g.tensors();
-    for(auto &tensor : tensors)
+    for (auto &tensor : tensors)
     {
-        if(tensor != nullptr)
+        if (tensor != nullptr)
         {
-            switch(data_type)
+            switch (data_type)
             {
                 case DataType::QASYMM8:
                 case DataType::QASYMM8_SIGNED:
@@ -156,7 +154,7 @@ template <typename NT>
 void convert_special_node(Graph &g, std::function<bool(INode *, Tensor *)> const &f)
 {
     const std::vector<NodeID> nodes_ids = g.nodes(NT::node_type);
-    for(const auto &nodes_id : nodes_ids)
+    for (const auto &nodes_id : nodes_ids)
     {
         INode *node = arm_compute::utils::cast::polymorphic_downcast<NT *>(g.node(nodes_id));
         ARM_COMPUTE_ERROR_ON(node == nullptr);
@@ -174,41 +172,41 @@ void convert_special_node(Graph &g, std::function<bool(INode *, Tensor *)> const
  */
 void convert_special_tensors(Graph &g)
 {
-    auto softmax_func = [](INode * node, Tensor * tensor)
+    auto softmax_func = [](INode *node, Tensor *tensor)
     {
         ARM_COMPUTE_UNUSED(node);
-        if(tensor->desc().data_type == DataType::QASYMM8)
+        if (tensor->desc().data_type == DataType::QASYMM8)
         {
             tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, 0);
         }
-        else if(tensor->desc().data_type == DataType::QASYMM8_SIGNED)
+        else if (tensor->desc().data_type == DataType::QASYMM8_SIGNED)
         {
             tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, -128);
         }
         return true;
     };
 
-    auto act_func = [](INode * node, Tensor * tensor)
+    auto act_func = [](INode *node, Tensor *tensor)
     {
         auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(node);
-        if(tensor->desc().data_type == DataType::QASYMM8)
+        if (tensor->desc().data_type == DataType::QASYMM8)
         {
-            if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH)
+            if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH)
             {
                 tensor->desc().quant_info = QuantizationInfo(1.f / 128.f, 128);
             }
-            else if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            else if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
             {
                 tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, 0);
             }
         }
-        else if(tensor->desc().data_type == DataType::QASYMM8_SIGNED)
+        else if (tensor->desc().data_type == DataType::QASYMM8_SIGNED)
         {
-            if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH)
+            if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH)
             {
                 tensor->desc().quant_info = QuantizationInfo(1.f / 128.f, 0);
             }
-            else if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            else if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
             {
                 tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, -128);
             }
@@ -228,22 +226,19 @@ void convert_special_tensors(Graph &g)
  */
 void handle_nodes_with_bias(Graph &g)
 {
-    const std::set<NodeType> special_node_types = { NodeType::ConvolutionLayer,
-                                                    NodeType::DeconvolutionLayer,
-                                                    NodeType::DepthwiseConvolutionLayer,
-                                                    NodeType::FullyConnectedLayer
-                                                  };
+    const std::set<NodeType> special_node_types = {NodeType::ConvolutionLayer, NodeType::DeconvolutionLayer,
+                                                   NodeType::DepthwiseConvolutionLayer, NodeType::FullyConnectedLayer};
 
-    for(const auto &spc_type : special_node_types)
+    for (const auto &spc_type : special_node_types)
     {
         const std::vector<NodeID> scp_nodes_ids = g.nodes(spc_type);
-        for(const auto &node_id : scp_nodes_ids)
+        for (const auto &node_id : scp_nodes_ids)
         {
             INode *node = g.node(node_id);
-            if(node != nullptr)
+            if (node != nullptr)
             {
                 Tensor *tensor = node->input(2);
-                if(tensor != nullptr)
+                if (tensor != nullptr)
                 {
                     tensor->desc().data_type = DataType::S32;
                 }
@@ -253,8 +248,8 @@ void handle_nodes_with_bias(Graph &g)
                     params.name = params.name.empty() ? "" : params.name + "Bias";
 
                     TensorDescriptor b_desc = node->input(1)->desc();
-                    auto             depth  = b_desc.shape[get_dimension_idx(b_desc.layout, DataLayoutDimension::BATCHES)];
-                    b_desc.shape            = TensorShape(depth);
+                    auto depth   = b_desc.shape[get_dimension_idx(b_desc.layout, DataLayoutDimension::BATCHES)];
+                    b_desc.shape = TensorShape(depth);
 
                     auto accessor = std::make_unique<EmptyAccessor>();
                     auto b_nid    = GraphBuilder::add_const_node(g, params, b_desc, std::move(accessor));
@@ -266,8 +261,7 @@ void handle_nodes_with_bias(Graph &g)
 }
 } // namespace
 
-SyntheticDataTypeMutator::SyntheticDataTypeMutator(DataType mutate_type)
-    : _mutate_type{ mutate_type }
+SyntheticDataTypeMutator::SyntheticDataTypeMutator(DataType mutate_type) : _mutate_type{mutate_type}
 {
 }
 
@@ -283,7 +277,7 @@ IGraphMutator::MutationType SyntheticDataTypeMutator::type() const
 
 void SyntheticDataTypeMutator::mutate(Graph &g)
 {
-    if(is_mutation_supported(g))
+    if (is_mutation_supported(g))
     {
         // Remove nodes that get optimized out (e.g. BatchNorm)
         remove_optimized_nodes(g);
diff --git a/src/graph/nodes/ActivationLayerNode.cpp b/src/graph/nodes/ActivationLayerNode.cpp
index cf65d83a5e46f11a18a49786d6c5526348b3286d..1773afcb16d9fcfcec224e4c52c82fba8391f4cc 100644
--- a/src/graph/nodes/ActivationLayerNode.cpp
+++ b/src/graph/nodes/ActivationLayerNode.cpp
@@ -44,7 +44,7 @@ ActivationLayerInfo ActivationLayerNode::activation_info() const
 
 bool ActivationLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -63,7 +63,7 @@ TensorDescriptor ActivationLayerNode::configure_output(size_t idx) const
     ARM_COMPUTE_ERROR_ON(src == nullptr);
 
     TensorDescriptor output_info = src->desc();
-    if(!_out_quant_info.empty())
+    if (!_out_quant_info.empty())
     {
         output_info.quant_info = _out_quant_info;
     }
diff --git a/src/graph/nodes/ArgMinMaxLayerNode.cpp b/src/graph/nodes/ArgMinMaxLayerNode.cpp
index 63163b9e2c50489ed8c577e32fb653623a3057d7..5adebc950a167c7000c69afc8269ffd101942c00 100644
--- a/src/graph/nodes/ArgMinMaxLayerNode.cpp
+++ b/src/graph/nodes/ArgMinMaxLayerNode.cpp
@@ -23,16 +23,18 @@
  */
 #include "arm_compute/graph/nodes/ArgMinMaxLayerNode.h"
 
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-ArgMinMaxLayerNode::ArgMinMaxLayerNode(ReductionOperation op, unsigned int axis, DataType out_data_type, QuantizationInfo out_quant_info)
+ArgMinMaxLayerNode::ArgMinMaxLayerNode(ReductionOperation op,
+                                       unsigned int       axis,
+                                       DataType           out_data_type,
+                                       QuantizationInfo   out_quant_info)
     : _op(op), _axis(axis), _out_data_type(out_data_type), _out_quant_info(std::move(out_quant_info))
 {
     _input_edges.resize(1, EmptyEdgeID);
@@ -56,7 +58,7 @@ DataType ArgMinMaxLayerNode::out_data_type() const
 
 bool ArgMinMaxLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -75,17 +77,18 @@ TensorDescriptor ArgMinMaxLayerNode::configure_output(size_t idx) const
     ARM_COMPUTE_ERROR_ON(src == nullptr);
 
     TensorDescriptor output_info = src->desc();
-    if(!_out_quant_info.empty())
+    if (!_out_quant_info.empty())
     {
         output_info.quant_info = _out_quant_info;
     }
 
-    if(_out_data_type != DataType::UNKNOWN)
+    if (_out_data_type != DataType::UNKNOWN)
     {
         output_info.data_type = _out_data_type;
     }
 
-    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, false);
+    TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, false);
     output_info.set_shape(output_shape);
 
     return output_info;
diff --git a/src/graph/nodes/BatchNormalizationLayerNode.cpp b/src/graph/nodes/BatchNormalizationLayerNode.cpp
index ceca0e2715ae5a9bdd190bf4a8b4c8a4590b8937..c317123e8d5c366659a46e51cf60d394866f6966 100644
--- a/src/graph/nodes/BatchNormalizationLayerNode.cpp
+++ b/src/graph/nodes/BatchNormalizationLayerNode.cpp
@@ -55,7 +55,7 @@ void BatchNormalizationLayerNode::set_fused_activation(ActivationLayerInfo fused
 
 bool BatchNormalizationLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -86,4 +86,4 @@ void BatchNormalizationLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/BoundingBoxTransformLayerNode.cpp b/src/graph/nodes/BoundingBoxTransformLayerNode.cpp
index f3f4f91075ae1f2a96685151eb9ef026f385ad94..8e521746396a1ebdd774e770ec76ceb22514fa93 100644
--- a/src/graph/nodes/BoundingBoxTransformLayerNode.cpp
+++ b/src/graph/nodes/BoundingBoxTransformLayerNode.cpp
@@ -23,17 +23,15 @@
  */
 #include "arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/Helpers.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-BoundingBoxTransformLayerNode::BoundingBoxTransformLayerNode(BoundingBoxTransformInfo &info)
-    : _bbox_info(info)
+BoundingBoxTransformLayerNode::BoundingBoxTransformLayerNode(BoundingBoxTransformInfo &info) : _bbox_info(info)
 {
     _input_edges.resize(2, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -46,7 +44,7 @@ const BoundingBoxTransformInfo &BoundingBoxTransformLayerNode::info() const
 
 bool BoundingBoxTransformLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/ChannelShuffleLayerNode.cpp b/src/graph/nodes/ChannelShuffleLayerNode.cpp
index 5102e4b6da9b0faff9a672a5657419bc110fcb2d..3cb9e23eca3cdc644980eb7f253224fddb147b73 100644
--- a/src/graph/nodes/ChannelShuffleLayerNode.cpp
+++ b/src/graph/nodes/ChannelShuffleLayerNode.cpp
@@ -30,8 +30,7 @@ namespace arm_compute
 {
 namespace graph
 {
-ChannelShuffleLayerNode::ChannelShuffleLayerNode(unsigned int num_groups)
-    : _num_groups(num_groups)
+ChannelShuffleLayerNode::ChannelShuffleLayerNode(unsigned int num_groups) : _num_groups(num_groups)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -44,7 +43,7 @@ unsigned int ChannelShuffleLayerNode::num_groups() const
 
 bool ChannelShuffleLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -75,4 +74,4 @@ void ChannelShuffleLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/ConcatenateLayerNode.cpp b/src/graph/nodes/ConcatenateLayerNode.cpp
index 3f3c70f3bb50cc023892f7060e2f3055ad3f47f2..8e5393a5e49e85f9c4db5a82284c93424c46b2ad 100644
--- a/src/graph/nodes/ConcatenateLayerNode.cpp
+++ b/src/graph/nodes/ConcatenateLayerNode.cpp
@@ -24,17 +24,17 @@
 #include "arm_compute/graph/nodes/ConcatenateLayerNode.h"
 
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 #include "arm_compute/graph/Utils.h"
 
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-ConcatenateLayerNode::ConcatenateLayerNode(unsigned int total_nodes, descriptors::ConcatLayerDescriptor concat_descriptor)
+ConcatenateLayerNode::ConcatenateLayerNode(unsigned int                       total_nodes,
+                                           descriptors::ConcatLayerDescriptor concat_descriptor)
     : _total_nodes(total_nodes), _concat_descriptor(std::move(concat_descriptor)), _is_enabled(true)
 {
     _input_edges.resize(_total_nodes, EmptyEdgeID);
@@ -73,7 +73,7 @@ TensorDescriptor ConcatenateLayerNode::compute_output_descriptor(const std::vect
     // Extract shapes
     std::vector<const TensorShape *> shapes;
     shapes.reserve(input_descriptors.size());
-    for(auto &input_descriptor : input_descriptors)
+    for (auto &input_descriptor : input_descriptors)
     {
         shapes.emplace_back(&input_descriptor.shape);
     }
@@ -85,7 +85,7 @@ TensorDescriptor ConcatenateLayerNode::compute_output_descriptor(const std::vect
 
 bool ConcatenateLayerNode::forward_descriptors()
 {
-    if(_outputs[0] != NullTensorID)
+    if (_outputs[0] != NullTensorID)
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -101,24 +101,22 @@ TensorDescriptor ConcatenateLayerNode::configure_output(size_t idx) const
     ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
 
     // Check if all input tensors are set
-    bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges), [](const EdgeID & eid)
-    {
-        return eid != EmptyEdgeID;
-    });
+    bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges),
+                                          [](const EdgeID &eid) { return eid != EmptyEdgeID; });
 
     TensorDescriptor output_info = {};
 
-    if(are_all_inputs_set)
+    if (are_all_inputs_set)
     {
         std::vector<TensorDescriptor> inputs_descriptors;
-        for(unsigned int i = 0; i < _input_edges.size(); ++i)
+        for (unsigned int i = 0; i < _input_edges.size(); ++i)
         {
             const Tensor *t = _graph->tensor(input_id(i));
             ARM_COMPUTE_ERROR_ON(t == nullptr);
             inputs_descriptors.push_back(t->desc());
         }
         output_info = compute_output_descriptor(inputs_descriptors, _concat_descriptor.axis);
-        if(!_concat_descriptor.output_qinfo.empty())
+        if (!_concat_descriptor.output_qinfo.empty())
         {
             output_info.quant_info = _concat_descriptor.output_qinfo;
         }
diff --git a/src/graph/nodes/ConstNode.cpp b/src/graph/nodes/ConstNode.cpp
index eb96d638880ab9c27420f437c7c389549d67a399..6e8fbff71a7ba9b2771db8235dd901909925e5b7 100644
--- a/src/graph/nodes/ConstNode.cpp
+++ b/src/graph/nodes/ConstNode.cpp
@@ -30,15 +30,14 @@ namespace arm_compute
 {
 namespace graph
 {
-ConstNode::ConstNode(TensorDescriptor desc)
-    : _desc(std::move(desc))
+ConstNode::ConstNode(TensorDescriptor desc) : _desc(std::move(desc))
 {
     _outputs.resize(1, NullTensorID);
 }
 
 bool ConstNode::forward_descriptors()
 {
-    if(output_id(0) != NullTensorID)
+    if (output_id(0) != NullTensorID)
     {
         Tensor *t = output(0);
         ARM_COMPUTE_ERROR_ON(t == nullptr);
diff --git a/src/graph/nodes/ConvolutionLayerNode.cpp b/src/graph/nodes/ConvolutionLayerNode.cpp
index ee9dde91d581d4b5d355b0d1b30cf167ac548716..f0263fc84a00f9218fdbfd352af1a34ae7bfd27b 100644
--- a/src/graph/nodes/ConvolutionLayerNode.cpp
+++ b/src/graph/nodes/ConvolutionLayerNode.cpp
@@ -37,7 +37,12 @@ ConvolutionLayerNode::ConvolutionLayerNode(PadStrideInfo     info,
                                            ConvolutionMethod method,
                                            FastMathHint      fast_math_hint,
                                            QuantizationInfo  out_quant_info)
-    : _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(std::move(out_quant_info)), _fused_activation()
+    : _info(std::move(info)),
+      _num_groups(num_groups),
+      _method(method),
+      _fast_math_hint(fast_math_hint),
+      _out_quant_info(std::move(out_quant_info)),
+      _fused_activation()
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -100,20 +105,22 @@ TensorDescriptor ConvolutionLayerNode::compute_output_descriptor(const TensorDes
     const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
     const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
 
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL),
+                                weights_descriptor.shape[3]);
 
     return output_descriptor;
 }
 
 bool ConvolutionLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -132,7 +139,7 @@ TensorDescriptor ConvolutionLayerNode::configure_output(size_t idx) const
     ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
 
     TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
-    if(!_out_quant_info.empty())
+    if (!_out_quant_info.empty())
     {
         output_info.quant_info = _out_quant_info;
     }
diff --git a/src/graph/nodes/DeconvolutionLayerNode.cpp b/src/graph/nodes/DeconvolutionLayerNode.cpp
index 3542d5ad10b79a677355e04881e0eecbf953460a..2058ab21e58c02b73a7f4b98e4c37d403ea3c519 100644
--- a/src/graph/nodes/DeconvolutionLayerNode.cpp
+++ b/src/graph/nodes/DeconvolutionLayerNode.cpp
@@ -56,20 +56,22 @@ TensorDescriptor DeconvolutionLayerNode::compute_output_descriptor(const TensorD
     const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
     const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
 
-    std::tie(output_width, output_height) = deconvolution_output_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+    std::tie(output_width, output_height) =
+        deconvolution_output_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL),
+                                weights_descriptor.shape[3]);
 
     return output_descriptor;
 }
 
 bool DeconvolutionLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -89,7 +91,7 @@ TensorDescriptor DeconvolutionLayerNode::configure_output(size_t idx) const
 
     TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), descriptor.info);
 
-    if(!descriptor.out_quant_info.empty())
+    if (!descriptor.out_quant_info.empty())
     {
         output_info.set_quantization_info(descriptor.out_quant_info);
     }
diff --git a/src/graph/nodes/DepthToSpaceLayerNode.cpp b/src/graph/nodes/DepthToSpaceLayerNode.cpp
index b70ac56a07d924282eab502d97c6a00ea5d6d0bb..0b914a0e569378885ea43c4e26d740d31b089c64 100644
--- a/src/graph/nodes/DepthToSpaceLayerNode.cpp
+++ b/src/graph/nodes/DepthToSpaceLayerNode.cpp
@@ -32,8 +32,7 @@ namespace arm_compute
 {
 namespace graph
 {
-DepthToSpaceLayerNode::DepthToSpaceLayerNode(int block_shape)
-    : _block_shape(block_shape)
+DepthToSpaceLayerNode::DepthToSpaceLayerNode(int block_shape) : _block_shape(block_shape)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -44,7 +43,8 @@ int DepthToSpaceLayerNode::block_shape() const
     return _block_shape;
 }
 
-TensorDescriptor DepthToSpaceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor, int block_shape)
+TensorDescriptor DepthToSpaceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                                  int                     block_shape)
 {
     using namespace arm_compute::helpers::tensor_transform;
 
@@ -53,14 +53,15 @@ TensorDescriptor DepthToSpaceLayerNode::compute_output_descriptor(const TensorDe
 
     // Set descriptor shape
     TensorDescriptor output_descriptor = input_descriptor;
-    output_descriptor.shape            = misc::shape_calculator::compute_depth_to_space_shape(input_shape, data_layout, block_shape);
+    output_descriptor.shape =
+        misc::shape_calculator::compute_depth_to_space_shape(input_shape, data_layout, block_shape);
 
     return output_descriptor;
 }
 
 bool DepthToSpaceLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
index 7de20165cb2701a895ee028b26ef266a99f8f5ff..92d7266088fea9cebcc2eb002880921aba353d18 100644
--- a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
+++ b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
@@ -32,9 +32,15 @@ namespace arm_compute
 {
 namespace graph
 {
-DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, int depth_multiplier, DepthwiseConvolutionMethod method,
-                                                             QuantizationInfo out_quant_info)
-    : _info(std::move(info)), _depth_multiplier(depth_multiplier), _method(method), _out_quant_info(std::move(out_quant_info)), _fused_activation()
+DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo              info,
+                                                             int                        depth_multiplier,
+                                                             DepthwiseConvolutionMethod method,
+                                                             QuantizationInfo           out_quant_info)
+    : _info(std::move(info)),
+      _depth_multiplier(depth_multiplier),
+      _method(method),
+      _out_quant_info(std::move(out_quant_info)),
+      _fused_activation()
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -89,20 +95,22 @@ TensorDescriptor DepthwiseConvolutionLayerNode::compute_output_descriptor(const
     const unsigned int kernel_width   = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
     const unsigned int kernel_height  = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
 
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), input_channels * depth_multiplier);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL),
+                                input_channels * depth_multiplier);
 
     return output_descriptor;
 }
 
 bool DepthwiseConvolutionLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -121,7 +129,7 @@ TensorDescriptor DepthwiseConvolutionLayerNode::configure_output(size_t idx) con
     ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
 
     TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info, _depth_multiplier);
-    if(!_out_quant_info.empty())
+    if (!_out_quant_info.empty())
     {
         output_info.quant_info = _out_quant_info;
     }
@@ -139,4 +147,4 @@ void DepthwiseConvolutionLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/DequantizationLayerNode.cpp b/src/graph/nodes/DequantizationLayerNode.cpp
index 14c4752f1273446b75a878e9ac6d6df0a79bee91..3ea000852ad2beff7fa1ef9ff8623a3d001a7b6c 100644
--- a/src/graph/nodes/DequantizationLayerNode.cpp
+++ b/src/graph/nodes/DequantizationLayerNode.cpp
@@ -40,7 +40,7 @@ DequantizationLayerNode::DequantizationLayerNode()
 
 bool DequantizationLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -74,4 +74,4 @@ void DequantizationLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/DetectionOutputLayerNode.cpp b/src/graph/nodes/DetectionOutputLayerNode.cpp
index fc6f531ee0bf0033dc36a3d32ba1403fa3716bcb..65ddd2f5bc2bbee5579812ef6c7cb756c3d37f43 100644
--- a/src/graph/nodes/DetectionOutputLayerNode.cpp
+++ b/src/graph/nodes/DetectionOutputLayerNode.cpp
@@ -32,8 +32,7 @@ namespace arm_compute
 {
 namespace graph
 {
-DetectionOutputLayerNode::DetectionOutputLayerNode(DetectionOutputLayerInfo detection_info)
-    : _info(detection_info)
+DetectionOutputLayerNode::DetectionOutputLayerNode(DetectionOutputLayerInfo detection_info) : _info(detection_info)
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -47,7 +46,8 @@ DetectionOutputLayerInfo DetectionOutputLayerNode::detection_output_info() const
 TensorDescriptor DetectionOutputLayerNode::compute_output_descriptor(const TensorDescriptor         &input_descriptor,
                                                                      const DetectionOutputLayerInfo &info)
 {
-    const unsigned int max_size = info.keep_top_k() * ((input_descriptor.shape.num_dimensions() > 1) ? input_descriptor.shape[1] : 1);
+    const unsigned int max_size =
+        info.keep_top_k() * ((input_descriptor.shape.num_dimensions() > 1) ? input_descriptor.shape[1] : 1);
 
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(0, detection_size);
@@ -58,7 +58,8 @@ TensorDescriptor DetectionOutputLayerNode::compute_output_descriptor(const Tenso
 
 bool DetectionOutputLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) &&
+        (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/DetectionPostProcessLayerNode.cpp b/src/graph/nodes/DetectionPostProcessLayerNode.cpp
index 2c5005af3035ae2b4b3bdb08d91865025858e72b..af3fc03d6765dd577587636ba790f45c91a5bf7d 100644
--- a/src/graph/nodes/DetectionPostProcessLayerNode.cpp
+++ b/src/graph/nodes/DetectionPostProcessLayerNode.cpp
@@ -46,10 +46,11 @@ DetectionPostProcessLayerInfo DetectionPostProcessLayerNode::detection_post_proc
 
 bool DetectionPostProcessLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID)
-       && (output_id(2) != NullTensorID) && (output_id(3) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) &&
+        (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID) && (output_id(2) != NullTensorID) &&
+        (output_id(3) != NullTensorID))
     {
-        for(unsigned int i = 0; i < 4; ++i)
+        for (unsigned int i = 0; i < 4; ++i)
         {
             Tensor *dst = output(i);
             ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -68,7 +69,7 @@ TensorDescriptor DetectionPostProcessLayerNode::configure_output(size_t idx) con
     TensorDescriptor   output_desc;
     const unsigned int num_detected_box = _info.max_detections() * _info.max_classes_per_detection();
 
-    switch(idx)
+    switch (idx)
     {
         case 0:
             // Configure boxes output
@@ -101,4 +102,4 @@ void DetectionPostProcessLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/DummyNode.cpp b/src/graph/nodes/DummyNode.cpp
index 6fa9fbaf5682052b282fcacd4462ba3a689b230c..b5f37bd79ba542d6004d8556327ad6a91c24d805 100644
--- a/src/graph/nodes/DummyNode.cpp
+++ b/src/graph/nodes/DummyNode.cpp
@@ -32,8 +32,7 @@ namespace arm_compute
 {
 namespace graph
 {
-DummyNode::DummyNode(TensorShape shape)
-    : _shape(shape)
+DummyNode::DummyNode(TensorShape shape) : _shape(shape)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -41,7 +40,7 @@ DummyNode::DummyNode(TensorShape shape)
 
 bool DummyNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -75,4 +74,4 @@ void DummyNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/EltwiseLayerNode.cpp b/src/graph/nodes/EltwiseLayerNode.cpp
index 4426e953ee91e3885484e5961a12bdd89db124d3..3f7a08e64dd737b263a655a81bfc66dff854a8e1 100644
--- a/src/graph/nodes/EltwiseLayerNode.cpp
+++ b/src/graph/nodes/EltwiseLayerNode.cpp
@@ -31,8 +31,7 @@ namespace arm_compute
 {
 namespace graph
 {
-EltwiseLayerNode::EltwiseLayerNode(const descriptors::EltwiseLayerDescriptor &descriptor)
-    : descriptor(descriptor)
+EltwiseLayerNode::EltwiseLayerNode(const descriptors::EltwiseLayerDescriptor &descriptor) : descriptor(descriptor)
 {
     _input_edges.resize(2, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -70,7 +69,7 @@ void EltwiseLayerNode::set_fused_activation(ActivationLayerInfo fused_activation
 
 bool EltwiseLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -97,7 +96,7 @@ TensorDescriptor EltwiseLayerNode::configure_output(size_t idx) const
 
     output_info.set_shape(out_shape);
 
-    if(!descriptor.out_quant_info.empty())
+    if (!descriptor.out_quant_info.empty())
     {
         output_info.set_quantization_info(descriptor.out_quant_info);
     }
@@ -134,7 +133,7 @@ void UnaryEltwiseLayerNode::set_fused_activation(ActivationLayerInfo fused_activ
 
 bool UnaryEltwiseLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -153,7 +152,7 @@ TensorDescriptor UnaryEltwiseLayerNode::configure_output(size_t idx) const
 
     auto output_info = src->desc();
 
-    if(!descriptor.out_quant_info.empty())
+    if (!descriptor.out_quant_info.empty())
     {
         output_info.set_quantization_info(descriptor.out_quant_info);
     }
diff --git a/src/graph/nodes/FlattenLayerNode.cpp b/src/graph/nodes/FlattenLayerNode.cpp
index 48519a16952fac8473ea4bf77205b0080b4efd24..952df2f3ec2d4ac221997712ef27515ce2fc707a 100644
--- a/src/graph/nodes/FlattenLayerNode.cpp
+++ b/src/graph/nodes/FlattenLayerNode.cpp
@@ -38,7 +38,7 @@ FlattenLayerNode::FlattenLayerNode()
 
 bool FlattenLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -72,4 +72,4 @@ void FlattenLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/FullyConnectedLayer.cpp b/src/graph/nodes/FullyConnectedLayer.cpp
index 6278227878a49e527b5b392c8490091f55fbf6a5..1eed69ddaf5a3407adb0c64ca8e5b1f490227cec 100644
--- a/src/graph/nodes/FullyConnectedLayer.cpp
+++ b/src/graph/nodes/FullyConnectedLayer.cpp
@@ -21,18 +21,23 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/nodes/FullyConnectedLayerNode.h"
-
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/nodes/FullyConnectedLayerNode.h"
 
 namespace arm_compute
 {
 namespace graph
 {
-FullyConnectedLayerNode::FullyConnectedLayerNode(unsigned int num_outputs, QuantizationInfo out_quant_info, FullyConnectedLayerInfo fc_info, FastMathHint fast_math_hint)
-    : _num_outputs(num_outputs), _out_quant_info(std::move(out_quant_info)), _info(fc_info), _fast_math_hint(fast_math_hint)
+FullyConnectedLayerNode::FullyConnectedLayerNode(unsigned int            num_outputs,
+                                                 QuantizationInfo        out_quant_info,
+                                                 FullyConnectedLayerInfo fc_info,
+                                                 FastMathHint            fast_math_hint)
+    : _num_outputs(num_outputs),
+      _out_quant_info(std::move(out_quant_info)),
+      _info(fc_info),
+      _fast_math_hint(fast_math_hint)
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -60,11 +65,11 @@ TensorDescriptor FullyConnectedLayerNode::compute_weights_descriptor(const Tenso
     unsigned int num_weights    = 1;
     unsigned int num_dimensions = input_descriptor.shape.num_dimensions();
     // Ignore the batch dimension if there is one:
-    if(num_dimensions == 2 || num_dimensions == 4)
+    if (num_dimensions == 2 || num_dimensions == 4)
     {
         num_dimensions--;
     }
-    for(unsigned int i = 0; i < num_dimensions; i++)
+    for (unsigned int i = 0; i < num_dimensions; i++)
     {
         num_weights *= input_descriptor.shape[i];
     }
@@ -73,13 +78,13 @@ TensorDescriptor FullyConnectedLayerNode::compute_weights_descriptor(const Tenso
     weights_descriptor.shape            = TensorShape(num_weights, num_outputs);
 
     // If weights are tranposed, use tranposed shape
-    if(!fc_info.transpose_weights)
+    if (!fc_info.transpose_weights)
     {
         weights_descriptor.shape = TensorShape(num_outputs, num_weights);
     }
 
     // Set quantization info if present
-    if(!weights_quant_info.empty())
+    if (!weights_quant_info.empty())
     {
         weights_descriptor.quant_info = weights_quant_info;
     }
@@ -93,7 +98,7 @@ TensorDescriptor FullyConnectedLayerNode::compute_output_descriptor(const Tensor
 {
     // Note: Only 1D batch space is supported at the moment
     unsigned int batches = input_descriptor.shape[1];
-    if(input_descriptor.shape.num_dimensions() > 2)
+    if (input_descriptor.shape.num_dimensions() > 2)
     {
         batches = input_descriptor.shape[3];
     }
@@ -103,7 +108,7 @@ TensorDescriptor FullyConnectedLayerNode::compute_output_descriptor(const Tensor
     output_descriptor.shape            = TensorShape(num_outputs, batches);
 
     // Set quantization info if present
-    if(!out_quant_info.empty())
+    if (!out_quant_info.empty())
     {
         output_descriptor.quant_info = out_quant_info;
     }
@@ -118,7 +123,7 @@ FullyConnectedLayerInfo FullyConnectedLayerNode::info() const
 
 bool FullyConnectedLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -147,4 +152,4 @@ void FullyConnectedLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp b/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
index de995ebee9b87795b7f943fc36f56ac68ccdfa61..9d37e84acfa064c11d9a6b27ebac39fc13bb068a 100644
--- a/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
+++ b/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
@@ -32,12 +32,18 @@ namespace arm_compute
 {
 namespace graph
 {
-FusedConvolutionBatchNormalizationNode::FusedConvolutionBatchNormalizationNode(float epsilon, PadStrideInfo info,
-                                                                               unsigned int      num_groups,
-                                                                               ConvolutionMethod method,
-                                                                               FastMathHint      fast_math_hint,
+FusedConvolutionBatchNormalizationNode::FusedConvolutionBatchNormalizationNode(float               epsilon,
+                                                                               PadStrideInfo       info,
+                                                                               unsigned int        num_groups,
+                                                                               ConvolutionMethod   method,
+                                                                               FastMathHint        fast_math_hint,
                                                                                ActivationLayerInfo fused_activation)
-    : _epsilon(epsilon), _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _fused_activation(fused_activation)
+    : _epsilon(epsilon),
+      _info(std::move(info)),
+      _num_groups(num_groups),
+      _method(method),
+      _fast_math_hint(fast_math_hint),
+      _fused_activation(fused_activation)
 {
     _input_edges.resize(7, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -88,9 +94,8 @@ void FusedConvolutionBatchNormalizationNode::set_fused_activation(ActivationLaye
     _fused_activation = fused_activation;
 }
 
-TensorDescriptor FusedConvolutionBatchNormalizationNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                                   const TensorDescriptor &weights_descriptor,
-                                                                                   const PadStrideInfo    &info)
+TensorDescriptor FusedConvolutionBatchNormalizationNode::compute_output_descriptor(
+    const TensorDescriptor &input_descriptor, const TensorDescriptor &weights_descriptor, const PadStrideInfo &info)
 {
     unsigned int output_width  = 0;
     unsigned int output_height = 0;
@@ -100,20 +105,22 @@ TensorDescriptor FusedConvolutionBatchNormalizationNode::compute_output_descript
     const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
     const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
 
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL),
+                                weights_descriptor.shape[3]);
 
     return output_descriptor;
 }
 
 bool FusedConvolutionBatchNormalizationNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp b/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
deleted file mode 100644
index af81f0369a6e89e117d40e6c0d4ef53935188829..0000000000000000000000000000000000000000
--- a/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h"
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/INodeVisitor.h"
-#include "arm_compute/graph/Utils.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-FusedConvolutionBatchNormalizationWithPostOpsNode::FusedConvolutionBatchNormalizationWithPostOpsNode(float epsilon, PadStrideInfo info,
-                                                                                                     unsigned int      num_groups,
-                                                                                                     ConvolutionMethod method,
-                                                                                                     FastMathHint      fast_math_hint)
-    : _epsilon(epsilon), _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint)
-{
-    _input_edges.resize(8, EmptyEdgeID);
-    _outputs.resize(1, NullTensorID);
-}
-
-void FusedConvolutionBatchNormalizationWithPostOpsNode::set_convolution_method(ConvolutionMethod method)
-{
-    _method = method;
-}
-
-float FusedConvolutionBatchNormalizationWithPostOpsNode::epsilon() const
-{
-    return _epsilon;
-}
-
-ConvolutionMethod FusedConvolutionBatchNormalizationWithPostOpsNode::convolution_method() const
-{
-    return _method;
-}
-
-void FusedConvolutionBatchNormalizationWithPostOpsNode::set_fast_math_hint(FastMathHint hint)
-{
-    _fast_math_hint = hint;
-}
-
-FastMathHint FusedConvolutionBatchNormalizationWithPostOpsNode::fast_math_hint() const
-{
-    return _fast_math_hint;
-}
-
-PadStrideInfo FusedConvolutionBatchNormalizationWithPostOpsNode::convolution_info() const
-{
-    return _info;
-}
-
-unsigned int FusedConvolutionBatchNormalizationWithPostOpsNode::num_groups() const
-{
-    return _num_groups;
-}
-
-TensorDescriptor FusedConvolutionBatchNormalizationWithPostOpsNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                                              const TensorDescriptor &weights_descriptor,
-                                                                                              const PadStrideInfo    &info)
-{
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
-
-    const unsigned int input_width   = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
-    const unsigned int input_height  = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
-    const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
-    const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
-
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
-
-    const DataLayout data_layout       = input_descriptor.layout;
-    TensorDescriptor output_descriptor = input_descriptor;
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
-
-    return output_descriptor;
-}
-
-bool FusedConvolutionBatchNormalizationWithPostOpsNode::forward_descriptors()
-{
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
-    {
-        Tensor *dst = output(0);
-        ARM_COMPUTE_ERROR_ON(dst == nullptr);
-        dst->desc() = configure_output(0);
-        return true;
-    }
-    return false;
-}
-
-TensorDescriptor FusedConvolutionBatchNormalizationWithPostOpsNode::configure_output(size_t idx) const
-{
-    ARM_COMPUTE_UNUSED(idx);
-    const Tensor *src     = input(0);
-    const Tensor *weights = input(1);
-
-    ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
-
-    TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
-
-    return output_info;
-}
-
-NodeType FusedConvolutionBatchNormalizationWithPostOpsNode::type() const
-{
-    return FusedConvolutionBatchNormalizationWithPostOpsNode::node_type;
-}
-
-void FusedConvolutionBatchNormalizationWithPostOpsNode::accept(INodeVisitor &v)
-{
-    v.visit(*this);
-}
-} // namespace graph
-} // namespace arm_compute
diff --git a/src/graph/nodes/FusedConvolutionWithPostOpNode.cpp b/src/graph/nodes/FusedConvolutionWithPostOpNode.cpp
deleted file mode 100644
index 63341e2760c32bc28d786063083f7e678f0f1567..0000000000000000000000000000000000000000
--- a/src/graph/nodes/FusedConvolutionWithPostOpNode.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h"
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/INodeVisitor.h"
-#include "arm_compute/graph/Utils.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-FusedConvolutionWithPostOpNode::FusedConvolutionWithPostOpNode(PadStrideInfo     info,
-                                                               unsigned int      num_groups,
-                                                               ConvolutionMethod method,
-                                                               FastMathHint      fast_math_hint,
-                                                               QuantizationInfo  out_quant_info)
-    : _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(std::move(out_quant_info)), _fused_activation()
-{
-    _input_edges.resize(4, EmptyEdgeID);
-    _outputs.resize(1, NullTensorID);
-}
-
-void FusedConvolutionWithPostOpNode::set_convolution_method(ConvolutionMethod method)
-{
-    _method = method;
-}
-
-ConvolutionMethod FusedConvolutionWithPostOpNode::convolution_method() const
-{
-    return _method;
-}
-
-void FusedConvolutionWithPostOpNode::set_fast_math_hint(FastMathHint hint)
-{
-    _fast_math_hint = hint;
-}
-
-FastMathHint FusedConvolutionWithPostOpNode::fast_math_hint() const
-{
-    return _fast_math_hint;
-}
-
-PadStrideInfo FusedConvolutionWithPostOpNode::convolution_info() const
-{
-    return _info;
-}
-
-unsigned int FusedConvolutionWithPostOpNode::num_groups() const
-{
-    return _num_groups;
-}
-
-ActivationLayerInfo FusedConvolutionWithPostOpNode::fused_activation() const
-{
-    return _fused_activation;
-}
-
-void FusedConvolutionWithPostOpNode::set_fused_activation(ActivationLayerInfo fused_activation)
-{
-    _fused_activation = fused_activation;
-}
-
-void FusedConvolutionWithPostOpNode::set_convolution_info(PadStrideInfo info)
-{
-    _info = info;
-}
-
-TensorDescriptor FusedConvolutionWithPostOpNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                           const TensorDescriptor &weights_descriptor,
-                                                                           const PadStrideInfo    &info)
-{
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
-
-    const unsigned int input_width   = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
-    const unsigned int input_height  = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
-    const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
-    const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
-
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
-
-    const DataLayout data_layout       = input_descriptor.layout;
-    TensorDescriptor output_descriptor = input_descriptor;
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
-
-    return output_descriptor;
-}
-
-bool FusedConvolutionWithPostOpNode::forward_descriptors()
-{
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
-    {
-        Tensor *dst = output(0);
-        ARM_COMPUTE_ERROR_ON(dst == nullptr);
-        dst->desc() = configure_output(0);
-        return true;
-    }
-    return false;
-}
-
-TensorDescriptor FusedConvolutionWithPostOpNode::configure_output(size_t idx) const
-{
-    ARM_COMPUTE_UNUSED(idx);
-    const Tensor *src     = input(0);
-    const Tensor *weights = input(1);
-
-    ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
-
-    TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
-    if(!_out_quant_info.empty())
-    {
-        output_info.quant_info = _out_quant_info;
-    }
-
-    return output_info;
-}
-
-NodeType FusedConvolutionWithPostOpNode::type() const
-{
-    return FusedConvolutionWithPostOpNode::node_type;
-}
-
-void FusedConvolutionWithPostOpNode::accept(INodeVisitor &v)
-{
-    v.visit(*this);
-}
-} // namespace graph
-} // namespace arm_compute
diff --git a/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp b/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp
index c022450b9d1f2a17202f79f9ed1dfec5ed5eb5aa..c51641d64cbac4975dc87b3d37b37c6a5e006f62 100644
--- a/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp
+++ b/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp
@@ -32,18 +32,24 @@ namespace arm_compute
 {
 namespace graph
 {
-FusedDepthwiseConvolutionBatchNormalizationNode::FusedDepthwiseConvolutionBatchNormalizationNode(float                      epsilon,
-                                                                                                 PadStrideInfo              info,
-                                                                                                 unsigned int               depth_multiplier,
-                                                                                                 DepthwiseConvolutionMethod method,
-                                                                                                 ActivationLayerInfo        fused_activation)
-    : _epsilon(epsilon), _info(std::move(info)), _depth_multiplier(depth_multiplier), _method(method), _fused_activation(fused_activation)
+FusedDepthwiseConvolutionBatchNormalizationNode::FusedDepthwiseConvolutionBatchNormalizationNode(
+    float                      epsilon,
+    PadStrideInfo              info,
+    unsigned int               depth_multiplier,
+    DepthwiseConvolutionMethod method,
+    ActivationLayerInfo        fused_activation)
+    : _epsilon(epsilon),
+      _info(std::move(info)),
+      _depth_multiplier(depth_multiplier),
+      _method(method),
+      _fused_activation(fused_activation)
 {
     _input_edges.resize(7, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
 }
 
-void FusedDepthwiseConvolutionBatchNormalizationNode::set_depthwise_convolution_method(DepthwiseConvolutionMethod method)
+void FusedDepthwiseConvolutionBatchNormalizationNode::set_depthwise_convolution_method(
+    DepthwiseConvolutionMethod method)
 {
     _method = method;
 }
@@ -78,10 +84,11 @@ void FusedDepthwiseConvolutionBatchNormalizationNode::set_fused_activation(Activ
     _fused_activation = fused_activation;
 }
 
-TensorDescriptor FusedDepthwiseConvolutionBatchNormalizationNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                                            const TensorDescriptor &weights_descriptor,
-                                                                                            const PadStrideInfo    &info,
-                                                                                            int                     depth_multiplier)
+TensorDescriptor
+FusedDepthwiseConvolutionBatchNormalizationNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                                           const TensorDescriptor &weights_descriptor,
+                                                                           const PadStrideInfo    &info,
+                                                                           int                     depth_multiplier)
 {
     unsigned int output_width  = 0;
     unsigned int output_height = 0;
@@ -92,19 +99,22 @@ TensorDescriptor FusedDepthwiseConvolutionBatchNormalizationNode::compute_output
     const unsigned int kernel_width   = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
     const unsigned int kernel_height  = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
 
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::WIDTH), output_width);
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::CHANNEL), input_channels * depth_multiplier);
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::HEIGHT),
+                                output_height);
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::CHANNEL),
+                                input_channels * depth_multiplier);
 
     return output_descriptor;
 }
 
 bool FusedDepthwiseConvolutionBatchNormalizationNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/GenerateProposalsLayerNode.cpp b/src/graph/nodes/GenerateProposalsLayerNode.cpp
index 9f368628188816f6ba66679fcec37763bcdd58ae..1671a47a95accdb58b4f8e6627744a3eb8ae0dac 100644
--- a/src/graph/nodes/GenerateProposalsLayerNode.cpp
+++ b/src/graph/nodes/GenerateProposalsLayerNode.cpp
@@ -23,17 +23,15 @@
  */
 #include "arm_compute/graph/nodes/GenerateProposalsLayerNode.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/Helpers.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-GenerateProposalsLayerNode::GenerateProposalsLayerNode(GenerateProposalsInfo &info)
-    : _info(info)
+GenerateProposalsLayerNode::GenerateProposalsLayerNode(GenerateProposalsInfo &info) : _info(info)
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(3, NullTensorID);
@@ -46,10 +44,10 @@ const GenerateProposalsInfo &GenerateProposalsLayerNode::info() const
 
 bool GenerateProposalsLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID)
-       && (output_id(2) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) &&
+        (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID) && (output_id(2) != NullTensorID))
     {
-        for(unsigned int i = 0; i < 3; ++i)
+        for (unsigned int i = 0; i < 3; ++i)
         {
             Tensor *dst = output(i);
             ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -68,7 +66,7 @@ TensorDescriptor GenerateProposalsLayerNode::configure_output(size_t idx) const
     ARM_COMPUTE_ERROR_ON(src == nullptr);
     TensorDescriptor output_desc = src->desc();
 
-    switch(idx)
+    switch (idx)
     {
         case 0:
             // Configure proposals output
diff --git a/src/graph/nodes/InputNode.cpp b/src/graph/nodes/InputNode.cpp
index 072281f259eb99122d523b0208cf52cc79a705de..7408bc265dc06469df9496991f6fdcfd25c4c09e 100644
--- a/src/graph/nodes/InputNode.cpp
+++ b/src/graph/nodes/InputNode.cpp
@@ -30,15 +30,14 @@ namespace arm_compute
 {
 namespace graph
 {
-InputNode::InputNode(TensorDescriptor desc)
-    : _desc(std::move(desc))
+InputNode::InputNode(TensorDescriptor desc) : _desc(std::move(desc))
 {
     _outputs.resize(1, NullTensorID);
 }
 
 bool InputNode::forward_descriptors()
 {
-    if(output_id(0) != NullTensorID)
+    if (output_id(0) != NullTensorID)
     {
         Tensor *t = output(0);
         ARM_COMPUTE_ERROR_ON(t == nullptr);
diff --git a/src/graph/nodes/L2NormalizeLayerNode.cpp b/src/graph/nodes/L2NormalizeLayerNode.cpp
index 0c35a335fafb67764994af516a17215a16914985..1a57cf0199a5929b610dfb64381e94c7361fc2bc 100644
--- a/src/graph/nodes/L2NormalizeLayerNode.cpp
+++ b/src/graph/nodes/L2NormalizeLayerNode.cpp
@@ -30,18 +30,15 @@ namespace arm_compute
 {
 namespace graph
 {
-L2NormalizeLayerNode::L2NormalizeLayerNode()
-    : L2NormalizeLayerNode(0, 1e-12f)
+L2NormalizeLayerNode::L2NormalizeLayerNode() : L2NormalizeLayerNode(0, 1e-12f)
 {
 }
 
-L2NormalizeLayerNode::L2NormalizeLayerNode(int axis)
-    : L2NormalizeLayerNode(axis, 1e-12f)
+L2NormalizeLayerNode::L2NormalizeLayerNode(int axis) : L2NormalizeLayerNode(axis, 1e-12f)
 {
 }
 
-L2NormalizeLayerNode::L2NormalizeLayerNode(int axis, float epsilon)
-    : _axis(axis), _epsilon(epsilon)
+L2NormalizeLayerNode::L2NormalizeLayerNode(int axis, float epsilon) : _axis(axis), _epsilon(epsilon)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -49,7 +46,7 @@ L2NormalizeLayerNode::L2NormalizeLayerNode(int axis, float epsilon)
 
 bool L2NormalizeLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -92,4 +89,4 @@ void L2NormalizeLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/NormalizationLayerNode.cpp b/src/graph/nodes/NormalizationLayerNode.cpp
index eaa1bcf9248e1b164374bd4ed7ad675c2ebde9bb..b18bb7dd9331c69c3f1d00fa5647281351780e16 100644
--- a/src/graph/nodes/NormalizationLayerNode.cpp
+++ b/src/graph/nodes/NormalizationLayerNode.cpp
@@ -31,8 +31,7 @@ namespace arm_compute
 {
 namespace graph
 {
-NormalizationLayerNode::NormalizationLayerNode(NormalizationLayerInfo norm_info)
-    : _info(norm_info)
+NormalizationLayerNode::NormalizationLayerNode(NormalizationLayerInfo norm_info) : _info(norm_info)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -45,7 +44,7 @@ NormalizationLayerInfo NormalizationLayerNode::normalization_info() const
 
 bool NormalizationLayerNode::forward_descriptors()
 {
-    if(input_id(0) != NullTensorID && (output_id(0) != NullTensorID))
+    if (input_id(0) != NullTensorID && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -76,4 +75,4 @@ void NormalizationLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp b/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp
index 113d0a541fd2b3d3ed04befefe7714657a1f2a5c..cac96606ea95889bb37c67fb987546e3005dbaaa 100644
--- a/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp
+++ b/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp
@@ -39,7 +39,7 @@ NormalizePlanarYUVLayerNode::NormalizePlanarYUVLayerNode()
 
 bool NormalizePlanarYUVLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/PReluLayerNode.cpp b/src/graph/nodes/PReluLayerNode.cpp
index 378c18e3bb3fecf7d09e98cc9ad4f30211ce8c12..2b50fe9234aaaa8565799cca10b9f768cc6a9b15 100644
--- a/src/graph/nodes/PReluLayerNode.cpp
+++ b/src/graph/nodes/PReluLayerNode.cpp
@@ -38,7 +38,7 @@ PReluLayerNode::PReluLayerNode()
 
 bool PReluLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/PadLayerNode.cpp b/src/graph/nodes/PadLayerNode.cpp
index 6424370d417a43bea9372b5ef8eebd3e882f34a2..336e7de05a9bec476c7a1ad0e6a6d6863ddba734 100644
--- a/src/graph/nodes/PadLayerNode.cpp
+++ b/src/graph/nodes/PadLayerNode.cpp
@@ -23,17 +23,15 @@
  */
 #include "arm_compute/graph/nodes/PadLayerNode.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/Helpers.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-PadLayerNode::PadLayerNode(const PaddingList &padding, PixelValue pad_value)
-    : _padding(padding), _pad_value(pad_value)
+PadLayerNode::PadLayerNode(const PaddingList &padding, PixelValue pad_value) : _padding(padding), _pad_value(pad_value)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -51,7 +49,7 @@ PixelValue PadLayerNode::pad_value() const
 
 bool PadLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -71,7 +69,7 @@ TensorDescriptor PadLayerNode::configure_output(size_t idx) const
 
     TensorDescriptor  output_desc = src->desc();
     const TensorShape input_shape = src->desc().shape;
-    for(size_t dim = 0; dim < _padding.size(); ++dim)
+    for (size_t dim = 0; dim < _padding.size(); ++dim)
     {
         output_desc.shape.set(dim, _padding[dim].first + input_shape[dim] + _padding[dim].second);
     }
diff --git a/src/graph/nodes/PermuteLayerNode.cpp b/src/graph/nodes/PermuteLayerNode.cpp
index b311ee1301959da3532efe48dd67e67b2e9637d5..db53722363d051e77aa92794c3c52d56ab95ce4e 100644
--- a/src/graph/nodes/PermuteLayerNode.cpp
+++ b/src/graph/nodes/PermuteLayerNode.cpp
@@ -23,17 +23,15 @@
  */
 #include "arm_compute/graph/nodes/PermuteLayerNode.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/Helpers.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-PermuteLayerNode::PermuteLayerNode(PermutationVector perm, DataLayout layout)
-    : _perm(perm), _layout(layout)
+PermuteLayerNode::PermuteLayerNode(PermutationVector perm, DataLayout layout) : _perm(perm), _layout(layout)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -46,7 +44,7 @@ const PermutationVector &PermuteLayerNode::permutation_vector() const
 
 bool PermuteLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -66,7 +64,7 @@ TensorDescriptor PermuteLayerNode::configure_output(size_t idx) const
 
     TensorDescriptor output_desc = src->desc();
     permute(output_desc.shape, _perm);
-    if(_layout != DataLayout::UNKNOWN)
+    if (_layout != DataLayout::UNKNOWN)
     {
         output_desc.layout = _layout;
     }
@@ -84,4 +82,4 @@ void PermuteLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/PoolingLayerNode.cpp b/src/graph/nodes/PoolingLayerNode.cpp
index 4ecf924a5e05927d6a65ebec337b88191e31a409..ac954acbe3b686a142161f5568f433c54d204f43 100644
--- a/src/graph/nodes/PoolingLayerNode.cpp
+++ b/src/graph/nodes/PoolingLayerNode.cpp
@@ -32,8 +32,7 @@ namespace arm_compute
 {
 namespace graph
 {
-PoolingLayerNode::PoolingLayerNode(PoolingLayerInfo pool_info)
-    : _info(std::move(pool_info))
+PoolingLayerNode::PoolingLayerNode(PoolingLayerInfo pool_info) : _info(std::move(pool_info))
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -55,7 +54,8 @@ TensorDescriptor PoolingLayerNode::compute_output_descriptor(const TensorDescrip
     const unsigned int pool_size_x  = info.is_global_pooling ? input_width : info.pool_size.width;
     const unsigned int pool_size_y  = info.is_global_pooling ? input_height : info.pool_size.height;
 
-    std::tie(pooled_width, pooled_height) = scaled_dimensions(input_width, input_height, pool_size_x, pool_size_y, info.pad_stride_info);
+    std::tie(pooled_width, pooled_height) =
+        scaled_dimensions(input_width, input_height, pool_size_x, pool_size_y, info.pad_stride_info);
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
@@ -67,7 +67,7 @@ TensorDescriptor PoolingLayerNode::compute_output_descriptor(const TensorDescrip
 
 bool PoolingLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -98,4 +98,4 @@ void PoolingLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/PrintLayerNode.cpp b/src/graph/nodes/PrintLayerNode.cpp
index da408d8c4da32139323cc68680ee71013901105a..82a340005b5abb69805a373ae48fdf4623df27bc 100644
--- a/src/graph/nodes/PrintLayerNode.cpp
+++ b/src/graph/nodes/PrintLayerNode.cpp
@@ -32,7 +32,9 @@ namespace arm_compute
 {
 namespace graph
 {
-PrintLayerNode::PrintLayerNode(std::ostream &stream, const IOFormatInfo &format_info, const std::function<ITensor *(ITensor *)> transform)
+PrintLayerNode::PrintLayerNode(std::ostream                             &stream,
+                               const IOFormatInfo                       &format_info,
+                               const std::function<ITensor *(ITensor *)> transform)
     : _stream(stream), _format_info(format_info), _transform(transform)
 {
     _input_edges.resize(1, EmptyEdgeID);
@@ -56,7 +58,7 @@ const std::function<ITensor *(ITensor *)> PrintLayerNode::transform() const
 
 bool PrintLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -88,4 +90,4 @@ void PrintLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/PriorBoxLayerNode.cpp b/src/graph/nodes/PriorBoxLayerNode.cpp
index f017ead88063cf8619ac93aa60dec9cfad6e454e..5ffb1733331258dcb105dbbed42f748ed60feeac 100644
--- a/src/graph/nodes/PriorBoxLayerNode.cpp
+++ b/src/graph/nodes/PriorBoxLayerNode.cpp
@@ -32,8 +32,7 @@ namespace arm_compute
 {
 namespace graph
 {
-PriorBoxLayerNode::PriorBoxLayerNode(PriorBoxLayerInfo prior_info)
-    : _info(std::move(prior_info))
+PriorBoxLayerNode::PriorBoxLayerNode(PriorBoxLayerInfo prior_info) : _info(std::move(prior_info))
 {
     _input_edges.resize(2, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -44,7 +43,7 @@ PriorBoxLayerInfo PriorBoxLayerNode::priorbox_info() const
     return _info;
 }
 
-TensorDescriptor PriorBoxLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+TensorDescriptor PriorBoxLayerNode::compute_output_descriptor(const TensorDescriptor  &input_descriptor,
                                                               const PriorBoxLayerInfo &info)
 {
     const unsigned int layer_width  = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
@@ -61,7 +60,7 @@ TensorDescriptor PriorBoxLayerNode::compute_output_descriptor(const TensorDescri
 
 bool PriorBoxLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/QuantizationLayerNode.cpp b/src/graph/nodes/QuantizationLayerNode.cpp
index 4906808dae3089b63b1725ea39fa25bb3dae7177..0dd2da919ddf4c94fe47d71ca6011fecd2e274b2 100644
--- a/src/graph/nodes/QuantizationLayerNode.cpp
+++ b/src/graph/nodes/QuantizationLayerNode.cpp
@@ -47,7 +47,7 @@ QuantizationLayerNode::QuantizationLayerNode(QuantizationInfo out_quant_info, Da
 
 bool QuantizationLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/ROIAlignLayerNode.cpp b/src/graph/nodes/ROIAlignLayerNode.cpp
index 62891811f3091b46e0e0e8dd7240c75292d9604f..59093358267db1f0c5099d3d3a7e146f74766e7a 100644
--- a/src/graph/nodes/ROIAlignLayerNode.cpp
+++ b/src/graph/nodes/ROIAlignLayerNode.cpp
@@ -24,17 +24,15 @@
 
 #include "arm_compute/graph/nodes/ROIAlignLayerNode.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/Helpers.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-ROIAlignLayerNode::ROIAlignLayerNode(ROIPoolingLayerInfo &pool_info)
-    : _pool_info(pool_info)
+ROIAlignLayerNode::ROIAlignLayerNode(ROIPoolingLayerInfo &pool_info) : _pool_info(pool_info)
 {
     _input_edges.resize(2, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -47,7 +45,7 @@ const ROIPoolingLayerInfo &ROIAlignLayerNode::pooling_info() const
 
 bool ROIAlignLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -92,4 +90,4 @@ void ROIAlignLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/ReductionLayerNode.cpp b/src/graph/nodes/ReductionLayerNode.cpp
index 0e9303989474654773f1eb5561d320267ecda629..965c1ba0a5d6ac3f269100fe51706383dde01ae4 100644
--- a/src/graph/nodes/ReductionLayerNode.cpp
+++ b/src/graph/nodes/ReductionLayerNode.cpp
@@ -56,7 +56,7 @@ bool ReductionLayerNode::keep_dims() const
 
 bool ReductionLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -74,8 +74,9 @@ TensorDescriptor ReductionLayerNode::configure_output(size_t idx) const
     const Tensor *src = input(0);
     ARM_COMPUTE_ERROR_ON(src == nullptr);
 
-    TensorDescriptor output_info  = src->desc();
-    TensorShape      output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, _keep_dims);
+    TensorDescriptor output_info = src->desc();
+    TensorShape      output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, _keep_dims);
     output_info.set_shape(output_shape);
 
     return output_info;
@@ -91,4 +92,4 @@ void ReductionLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/ReorgLayerNode.cpp b/src/graph/nodes/ReorgLayerNode.cpp
index e693e4b931469c35c5ba95ac0f3f5693e73a4ba7..251a4ea1b201ce3b6567632e87b0d92e29d5ed63 100644
--- a/src/graph/nodes/ReorgLayerNode.cpp
+++ b/src/graph/nodes/ReorgLayerNode.cpp
@@ -31,8 +31,7 @@ namespace arm_compute
 {
 namespace graph
 {
-ReorgLayerNode::ReorgLayerNode(int stride)
-    : _stride(stride)
+ReorgLayerNode::ReorgLayerNode(int stride) : _stride(stride)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -51,20 +50,22 @@ TensorDescriptor ReorgLayerNode::compute_output_descriptor(const TensorDescripto
 
     ARM_COMPUTE_ERROR_ON(stride <= 0);
     ARM_COMPUTE_ERROR_ON_MSG((input_width % stride != 0), "The width of the input tensor must be a multiple of stride");
-    ARM_COMPUTE_ERROR_ON_MSG((input_height % stride != 0), "The height of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_ERROR_ON_MSG((input_height % stride != 0),
+                             "The height of the input tensor must be a multiple of stride");
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), input_width / stride);
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), input_height / stride);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), input_channel * stride * stride);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL),
+                                input_channel * stride * stride);
 
     return output_descriptor;
 }
 
 bool ReorgLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -95,4 +96,4 @@ void ReorgLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/ReshapeLayer.cpp b/src/graph/nodes/ReshapeLayer.cpp
index a6354d03ed0bb0add4d7c293439419b7afcb34db..ce6bf9b803d32de2b1b5a8d22260c7b98e3e0086 100644
--- a/src/graph/nodes/ReshapeLayer.cpp
+++ b/src/graph/nodes/ReshapeLayer.cpp
@@ -21,17 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/nodes/ReshapeLayerNode.h"
-
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/nodes/ReshapeLayerNode.h"
 
 namespace arm_compute
 {
 namespace graph
 {
-ReshapeLayerNode::ReshapeLayerNode(TensorShape shape)
-    : _shape(shape)
+ReshapeLayerNode::ReshapeLayerNode(TensorShape shape) : _shape(shape)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -39,7 +37,7 @@ ReshapeLayerNode::ReshapeLayerNode(TensorShape shape)
 
 bool ReshapeLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -73,4 +71,4 @@ void ReshapeLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/ResizeLayerNode.cpp b/src/graph/nodes/ResizeLayerNode.cpp
index 2a94bf60630a81801b11e2753042f47bc0c34480..292b2c643e038887d321c841cc02ffa88e001ab1 100644
--- a/src/graph/nodes/ResizeLayerNode.cpp
+++ b/src/graph/nodes/ResizeLayerNode.cpp
@@ -50,7 +50,7 @@ std::pair<float, float> ResizeLayerNode::scaling_factor() const
 
 bool ResizeLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -88,4 +88,4 @@ void ResizeLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/SliceLayerNode.cpp b/src/graph/nodes/SliceLayerNode.cpp
index b7655b9eae9bbf4bbc5ef6e4e02bd32c327adf7f..eb877d9a24e32b7c07db720956f86145559a25f2 100644
--- a/src/graph/nodes/SliceLayerNode.cpp
+++ b/src/graph/nodes/SliceLayerNode.cpp
@@ -32,8 +32,7 @@ namespace arm_compute
 {
 namespace graph
 {
-SliceLayerNode::SliceLayerNode(const Coordinates &starts, const Coordinates &ends)
-    : _starts(starts), _ends(ends)
+SliceLayerNode::SliceLayerNode(const Coordinates &starts, const Coordinates &ends) : _starts(starts), _ends(ends)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -50,19 +49,20 @@ Coordinates SliceLayerNode::ends() const
 }
 
 TensorDescriptor SliceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                           const Coordinates &starts, const Coordinates &ends)
+                                                           const Coordinates      &starts,
+                                                           const Coordinates      &ends)
 {
     using namespace arm_compute::helpers::tensor_transform;
 
     TensorDescriptor output_desc = input_descriptor;
-    output_desc.shape            = arm_compute::misc::shape_calculator::compute_slice_shape(input_descriptor.shape, starts, ends);
+    output_desc.shape = arm_compute::misc::shape_calculator::compute_slice_shape(input_descriptor.shape, starts, ends);
 
     return output_desc;
 }
 
 bool SliceLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/SoftmaxLayerNode.cpp b/src/graph/nodes/SoftmaxLayerNode.cpp
index 031166993a53fb1da3b7359590d6d4fe9d2ef01b..4beac81b1f7ad05efcebf7f3d9193a2bcb3d3fa0 100644
--- a/src/graph/nodes/SoftmaxLayerNode.cpp
+++ b/src/graph/nodes/SoftmaxLayerNode.cpp
@@ -31,8 +31,7 @@ namespace arm_compute
 {
 namespace graph
 {
-SoftmaxLayerNode::SoftmaxLayerNode(float beta)
-    : _beta(beta)
+SoftmaxLayerNode::SoftmaxLayerNode(float beta) : _beta(beta)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -45,7 +44,7 @@ float SoftmaxLayerNode::beta() const
 
 bool SoftmaxLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -79,4 +78,4 @@ void SoftmaxLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/SplitLayerNode.cpp b/src/graph/nodes/SplitLayerNode.cpp
index 31931c3a79e7c725f68a12ff4e718e3adedb7279..dfb6624f807d1a71a963f3b5f59dccc98043c2bb 100644
--- a/src/graph/nodes/SplitLayerNode.cpp
+++ b/src/graph/nodes/SplitLayerNode.cpp
@@ -49,8 +49,8 @@ unsigned int SplitLayerNode::axis() const
     return _axis;
 }
 
-std::pair<TensorDescriptor, Coordinates> SplitLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                                   unsigned int num_splits, int axis, unsigned int idx)
+std::pair<TensorDescriptor, Coordinates> SplitLayerNode::compute_output_descriptor(
+    const TensorDescriptor &input_descriptor, unsigned int num_splits, int axis, unsigned int idx)
 {
     // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis).
     int              num_dimension = static_cast<int32_t>(input_descriptor.shape.num_dimensions());
@@ -58,7 +58,7 @@ std::pair<TensorDescriptor, Coordinates> SplitLayerNode::compute_output_descript
     Coordinates      coords;
     TensorDescriptor output_descriptor = input_descriptor;
     int              split_size        = input_descriptor.shape[tmp_axis] / num_splits;
-    if(_size_splits.empty())
+    if (_size_splits.empty())
     {
         output_descriptor.shape.set(tmp_axis, split_size);
         coords.set(tmp_axis, idx * split_size);
@@ -66,15 +66,15 @@ std::pair<TensorDescriptor, Coordinates> SplitLayerNode::compute_output_descript
     else
     {
         int split_size = _size_splits[idx];
-        if(split_size == -1)
+        if (split_size == -1)
         {
             split_size = input_descriptor.shape[tmp_axis];
-            for(unsigned int i = 0; i < _size_splits.size() - 1; ++i)
+            for (unsigned int i = 0; i < _size_splits.size() - 1; ++i)
                 split_size -= _size_splits[i];
         }
         output_descriptor.shape.set(tmp_axis, split_size);
         int coord_value = 0;
-        for(unsigned int i = 0; i < idx; ++i)
+        for (unsigned int i = 0; i < idx; ++i)
             coord_value += _size_splits[i];
         coords.set(tmp_axis, coord_value);
     }
@@ -84,12 +84,12 @@ std::pair<TensorDescriptor, Coordinates> SplitLayerNode::compute_output_descript
 
 bool SplitLayerNode::forward_descriptors()
 {
-    if(input_id(0) != NullTensorID)
+    if (input_id(0) != NullTensorID)
     {
         validate();
-        for(unsigned int i = 0; i < _outputs.size(); ++i)
+        for (unsigned int i = 0; i < _outputs.size(); ++i)
         {
-            if(output_id(i) != NullTensorID)
+            if (output_id(i) != NullTensorID)
             {
                 Tensor *dst_i = output(i);
                 ARM_COMPUTE_ERROR_ON(dst_i == nullptr);
@@ -117,10 +117,10 @@ TensorDescriptor SplitLayerNode::configure_output(size_t idx) const
     int tmp_axis      = wrap_around(_axis, num_dimension);
 
     int split_size = (_size_splits.empty()) ? (input_descriptor.shape[tmp_axis] / _num_splits) : _size_splits[idx];
-    if(split_size == -1)
+    if (split_size == -1)
     {
         split_size = input_descriptor.shape[tmp_axis];
-        for(unsigned int i = 0; i < _size_splits.size() - 1; ++i)
+        for (unsigned int i = 0; i < _size_splits.size() - 1; ++i)
             split_size -= _size_splits[i];
     }
     output_descriptor.shape.set(tmp_axis, split_size);
@@ -138,7 +138,7 @@ Status SplitLayerNode::validate() const
     // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis).
     int tmp_axis = wrap_around(_axis, num_dimension);
 
-    if(_size_splits.empty())
+    if (_size_splits.empty())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->desc().shape[tmp_axis] % _num_splits, "Split should be exact");
     }
@@ -156,4 +156,4 @@ void SplitLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/StackLayerNode.cpp b/src/graph/nodes/StackLayerNode.cpp
index f292b33ad0df7fdc4bca93de9d7ed21cb6742a2f..031d8fc73954f205fd62fa43b60605bcf94174c4 100644
--- a/src/graph/nodes/StackLayerNode.cpp
+++ b/src/graph/nodes/StackLayerNode.cpp
@@ -25,18 +25,16 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 #include "arm_compute/graph/Utils.h"
 
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-StackLayerNode::StackLayerNode(unsigned int total_nodes, int axis)
-    : _total_nodes(total_nodes), _axis(axis)
+StackLayerNode::StackLayerNode(unsigned int total_nodes, int axis) : _total_nodes(total_nodes), _axis(axis)
 {
     _input_edges.resize(_total_nodes, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -64,7 +62,7 @@ TensorDescriptor StackLayerNode::compute_output_descriptor(const std::vector<Ten
 
 bool StackLayerNode::forward_descriptors()
 {
-    if(_outputs[0] != NullTensorID)
+    if (_outputs[0] != NullTensorID)
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -80,17 +78,15 @@ TensorDescriptor StackLayerNode::configure_output(size_t idx) const
     ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
 
     // Check if all input tensors are set
-    bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges), [](const EdgeID & eid)
-    {
-        return eid != EmptyEdgeID;
-    });
+    bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges),
+                                          [](const EdgeID &eid) { return eid != EmptyEdgeID; });
 
     TensorDescriptor output_info = {};
 
-    if(are_all_inputs_set)
+    if (are_all_inputs_set)
     {
         std::vector<TensorDescriptor> inputs_descriptors;
-        for(unsigned int i = 0; i < _input_edges.size(); ++i)
+        for (unsigned int i = 0; i < _input_edges.size(); ++i)
         {
             const Tensor *t = _graph->tensor(input_id(i));
             ARM_COMPUTE_ERROR_ON(t == nullptr);
diff --git a/src/graph/nodes/StridedSliceLayerNode.cpp b/src/graph/nodes/StridedSliceLayerNode.cpp
index 6a1a724bb34eabe8753a987d9fd91955c4a43f02..fc9f72204c3944c3d809a287c60bc67ac6ddc40f 100644
--- a/src/graph/nodes/StridedSliceLayerNode.cpp
+++ b/src/graph/nodes/StridedSliceLayerNode.cpp
@@ -79,7 +79,7 @@ TensorDescriptor StridedSliceLayerNode::compute_output_descriptor(const TensorDe
 
 bool StridedSliceLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/printers/DotGraphPrinter.cpp b/src/graph/printers/DotGraphPrinter.cpp
index 1071d50197a58d5ca30702b821d2661b028a2302..5587ed23f0f767a140688e4428ab35d92c96a72d 100644
--- a/src/graph/printers/DotGraphPrinter.cpp
+++ b/src/graph/printers/DotGraphPrinter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,9 +25,9 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/nodes/Nodes.h"
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/TypePrinter.h"
-#include "arm_compute/graph/nodes/Nodes.h"
 
 namespace arm_compute
 {
@@ -85,22 +85,6 @@ void DotGraphVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
     _info = ss.str();
 }
 
-void DotGraphVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
-{
-    ARM_COMPUTE_UNUSED(n);
-    std::stringstream ss;
-    ss << "FusedConvolutionBatchNormalizationWithPostOpsNode";
-    _info = ss.str();
-}
-
-void DotGraphVisitor::visit(FusedConvolutionWithPostOpNode &n)
-{
-    ARM_COMPUTE_UNUSED(n);
-    std::stringstream ss;
-    ss << "FusedConvolutionWithPostOpNode";
-    _info = ss.str();
-}
-
 void DotGraphVisitor::visit(FusedDepthwiseConvolutionBatchNormalizationNode &n)
 {
     ARM_COMPUTE_UNUSED(n);
@@ -168,9 +152,9 @@ void DotGraphPrinter::print_footer(const Graph &g, std::ostream &os)
 
 void DotGraphPrinter::print_nodes(const Graph &g, std::ostream &os)
 {
-    for(const auto &n : g.nodes())
+    for (const auto &n : g.nodes())
     {
-        if(n)
+        if (n)
         {
             // Output node id
             std::string node_id = std::string("n") + support::cpp11::to_string(n->id());
@@ -182,7 +166,8 @@ void DotGraphPrinter::print_nodes(const Graph &g, std::ostream &os)
             std::string name             = n->name().empty() ? node_id : n->name();
             auto        node_description = _dot_node_visitor.info();
 
-            os << R"([label = ")" << name << R"( \n )" << n->assigned_target() << R"( \n )" << node_description << R"("])";
+            os << R"([label = ")" << name << R"( \n )" << n->assigned_target() << R"( \n )" << node_description
+               << R"("])";
             os << ";\n";
         }
     }
@@ -190,16 +175,17 @@ void DotGraphPrinter::print_nodes(const Graph &g, std::ostream &os)
 
 void DotGraphPrinter::print_edges(const Graph &g, std::ostream &os)
 {
-    for(const auto &e : g.edges())
+    for (const auto &e : g.edges())
     {
-        if(e)
+        if (e)
         {
             std::string source_node_id = std::string("n") + support::cpp11::to_string(e->producer_id());
             std::string sink_node_id   = std::string("n") + support::cpp11::to_string(e->consumer_id());
             os << source_node_id << " -> " << sink_node_id << " ";
             const Tensor *t = e->tensor();
             ARM_COMPUTE_ERROR_ON(t == nullptr);
-            os << R"([label = ")" << t->desc().shape << R"( \n )" << t->desc().data_type << R"( \n )" << t->desc().layout << R"("])";
+            os << R"([label = ")" << t->desc().shape << R"( \n )" << t->desc().data_type << R"( \n )"
+               << t->desc().layout << R"("])";
             os << ";\n";
         }
     }
diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp
index ef7c62d64bb55b1549f50d53b7256f332019136f..eca712dbf0d8eb030ce5c02a07b0dcb24cc1dcfd 100644
--- a/src/runtime/Allocator.cpp
+++ b/src/runtime/Allocator.cpp
@@ -22,9 +22,9 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/Allocator.h"
-#include "arm_compute/runtime/MemoryRegion.h"
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/MemoryRegion.h"
 
 #include <cstddef>
 
diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index bea55d8eb9444bc9a16f03b8b53533ac151eec5e..8a0fc05c39ebf0240f44ead02282d07f2f05683e 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp
@@ -35,8 +35,7 @@
 
 namespace arm_compute
 {
-BlobLifetimeManager::BlobLifetimeManager()
-    : _blobs()
+BlobLifetimeManager::BlobLifetimeManager() : _blobs()
 {
 }
 
@@ -62,33 +61,32 @@ void BlobLifetimeManager::update_blobs_and_mappings()
     ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
 
     // Sort free blobs requirements in descending order.
-    _free_blobs.sort([](const Blob & ba, const Blob & bb)
-    {
-        return ba.max_size > bb.max_size;
-    });
+    _free_blobs.sort([](const Blob &ba, const Blob &bb) { return ba.max_size > bb.max_size; });
 
     // Create group sizes vector
     std::vector<BlobInfo> group_sizes;
-    std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b)
-    {
-        return BlobInfo{ b.max_size, b.max_alignment, b.bound_elements.size() };
-    });
+    std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes),
+                   [](const Blob &b) {
+                       return BlobInfo{b.max_size, b.max_alignment, b.bound_elements.size()};
+                   });
 
     // Update blob sizes
     size_t max_size = std::max(_blobs.size(), group_sizes.size());
     _blobs.resize(max_size);
     group_sizes.resize(max_size);
-    std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](BlobInfo lhs, BlobInfo rhs)
-    {
-        return BlobInfo{ std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment), std::max(lhs.owners, rhs.owners) };
-    });
+    std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs),
+                   [](BlobInfo lhs, BlobInfo rhs)
+                   {
+                       return BlobInfo{std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment),
+                                       std::max(lhs.owners, rhs.owners)};
+                   });
 
     // Calculate group mappings
     auto &group_mappings = _active_group->mappings();
     int   blob_idx       = 0;
-    for(auto &free_blob : _free_blobs)
+    for (auto &free_blob : _free_blobs)
     {
-        for(auto &bound_element_id : free_blob.bound_elements)
+        for (auto &bound_element_id : free_blob.bound_elements)
         {
             ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements));
             Element &bound_element               = _active_elements[bound_element_id];
diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp
index 88e280537c4190c859f0a06fac4966d65711b7e4..a2f63ef52b936e81453e3d9fa7b35ecd72003e1e 100644
--- a/src/runtime/BlobMemoryPool.cpp
+++ b/src/runtime/BlobMemoryPool.cpp
@@ -47,7 +47,7 @@ BlobMemoryPool::~BlobMemoryPool()
 void BlobMemoryPool::acquire(MemoryMappings &handles)
 {
     // Set memory to handlers
-    for(auto &handle : handles)
+    for (auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
         handle.first->set_region(_blobs[handle.second].get());
@@ -56,7 +56,7 @@ void BlobMemoryPool::acquire(MemoryMappings &handles)
 
 void BlobMemoryPool::release(MemoryMappings &handles)
 {
-    for(auto &handle : handles)
+    for (auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
         handle.first->set_region(nullptr);
@@ -78,7 +78,7 @@ void BlobMemoryPool::allocate_blobs(const std::vector<BlobInfo> &blob_info)
 {
     ARM_COMPUTE_ERROR_ON(!_allocator);
 
-    for(const auto &bi : blob_info)
+    for (const auto &bi : blob_info)
     {
         _blobs.push_back(_allocator->make_region(bi.size, bi.alignment));
     }
diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp
index e06ef3d37d5cc87b5361226e2047419858197e6d..b4545b93bffe6ec15738a6948d4e5e79f717d902 100644
--- a/src/runtime/CL/CLBufferAllocator.cpp
+++ b/src/runtime/CL/CLBufferAllocator.cpp
@@ -35,7 +35,8 @@ namespace arm_compute
 void *CLBufferAllocator::allocate(size_t size, size_t alignment)
 {
     ARM_COMPUTE_UNUSED(alignment);
-    cl_mem buf{ clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr) };
+    cl_mem buf{clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size,
+                              nullptr, nullptr)};
     return static_cast<void *>(buf);
 }
 
diff --git a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp
index 7168259fcd169b3615eb86d85c9945a6990c1ccf..d680dc08bb7f74103a7b6e46ddb4179abd1b4f44 100644
--- a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp
+++ b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp
@@ -27,8 +27,7 @@
 
 namespace arm_compute
 {
-CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle()
-    : _heuristics(std::make_unique<mlgo::MLGOHeuristics>())
+CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle() : _heuristics(std::make_unique<mlgo::MLGOHeuristics>())
 {
 }
 CLGEMMHeuristicsHandle::~CLGEMMHeuristicsHandle() = default;
diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
index 5b4bbbcde057b6fbf2b71cf1d577277b711b9a0a..eb28ecbf8d7d5674107cd4b45493d1e2e3b31ef4 100644
--- a/src/runtime/CL/CLHelpers.cpp
+++ b/src/runtime/CL/CLHelpers.cpp
@@ -50,34 +50,30 @@ void printf_callback(const char *buffer, unsigned int len, size_t complete, void
  * @return A pointer to the context properties which can be used to create an opencl context
  */
 
-void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, std::array<cl_context_properties, 7> &prop)
+void initialise_context_properties(const cl::Platform                   &platform,
+                                   const cl::Device                     &device,
+                                   std::array<cl_context_properties, 7> &prop)
 {
     ARM_COMPUTE_UNUSED(device);
 #if defined(ARM_COMPUTE_ASSERTS_ENABLED)
     // Query devices in the context for cl_arm_printf support
-    if(arm_compute::device_supports_extension(device, "cl_arm_printf"))
+    if (arm_compute::device_supports_extension(device, "cl_arm_printf"))
     {
         // Create a cl_context with a printf_callback and user specified buffer size.
-        std::array<cl_context_properties, 7> properties_printf =
-        {
+        std::array<cl_context_properties, 7> properties_printf = {
             CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
             // Enable a printf callback function for this context.
             CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback),
             // Request a minimum printf buffer size of 4MB for devices in the
             // context that support this extension.
-            CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
-            0
-        };
+            CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0};
         prop = properties_printf;
     }
     else
 #endif // defined(ARM_COMPUTE_ASSERTS_ENABLED)
     {
-        std::array<cl_context_properties, 3> properties =
-        {
-            CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
-            0
-        };
+        std::array<cl_context_properties, 3> properties = {CL_CONTEXT_PLATFORM,
+                                                           reinterpret_cast<cl_context_properties>(platform()), 0};
         std::copy(properties.begin(), properties.end(), prop.begin());
     };
 }
@@ -91,19 +87,19 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type)
     cl::Platform::get(&platforms);
     ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform");
 
-    cl::Platform selected_platform{ nullptr };
+    cl::Platform selected_platform{nullptr};
 
     // If the user has selected the Native platform, return the first available.
-    switch(cl_backend_type)
+    switch (cl_backend_type)
     {
         case CLBackendType::Native:
             selected_platform = platforms[0];
             break;
         case CLBackendType::Clvk:
-            for(auto p : platforms)
+            for (auto p : platforms)
             {
                 std::string res = p.getInfo<CL_PLATFORM_NAME>();
-                if(res.find("clvk") != std::string::npos)
+                if (res.find("clvk") != std::string::npos)
                 {
                     selected_platform = p;
                     break;
@@ -114,7 +110,7 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type)
             ARM_COMPUTE_ERROR("Unsupported backend type");
     }
 
-    if(!selected_platform())
+    if (!selected_platform())
     {
         ARM_COMPUTE_ERROR("No valid platform found");
     }
@@ -122,8 +118,7 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type)
     return selected_platform;
 }
 
-std::tuple<cl::Context, cl::Device, cl_int>
-create_opencl_context_and_device(CLBackendType cl_backend_type)
+std::tuple<cl::Context, cl::Device, cl_int> create_opencl_context_and_device(CLBackendType cl_backend_type)
 {
     ARM_COMPUTE_ERROR_ON(!opencl_is_available());
     cl::Platform            p = select_preferable_platform(cl_backend_type);
@@ -131,9 +126,9 @@ create_opencl_context_and_device(CLBackendType cl_backend_type)
     std::vector<cl::Device> platform_devices;
     p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
     ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
-    device     = platform_devices[0];
-    cl_int err = CL_SUCCESS;
-    std::array<cl_context_properties, 7> properties = { 0, 0, 0, 0, 0, 0, 0 };
+    device                                          = platform_devices[0];
+    cl_int                               err        = CL_SUCCESS;
+    std::array<cl_context_properties, 7> properties = {0, 0, 0, 0, 0, 0, 0};
     initialise_context_properties(p, device, properties);
     cl::Context cl_context = cl::Context(device, properties.data(), nullptr, nullptr, &err);
     ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
@@ -143,7 +138,7 @@ create_opencl_context_and_device(CLBackendType cl_backend_type)
 void schedule_kernel_on_ctx(CLRuntimeContext *ctx, ICLKernel *kernel, bool flush)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(kernel);
-    if(ctx)
+    if (ctx)
     {
         ARM_COMPUTE_ERROR_ON(ctx->gpu_scheduler() == nullptr);
         ctx->gpu_scheduler()->enqueue(*kernel, flush);
diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index a1743c56e6c6f9f5a1e327794219f3f106cf0572..c6ee6fde83d9e7bc1addb416f02fc806b6186f51 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp
@@ -24,24 +24,22 @@
 #include "arm_compute/runtime/CL/CLMemory.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "support/Cast.h"
 
 namespace arm_compute
 {
-CLMemory::CLMemory()
-    : _region(nullptr), _region_owned(nullptr)
+CLMemory::CLMemory() : _region(nullptr), _region_owned(nullptr)
 {
 }
 
-CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory)
-    : _region(nullptr), _region_owned(memory)
+CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory) : _region(nullptr), _region_owned(memory)
 {
     _region_owned = memory;
     _region       = _region_owned.get();
 }
 
-CLMemory::CLMemory(ICLMemoryRegion *memory)
-    : _region(memory), _region_owned(nullptr)
+CLMemory::CLMemory(ICLMemoryRegion *memory) : _region(memory), _region_owned(nullptr)
 {
     _region = memory;
 }
@@ -78,4 +76,4 @@ void CLMemory::set_owned_region(std::unique_ptr<IMemoryRegion> region)
     _region_owned = utils::cast::polymorphic_downcast_unique_ptr<ICLMemoryRegion>(std::move(region));
     _region       = _region_owned.get();
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp
index 00f91a0ffb07fa2cc8aab64eec182def8386aee5..835958b8165d6c5bf3759ea4a75878f6eed87b32 100644
--- a/src/runtime/CL/CLMemoryRegion.cpp
+++ b/src/runtime/CL/CLMemoryRegion.cpp
@@ -29,10 +29,7 @@
 namespace arm_compute
 {
 ICLMemoryRegion::ICLMemoryRegion(size_t size)
-    : IMemoryRegion(size),
-      _ctx(CLScheduler::get().context()),
-      _mapping(nullptr),
-      _mem()
+    : IMemoryRegion(size), _ctx(CLScheduler::get().context()), _mapping(nullptr), _mem()
 {
 }
 
@@ -57,17 +54,15 @@ std::unique_ptr<IMemoryRegion> ICLMemoryRegion::extract_subregion(size_t offset,
     return nullptr;
 }
 
-CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size)
-    : ICLMemoryRegion(size)
+CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size) : ICLMemoryRegion(size)
 {
-    if(_size != 0)
+    if (_size != 0)
     {
         _mem = cl::Buffer(CLScheduler::get().context(), flags, _size);
     }
 }
 
-CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer)
-    : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
+CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer) : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
 {
     _mem = buffer;
 }
@@ -102,10 +97,10 @@ void CLBufferMemoryRegion::unmap(cl::CommandQueue &q)
 ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment)
     : ICLMemoryRegion(size), _ptr(nullptr)
 {
-    if(size != 0)
+    if (size != 0)
     {
         _ptr = clSVMAlloc(CLScheduler::get().context().get(), flags, size, alignment);
-        if(_ptr != nullptr)
+        if (_ptr != nullptr)
         {
             _mem = cl::Buffer(CLScheduler::get().context(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr);
         }
@@ -114,7 +109,7 @@ ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t a
 
 ICLSVMMemoryRegion::~ICLSVMMemoryRegion()
 {
-    if(_ptr != nullptr)
+    if (_ptr != nullptr)
     {
         try
         {
@@ -125,7 +120,7 @@ ICLSVMMemoryRegion::~ICLSVMMemoryRegion()
             _mem = cl::Buffer();
             clSVMFree(_ctx.get(), _ptr);
         }
-        catch(...)
+        catch (...)
         {
         }
     }
@@ -144,7 +139,8 @@ CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(cl_mem_flags flags, size_t size
 void *CLCoarseSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
 {
     ARM_COMPUTE_ERROR_ON(_ptr == nullptr);
-    clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, nullptr);
+    clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr,
+                    nullptr);
     _mapping = _ptr;
     return _mapping;
 }
@@ -163,7 +159,7 @@ CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(cl_mem_flags flags, size_t size, si
 
 void *CLFineSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
 {
-    if(blocking)
+    if (blocking)
     {
         clFinish(q.get());
     }
diff --git a/src/runtime/CL/CLOperator.cpp b/src/runtime/CL/CLOperator.cpp
index 075a544077509bffd9c0a5c8127322151d13596d..89d452003892c421c029d9d29aa3c501c412d0be 100644
--- a/src/runtime/CL/CLOperator.cpp
+++ b/src/runtime/CL/CLOperator.cpp
@@ -30,14 +30,13 @@ namespace arm_compute
 {
 namespace experimental
 {
-ICLOperator::ICLOperator(IRuntimeContext *ctx)
-    : _kernel(), _ctx(ctx), _workspace()
+ICLOperator::ICLOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace()
 {
 }
 
 void ICLOperator::run(ITensorPack &tensors)
 {
-    if(tensors.empty())
+    if (tensors.empty())
     {
         ARM_COMPUTE_ERROR("No inputs provided");
     }
diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp
index 5083b4b0c58c889826df9c262ef3e36133c23914..b426b8c3045d4bef60f42768e00b59f91fe21515 100644
--- a/src/runtime/CL/CLRuntimeContext.cpp
+++ b/src/runtime/CL/CLRuntimeContext.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
+
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
@@ -29,7 +30,10 @@
 namespace arm_compute
 {
 CLRuntimeContext::CLRuntimeContext()
-    : _gpu_owned_scheduler(std::make_unique<CLScheduler>()), _gpu_scheduler(_gpu_owned_scheduler.get()), _symbols(), _backend_type()
+    : _gpu_owned_scheduler(std::make_unique<CLScheduler>()),
+      _gpu_scheduler(_gpu_owned_scheduler.get()),
+      _symbols(),
+      _backend_type()
 {
     _symbols.load_default();
     auto ctx_dev_err = create_opencl_context_and_device(_backend_type);
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 49fb724cdb576f5b2eb1de154fca4f9de7db9505..f0a42f55fdf141493873949c48d5ccd6044f9412 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -81,7 +82,7 @@ cl::Event CLScheduler::enqueue_sync_event()
 
 void CLScheduler::tune_kernel_static(ICLKernel &kernel)
 {
-    if(_cl_tuner != nullptr)
+    if (_cl_tuner != nullptr)
     {
         _cl_tuner->tune_kernel_static(kernel);
     }
@@ -95,8 +96,16 @@ bool CLScheduler::is_initialised() const
 std::once_flag CLScheduler::_initialize_symbols;
 
 CLScheduler::CLScheduler()
-    : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner(nullptr), _gemm_heuristics(nullptr), _backend_type(CLBackendType::Native), _job_chaining_enabled(false),
-      _job_chaining_size(), _job_chaining_count(0)
+    : _context(),
+      _queue(),
+      _target(GPUTarget::MIDGARD),
+      _is_initialised(false),
+      _cl_tuner(nullptr),
+      _gemm_heuristics(nullptr),
+      _backend_type(CLBackendType::Native),
+      _job_chaining_enabled(true),
+      _job_chaining_size(1),
+      _job_chaining_count(0)
 {
 }
 
@@ -107,9 +116,12 @@ CLScheduler &CLScheduler::get()
     return scheduler;
 }
 
-void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h)
+void CLScheduler::default_init_with_context(cl::Device             &device,
+                                            cl::Context            &ctx,
+                                            ICLTuner               *cl_tuner,
+                                            CLGEMMHeuristicsHandle *gemm_h)
 {
-    if(!_is_initialised)
+    if (!_is_initialised)
     {
         const std::string cl_kernels_folder("./cl_kernels/");
         cl::CommandQueue  queue = cl::CommandQueue(ctx, device);
@@ -121,7 +133,7 @@ void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx
 
 void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
 {
-    if(!_is_initialised)
+    if (!_is_initialised)
     {
         cl::Context ctx;
         cl::Device  dev;
@@ -151,7 +163,12 @@ void CLScheduler::set_context(cl::Context context)
     CLKernelLibrary::get().set_context(_context);
 }
 
-void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
+void CLScheduler::init(cl::Context             context,
+                       cl::CommandQueue        queue,
+                       const cl::Device       &device,
+                       ICLTuner               *cl_tuner,
+                       CLGEMMHeuristicsHandle *gemm_h,
+                       CLBackendType           cl_backend_type)
 {
     set_context(std::move(context));
     _queue           = std::move(queue);
@@ -164,21 +181,21 @@ void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::De
 
 void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,
-                             "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
+    ARM_COMPUTE_ERROR_ON_MSG(
+        !_is_initialised, "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
                              or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
 
     const bool inject_memory = !tensors.empty();
 
     // Tune the kernel if the CLTuner has been provided
-    if(_cl_tuner != nullptr)
+    if (_cl_tuner != nullptr)
     {
         inject_memory ? _cl_tuner->tune_kernel_dynamic(kernel, tensors) : _cl_tuner->tune_kernel_dynamic(kernel);
     }
 
     // Run kernel
     inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue);
-    if(_job_chaining_enabled)
+    if (_job_chaining_enabled)
     {
         ++_job_chaining_count;
     }
@@ -188,15 +205,25 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool f
 
 void CLScheduler::flush_queue(bool flush)
 {
-    if(_job_chaining_enabled)
+    if (_job_chaining_enabled)
     {
-        if(_job_chaining_count >= _job_chaining_size)
+        if (_job_chaining_count >= _job_chaining_size)
         {
             _job_chaining_count = 0;
+            /*
+                Optimisation note: Flush the queue at the first enqueue to start the GPU
+                execution and then incrementally saturate the clFlush calls to minimize
+                the CPU activity for job-scheduling.
+                For eg. job-chain size goes from 1, 2, 4, 8 and 16
+            */
+            if (_job_chaining_size < 16)
+            {
+                _job_chaining_size <<= 1;
+            }
             _queue.flush();
         }
     }
-    else if(flush)
+    else if (flush)
     {
         _queue.flush();
     }
diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
index 14936ae23c9ffe5a25ebf7f1cb8385a107aecb08..ace820bbb7afd7771ad5050d1fe6b6236441eaef 100644
--- a/src/runtime/CL/CLSubTensor.cpp
+++ b/src/runtime/CL/CLSubTensor.cpp
@@ -29,12 +29,14 @@
 
 using namespace arm_compute;
 
-CLSubTensor::CLSubTensor()
-    : _parent(nullptr), _info()
+CLSubTensor::CLSubTensor() : _parent(nullptr), _info()
 {
 }
 
-CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
+CLSubTensor::CLSubTensor(ICLTensor         *parent,
+                         const TensorShape &tensor_shape,
+                         const Coordinates &coords,
+                         bool               extend_parent)
     : _parent(nullptr), _info()
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
@@ -81,7 +83,7 @@ void CLSubTensor::unmap()
 uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking)
 {
     ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
-    if(_parent->buffer() == nullptr)
+    if (_parent->buffer() == nullptr)
     {
         _parent->map(q, blocking);
     }
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index f85b8ae777ecf17711b7a61748edcce3f1e831c9..e6457218c7342d0e5e93b8a133ace809080c6177 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -46,17 +46,16 @@ static IAllocator *static_global_cl_allocator = nullptr;
 std::unique_ptr<ICLMemoryRegion> allocate_region(size_t size, cl_uint alignment)
 {
     // Try fine-grain SVM
-    std::unique_ptr<ICLMemoryRegion> region = std::make_unique<CLFineSVMMemoryRegion>(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
-                                                                                      size,
-                                                                                      alignment);
+    std::unique_ptr<ICLMemoryRegion> region =
+        std::make_unique<CLFineSVMMemoryRegion>(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment);
 
     // Try coarse-grain SVM in case of failure
-    if(region != nullptr && region->ptr() == nullptr)
+    if (region != nullptr && region->ptr() == nullptr)
     {
         region = std::make_unique<CLCoarseSVMMemoryRegion>(CL_MEM_READ_WRITE, size, alignment);
     }
     // Try legacy buffer memory in case of failure
-    if(region != nullptr && region->ptr() == nullptr)
+    if (region != nullptr && region->ptr() == nullptr)
     {
         region = std::make_unique<CLBufferMemoryRegion>(CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
     }
@@ -80,7 +79,10 @@ void clear_quantization_arrays(CLFloatArray &scale, CLInt32Array &offset)
  * @param[in]      qinfo    Quantization info
  * @param[in]      pad_size Pad size to use in case array needs to be padded for computation purposes
  */
-void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const QuantizationInfo &qinfo, size_t pad_size)
+void populate_quantization_info(CLFloatArray           &scale,
+                                CLInt32Array           &offset,
+                                const QuantizationInfo &qinfo,
+                                size_t                  pad_size)
 {
     clear_quantization_arrays(scale, offset);
 
@@ -90,16 +92,18 @@ void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const
     const size_t              element_size = sizeof(std::remove_reference<decltype(qscale)>::type::value_type);
     scale                                  = CLFloatArray(num_elements + pad_size);
     scale.resize(num_elements);
-    CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale().data());
+    CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size,
+                                                  qinfo.scale().data());
 
-    if(!qinfo.offset().empty())
+    if (!qinfo.offset().empty())
     {
         // Create offset array
-        const std::vector<int32_t> &qoffset             = qinfo.offset();
-        const size_t                offset_element_size = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type);
-        offset                                          = CLInt32Array(num_elements + pad_size);
+        const std::vector<int32_t> &qoffset = qinfo.offset();
+        const size_t offset_element_size    = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type);
+        offset                              = CLInt32Array(num_elements + pad_size);
         offset.resize(num_elements);
-        CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0, num_elements * offset_element_size, qinfo.offset().data());
+        CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0,
+                                                      num_elements * offset_element_size, qinfo.offset().data());
     }
 }
 } // namespace
@@ -111,7 +115,7 @@ CLTensorAllocator::CLTensorAllocator(IMemoryManageable *owner, CLRuntimeContext
 
 CLQuantization CLTensorAllocator::quantization() const
 {
-    return { &_scale, &_offset };
+    return {&_scale, &_offset};
 }
 
 uint8_t *CLTensorAllocator::data()
@@ -127,10 +131,10 @@ const cl::Buffer &CLTensorAllocator::cl_data() const
 void CLTensorAllocator::allocate()
 {
     // Allocate tensor backing memory
-    if(_associated_memory_group == nullptr)
+    if (_associated_memory_group == nullptr)
     {
         // Perform memory allocation
-        if(static_global_cl_allocator != nullptr)
+        if (static_global_cl_allocator != nullptr)
         {
             _memory.set_owned_region(static_global_cl_allocator->make_region(info().total_size(), 0));
         }
@@ -146,7 +150,7 @@ void CLTensorAllocator::allocate()
     }
 
     // Allocate and fill the quantization parameter arrays
-    if(is_data_type_quantized_per_channel(info().data_type()))
+    if (is_data_type_quantized_per_channel(info().data_type()))
     {
         const size_t pad_size = 0;
         populate_quantization_info(_scale, _offset, info().quantization_info(), pad_size);
@@ -193,7 +197,7 @@ void CLTensorAllocator::set_global_allocator(IAllocator *allocator)
 
 uint8_t *CLTensorAllocator::lock()
 {
-    if(_ctx)
+    if (_ctx)
     {
         return map(_ctx->gpu_scheduler()->queue(), true);
     }
@@ -206,7 +210,7 @@ uint8_t *CLTensorAllocator::lock()
 void CLTensorAllocator::unlock()
 {
     ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-    if(_ctx)
+    if (_ctx)
     {
         unmap(_ctx->gpu_scheduler()->queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer()));
     }
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 445638f01fa7c33cb1c8c926e2f215b93cded177..0d62fe3afe2e69411a5b7581ba4a1c63d40ffb15 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/CLTuner.h"
-#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CL/ICLKernel.h"
 #include "support/StringSupport.h"
@@ -37,19 +38,23 @@
 namespace arm_compute
 {
 CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info)
-    : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info)
+    : real_clEnqueueNDRangeKernel(nullptr),
+      _tuning_params_table(),
+      _lws_table(),
+      _kernel_event(),
+      _tune_new_kernels(tune_new_kernels),
+      _tuning_info(tuning_info)
 {
 }
 
 struct CLTuner::IKernelData
 {
-    virtual ~IKernelData() = default;
+    virtual ~IKernelData()                                          = default;
     virtual void do_run(ICLKernel &kernel, cl::CommandQueue &queue) = 0;
 };
 struct DefaultKernelData : public CLTuner::IKernelData
 {
-    DefaultKernelData(ITensorPack &tensors)
-        : _tensors{ tensors }
+    DefaultKernelData(ITensorPack &tensors) : _tensors{tensors}
     {
     }
     ~DefaultKernelData() override = default;
@@ -100,16 +105,17 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel)
 void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
 {
     // Get the configuration ID from the kernel and append GPU target name and number of available compute units
-    const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units());
+    const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" +
+                                  support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units());
 
     // Check if we need to find the Optimal LWS. If the kernel's config_id is equal to default_config_id, the kernel does not require to be tuned
-    if(kernel.config_id() != arm_compute::default_config_id)
+    if (kernel.config_id() != arm_compute::default_config_id)
     {
         auto p = _tuning_params_table.find(config_id);
 
-        if(p == _tuning_params_table.end())
+        if (p == _tuning_params_table.end())
         {
-            if(_tune_new_kernels)
+            if (_tune_new_kernels)
             {
                 // Find the optimal LWS for the kernel
                 CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, data);
@@ -119,7 +125,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
 
                 // Set Local-Workgroup-Size
                 kernel.set_lws_hint(opt_tuning_params.get_lws());
-                if(_tuning_info.tune_wbsm)
+                if (_tuning_info.tune_wbsm)
                 {
                     kernel.set_wbsm_hint(opt_tuning_params.get_wbsm());
                 }
@@ -129,7 +135,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
         {
             // Set Local-Workgroup-Size
             kernel.set_lws_hint(p->second.get_lws());
-            if(_tuning_info.tune_wbsm)
+            if (_tuning_info.tune_wbsm)
             {
                 kernel.set_wbsm_hint(p->second.get_wbsm());
             }
@@ -138,7 +144,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
 }
 void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
 {
-    DefaultKernelData data{ tensors };
+    DefaultKernelData data{tensors};
 
     do_tune_kernel_dynamic(kernel, &data);
 }
@@ -154,7 +160,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
     cl::CommandQueue queue_profiler;
 
     // Extract real OpenCL function to intercept
-    if(real_clEnqueueNDRangeKernel == nullptr)
+    if (real_clEnqueueNDRangeKernel == nullptr)
     {
         real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
     }
@@ -165,7 +171,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
     // Check if we can use the OpenCL timer with the default queue
     cl_command_queue_properties props = default_queue.getInfo<CL_QUEUE_PROPERTIES>();
 
-    if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
+    if ((props & CL_QUEUE_PROFILING_ENABLE) == 0)
     {
         // Set the queue for profiling
         queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
@@ -176,21 +182,23 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
     }
 
     // Start intercepting enqueues:
-    auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
-                              const cl_event * event_wait_list, cl_event * event)
+    auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo,
+                              const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+                              const cl_event *event_wait_list, cl_event *event)
     {
-        if(this->kernel_event_is_set())
+        if (this->kernel_event_is_set())
         {
             // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
             return CL_SUCCESS;
         }
         cl_event tmp;
-        cl_int   retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
+        cl_int   retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws,
+                                                            num_events_in_wait_list, event_wait_list, &tmp);
 
         // Set OpenCL event
         this->set_cl_kernel_event(tmp);
 
-        if(event != nullptr)
+        if (event != nullptr)
         {
             //return cl_event from the intercepted call
             clRetainEvent(tmp);
@@ -209,9 +217,10 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
     /// This is only a temporary workaround. An ideal solution involves decoupling the execution window from run() / run_op()
     /// Please see COMPMID-5934
     cl::NDRange gws = kernel.get_cached_gws();
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO,
-                                        "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search",
-                                        kernel.config_id().c_str(), to_string(gws).c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(
+        arm_compute::logging::LogLevel::INFO,
+        "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", kernel.config_id().c_str(),
+        to_string(gws).c_str());
 
     queue_profiler.finish();
 
@@ -224,7 +233,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
 
     // Construct the list of tuning parameters values to be tested based on the tuner mode.
     auto tuning_list = cl_tuner::get_tuning_parameters_list(_tuning_info, gws);
-    for(size_t i = 0; i < tuning_list->size(); ++i)
+    for (size_t i = 0; i < tuning_list->size(); ++i)
     {
         CLTuningParams tuning_test = (*tuning_list)[i];
         // Setting the lws
@@ -234,19 +243,18 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
         auto        z           = lws_test[2];
         const bool  invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
 
-        if(invalid_lws)
+        if (invalid_lws)
         {
             continue;
         }
 
         kernel.set_lws_hint(lws_test);
-        if(_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported())
+        if (_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported())
         {
             cl_int wbsm_test = tuning_test.get_wbsm();
             kernel.set_wbsm_hint(wbsm_test);
         }
-        ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO,
-                                            "[CLTuner] Trying LWS: %s, WBSM: %d",
+        ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "[CLTuner] Trying LWS: %s, WBSM: %d",
                                             to_string(kernel.lws_hint()).c_str(), kernel.wbsm_hint());
 
         // Run the kernel
@@ -260,11 +268,11 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
         _kernel_event        = nullptr;
 
         // Check the execution time
-        if(diff < min_exec_time)
+        if (diff < min_exec_time)
         {
             min_exec_time = diff;
             opt_tuning_params.set_lws(tuning_test.get_lws());
-            if(_tuning_info.tune_wbsm)
+            if (_tuning_info.tune_wbsm)
             {
                 opt_tuning_params.set_wbsm(tuning_test.get_wbsm());
             }
@@ -292,30 +300,30 @@ void CLTuner::load_from_file(const std::string &filename)
     std::ifstream fs;
     fs.exceptions(std::ifstream::badbit);
     fs.open(filename, std::ios::in);
-    if(!fs.is_open())
+    if (!fs.is_open())
     {
         ARM_COMPUTE_ERROR_VAR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno);
     }
     std::string line;
     bool        header_line = true;
-    while(!std::getline(fs, line).fail())
+    while (!std::getline(fs, line).fail())
     {
-        if(header_line)
+        if (header_line)
         {
             header_line            = false;
             size_t pos_lws         = line.find("lws");
             size_t pos_wbsm        = line.find("wbsm");
             _tuning_info.tune_wbsm = false;
-            if(pos_lws != std::string::npos || pos_wbsm != std::string::npos)
+            if (pos_lws != std::string::npos || pos_wbsm != std::string::npos)
             {
                 // The file has in the first line the parameters it has been tuned on
-                if(pos_wbsm != std::string::npos)
+                if (pos_wbsm != std::string::npos)
                 {
                     _tuning_info.tune_wbsm = true;
                 }
                 // Once the line with the tuning parameter is read we can
                 // read the next one to start collecting the values
-                if(std::getline(fs, line).fail())
+                if (std::getline(fs, line).fail())
                 {
                     break;
                 }
@@ -324,13 +332,13 @@ void CLTuner::load_from_file(const std::string &filename)
 
         CLTuningParams tuning_params;
         size_t         pos = line.find(";");
-        if(pos == std::string::npos)
+        if (pos == std::string::npos)
         {
             ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
         }
         std::string kernel_id = line.substr(0, pos);
         line.erase(0, pos + 1);
-        if(!tuning_params.from_string(_tuning_info, line))
+        if (!tuning_params.from_string(_tuning_info, line))
         {
             ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
         }
@@ -341,7 +349,7 @@ void CLTuner::load_from_file(const std::string &filename)
 
 bool CLTuner::save_to_file(const std::string &filename) const
 {
-    if(!_tune_new_kernels || _tuning_params_table.empty() || filename.empty())
+    if (!_tune_new_kernels || _tuning_params_table.empty() || filename.empty())
     {
         return false;
     }
@@ -350,16 +358,16 @@ bool CLTuner::save_to_file(const std::string &filename) const
     fs.open(filename, std::ios::out);
     std::string header_string = "";
     header_string += "lws";
-    if(_tuning_info.tune_wbsm)
+    if (_tuning_info.tune_wbsm)
     {
-        if(!header_string.empty())
+        if (!header_string.empty())
         {
             header_string += " ";
         }
         header_string += "wbsm";
     }
     fs << header_string << std::endl;
-    for(auto const &kernel_data : _tuning_params_table)
+    for (auto const &kernel_data : _tuning_params_table)
     {
         CLTuningParams tun_pams(kernel_data.second);
         fs << kernel_data.first << tun_pams.to_string(_tuning_info) << std::endl;
diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp
index 45305377892e3ed2e84739dcbd2d220f5aa789ee..bc782c3a2cc111c5022b2ea83ffa24e68298e146 100644
--- a/src/runtime/CL/ICLSimpleFunction.cpp
+++ b/src/runtime/CL/ICLSimpleFunction.cpp
@@ -26,15 +26,14 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 using namespace arm_compute;
 
 ICLSimpleFunction::ICLSimpleFunction(CLRuntimeContext *ctx) // NOLINT
-    : _kernel(),
-      _border_handler(std::make_unique<CLFillBorderKernel>()),
-      _ctx(ctx)
+    : _kernel(), _border_handler(std::make_unique<CLFillBorderKernel>()), _ctx(ctx)
 {
 }
 
diff --git a/src/runtime/CL/Utils.cpp b/src/runtime/CL/Utils.cpp
index da3d4850bf7bf514d6a90581f1aa962ab2dfd874..294396c28a0ed75c568309fd884448ce09d2951a 100644
--- a/src/runtime/CL/Utils.cpp
+++ b/src/runtime/CL/Utils.cpp
@@ -35,20 +35,20 @@ namespace arm_compute
 void restore_program_cache_from_file(const std::string &filename)
 {
     std::ifstream cache_file(filename, std::ios::binary);
-    if(cache_file.is_open())
+    if (cache_file.is_open())
     {
-        if(!CLScheduler::get().is_initialised())
+        if (!CLScheduler::get().is_initialised())
         {
             arm_compute::CLScheduler::get().default_init();
         }
 
-        while(!cache_file.eof())
+        while (!cache_file.eof())
         {
             size_t name_len   = 0;
             size_t binary_len = 0;
             cache_file.read(reinterpret_cast<char *>(&name_len), sizeof(size_t));
             cache_file.read(reinterpret_cast<char *>(&binary_len), sizeof(size_t));
-            if(name_len == 0 || binary_len == 0)
+            if (name_len == 0 || binary_len == 0)
             {
                 break;
             }
@@ -60,7 +60,7 @@ void restore_program_cache_from_file(const std::string &filename)
             tmp.resize(binary_len);
             cache_file.read(reinterpret_cast<char *>(binary.data()), binary_len);
             cl::Context             context = arm_compute::CLScheduler::get().context();
-            cl::Program::Binaries   binaries{ binary };
+            cl::Program::Binaries   binaries{binary};
             std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
             cl::Program             program(context, devices, binaries);
             program.build();
@@ -72,12 +72,12 @@ void restore_program_cache_from_file(const std::string &filename)
 
 void save_program_cache_to_file(const std::string &filename)
 {
-    if(CLScheduler::get().is_initialised())
+    if (CLScheduler::get().is_initialised())
     {
         std::ofstream cache_file(filename, std::ios::binary);
-        if(cache_file.is_open())
+        if (cache_file.is_open())
         {
-            for(const auto &it : CLKernelLibrary::get().get_built_programs())
+            for (const auto &it : CLKernelLibrary::get().get_built_programs())
             {
                 std::vector<std::vector<unsigned char>> binaries = it.second.getInfo<CL_PROGRAM_BINARIES>();
                 ARM_COMPUTE_ERROR_ON(binaries.size() != 1);
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index f324b1a68ce5d8563c65c1ad4482a673d9a45476..c035644e4ad3c3a9f6c56be0dedc73312b1733b9 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClActivation.h"
 
@@ -35,18 +36,17 @@ namespace arm_compute
 {
 struct CLActivationLayer::Impl
 {
-    const ICLTensor                      *src{ nullptr };
-    ICLTensor                            *dst{ nullptr };
-    CLRuntimeContext                     *ctx{ nullptr };
-    std::unique_ptr<opencl::ClActivation> op{ nullptr };
+    const ICLTensor                      *src{nullptr};
+    ICLTensor                            *dst{nullptr};
+    CLRuntimeContext                     *ctx{nullptr};
+    std::unique_ptr<opencl::ClActivation> op{nullptr};
 };
 
-CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx)
-    : _impl(std::make_unique<Impl>())
+CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
 {
     _impl->ctx = ctx;
 }
-CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default;
+CLActivationLayer::CLActivationLayer(CLActivationLayer &&)            = default;
 CLActivationLayer &CLActivationLayer::operator=(CLActivationLayer &&) = default;
 CLActivationLayer::~CLActivationLayer()                               = default;
 
@@ -55,7 +55,10 @@ void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, Activatio
     configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info);
 }
 
-void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+void CLActivationLayer::configure(const CLCompileContext &compile_context,
+                                  ICLTensor              *input,
+                                  ICLTensor              *output,
+                                  ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -66,7 +69,8 @@ void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTe
     _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), act_info);
 }
 
-Status CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status
+CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     return opencl::ClActivation::validate(input, output, act_info);
 }
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index b30d73902537ced5df25b64844e0face7a5207e9..f9bbd31e8a7a392ef3571bd0f0910c52b495c125 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -27,31 +27,39 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/runtime/Utils.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _not_reshaped_output(), _arg_min_max_kernel(), _reshape(), _reduction_axis()
+    : _memory_group(std::move(memory_manager)),
+      _not_reshaped_output(),
+      _arg_min_max_kernel(),
+      _reshape(),
+      _reduction_axis()
 {
 }
 
 CLArgMinMaxLayer::~CLArgMinMaxLayer() = default;
 
-Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+Status
+CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+                                    "Invalid reduction operation");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions),
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
     DataType   output_data_type = DataType::S32;
@@ -59,17 +67,18 @@ Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITen
     const auto input_num_channles = input->num_channels();
     const auto input_qinfo        = input->quantization_info();
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         output_data_type                       = output->data_type();
-        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false));
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
     }
 
     auto shape_before_reshape = input->tensor_shape();
     shape_before_reshape.set(axis, 1);
-    auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
-    {
+    auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels,
+                                    QuantizationInfo qinfo) {
         ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
     };
 
@@ -85,20 +94,36 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, output, op);
 }
 
-void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+void CLArgMinMaxLayer::configure(const CLCompileContext   &compile_context,
+                                 const ICLTensor          *input,
+                                 int                       axis,
+                                 ICLTensor                *output,
+                                 const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, axis, output, op);
 
     _reduction_axis = axis;
 
-    const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
-    DataType          output_data_type = (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type();
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
-
-    TensorShape not_reshaped_output_shape{ input->info()->tensor_shape() };
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+    DataType output_data_type =
+        (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type();
+    auto_init_if_empty(*output->info(), input->info()
+                                            ->clone()
+                                            ->set_tensor_shape(output_shape)
+                                            .set_data_type(output_data_type)
+                                            .reset_padding()
+                                            .set_is_resizable(true));
+
+    TensorShape not_reshaped_output_shape{input->info()->tensor_shape()};
     not_reshaped_output_shape.set(axis, 1);
-    auto_init_if_empty(*_not_reshaped_output.info(), input->info()->clone()->set_tensor_shape(not_reshaped_output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+    auto_init_if_empty(*_not_reshaped_output.info(), input->info()
+                                                         ->clone()
+                                                         ->set_tensor_shape(not_reshaped_output_shape)
+                                                         .set_data_type(output_data_type)
+                                                         .reset_padding()
+                                                         .set_is_resizable(true));
 
     _arg_min_max_kernel = std::make_unique<CLArgMinMaxLayerKernel>();
     _arg_min_max_kernel->configure(compile_context, input, &_not_reshaped_output, axis, op);
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index e8affc0853c2e4e4b22e70165aba993e069c41be..0c371c41717b717e969865ad88e9170a031480f7 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -30,9 +30,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 
 namespace arm_compute
 {
@@ -43,24 +42,40 @@ CLBatchNormalizationLayer::CLBatchNormalizationLayer()
 
 CLBatchNormalizationLayer::~CLBatchNormalizationLayer() = default;
 
-void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon,
+void CLBatchNormalizationLayer::configure(ICLTensor          *input,
+                                          ICLTensor          *output,
+                                          const ICLTensor    *mean,
+                                          const ICLTensor    *var,
+                                          const ICLTensor    *beta,
+                                          const ICLTensor    *gamma,
+                                          float               epsilon,
                                           ActivationLayerInfo act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
-                                          const ICLTensor *gamma, float epsilon,
-                                          ActivationLayerInfo act_info)
+void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context,
+                                          ICLTensor              *input,
+                                          ICLTensor              *output,
+                                          const ICLTensor        *mean,
+                                          const ICLTensor        *var,
+                                          const ICLTensor        *beta,
+                                          const ICLTensor        *gamma,
+                                          float                   epsilon,
+                                          ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info);
     _norm_kernel->configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                           const ITensorInfo *mean, const ITensorInfo *var,
-                                           const ITensorInfo *beta, const ITensorInfo *gamma,
-                                           float epsilon, ActivationLayerInfo act_info)
+Status CLBatchNormalizationLayer::validate(const ITensorInfo  *input,
+                                           const ITensorInfo  *output,
+                                           const ITensorInfo  *mean,
+                                           const ITensorInfo  *var,
+                                           const ITensorInfo  *beta,
+                                           const ITensorInfo  *gamma,
+                                           float               epsilon,
+                                           ActivationLayerInfo act_info)
 {
     return CLBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info);
 }
@@ -69,4 +84,4 @@ void CLBatchNormalizationLayer::run()
 {
     CLScheduler::get().enqueue(*_norm_kernel, true);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
index d7a409128d86458ce2826027d33b8661f83ffbce..a3798daf61bd3047c988516e382b1c96bbab6d8e 100644
--- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
@@ -30,14 +30,12 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
 
 namespace arm_compute
 {
-CLBatchToSpaceLayer::CLBatchToSpaceLayer()
-    : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>())
+CLBatchToSpaceLayer::CLBatchToSpaceLayer() : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>())
 {
 }
 
@@ -49,29 +47,43 @@ void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *blo
     _batch_to_space_kernel->configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
 }
 
-void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    const ICLTensor        *block_shape,
+                                    ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input, block_shape, output);
     _batch_to_space_kernel->configure(compile_context, input, block_shape, output);
 }
 
-void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
+void CLBatchToSpaceLayer::configure(
+    const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info);
 }
 
-void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    int32_t                 block_shape_x,
+                                    int32_t                 block_shape_y,
+                                    ICLTensor              *output,
+                                    const CropInfo         &crop_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, output);
     _batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output, crop_info);
 }
 
-Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     return CLBatchToSpaceLayerKernel::validate(input, block_shape, output);
 }
 
-Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status CLBatchToSpaceLayer::validate(const ITensorInfo *input,
+                                     int32_t            block_shape_x,
+                                     int32_t            block_shape_y,
+                                     const ITensorInfo *output,
+                                     const CropInfo    &crop_info)
 {
     return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info);
 }
diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
index a4712ed3f1c2f568c875c04e7e7ccc1fe9fe0ce5..7bfd0e3677f0b9b39a288e00d6cb5efc1a3c0ea0 100644
--- a/src/runtime/CL/functions/CLBitwiseAnd.cpp
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
 
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
@@ -36,11 +35,14 @@ void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, I
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLBitwiseAnd::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseAnd::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input1,
+                             const ICLTensor        *input2,
+                             ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<CLBitwiseKernel>();
     k->configure(compile_context, input1, input2, output, BitwiseOperation::AND);
     _kernel = std::move(k);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
index 5964b9244759893187eeac6961f58ba77aec8b6e..9763915c02e173b078f944d72f05d7d6df448352 100644
--- a/src/runtime/CL/functions/CLBitwiseNot.cpp
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
 
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
@@ -43,4 +42,4 @@ void CLBitwiseNot::configure(const CLCompileContext &compile_context, const ICLT
     k->configure(compile_context, input, nullptr, output, BitwiseOperation::NOT);
     _kernel = std::move(k);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
index a07bf17bb29e9b16619b498f637d0e0dcd509863..dd3171b982cdb6c50d37a4119ebf721fff7c4502 100644
--- a/src/runtime/CL/functions/CLBitwiseOr.cpp
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
 
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
@@ -36,11 +35,14 @@ void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, IC
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLBitwiseOr::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseOr::configure(const CLCompileContext &compile_context,
+                            const ICLTensor        *input1,
+                            const ICLTensor        *input2,
+                            ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<CLBitwiseKernel>();
     k->configure(compile_context, input1, input2, output, BitwiseOperation::OR);
     _kernel = std::move(k);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
index f65e2e406c54ccc3978251b70e2fbda4b1adcf18..5bee4b37ec6a7d7d7029cbe9ecffb29b1700088a 100644
--- a/src/runtime/CL/functions/CLBitwiseXor.cpp
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
 
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
@@ -36,7 +35,10 @@ void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, I
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLBitwiseXor::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseXor::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input1,
+                             const ICLTensor        *input2,
+                             ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<CLBitwiseKernel>();
diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
index 48583bfaf3bbb8e446d91e85ddb336e4e899f350..76e626fd75fb5dfe390991beccdfc9d385c0efc0 100644
--- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
+++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
@@ -23,18 +23,24 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h"
 
-#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
 
 namespace arm_compute
 {
-void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransform::configure(const ICLTensor                *boxes,
+                                       ICLTensor                      *pred_boxes,
+                                       const ICLTensor                *deltas,
+                                       const BoundingBoxTransformInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
 }
 
-void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransform::configure(const CLCompileContext         &compile_context,
+                                       const ICLTensor                *boxes,
+                                       ICLTensor                      *pred_boxes,
+                                       const ICLTensor                *deltas,
+                                       const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info);
 
@@ -44,7 +50,10 @@ void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context,
     _kernel = std::move(k);
 }
 
-Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status CLBoundingBoxTransform::validate(const ITensorInfo              *boxes,
+                                        const ITensorInfo              *pred_boxes,
+                                        const ITensorInfo              *deltas,
+                                        const BoundingBoxTransformInfo &info)
 {
     return CLBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
 }
diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
index 10f7cc2065e1b22c2ab0b0432900214d02844d95..42ec8f7ee0827863460d0c418e39b4b465a70768 100644
--- a/src/runtime/CL/functions/CLCast.cpp
+++ b/src/runtime/CL/functions/CLCast.cpp
@@ -26,10 +26,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCast.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCast.h"
 
 #include <utility>
 
@@ -37,16 +37,15 @@ namespace arm_compute
 {
 struct CLCast::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClCast> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCast> op{nullptr};
 };
 
-CLCast::CLCast()
-    : _impl(std::make_unique<Impl>())
+CLCast::CLCast() : _impl(std::make_unique<Impl>())
 {
 }
-CLCast::CLCast(CLCast &&) = default;
+CLCast::CLCast(CLCast &&)            = default;
 CLCast &CLCast::operator=(CLCast &&) = default;
 CLCast::~CLCast()                    = default;
 
@@ -55,7 +54,10 @@ void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy
     configure(CLKernelLibrary::get().get_compile_context(), input, output, policy);
 }
 
-void CLCast::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
+void CLCast::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *input,
+                       ICLTensor              *output,
+                       ConvertPolicy           policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, policy);
@@ -74,7 +76,7 @@ Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, Con
 
 void CLCast::run()
 {
-    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
     _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
index 021f28f238fd734d588a1559a883d72b7068868d..1ee4789816c7063f2a5719b7f695f949de68539a 100644
--- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
+++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
 
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 
 namespace arm_compute
 {
@@ -35,7 +35,10 @@ void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output,
     configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
 }
 
-void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context,
+                                      const ICLTensor        *input,
+                                      ICLTensor              *output,
+                                      unsigned int            num_groups)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, num_groups);
     auto k = std::make_unique<CLChannelShuffleLayerKernel>();
diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
index 192a266f0fb419bf968aa89e6bc76191480ddd9a..2f54371e88d30f78c72b1f8831e5dece3fdb8adf 100644
--- a/src/runtime/CL/functions/CLComparison.cpp
+++ b/src/runtime/CL/functions/CLComparison.cpp
@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLComparisonKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLComparisonKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 namespace arm_compute
 {
@@ -37,25 +37,33 @@ void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
 }
 
-void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparison::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input1,
+                             ICLTensor              *input2,
+                             ICLTensor              *output,
+                             ComparisonOperation     operation)
 {
     ARM_COMPUTE_LOG_PARAMS(input2, input2, output, operation);
     auto k = std::make_unique<CLComparisonKernel>();
     k->configure(compile_context, input1, input2, output, operation);
     _kernel = std::move(k);
 
-    if(output->info()->dimension(0) > 1)
+    if (output->info()->dimension(0) > 1)
     {
         ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
 
-        if(broadcasted_info->info()->dimension(0) == 1)
+        if (broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(),
+                                       BorderMode::REPLICATE);
         }
     }
 }
 
-Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+Status CLComparison::validate(const ITensorInfo  *input1,
+                              const ITensorInfo  *input2,
+                              const ITensorInfo  *output,
+                              ComparisonOperation operation)
 {
     return CLComparisonKernel::validate(input1, input2, output, operation);
 }
@@ -67,25 +75,30 @@ void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, IC
 }
 
 template <ComparisonOperation COP>
-void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context,
+                                        ICLTensor              *input1,
+                                        ICLTensor              *input2,
+                                        ICLTensor              *output)
 {
     auto k = std::make_unique<CLComparisonKernel>();
     k->configure(compile_context, input1, input2, output, COP);
     _kernel = std::move(k);
 
-    if(output->info()->dimension(0) > 1)
+    if (output->info()->dimension(0) > 1)
     {
         ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
 
-        if(broadcasted_info->info()->dimension(0) == 1)
+        if (broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(),
+                                       BorderMode::REPLICATE);
         }
     }
 }
 
 template <ComparisonOperation COP>
-Status CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status
+CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
     return CLComparisonKernel::validate(input1, input2, output, COP);
 }
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 0a8884f4e3692d70124777dd6026d8757a25ecca..9df1c34593581f8cb7aa4312a0de83e271998999 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -24,24 +24,23 @@
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClConcatenate.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClConcatenate.h"
 
 namespace arm_compute
 {
 struct CLConcatenateLayer::Impl
 {
     std::vector<const ICLTensor *>         srcs{};
-    ICLTensor                             *dst{ nullptr };
-    unsigned int                           num_inputs{ 0 };
-    unsigned int                           axis{ 0 };
-    std::unique_ptr<opencl::ClConcatenate> op{ nullptr };
+    ICLTensor                             *dst{nullptr};
+    unsigned int                           num_inputs{0};
+    unsigned int                           axis{0};
+    std::unique_ptr<opencl::ClConcatenate> op{nullptr};
 };
 
-CLConcatenateLayer::CLConcatenateLayer()
-    : _impl(std::make_unique<Impl>())
+CLConcatenateLayer::CLConcatenateLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
@@ -56,7 +55,10 @@ void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector
     configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis);
 }
 
-void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+void CLConcatenateLayer::configure(const CLCompileContext         &compile_context,
+                                   std::vector<const ICLTensor *> &inputs_vector,
+                                   ICLTensor                      *output,
+                                   size_t                          axis)
 {
     ARM_COMPUTE_ERROR_ON(output == nullptr);
     ARM_COMPUTE_LOG_PARAMS(inputs_vector, output, axis);
@@ -68,7 +70,7 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std:
     _impl->op         = std::make_unique<opencl::ClConcatenate>();
 
     std::vector<ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < inputs_vector.size(); ++i)
+    for (unsigned int i = 0; i < inputs_vector.size(); ++i)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
@@ -76,7 +78,9 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std:
     _impl->op->configure(compile_context, inputs_vector_info, _impl->dst->info(), axis);
 }
 
-Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector,
+                                    const ITensorInfo                      *output,
+                                    size_t                                  axis)
 {
     return opencl::ClConcatenate::validate(inputs_vector, output, axis);
 }
@@ -84,7 +88,7 @@ Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inpu
 void CLConcatenateLayer::run()
 {
     ITensorPack pack;
-    for(unsigned i = 0; i < _impl->num_inputs; ++i)
+    for (unsigned i = 0; i < _impl->num_inputs; ++i)
     {
         pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i));
     }
diff --git a/src/runtime/CL/functions/CLConv3D.cpp b/src/runtime/CL/functions/CLConv3D.cpp
index 729b973b6a3fdb5a959ab5dd2dde70caaaf05fee..9d1b368f72d6d61b391fe963bbc17e62d1a4e5cf 100644
--- a/src/runtime/CL/functions/CLConv3D.cpp
+++ b/src/runtime/CL/functions/CLConv3D.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLConv3D.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/gpu/cl/operators/ClDirectConv3d.h"
 
 namespace arm_compute
@@ -32,29 +33,38 @@ using namespace arm_compute::experimental;
 
 struct CLConv3D::Impl
 {
-    const ICLTensor                        *src{ nullptr };
-    const ICLTensor                        *weights{ nullptr };
-    const ICLTensor                        *biases{ nullptr };
-    ICLTensor                              *dst{ nullptr };
-    std::unique_ptr<opencl::ClDirectConv3d> op{ nullptr };
+    const ICLTensor                        *src{nullptr};
+    const ICLTensor                        *weights{nullptr};
+    const ICLTensor                        *biases{nullptr};
+    ICLTensor                              *dst{nullptr};
+    std::unique_ptr<opencl::ClDirectConv3d> op{nullptr};
 };
 
-CLConv3D::CLConv3D()
-    : _impl(std::make_unique<Impl>())
+CLConv3D::CLConv3D() : _impl(std::make_unique<Impl>())
 {
 }
 
 CLConv3D::~CLConv3D() = default;
 
-void CLConv3D::configure(const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info)
+void CLConv3D::configure(const ICLTensor  *src,
+                         const ICLTensor  *weights,
+                         const ICLTensor  *biases,
+                         ICLTensor        *dst,
+                         const Conv3dInfo &conv3d_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), src, weights, biases, dst, conv3d_info);
 }
 
-void CLConv3D::configure(const CLCompileContext &compile_context, const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info)
+void CLConv3D::configure(const CLCompileContext &compile_context,
+                         const ICLTensor        *src,
+                         const ICLTensor        *weights,
+                         const ICLTensor        *biases,
+                         ICLTensor              *dst,
+                         const Conv3dInfo       &conv3d_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate(src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate(
+        src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info));
 
     _impl->src     = src;
     _impl->weights = weights;
@@ -62,10 +72,15 @@ void CLConv3D::configure(const CLCompileContext &compile_context, const ICLTenso
     _impl->dst     = dst;
 
     _impl->op = std::make_unique<opencl::ClDirectConv3d>();
-    _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(), _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info);
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(),
+                         _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info);
 }
 
-Status CLConv3D::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status CLConv3D::validate(const ITensorInfo *src,
+                          const ITensorInfo *weights,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *dst,
+                          const Conv3dInfo  &conv3d_info)
 {
     return opencl::ClDirectConv3d::validate(src, weights, biases, dst, conv3d_info);
 }
diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
index b3efe5c8a0b086beb24306d9ec9cf320da863887..2298f2a6699b4650d477a654f82721c6b9ee4d68 100644
--- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
@@ -27,33 +27,37 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 
 namespace arm_compute
 {
 struct CLConvertFullyConnectedWeights::Impl
 {
-    const ICLTensor                                        *src{ nullptr };
-    ICLTensor                                              *dst{ nullptr };
-    std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{ nullptr };
+    const ICLTensor                                        *src{nullptr};
+    ICLTensor                                              *dst{nullptr};
+    std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{nullptr};
 };
-CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights()
-    : _impl(std::make_unique<Impl>())
+CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>())
 {
 }
 CLConvertFullyConnectedWeights::~CLConvertFullyConnectedWeights() = default;
 
-void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
-                                               DataLayout data_layout)
+void CLConvertFullyConnectedWeights::configure(const ICLTensor   *input,
+                                               ICLTensor         *output,
+                                               const TensorShape &original_input_shape,
+                                               DataLayout         data_layout)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout);
 }
 
-void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
-                                               DataLayout data_layout)
+void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context,
+                                               const ICLTensor        *input,
+                                               ICLTensor              *output,
+                                               const TensorShape      &original_input_shape,
+                                               DataLayout              data_layout)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, original_input_shape, data_layout);
@@ -63,8 +67,10 @@ void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_c
     _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), original_input_shape, data_layout);
 }
 
-Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
-                                                DataLayout data_layout)
+Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input,
+                                                const ITensorInfo *output,
+                                                const TensorShape &original_input_shape,
+                                                DataLayout         data_layout)
 {
     return opencl::ClConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout);
 }
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 476bf274231c6e3e87915e1379dadcb49f51de9e..7767b45a01da1b84bfb9bbfb3604ef18bac23c86 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -28,12 +28,11 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/core/experimental/PostOpUtils.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClConv2d.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -44,46 +43,59 @@ struct CLConvolutionLayer::Impl
 {
     MemoryGroup                          memory_group{};
     std::shared_ptr<IMemoryManager>      memory_manager{};
-    std::unique_ptr<opencl::IClOperator> op{ nullptr };
+    std::unique_ptr<opencl::IClOperator> op{nullptr};
     ITensorPack                          run_pack{};
     ITensorPack                          prep_pack{};
     WorkspaceData<CLTensor>              workspace{};
     experimental::MemoryRequirements     aux_mem_req{};
-    std::unique_ptr<IFunction>           func{ nullptr };
+    std::unique_ptr<IFunction>           func{nullptr};
 };
 
-CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_manager = std::move(memory_manager);
 }
 
 CLConvolutionLayer::~CLConvolutionLayer() = default;
 
-void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+void CLConvolutionLayer::configure(ICLTensor                 *input,
+                                   const ICLTensor           *weights,
+                                   const ICLTensor           *biases,
+                                   ICLTensor                 *output,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math,
+                                   unsigned int               num_groups)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups, post_ops);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info,
+              dilation, act_info, enable_fast_math, num_groups);
 }
 
-void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                   const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+void CLConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                   ICLTensor                 *input,
+                                   const ICLTensor           *weights,
+                                   const ICLTensor           *biases,
+                                   ICLTensor                 *output,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math,
+                                   unsigned int               num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
-                                                            enable_fast_math, num_groups));
-    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups, post_ops);
+    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(
+        input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
+        weights_info, dilation, act_info, enable_fast_math, num_groups));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                           enable_fast_math, num_groups);
 
-    // Convert post op arguments to ITensorInfo
-    auto transformed_post_ops = experimental::transform_post_op_list_arguments<ICLTensor *, ITensorInfo *>(post_ops, [](auto tensor)
-    {
-        return tensor->info();
-    });
-    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups, transformed_post_ops);
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
-    switch(opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info,
-                                                    weights_info, CLScheduler::get().target()))
+    switch (opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info,
+                                                     weights_info, CLScheduler::get().target()))
     {
         case ConvolutionMethod::WINOGRAD:
         case ConvolutionMethod::DIRECT:
@@ -91,13 +103,13 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT
         case ConvolutionMethod::GEMM:
         {
             auto f = std::make_unique<opencl::ClConv2d>();
-            f->configure(compile_context, input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
+            f->configure(compile_context, input->info(), weights->info(),
+                         ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
             _impl->op = std::move(f);
             break;
         }
         case ConvolutionMethod::FFT:
         {
-            ARM_COMPUTE_ERROR_ON_MSG(post_ops.size() > 0, "CLFFTConvolutionLayer does not support post ops");
             auto f = std::make_unique<CLFFTConvolutionLayer>(_impl->memory_manager);
             f->configure(compile_context, input, weights, biases, output, conv_info, act_info, enable_fast_math);
             _impl->func = std::move(f);
@@ -108,49 +120,52 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT
             break;
     }
 
-    if(_impl->op)
+    if (_impl->op)
     {
-        _impl->memory_group         = MemoryGroup(std::move(_impl->memory_manager));
-        _impl->aux_mem_req          = _impl->op->workspace();
-        _impl->run_pack             = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-        size_t post_op_tensor_index = 0;
-        for(const auto &op : post_ops.get_list())
-        {
-            for(auto &tensor : op->arguments())
-            {
-                _impl->run_pack.add_const_tensor(experimental::get_post_op_arg_type(post_op_tensor_index++), *tensor);
-            }
-        }
-        _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
-        _impl->workspace = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+        _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
+        _impl->aux_mem_req  = _impl->op->workspace();
+        _impl->run_pack     = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+        _impl->prep_pack    = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+        _impl->workspace =
+            manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
     }
 }
 
-Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups, const experimental::PostOpList<ITensorInfo *> &post_ops)
+Status CLConvolutionLayer::validate(const ITensorInfo         *input,
+                                    const ITensorInfo         *weights,
+                                    const ITensorInfo         *biases,
+                                    const ITensorInfo         *output,
+                                    const PadStrideInfo       &conv_info,
+                                    const WeightsInfo         &weights_info,
+                                    const Size2D              &dilation,
+                                    const ActivationLayerInfo &act_info,
+                                    bool                       enable_fast_math,
+                                    unsigned int               num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
 
     const GPUTarget  gpu_target  = CLScheduler::get().target();
-    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups, post_ops);
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
-    switch(opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target))
+    switch (opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target))
     {
         case ConvolutionMethod::WINOGRAD:
         case ConvolutionMethod::DIRECT:
         case ConvolutionMethod::INDIRECT:
         case ConvolutionMethod::GEMM:
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info));
             break;
         }
         case ConvolutionMethod::FFT:
         {
             // Validate FFT-based convolution layer
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(post_ops.size() > 0, "CLFFTConvolutionLayer does not support post ops");
-            ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info,
+                                                                        act_info, enable_fast_math));
             break;
         }
         default:
@@ -161,8 +176,15 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
     return Status{};
 }
 
-ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                             const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math)
+ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo         *input,
+                                                             const ITensorInfo         *weights,
+                                                             const ITensorInfo         *output,
+                                                             const PadStrideInfo       &conv_info,
+                                                             const WeightsInfo         &weights_info,
+                                                             const ActivationLayerInfo &act_info,
+                                                             const GPUTarget            gpu_target,
+                                                             const Size2D              &dilation,
+                                                             bool                       enable_fast_math)
 {
     const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, 1);
     return opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target);
@@ -174,7 +196,7 @@ void CLConvolutionLayer::run()
 
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
 
-    if(_impl->func)
+    if (_impl->func)
     {
         _impl->func->run();
     }
@@ -186,7 +208,7 @@ void CLConvolutionLayer::run()
 
 void CLConvolutionLayer::prepare()
 {
-    if(_impl->func)
+    if (_impl->func)
     {
         _impl->func->prepare();
     }
diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp
index 56400b67a01171f24b218fdb2e926d83ce8d384a..a4f2b0634f22e8f3833a07714e379b557cd8f213 100644
--- a/src/runtime/CL/functions/CLCopy.cpp
+++ b/src/runtime/CL/functions/CLCopy.cpp
@@ -27,10 +27,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCopy.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCopy.h"
 
 #include <utility>
 
@@ -38,16 +38,15 @@ namespace arm_compute
 {
 struct CLCopy::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClCopy> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCopy> op{nullptr};
 };
 
-CLCopy::CLCopy()
-    : _impl(std::make_unique<Impl>())
+CLCopy::CLCopy() : _impl(std::make_unique<Impl>())
 {
 }
-CLCopy::CLCopy(CLCopy &&) = default;
+CLCopy::CLCopy(CLCopy &&)            = default;
 CLCopy &CLCopy::operator=(CLCopy &&) = default;
 CLCopy::~CLCopy()                    = default;
 
diff --git a/src/runtime/CL/functions/CLCrop.cpp b/src/runtime/CL/functions/CLCrop.cpp
index 35ea17cfc2f0dd1c42e73680e3b0891410810ec3..fc29c43827a0ed6119aa6c33f2edc599949ce8f5 100644
--- a/src/runtime/CL/functions/CLCrop.cpp
+++ b/src/runtime/CL/functions/CLCrop.cpp
@@ -27,10 +27,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCrop.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCrop.h"
 
 #include <utility>
 
@@ -38,27 +38,38 @@ namespace arm_compute
 {
 struct CLCrop::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClCrop> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCrop> op{nullptr};
 };
 
-CLCrop::CLCrop()
-    : _impl(std::make_unique<Impl>())
+CLCrop::CLCrop() : _impl(std::make_unique<Impl>())
 {
 }
-CLCrop::CLCrop(CLCrop &&) = default;
+CLCrop::CLCrop(CLCrop &&)            = default;
 CLCrop &CLCrop::operator=(CLCrop &&) = default;
 CLCrop::~CLCrop()                    = default;
 
-void CLCrop::configure(const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
-                       Window *dst_window)
+void CLCrop::configure(const ICLTensor *src,
+                       ICLTensor       *dst,
+                       Coordinates2D    start,
+                       Coordinates2D    end,
+                       uint32_t         batch_index,
+                       float            extrapolation_value,
+                       Window          *dst_window)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, dst_window);
+    configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value,
+              dst_window);
 }
 
-void CLCrop::configure(const CLCompileContext &compile_context, const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
-                       Window *dst_window)
+void CLCrop::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *src,
+                       ICLTensor              *dst,
+                       Coordinates2D           start,
+                       Coordinates2D           end,
+                       uint32_t                batch_index,
+                       float                   extrapolation_value,
+                       Window                 *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window);
@@ -67,10 +78,17 @@ void CLCrop::configure(const CLCompileContext &compile_context, const ICLTensor
     _impl->dst = dst;
 
     _impl->op = std::make_unique<opencl::ClCrop>();
-    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index, extrapolation_value, dst_window);
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index,
+                         extrapolation_value, dst_window);
 }
 
-Status CLCrop::validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
+Status CLCrop::validate(const ITensorInfo *input,
+                        const ITensorInfo *output,
+                        Coordinates2D      start,
+                        Coordinates2D      end,
+                        uint32_t           batch_index,
+                        float              extrapolation_value,
+                        Window            *dst_window)
 {
     return opencl::ClCrop::validate(input, output, start, end, batch_index, extrapolation_value, dst_window);
 }
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index d8fc38d99e705eaeeee761d91344f2ecb3fe7e72..821412b14959abef4a7e13cf5b4c8f0c9d9e018f 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -25,19 +25,26 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
-#include "src/common/utils/Log.h"
-
 #include <cstddef>
 
 namespace arm_compute
 {
 namespace
 {
-inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTensor *box_ind, ICLTensor *output, uint32_t crop_box_ind, Coordinates &start, Coordinates &end, uint32_t &batch_index)
+inline void configure_crop(const ICLTensor *input,
+                           ICLTensor       *crop_boxes,
+                           ICLTensor       *box_ind,
+                           ICLTensor       *output,
+                           uint32_t         crop_box_ind,
+                           Coordinates     &start,
+                           Coordinates     &end,
+                           uint32_t        &batch_index)
 {
     batch_index = *(reinterpret_cast<int32_t *>(box_ind->ptr_to_element(Coordinates(crop_box_ind))));
 
@@ -50,30 +57,48 @@ inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTen
     // The normalized coordinates are scaled to retrieve the floating point image coordinates which are rounded to integers.
     start = Coordinates(std::floor(x0 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
                         std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
-    end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
-                      std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
-    const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
+    end   = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
+                        std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
+    const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1,
+                                static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
     output->info()->set_tensor_shape(out_shape);
 }
 } // namespace
 
 CLCropResize::CLCropResize()
-    : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results(), _internal_functions()
+    : _input(nullptr),
+      _boxes(nullptr),
+      _box_ind(nullptr),
+      _output(nullptr),
+      _num_boxes(0),
+      _method(),
+      _extrapolation_value(0),
+      _scale(),
+      _copy(),
+      _crop_results(),
+      _scaled_results(),
+      _internal_functions()
 {
 }
 
 CLCropResize::~CLCropResize() = default;
 
-Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
-                              Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+Status CLCropResize::validate(const ITensorInfo  *input,
+                              ITensorInfo        *boxes,
+                              ITensorInfo        *box_ind,
+                              const ITensorInfo  *output,
+                              Coordinates2D       crop_size,
+                              InterpolationPolicy method,
+                              float               extrapolation_value)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
     ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4);
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
     TensorInfo temp_info;
-    ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, { 0, 0 }, { 1, 1 }, input->dimension(3) - 1, extrapolation_value));
-    if(output->total_size() > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, {0, 0}, {1, 1},
+                                                 input->dimension(3) - 1, extrapolation_value));
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -83,20 +108,34 @@ Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITen
     return Status{};
 }
 
-void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
-                             InterpolationPolicy method, float extrapolation_value)
+void CLCropResize::configure(const ICLTensor    *input,
+                             ICLTensor          *boxes,
+                             ICLTensor          *box_ind,
+                             ICLTensor          *output,
+                             Coordinates2D       crop_size,
+                             InterpolationPolicy method,
+                             float               extrapolation_value)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, extrapolation_value);
+    configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method,
+              extrapolation_value);
 }
 
-void CLCropResize::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
-                             InterpolationPolicy method, float extrapolation_value)
+void CLCropResize::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input,
+                             ICLTensor              *boxes,
+                             ICLTensor              *box_ind,
+                             ICLTensor              *output,
+                             Coordinates2D           crop_size,
+                             InterpolationPolicy     method,
+                             float                   extrapolation_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, boxes, box_ind);
-    ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+    ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(),
+                                                      crop_size, method, extrapolation_value));
     ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value);
 
-    TensorShape output_shape = TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]);
+    TensorShape output_shape =
+        TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]);
     auto_init_if_empty(*output->info(), output_shape, 1, DataType::F32);
 
     _num_boxes = boxes->info()->tensor_shape()[1];
@@ -122,7 +161,7 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
     // kernels used for cropping and scaling.
     _boxes->map(CLScheduler::get().queue());
     _box_ind->map(CLScheduler::get().queue());
-    for(unsigned int num_box = 0; num_box < _num_boxes; ++num_box)
+    for (unsigned int num_box = 0; num_box < _num_boxes; ++num_box)
     {
         auto       crop_tensor = std::make_unique<CLTensor>();
         TensorInfo crop_result_info(1, DataType::F32);
@@ -143,7 +182,9 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
         configure_crop(_input, _boxes, _box_ind, _crop_results[num_box].get(), num_box, start, end, batch_index);
 
         auto scale_kernel = std::make_unique<CLScale>();
-        scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT });
+        scale_kernel->configure(
+            compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(),
+            ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT});
         _scale.emplace_back(std::move(scale_kernel));
 
         Window win = calculate_max_window(*_output->info());
@@ -159,28 +200,50 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
         bool is_width_flipped  = end[0] < start[0];
         bool is_height_flipped = end[1] < start[1];
         /** The number of rows out of bounds at the start and end of _crop_results[num_box].get(). */
-        std::array<int32_t, 2> rows_out_of_bounds{ 0 };
+        std::array<int32_t, 2> rows_out_of_bounds{0};
         /** The number of columns out of bounds at the start and end of _crop_results[num_box].get(). */
-        std::array<int32_t, 2> cols_out_of_bounds{ 0 };
-        if(is_height_flipped)
+        std::array<int32_t, 2> cols_out_of_bounds{0};
+        if (is_height_flipped)
         {
-            rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(start[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
-            rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
+            rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                        ? std::min(start[1] - _input->info()->dimension(2) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(2))
+                                        : 0;
+            rows_out_of_bounds[1] =
+                end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)))
+                           : 0;
         }
         else
         {
-            rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
-            rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(end[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
+            rows_out_of_bounds[0] =
+                start[1] < 0
+                    ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)))
+                    : 0;
+            rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                        ? std::min(end[1] - _input->info()->dimension(2) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(2))
+                                        : 0;
         }
-        if(is_width_flipped)
+        if (is_width_flipped)
         {
-            cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(start[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
-            cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
+            cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                        ? std::min(start[0] - _input->info()->dimension(1) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(1))
+                                        : 0;
+            cols_out_of_bounds[1] =
+                end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)))
+                           : 0;
         }
         else
         {
-            cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
-            cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(end[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
+            cols_out_of_bounds[0] =
+                start[0] < 0
+                    ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)))
+                    : 0;
+            cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                        ? std::min(end[0] - _input->info()->dimension(1) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(1))
+                                        : 0;
         }
 
         Window full_window = calculate_max_window(*_crop_results[num_box].get()->info());
@@ -203,67 +266,84 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
         // Fill all _crop_results[num_box].get() rows that have no elements that are within the input bounds
         // with the extrapolation value using memset.
         // First for the rows before the in bounds rows.
-        if(rows_out_of_bounds[0] > 0)
+        if (rows_out_of_bounds[0] > 0)
         {
             Window slice_fill_rows_before(full_window);
             slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1));
             auto kernel = std::make_unique<CLFill>();
-            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_before);
+            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                              &slice_fill_rows_before);
             //_internal_functions.emplace_back(std::move(kernel));
             _internal_functions.push_back(std::move(kernel));
         }
 
         Window slice_in(full_window);
-        slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1));
-        slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1));
-
-        int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1];
-        if(rows_in_bounds > 0)
+        slice_in.set(2,
+                     Window::Dimension(rows_out_of_bounds[0],
+                                       _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1));
+        slice_in.set(1,
+                     Window::Dimension(cols_out_of_bounds[0],
+                                       _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1));
+
+        int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) -
+                             rows_out_of_bounds[0] - rows_out_of_bounds[1];
+        if (rows_in_bounds > 0)
         {
             // Fill all elements that share a row with an in bounds element with the extrapolation value.
-            if(cols_out_of_bounds[0] > 0)
+            if (cols_out_of_bounds[0] > 0)
             {
                 Window slice_fill_cols_before(slice_in);
                 slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1));
                 auto kernel = std::make_unique<CLFill>();
-                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_before);
+                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                                  &slice_fill_cols_before);
                 //_internal_functions.emplace_back(std::move(kernel));
                 _internal_functions.push_back(std::move(kernel));
             }
 
-            if(cols_out_of_bounds[1] > 0)
+            if (cols_out_of_bounds[1] > 0)
             {
                 Window slice_fill_cols_after(slice_in);
-                slice_fill_cols_after.set(1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(1), 1));
+                slice_fill_cols_after.set(
+                    1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1],
+                                         _crop_results[num_box].get()->info()->dimension(1), 1));
                 auto kernel = std::make_unique<CLFill>();
-                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_after);
+                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                                  &slice_fill_cols_after);
                 //_internal_functions.emplace_back(std::move(kernel));
                 _internal_functions.push_back(std::move(kernel));
             }
 
             // Copy all elements within the input bounds from the input tensor.
-            int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1];
-            if(cols_in_bounds > 0)
+            int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) -
+                                 cols_out_of_bounds[0] - cols_out_of_bounds[1];
+            if (cols_in_bounds > 0)
             {
-                Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
-                                        is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] };
-                Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
-                                      is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 };
+                Coordinates2D start_in{
+                    is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
+                    is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0]};
+                Coordinates2D end_in{
+                    is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
+                    is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1};
                 auto kernel = std::make_unique<CLCrop>();
 
-                kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, extrapolation_value, &slice_in);
+                kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index,
+                                  extrapolation_value, &slice_in);
                 //_internal_functions.emplace_back(std::move(kernel));
                 _internal_functions.push_back(std::move(kernel));
             }
         }
 
         // Fill all rows after the in bounds elements with the extrapolation value.
-        if(rows_out_of_bounds[1] > 0)
+        if (rows_out_of_bounds[1] > 0)
         {
             Window slice_fill_rows_after(full_window);
-            slice_fill_rows_after.set(2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(2), 1));
+            slice_fill_rows_after.set(
+                2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1],
+                                     _crop_results[num_box].get()->info()->dimension(2), 1));
             auto kernel = std::make_unique<CLFill>();
-            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_after);
+            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                              &slice_fill_rows_after);
             //_internal_functions.emplace_back(std::move(kernel));
             _internal_functions.push_back(std::move(kernel));
         }
@@ -277,18 +357,18 @@ void CLCropResize::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
 
-    for(unsigned int i = 0; i < _internal_functions.size(); ++i)
+    for (unsigned int i = 0; i < _internal_functions.size(); ++i)
     {
         _internal_functions[i]->run();
     }
 
     CLScheduler::get().sync();
-    for(auto &kernel : _scale)
+    for (auto &kernel : _scale)
     {
         kernel->run();
     }
     CLScheduler::get().sync();
-    for(auto &kernel : _copy)
+    for (auto &kernel : _copy)
     {
         kernel->run();
     }
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 4421a18f2a7f39f5ce7ea5fdbd520718a627894e..4e0d1501ba60cf0c00bb92115ac3e7567b2d4a1c 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -25,16 +25,16 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/IClOperator.h"
 #include "src/gpu/cl/operators/ClTransposedConvolution.h"
 
-#include "src/common/utils/Log.h"
-
 #include <cmath>
 #include <memory>
 #include <tuple>
@@ -44,11 +44,11 @@ using namespace arm_compute::misc::shape_calculator;
 
 struct CLDeconvolutionLayer::Impl
 {
-    const ICLTensor                     *src{ nullptr };
-    const ICLTensor                     *weights{ nullptr };
-    const ICLTensor                     *biases{ nullptr };
-    ICLTensor                           *dst{ nullptr };
-    std::unique_ptr<opencl::IClOperator> op{ nullptr };
+    const ICLTensor                     *src{nullptr};
+    const ICLTensor                     *weights{nullptr};
+    const ICLTensor                     *biases{nullptr};
+    ICLTensor                           *dst{nullptr};
+    std::unique_ptr<opencl::IClOperator> op{nullptr};
 };
 
 CLDeconvolutionLayer::~CLDeconvolutionLayer() = default;
@@ -58,24 +58,35 @@ CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor
 {
 }
 
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
-                                     const WeightsInfo &weights_info)
+void CLDeconvolutionLayer::configure(ICLTensor           *input,
+                                     ICLTensor           *weights,
+                                     const ICLTensor     *bias,
+                                     ICLTensor           *output,
+                                     const PadStrideInfo &deconv_info,
+                                     const WeightsInfo   &weights_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, weights_info);
 }
 
-void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
-                                     const WeightsInfo &weights_info)
+void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+                                     ICLTensor              *input,
+                                     ICLTensor              *weights,
+                                     const ICLTensor        *bias,
+                                     ICLTensor              *output,
+                                     const PadStrideInfo    &deconv_info,
+                                     const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info, weights_info);
 
-    switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info))
+    switch (CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(),
+                                                           deconv_info, weights_info))
     {
         case DeconvolutionMethod::DIRECT:
         {
             auto op = std::make_unique<opencl::ClTransposedConvolution>();
-            op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info);
+            op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr,
+                          output->info(), deconv_info);
 
             _impl->src     = input;
             _impl->weights = weights;
@@ -105,22 +116,28 @@ void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, IC
     }
 }
 
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                                      const WeightsInfo &weights_info)
+Status CLDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                      const ITensorInfo   *weights,
+                                      const ITensorInfo   *bias,
+                                      ITensorInfo         *output,
+                                      const PadStrideInfo &deconv_info,
+                                      const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
+    switch (CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
     {
         case DeconvolutionMethod::DIRECT:
         {
             // Validate transposed convolution operator
-            ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info));
             break;
         }
         case DeconvolutionMethod::UPSCALE_CONV2D:
         {
             // Validate direct convolution layer
-            ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
             break;
         }
         case DeconvolutionMethod::GEMM:
@@ -137,12 +154,16 @@ Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
     return Status{};
 }
 
-DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                                                                   const WeightsInfo &weights_info)
+DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo   *input,
+                                                                   const ITensorInfo   *weights,
+                                                                   const ITensorInfo   *bias,
+                                                                   ITensorInfo         *output,
+                                                                   const PadStrideInfo &deconv_info,
+                                                                   const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_UNUSED(output, bias, weights_info);
 
-    if(is_data_type_quantized_per_channel(weights->data_type()))
+    if (is_data_type_quantized_per_channel(weights->data_type()))
     {
         return DeconvolutionMethod::UPSCALE_CONV2D;
     }
@@ -154,11 +175,12 @@ DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensor
     const size_t idx_n = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
     const size_t ofm   = weights->tensor_shape()[idx_n];
 
-    if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second)
+    if (weights->dimension(idx_w) != deconv_info.stride().first ||
+        weights->dimension(idx_h) != deconv_info.stride().second)
     {
-        // We observe better performance for FP32 types only when ofm <= 16.
-        // A better heuristic is required for selecting the method for FP16 data types.
-        if(input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16)))
+        // We observe better performance for FP32 types only when ofm <= 16, and for FP16 only when ofm <= 32.
+        if (input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16)) &&
+            !((input->data_type() == DataType::F16) && (ofm > 32)))
         {
             return DeconvolutionMethod::DIRECT;
         }
@@ -175,7 +197,7 @@ void CLDeconvolutionLayer::run()
 {
     prepare();
 
-    if(_impl->op != nullptr)
+    if (_impl->op != nullptr)
     {
         // Optimized Operator will be used
         ITensorPack pack;
@@ -195,7 +217,7 @@ void CLDeconvolutionLayer::run()
 
 void CLDeconvolutionLayer::prepare()
 {
-    if(_impl->op == nullptr)
+    if (_impl->op == nullptr)
     {
         _function->prepare();
     }
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index 0b428f5b178adf31d998ddb13d2d267a5f9658d1..b92bf903a6cdaaeb03eab4438ae230e616ea4c72 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -27,22 +27,21 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 
 namespace arm_compute
 {
 CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT
-    : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()),
-      _fill(),
-      _output(nullptr)
+    : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()), _fill(), _output(nullptr)
 {
 }
 
 CLDeconvolutionLayerUpsample::~CLDeconvolutionLayerUpsample() = default;
 
-Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
+Status
+CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
 {
     return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info);
 }
@@ -52,13 +51,17 @@ void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output
     configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context,
+                                             ICLTensor              *input,
+                                             ICLTensor              *output,
+                                             const PadStrideInfo    &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, info);
 
     _output = output;
-    _fill.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
+    _fill.configure(compile_context, _output,
+                    PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
     _upsample->configure(compile_context, input, _output, info);
 }
 
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index cac3f5101300fb2ff79a8ff83490c513c4394984..6d2fea974e4661c9e128b2851f37861cdaf4fa8b 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -26,10 +26,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCast.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCast.h"
 
 #include <utility>
 
@@ -37,16 +37,15 @@ namespace arm_compute
 {
 struct CLDepthConvertLayer::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClCast> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCast> op{nullptr};
 };
 
-CLDepthConvertLayer::CLDepthConvertLayer()
-    : _impl(std::make_unique<Impl>())
+CLDepthConvertLayer::CLDepthConvertLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default;
+CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&)            = default;
 CLDepthConvertLayer &CLDepthConvertLayer::operator=(CLDepthConvertLayer &&) = default;
 CLDepthConvertLayer::~CLDepthConvertLayer()                                 = default;
 
@@ -55,7 +54,11 @@ void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, C
     configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift);
 }
 
-void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+void CLDepthConvertLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    ICLTensor              *output,
+                                    ConvertPolicy           policy,
+                                    uint32_t                shift)
 {
     ARM_COMPUTE_UNUSED(shift);
     ARM_COMPUTE_LOG_PARAMS(input, output, policy, shift);
@@ -70,7 +73,8 @@ void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, con
     _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), policy);
 }
 
-Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+Status
+CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(shift != 0);
     return opencl::ClCast::validate(input, output, policy);
@@ -78,7 +82,7 @@ Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo
 
 void CLDepthConvertLayer::run()
 {
-    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
     _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
index 98531e7cac5561bafcc517148c45c579c633dd19..9477c7f81dfb4e231ff60b36d3893d44ade43c58 100644
--- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h"
 
-#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
 
 #include <utility>
 
@@ -36,7 +35,10 @@ void CLDepthToSpaceLayer::configure(const ICLTensor *input, ICLTensor *output, i
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    ICLTensor              *output,
+                                    int32_t                 block_shape)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
     auto k = std::make_unique<CLDepthToSpaceLayerKernel>();
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index dcb982fa565ecb45957e3b8ee17118956eccdd8f..873601bb113cd74353c52ba61eb409921cd1ce94 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -29,12 +29,12 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
 #include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 using namespace arm_compute::misc;
@@ -63,25 +63,33 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemory
 
 CLDepthwiseConvolutionLayer::~CLDepthwiseConvolutionLayer() = default;
 
-void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                            unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+void CLDepthwiseConvolutionLayer::configure(ICLTensor           *input,
+                                            const ICLTensor     *weights,
+                                            const ICLTensor     *biases,
+                                            ICLTensor           *output,
+                                            const PadStrideInfo &conv_info,
+                                            unsigned int         depth_multiplier,
+                                            ActivationLayerInfo  act_info,
+                                            const Size2D        &dilation)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier,
+              act_info, dilation);
 }
 
-void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
-                                            ICLTensor *output, const PadStrideInfo &conv_info,
-                                            unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context,
+                                            ICLTensor              *input,
+                                            const ICLTensor        *weights,
+                                            const ICLTensor        *biases,
+                                            ICLTensor              *output,
+                                            const PadStrideInfo    &conv_info,
+                                            unsigned int            depth_multiplier,
+                                            ActivationLayerInfo     act_info,
+                                            const Size2D           &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(),
-                                                                     weights->info(),
-                                                                     biases != nullptr ? biases->info() : nullptr,
-                                                                     output != nullptr ? output->info() : input->info(),
-                                                                     conv_info,
-                                                                     depth_multiplier,
-                                                                     act_info,
-                                                                     dilation));
+    ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(
+        input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr,
+        output != nullptr ? output->info() : input->info(), conv_info, depth_multiplier, act_info, dilation));
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
 
     _is_quantized     = is_data_type_quantized(input->info()->data_type());
@@ -96,7 +104,7 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont
     ICLTensor       *input_to_use   = input;
     const ICLTensor *weights_to_use = weights;
     ICLTensor       *output_to_use  = output;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _memory_group.manage(&_permuted_input);
         _memory_group.manage(&_permuted_output);
@@ -119,10 +127,12 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont
 
     CLTensor *output_multipliers_to_use = nullptr;
     CLTensor *output_shifts_to_use      = nullptr;
-    if(_is_quantized)
+    if (_is_quantized)
     {
-        const size_t idx_c       = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t num_filters = (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1;
+        const size_t idx_c =
+            get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t num_filters =
+            (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1;
 
         _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
         _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
@@ -132,16 +142,18 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont
     }
 
     // Get the depthwise convolution compute parameters
-    auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-    const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier);
+    auto                       t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+    const DWCComputeKernelInfo dwc_native_compute_info =
+        t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier);
 
-    const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation };
+    const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation};
 
     _dwc_native_kernel->set_target(gpu_target);
     _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
-                                  dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use, output_shifts_to_use);
+                                  dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use,
+                                  output_shifts_to_use);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permuted_input.allocator()->allocate();
 
@@ -151,22 +163,27 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont
         _permuted_output.allocator()->allocate();
     }
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _output_multipliers.allocator()->allocate();
         _output_shifts.allocator()->allocate();
     }
 }
 
-Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo   *input,
+                                             const ITensorInfo   *weights,
+                                             const ITensorInfo   *biases,
+                                             const ITensorInfo   *output,
                                              const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+                                             unsigned int         depth_multiplier,
+                                             ActivationLayerInfo  act_info,
+                                             const Size2D        &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
 
     const bool in_place = input == output || output == nullptr;
-    if(in_place)
+    if (in_place)
     {
         output = input;
     }
@@ -174,21 +191,23 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) >
+                                input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) >
+                                input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
 
     const GPUTarget gpu_target = CLScheduler::get().target();
 
-    const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation };
+    const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation};
 
     const bool needs_permute = input->data_layout() == DataLayout::NCHW;
 
     const bool is_quantized = is_data_type_quantized(input->data_type());
 
     TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32));
-    if(is_quantized)
+    if (is_quantized)
     {
-        if(is_data_type_quantized_per_channel(weights->data_type()))
+        if (is_data_type_quantized_per_channel(weights->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
 
@@ -201,40 +220,57 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
         }
     }
 
-    if(needs_permute)
+    if (needs_permute)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(in_place, "In-place is supported only with NHWC data layout");
         TensorShape           permuted_input_shape   = input->tensor_shape();
         TensorShape           permuted_weights_shape = weights->tensor_shape();
-        const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
-        TensorShape           permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
+        const ConvolutionInfo info{conv_info, depth_multiplier, ActivationLayerInfo(), dilation};
+        TensorShape           permuted_output_shape =
+            shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
 
         permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
 
-        const TensorInfo permuted_input   = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC);
-        const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC);
-        const TensorInfo permuted_output  = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_input = input->clone()
+                                              ->set_is_resizable(true)
+                                              .reset_padding()
+                                              .set_tensor_shape(permuted_input_shape)
+                                              .set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_weights = weights->clone()
+                                                ->set_is_resizable(true)
+                                                .reset_padding()
+                                                .set_tensor_shape(permuted_weights_shape)
+                                                .set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_output = output->clone()
+                                               ->set_is_resizable(true)
+                                               .reset_padding()
+                                               .set_tensor_shape(permuted_output_shape)
+                                               .set_data_layout(DataLayout::NHWC);
 
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
 
         // Get the depthwise convolution compute parameters
-        auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-        const DWCComputeKernelInfo dwc_native_compute_info = t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier);
+        auto                       t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+        const DWCComputeKernelInfo dwc_native_compute_info =
+            t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier);
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output,
-                                                                                      dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(
+            &permuted_input, &permuted_weights, biases, &permuted_output, dwc_native_compute_info, conv_kernel_info,
+            &output_multipliers_shifts_info, &output_multipliers_shifts_info));
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
     }
     else
     {
         // Get the depthwise convolution compute parameters
-        auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-        const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input, weights, conv_info, dilation, depth_multiplier);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info,
-                                                                                      &output_multipliers_shifts_info));
+        auto                       t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+        const DWCComputeKernelInfo dwc_native_compute_info =
+            t->configure(input, weights, conv_info, dilation, depth_multiplier);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(
+            input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info,
+            &output_multipliers_shifts_info));
     }
     return Status{};
 }
@@ -245,12 +281,12 @@ void CLDepthwiseConvolutionLayer::run()
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_input_to_nhwc.run();
     }
     CLScheduler::get().enqueue(*_dwc_native_kernel);
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_output_to_nchw.run();
     }
@@ -258,22 +294,21 @@ void CLDepthwiseConvolutionLayer::run()
 
 void CLDepthwiseConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        if(_is_quantized)
+        if (_is_quantized)
         {
             _output_multipliers.map();
             _output_shifts.map();
-            quantization::compute_quantized_multipliers_and_shifts(_input->info(),
-                                                                   _original_weights->info(),
-                                                                   _output != nullptr ? _output->info() : _input->info(),
-                                                                   reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
-                                                                   reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
+            quantization::compute_quantized_multipliers_and_shifts(
+                _input->info(), _original_weights->info(), _output != nullptr ? _output->info() : _input->info(),
+                reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
+                reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
             _output_multipliers.unmap();
             _output_shifts.unmap();
         }
 
-        if(_needs_permute)
+        if (_needs_permute)
         {
             ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 64c6b5d91cb47e740f2de8bc9e098d4d60d2fd5a..20162a03db2b9ce9ca47184b73d6e61fd0af2b54 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -26,22 +26,21 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClDequantize.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClDequantize.h"
 
 namespace arm_compute
 {
 struct CLDequantizationLayer::Impl
 {
-    const ICLTensor                      *src{ nullptr };
-    ICLTensor                            *dst{ nullptr };
-    std::unique_ptr<opencl::ClDequantize> op{ nullptr };
+    const ICLTensor                      *src{nullptr};
+    ICLTensor                            *dst{nullptr};
+    std::unique_ptr<opencl::ClDequantize> op{nullptr};
 };
 
-CLDequantizationLayer::CLDequantizationLayer()
-    : _impl(std::make_unique<Impl>())
+CLDequantizationLayer::CLDequantizationLayer() : _impl(std::make_unique<Impl>())
 {
 }
 CLDequantizationLayer::~CLDequantizationLayer() = default;
@@ -51,7 +50,9 @@ void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
 }
 
-void CLDequantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+void CLDequantizationLayer::configure(const CLCompileContext &compile_context,
+                                      const ICLTensor        *input,
+                                      ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output);
     _impl->src = input;
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index 752e0e4a605112081b71e0c33bd9433c1dd73bae..d6dae0d7323eac9fca6bd87c4fc8214665fcbd52 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -28,37 +28,46 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/operators/ClActivation.h"
-#include "src/gpu/cl/operators/ClDirectConv2d.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/operators/ClActivation.h"
+#include "src/gpu/cl/operators/ClDirectConv2d.h"
 
 namespace arm_compute
 {
 struct CLDirectConvolutionLayer::Impl
 {
-    const ICLTensor                        *src{ nullptr };
-    const ICLTensor                        *weights{ nullptr };
-    const ICLTensor                        *biases{ nullptr };
-    ICLTensor                              *dst{ nullptr };
-    std::unique_ptr<opencl::ClDirectConv2d> op{ nullptr };
+    const ICLTensor                        *src{nullptr};
+    const ICLTensor                        *weights{nullptr};
+    const ICLTensor                        *biases{nullptr};
+    ICLTensor                              *dst{nullptr};
+    std::unique_ptr<opencl::ClDirectConv2d> op{nullptr};
 };
 
-CLDirectConvolutionLayer::CLDirectConvolutionLayer()
-    : _impl(std::make_unique<Impl>())
+CLDirectConvolutionLayer::CLDirectConvolutionLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default;
+CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&)            = default;
 CLDirectConvolutionLayer &CLDirectConvolutionLayer::operator=(CLDirectConvolutionLayer &&) = default;
 CLDirectConvolutionLayer::~CLDirectConvolutionLayer()                                      = default;
 
-void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLDirectConvolutionLayer::configure(ICLTensor                 *input,
+                                         const ICLTensor           *weights,
+                                         const ICLTensor           *biases,
+                                         ICLTensor                 *output,
+                                         const PadStrideInfo       &conv_info,
+                                         const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
 }
 
-void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                         const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLDirectConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                         ICLTensor                 *input,
+                                         const ICLTensor           *weights,
+                                         const ICLTensor           *biases,
+                                         ICLTensor                 *output,
+                                         const PadStrideInfo       &conv_info,
+                                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info);
@@ -69,10 +78,15 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context
     _impl->dst     = output;
 
     _impl->op = std::make_unique<opencl::ClDirectConv2d>();
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
 }
 
-Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status CLDirectConvolutionLayer::validate(const ITensorInfo         *input,
+                                          const ITensorInfo         *weights,
+                                          const ITensorInfo         *biases,
+                                          const ITensorInfo         *output,
+                                          const PadStrideInfo       &conv_info,
                                           const ActivationLayerInfo &act_info)
 {
     return opencl::ClDirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
@@ -87,4 +101,4 @@ void CLDirectConvolutionLayer::run()
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index 88c3c6193c3c5fe5943bdb2727f90ec602ab3833..7cd268ab0bef384ef532ab565a3b7e83e153e0cf 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -26,15 +26,15 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
-#include "src/common/utils/Log.h"
-
 #include <memory>
 #include <tuple>
 
@@ -55,11 +55,16 @@ CLDirectDeconvolutionLayer::CLDirectDeconvolutionLayer(std::shared_ptr<IMemoryMa
 {
 }
 
-Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
-                                            const WeightsInfo &weights_info)
+Status CLDirectDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                            const ITensorInfo   *weights,
+                                            const ITensorInfo   *bias,
+                                            ITensorInfo         *output,
+                                            const PadStrideInfo &info,
+                                            const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
     const DataLayout data_layout = input->data_layout();
 
@@ -70,20 +75,22 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) < 1);
 
-    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), info);
+    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h),
+                                                    weights->dimension(idx_w), weights->dimension(idx_h), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
-    if(input->data_type() != weights->data_type())
+    if (input->data_type() != weights->data_type())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL || !is_data_type_quantized_asymmetric(input->data_type()));
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL ||
+                                    !is_data_type_quantized_asymmetric(input->data_type()));
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(input->data_type()))
+        if (is_data_type_quantized_asymmetric(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -102,24 +109,39 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen
     unsigned int        deconv_pad_y    = 0;
     const unsigned int  stride_x        = info.stride().first;
     const unsigned int  stride_y        = info.stride().second;
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
-    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
+    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y,
+                                                                                out_dims, deconv_pad_x, deconv_pad_y);
+    TensorInfo          scale_out_info(input->clone()
+                                           ->set_is_resizable(true)
+                                           .reset_padding()
+                                           .set_tensor_shape(scale_out_shape)
+                                           .set_data_layout(data_layout));
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
 
     return Status{};
 }
 
-void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                                           const WeightsInfo &weights_info)
+void CLDirectDeconvolutionLayer::configure(ICLTensor           *input,
+                                           ICLTensor           *weights,
+                                           const ICLTensor     *bias,
+                                           ICLTensor           *output,
+                                           const PadStrideInfo &info,
+                                           const WeightsInfo   &weights_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, weights_info);
 }
 
-void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                                           const WeightsInfo &weights_info)
+void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+                                           ICLTensor              *input,
+                                           ICLTensor              *weights,
+                                           const ICLTensor        *bias,
+                                           ICLTensor              *output,
+                                           const PadStrideInfo    &info,
+                                           const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, weights_info);
@@ -139,17 +161,21 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
     _original_weights = weights;
     _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
     _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
-    _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
+    _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis, /* use_inverted_axis */ false);
 
-    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
+    auto out_dims =
+        deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+                                        weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(
+        input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
 
     _is_prepared = weights_info.retain_internal_weights();
 
@@ -158,7 +184,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
     // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
     unsigned int      deconv_pad_x    = 0;
     unsigned int      deconv_pad_y    = 0;
-    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(
+        *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
 
     unsigned int deconv_pad_left  = pad_right > pad_left ? pad_right - pad_left : 0;
     unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0;
@@ -179,7 +206,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
     _scaled_output.allocator()->init(scale_out_info);
 
     // configure scale function
-    const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
+    const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top,
+                                      deconv_pad_bottom, DimensionRoundingType::FLOOR);
     _scale_f.configure(compile_context, input, &_scaled_output, upsample_info);
 
     // Setup the function to convolve the upscaled output
@@ -191,7 +219,7 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
     _flip_axis.allocator()->allocate();
     _flip_axis.map(true);
     auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
-    if(weights->info()->data_layout() == DataLayout::NHWC)
+    if (weights->info()->data_layout() == DataLayout::NHWC)
     {
         axis_data[0] = 1;
         axis_data[1] = 2;
@@ -216,7 +244,7 @@ void CLDirectDeconvolutionLayer::run()
 
 void CLDirectDeconvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
@@ -229,7 +257,7 @@ void CLDirectDeconvolutionLayer::prepare()
         _conv_f.prepare();
 
         // Free flipped weights
-        if(!_weights_flipped.is_used())
+        if (!_weights_flipped.is_used())
         {
             _weights_flipped.allocator()->free();
         }
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
index 936b37fb316e316af772bfac024e8f04efb9dda1..d9529f0b7ffc62c1270acbae3d35c1b6bb61e724 100644
--- a/src/runtime/CL/functions/CLElementwiseOperations.cpp
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -26,8 +26,8 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
 
+#include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClAdd.h"
 #include "src/gpu/cl/operators/ClElementwiseOperations.h"
 #include "src/gpu/cl/operators/ClSub.h"
@@ -36,26 +36,30 @@ namespace arm_compute
 {
 struct CLArithmeticAddition::Impl
 {
-    const ICLTensor               *src_0{ nullptr };
-    const ICLTensor               *src_1{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClAdd> op{ nullptr };
+    const ICLTensor               *src_0{nullptr};
+    const ICLTensor               *src_1{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClAdd> op{nullptr};
 };
 
-CLArithmeticAddition::CLArithmeticAddition()
-    : _impl(std::make_unique<Impl>())
+CLArithmeticAddition::CLArithmeticAddition() : _impl(std::make_unique<Impl>())
 {
 }
-CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default;
+CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&)            = default;
 CLArithmeticAddition &CLArithmeticAddition::operator=(CLArithmeticAddition &&) = default;
 CLArithmeticAddition::~CLArithmeticAddition()                                  = default;
 
-void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticAddition::configure(
+    ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
 }
 
-void CLArithmeticAddition::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+void CLArithmeticAddition::configure(const CLCompileContext    &compile_context,
+                                     const ICLTensor           *input1,
+                                     const ICLTensor           *input2,
+                                     ICLTensor                 *output,
+                                     ConvertPolicy              policy,
                                      const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
@@ -65,7 +69,11 @@ void CLArithmeticAddition::configure(const CLCompileContext &compile_context, co
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
 }
 
-Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CLArithmeticAddition::validate(const ITensorInfo         *input1,
+                                      const ITensorInfo         *input2,
+                                      const ITensorInfo         *output,
+                                      ConvertPolicy              policy,
+                                      const ActivationLayerInfo &act_info)
 {
     return opencl::ClAdd::validate(input1, input2, output, policy, act_info);
 }
@@ -82,26 +90,33 @@ void CLArithmeticAddition::run()
 
 struct CLArithmeticSubtraction::Impl
 {
-    const ICLTensor               *src_0{ nullptr };
-    const ICLTensor               *src_1{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClSub> op{ nullptr };
+    const ICLTensor               *src_0{nullptr};
+    const ICLTensor               *src_1{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClSub> op{nullptr};
 };
 
-CLArithmeticSubtraction::CLArithmeticSubtraction()
-    : _impl(std::make_unique<Impl>())
+CLArithmeticSubtraction::CLArithmeticSubtraction() : _impl(std::make_unique<Impl>())
 {
 }
-CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default;
+CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&)            = default;
 CLArithmeticSubtraction &CLArithmeticSubtraction::operator=(CLArithmeticSubtraction &&) = default;
 CLArithmeticSubtraction::~CLArithmeticSubtraction()                                     = default;
 
-void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticSubtraction::configure(const ICLTensor           *input1,
+                                        const ICLTensor           *input2,
+                                        ICLTensor                 *output,
+                                        ConvertPolicy              policy,
+                                        const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
 }
 
-void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+void CLArithmeticSubtraction::configure(const CLCompileContext    &compile_context,
+                                        const ICLTensor           *input1,
+                                        const ICLTensor           *input2,
+                                        ICLTensor                 *output,
+                                        ConvertPolicy              policy,
                                         const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
@@ -111,7 +126,11 @@ void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context,
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
 }
 
-Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CLArithmeticSubtraction::validate(const ITensorInfo         *input1,
+                                         const ITensorInfo         *input2,
+                                         const ITensorInfo         *output,
+                                         ConvertPolicy              policy,
+                                         const ActivationLayerInfo &act_info)
 {
     return opencl::ClSub::validate(input1, input2, output, policy, act_info);
 }
@@ -128,26 +147,32 @@ void CLArithmeticSubtraction::run()
 
 struct CLArithmeticDivision::Impl
 {
-    const ICLTensor                               *src_0{ nullptr };
-    const ICLTensor                               *src_1{ nullptr };
-    ICLTensor                                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwiseDivision> op{ nullptr };
+    const ICLTensor                               *src_0{nullptr};
+    const ICLTensor                               *src_1{nullptr};
+    ICLTensor                                     *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseDivision> op{nullptr};
 };
 
-CLArithmeticDivision::CLArithmeticDivision()
-    : _impl(std::make_unique<Impl>())
+CLArithmeticDivision::CLArithmeticDivision() : _impl(std::make_unique<Impl>())
 {
 }
-CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default;
+CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&)            = default;
 CLArithmeticDivision &CLArithmeticDivision::operator=(CLArithmeticDivision &&) = default;
 CLArithmeticDivision::~CLArithmeticDivision()                                  = default;
 
-void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticDivision::configure(ICLTensor                 *input1,
+                                     ICLTensor                 *input2,
+                                     ICLTensor                 *output,
+                                     const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLArithmeticDivision::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticDivision::configure(const CLCompileContext    &compile_context,
+                                     const ICLTensor           *input1,
+                                     const ICLTensor           *input2,
+                                     ICLTensor                 *output,
+                                     const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -156,7 +181,10 @@ void CLArithmeticDivision::configure(const CLCompileContext &compile_context, co
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLArithmeticDivision::validate(const ITensorInfo         *input1,
+                                      const ITensorInfo         *input2,
+                                      const ITensorInfo         *output,
+                                      const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwiseDivision::validate(input1, input2, output, act_info);
 }
@@ -173,26 +201,32 @@ void CLArithmeticDivision::run()
 
 struct CLElementwiseMax::Impl
 {
-    const ICLTensor                          *src_0{ nullptr };
-    const ICLTensor                          *src_1{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwiseMax> op{ nullptr };
+    const ICLTensor                          *src_0{nullptr};
+    const ICLTensor                          *src_1{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseMax> op{nullptr};
 };
 
-CLElementwiseMax::CLElementwiseMax()
-    : _impl(std::make_unique<Impl>())
+CLElementwiseMax::CLElementwiseMax() : _impl(std::make_unique<Impl>())
 {
 }
-CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default;
+CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&)            = default;
 CLElementwiseMax &CLElementwiseMax::operator=(CLElementwiseMax &&) = default;
 CLElementwiseMax::~CLElementwiseMax()                              = default;
 
-void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMax::configure(ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMax::configure(const CLCompileContext    &compile_context,
+                                 ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -201,7 +235,10 @@ void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTen
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseMax::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwiseMax::validate(input1, input2, output, act_info);
 }
@@ -218,26 +255,32 @@ void CLElementwiseMax::run()
 
 struct CLElementwiseMin::Impl
 {
-    const ICLTensor                          *src_0{ nullptr };
-    const ICLTensor                          *src_1{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwiseMin> op{ nullptr };
+    const ICLTensor                          *src_0{nullptr};
+    const ICLTensor                          *src_1{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseMin> op{nullptr};
 };
 
-CLElementwiseMin::CLElementwiseMin()
-    : _impl(std::make_unique<Impl>())
+CLElementwiseMin::CLElementwiseMin() : _impl(std::make_unique<Impl>())
 {
 }
-CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default;
+CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&)            = default;
 CLElementwiseMin &CLElementwiseMin::operator=(CLElementwiseMin &&) = default;
 CLElementwiseMin::~CLElementwiseMin()                              = default;
 
-void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMin::configure(ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMin::configure(const CLCompileContext    &compile_context,
+                                 ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -246,7 +289,10 @@ void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTen
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseMin::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwiseMin::validate(input1, input2, output, act_info);
 }
@@ -263,26 +309,32 @@ void CLElementwiseMin::run()
 
 struct CLElementwiseSquaredDiff::Impl
 {
-    const ICLTensor                                  *src_0{ nullptr };
-    const ICLTensor                                  *src_1{ nullptr };
-    ICLTensor                                        *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{ nullptr };
+    const ICLTensor                                  *src_0{nullptr};
+    const ICLTensor                                  *src_1{nullptr};
+    ICLTensor                                        *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{nullptr};
 };
 
-CLElementwiseSquaredDiff::CLElementwiseSquaredDiff()
-    : _impl(std::make_unique<Impl>())
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() : _impl(std::make_unique<Impl>())
 {
 }
-CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default;
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&)            = default;
 CLElementwiseSquaredDiff &CLElementwiseSquaredDiff::operator=(CLElementwiseSquaredDiff &&) = default;
 CLElementwiseSquaredDiff::~CLElementwiseSquaredDiff()                                      = default;
 
-void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseSquaredDiff::configure(ICLTensor                 *input1,
+                                         ICLTensor                 *input2,
+                                         ICLTensor                 *output,
+                                         const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseSquaredDiff::configure(const CLCompileContext    &compile_context,
+                                         ICLTensor                 *input1,
+                                         ICLTensor                 *input2,
+                                         ICLTensor                 *output,
+                                         const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -291,7 +343,10 @@ void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseSquaredDiff::validate(const ITensorInfo         *input1,
+                                          const ITensorInfo         *input2,
+                                          const ITensorInfo         *output,
+                                          const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwiseSquaredDiff::validate(input1, input2, output, act_info);
 }
@@ -308,26 +363,32 @@ void CLElementwiseSquaredDiff::run()
 
 struct CLElementwisePower::Impl
 {
-    const ICLTensor                            *src_0{ nullptr };
-    const ICLTensor                            *src_1{ nullptr };
-    ICLTensor                                  *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwisePower> op{ nullptr };
+    const ICLTensor                            *src_0{nullptr};
+    const ICLTensor                            *src_1{nullptr};
+    ICLTensor                                  *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwisePower> op{nullptr};
 };
 
-CLElementwisePower::CLElementwisePower()
-    : _impl(std::make_unique<Impl>())
+CLElementwisePower::CLElementwisePower() : _impl(std::make_unique<Impl>())
 {
 }
-CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default;
+CLElementwisePower::CLElementwisePower(CLElementwisePower &&)            = default;
 CLElementwisePower &CLElementwisePower::operator=(CLElementwisePower &&) = default;
 CLElementwisePower::~CLElementwisePower()                                = default;
 
-void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwisePower::configure(ICLTensor                 *input1,
+                                   ICLTensor                 *input2,
+                                   ICLTensor                 *output,
+                                   const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwisePower::configure(const CLCompileContext    &compile_context,
+                                   ICLTensor                 *input1,
+                                   ICLTensor                 *input2,
+                                   ICLTensor                 *output,
+                                   const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -336,7 +397,10 @@ void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLT
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwisePower::validate(const ITensorInfo         *input1,
+                                    const ITensorInfo         *input2,
+                                    const ITensorInfo         *output,
+                                    const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwisePower::validate(input1, input2, output, act_info);
 }
diff --git a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
index 9dcd2d18915edce2948ac2b8e95661bee6cfec56..3043c26febdd58e9e095c5b220220750a0268212 100644
--- a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
+++ b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClElementwiseUnary.h"
 
@@ -32,17 +33,16 @@ namespace arm_compute
 {
 struct CLRsqrtLayer::Impl
 {
-    const ICLTensor                 *src{ nullptr };
-    ICLTensor                       *dst{ nullptr };
-    std::unique_ptr<opencl::ClRsqrt> op{ nullptr };
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClRsqrt> op{nullptr};
 };
 
-CLRsqrtLayer::CLRsqrtLayer()
-    : _impl(std::make_unique<Impl>())
+CLRsqrtLayer::CLRsqrtLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default;
+CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&)            = default;
 CLRsqrtLayer &CLRsqrtLayer::operator=(CLRsqrtLayer &&) = default;
 CLRsqrtLayer::~CLRsqrtLayer()                          = default;
 
@@ -74,17 +74,16 @@ void CLRsqrtLayer::run()
 
 struct CLExpLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClExp> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClExp> op{nullptr};
 };
 
-CLExpLayer::CLExpLayer()
-    : _impl(std::make_unique<Impl>())
+CLExpLayer::CLExpLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLExpLayer::CLExpLayer(CLExpLayer &&) = default;
+CLExpLayer::CLExpLayer(CLExpLayer &&)            = default;
 CLExpLayer &CLExpLayer::operator=(CLExpLayer &&) = default;
 CLExpLayer::~CLExpLayer()                        = default;
 
@@ -116,17 +115,16 @@ void CLExpLayer::run()
 
 struct CLNegLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClNeg> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClNeg> op{nullptr};
 };
 
-CLNegLayer::CLNegLayer()
-    : _impl(std::make_unique<Impl>())
+CLNegLayer::CLNegLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLNegLayer::CLNegLayer(CLNegLayer &&) = default;
+CLNegLayer::CLNegLayer(CLNegLayer &&)            = default;
 CLNegLayer &CLNegLayer::operator=(CLNegLayer &&) = default;
 CLNegLayer::~CLNegLayer()                        = default;
 
@@ -157,17 +155,16 @@ void CLNegLayer::run()
 
 struct CLSinLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClSin> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClSin> op{nullptr};
 };
 
-CLSinLayer::CLSinLayer()
-    : _impl(std::make_unique<Impl>())
+CLSinLayer::CLSinLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLSinLayer::CLSinLayer(CLSinLayer &&) = default;
+CLSinLayer::CLSinLayer(CLSinLayer &&)            = default;
 CLSinLayer &CLSinLayer::operator=(CLSinLayer &&) = default;
 CLSinLayer::~CLSinLayer()                        = default;
 
@@ -198,17 +195,16 @@ void CLSinLayer::run()
 
 struct CLAbsLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClAbs> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClAbs> op{nullptr};
 };
 
-CLAbsLayer::CLAbsLayer()
-    : _impl(std::make_unique<Impl>())
+CLAbsLayer::CLAbsLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default;
+CLAbsLayer::CLAbsLayer(CLAbsLayer &&)            = default;
 CLAbsLayer &CLAbsLayer::operator=(CLAbsLayer &&) = default;
 CLAbsLayer::~CLAbsLayer()                        = default;
 
@@ -239,17 +235,16 @@ void CLAbsLayer::run()
 
 struct CLLogLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClLog> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClLog> op{nullptr};
 };
 
-CLLogLayer::CLLogLayer()
-    : _impl(std::make_unique<Impl>())
+CLLogLayer::CLLogLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLLogLayer::CLLogLayer(CLLogLayer &&) = default;
+CLLogLayer::CLLogLayer(CLLogLayer &&)            = default;
 CLLogLayer &CLLogLayer::operator=(CLLogLayer &&) = default;
 CLLogLayer::~CLLogLayer()                        = default;
 
@@ -280,17 +275,16 @@ void CLLogLayer::run()
 
 struct CLRoundLayer::Impl
 {
-    const ICLTensor                 *src{ nullptr };
-    ICLTensor                       *dst{ nullptr };
-    std::unique_ptr<opencl::ClRound> op{ nullptr };
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClRound> op{nullptr};
 };
 
-CLRoundLayer::CLRoundLayer()
-    : _impl(std::make_unique<Impl>())
+CLRoundLayer::CLRoundLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default;
+CLRoundLayer::CLRoundLayer(CLRoundLayer &&)            = default;
 CLRoundLayer &CLRoundLayer::operator=(CLRoundLayer &&) = default;
 CLRoundLayer::~CLRoundLayer()                          = default;
 
diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
index bd0966b65fa1249507973ddc73b34635943fe3bf..48e9ae824ad8a349001702b188d212b2bc18a5b5 100644
--- a/src/runtime/CL/functions/CLFFT1D.cpp
+++ b/src/runtime/CL/functions/CLFFT1D.cpp
@@ -26,13 +26,13 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 #include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 #include "src/core/CL/kernels/CLFFTScaleKernel.h"
 #include "src/core/utils/helpers/fft.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLFFT1D::CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
@@ -54,7 +54,10 @@ void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DIn
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
+void CLFFT1D::configure(const CLCompileContext &compile_context,
+                        const ICLTensor        *input,
+                        ICLTensor              *output,
+                        const FFT1DInfo        &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config));
@@ -77,13 +80,14 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
     TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
     _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
     _memory_group.manage(&_digit_reversed_input);
-    _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+    _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices,
+                                     digit_reverse_config);
 
     // Create and configure FFT kernels
     unsigned int Nx = 1;
     _num_ffts       = decomposed_vector.size();
     _fft_kernels.reserve(_num_ffts);
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
         const unsigned int radix_for_stage = decomposed_vector.at(i);
 
@@ -93,18 +97,20 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
         fft_kernel_info.Nx             = Nx;
         fft_kernel_info.is_first_stage = (i == 0);
         _fft_kernels.emplace_back(std::make_unique<CLFFTRadixStageKernel>());
-        _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+        _fft_kernels.back()->configure(compile_context, &_digit_reversed_input,
+                                       ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
 
         Nx *= radix_for_stage;
     }
 
     // Configure scale kernel
-    if(_run_scale)
+    if (_run_scale)
     {
         FFTScaleKernelInfo scale_config;
         scale_config.scale     = static_cast<float>(N);
         scale_config.conjugate = config.direction == FFTDirection::Inverse;
-        is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config);
+        is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config)
+               : _scale_kernel->configure(output, nullptr, scale_config);
     }
 
     // Allocate tensors
@@ -123,7 +129,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
 
     // Check if FFT is decomposable
     const auto         supported_radix   = CLFFTRadixStageKernel::supported_radix();
@@ -132,7 +138,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
@@ -151,13 +157,13 @@ void CLFFT1D::run()
     CLScheduler::get().enqueue(*_digit_reverse_kernel, false);
 
     // Run radix kernels
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
         CLScheduler::get().enqueue(*_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale);
     }
 
     // Run output scaling
-    if(_run_scale)
+    if (_run_scale)
     {
         CLScheduler::get().enqueue(*_scale_kernel, true);
     }
diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp
index 94fc41135577eabd0712012505197799351e26cf..3857046719d8e34f43336d7d6da1ab637b12c591 100644
--- a/src/runtime/CL/functions/CLFFT2D.cpp
+++ b/src/runtime/CL/functions/CLFFT2D.cpp
@@ -26,16 +26,19 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 #include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 #include "src/core/CL/kernels/CLFFTScaleKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+    : _memory_group(memory_manager),
+      _first_pass_func(memory_manager),
+      _second_pass_func(memory_manager),
+      _first_pass_tensor()
 {
 }
 
@@ -46,7 +49,10 @@ void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DIn
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
+void CLFFT2D::configure(const CLCompileContext &compile_context,
+                        const ICLTensor        *input,
+                        ICLTensor              *output,
+                        const FFT2DInfo        &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config));
@@ -88,7 +94,7 @@ Status CLFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config));
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
index d12e2de3bf87d4d77a31a477d44e6f5330cc757a..2a73517549be3af9d5ee0f5a9dcb1a5be49dc7a8 100644
--- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,10 +25,12 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 #include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 #include "src/core/CL/kernels/CLFFTScaleKernel.h"
@@ -38,8 +40,6 @@
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/utils/helpers/fft.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace
@@ -50,11 +50,11 @@ int pad_decomposable(int N)
 
     int  pad           = 0;
     bool is_decomposed = false;
-    while(!is_decomposed)
+    while (!is_decomposed)
     {
         const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
         is_decomposed                = !decomposed_vector.empty();
-        if(!is_decomposed)
+        if (!is_decomposed)
         {
             ++pad;
         }
@@ -104,17 +104,31 @@ CLFFTConvolutionLayer::CLFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem
 {
 }
 
-void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CLFFTConvolutionLayer::configure(ICLTensor                 *input,
+                                      const ICLTensor           *weights,
+                                      const ICLTensor           *biases,
+                                      ICLTensor                 *output,
+                                      const PadStrideInfo       &conv_info,
+                                      const ActivationLayerInfo &act_info,
+                                      bool                       enable_fast_math)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info,
+              enable_fast_math);
 }
 
-void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CLFFTConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                      ICLTensor                 *input,
+                                      const ICLTensor           *weights,
+                                      const ICLTensor           *biases,
+                                      ICLTensor                 *output,
+                                      const PadStrideInfo       &conv_info,
+                                      const ActivationLayerInfo &act_info,
+                                      bool                       enable_fast_math)
 {
     ARM_COMPUTE_UNUSED(enable_fast_math);
-    ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, act_info, enable_fast_math));
+    ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(),
+                                                               biases != nullptr ? biases->info() : nullptr,
+                                                               output->info(), conv_info, act_info, enable_fast_math));
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math);
 
     _original_weights = weights;
@@ -124,21 +138,24 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     _has_bias = biases != nullptr;
 
     // Get indices for the width and height
-    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height =
+        get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
     // Input shape, kernel size and output tile
-    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
-    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
-    const Size2D pad_valid   = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
-                                      pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+    const Size2D input_dims =
+        Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+    const Size2D kernel_size =
+        Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+    const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+                                    pad_decomposable(input_dims.y() + kernel_size.y() - 1));
     // Tensors to use
     ICLTensor       *input_to_use   = input;
     const ICLTensor *weights_to_use = weights;
     ICLTensor       *output_to_use  = _has_bias ? &_bias_output : output;
 
     // Permute bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         _permute_bias_func.configure(compile_context, biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
         _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
@@ -146,7 +163,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
 
     // Permute input if needed
     _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _memory_group.manage(&_permuted_input);
         // Configure the function to transform the input tensor from NHWC -> NCHW
@@ -164,10 +181,11 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     // Flip weights
     _flipped_weights.allocator()->init(weights_to_use->info()->clone()->set_is_resizable(true).reset_padding());
     _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
-    _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis);
+    _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis,
+                                 /* use_inverted_axis */ false);
 
     // Pad weights
-    const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}};
     _pad_weights_func.configure(compile_context, &_flipped_weights, &_padded_weights, padding_w);
 
     // Transform weights
@@ -175,10 +193,10 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     _transform_weights_func->configure(compile_context, &_padded_weights, &_transformed_weights, FFT2DInfo());
 
     // Pad input
-    const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}};
     _memory_group.manage(&_padded_input);
     _pad_input_func.configure(compile_context, input_to_use, &_padded_input, padding_in);
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permuted_input.allocator()->allocate();
     }
@@ -202,7 +220,8 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     _memory_group.manage(&_itransformed_output);
     FFT2DInfo itranform_info;
     itranform_info.direction = FFTDirection::Inverse;
-    _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+    _itransformed_output.allocator()->init(
+        _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
     _itransform_output_func.configure(compile_context, &_output_reduced, &_itransformed_output, itranform_info);
     _output_reduced.allocator()->allocate();
 
@@ -214,25 +233,28 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     // Extract correct region
     const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
     const int start_top  = kernel_size.y() - conv_info.pad_top() - 1;
-    const int end_right  = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
-    const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
-    if(_has_bias)
+    const int end_right =
+        _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+    const int end_botton =
+        _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+    if (_has_bias)
     {
         _memory_group.manage(&_bias_output);
     }
-    else if(_needs_permute)
+    else if (_needs_permute)
     {
         output_to_use = &_permuted_output;
         _memory_group.manage(&_permuted_output);
     }
-    _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+    _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use,
+                                   Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
     _itransformed_output.allocator()->allocate();
 
     // Add bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         output_to_use = output;
-        if(_needs_permute)
+        if (_needs_permute)
         {
             output_to_use = &_permuted_output;
             _memory_group.manage(&_permuted_output);
@@ -243,7 +265,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     }
 
     // Permute output
-    if(_needs_permute)
+    if (_needs_permute)
     {
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
         _permuted_output.info()->set_data_layout(DataLayout::NCHW);
@@ -255,7 +277,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
 
     // Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.configure(compile_context, output, nullptr, act_info);
     }
@@ -269,8 +291,13 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     _flip_axis.unmap();
 }
 
-Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                       const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CLFFTConvolutionLayer::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *weights,
+                                       const ITensorInfo         *biases,
+                                       const ITensorInfo         *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const ActivationLayerInfo &act_info,
+                                       bool                       enable_fast_math)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON((input->data_type() == DataType::F16) && !enable_fast_math);
@@ -287,24 +314,27 @@ Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn
     const auto strides = conv_info.stride();
     ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) ||
+                                conv_info.pad_right() != (kernel_size.x() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) ||
+                                conv_info.pad_bottom() != (kernel_size.y() / 2));
 
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
         ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[3] != biases->tensor_shape().x());
     }
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) ||
+                                    (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
 
         // Validate Activation Layer
-        if(act_info.enabled())
+        if (act_info.enabled())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
         }
@@ -320,7 +350,7 @@ void CLFFTConvolutionLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Transform input
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_input_func.run();
     }
@@ -336,17 +366,17 @@ void CLFFTConvolutionLayer::run()
     _reshaped_output.allocator()->import_memory(_itransformed_output.cl_buffer());
     _extract_output_func.run();
     // Add bias
-    if(_has_bias)
+    if (_has_bias)
     {
         _bias_add_func.run();
     }
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_output_func.run();
     }
 
     // Run activation layer
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.run();
     }
@@ -354,10 +384,10 @@ void CLFFTConvolutionLayer::run()
 
 void CLFFTConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Permute bias to NCHW
-        if(_original_bias != nullptr)
+        if (_original_bias != nullptr)
         {
             _permuted_bias.allocator()->allocate();
             _permute_bias_func.run();
@@ -366,7 +396,7 @@ void CLFFTConvolutionLayer::prepare()
 
         const ICLTensor *cur_weights = _original_weights;
         // Permute weights
-        if(_needs_permute)
+        if (_needs_permute)
         {
             ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
 
diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp
index 6019a84aba2b6e150a35b0475f4f95b9ca66b938..9bd96a975eda2740799041b7a2e5f1e029621342 100644
--- a/src/runtime/CL/functions/CLFill.cpp
+++ b/src/runtime/CL/functions/CLFill.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClFill.h"
 
@@ -36,16 +37,15 @@ namespace arm_compute
 {
 struct CLFill::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClFill> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClFill> op{nullptr};
 };
 
-CLFill::CLFill()
-    : _impl(std::make_unique<Impl>())
+CLFill::CLFill() : _impl(std::make_unique<Impl>())
 {
 }
-CLFill::CLFill(CLFill &&) = default;
+CLFill::CLFill(CLFill &&)            = default;
 CLFill &CLFill::operator=(CLFill &&) = default;
 CLFill::~CLFill()                    = default;
 
@@ -54,7 +54,10 @@ void CLFill::configure(ICLTensor *tensor, const PixelValue &constant_value, Wind
     configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, dst_window);
 }
 
-void CLFill::configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *dst_window)
+void CLFill::configure(const CLCompileContext &compile_context,
+                       ICLTensor              *tensor,
+                       const PixelValue       &constant_value,
+                       Window                 *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
 
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
index 32fc37552ca1a835caf742d2ce0c62bb46b5588f..ba1b5372d341b4b3f59336a4947793dfe1cf94cf 100644
--- a/src/runtime/CL/functions/CLFlattenLayer.cpp
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -26,8 +26,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/gpu/cl/operators/ClFlatten.h"
@@ -36,16 +37,15 @@ namespace arm_compute
 {
 struct CLFlattenLayer::Impl
 {
-    const ICLTensor                   *src{ nullptr };
-    ICLTensor                         *dst{ nullptr };
-    std::unique_ptr<opencl::ClFlatten> op{ nullptr };
+    const ICLTensor                   *src{nullptr};
+    ICLTensor                         *dst{nullptr};
+    std::unique_ptr<opencl::ClFlatten> op{nullptr};
 };
 
-CLFlattenLayer::CLFlattenLayer()
-    : _impl(std::make_unique<Impl>())
+CLFlattenLayer::CLFlattenLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default;
+CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&)            = default;
 CLFlattenLayer &CLFlattenLayer::operator=(CLFlattenLayer &&) = default;
 CLFlattenLayer::~CLFlattenLayer()                            = default;
 
@@ -59,7 +59,8 @@ void CLFlattenLayer::configure(const CLCompileContext &compile_context, const IC
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     _impl->src = input;
     _impl->dst = output;
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info())));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(
+                                            misc::shape_calculator::compute_flatten_shape(input->info())));
 
     _impl->op = std::make_unique<opencl::ClFlatten>();
     _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info());
@@ -68,9 +69,10 @@ void CLFlattenLayer::configure(const CLCompileContext &compile_context, const IC
 Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
     }
     return opencl::ClFlatten::validate(input, output);
@@ -83,4 +85,4 @@ void CLFlattenLayer::run()
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
index 8739e1803efa8262a6f349e8b1949e12b80eaffb..4322219dd902689efdd1f972e6cfc531f091a523 100644
--- a/src/runtime/CL/functions/CLFloor.cpp
+++ b/src/runtime/CL/functions/CLFloor.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClFloor.h"
 
@@ -34,16 +35,15 @@ namespace arm_compute
 {
 struct CLFloor::Impl
 {
-    const ICLTensor                 *src{ nullptr };
-    ICLTensor                       *dst{ nullptr };
-    std::unique_ptr<opencl::ClFloor> op{ nullptr };
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClFloor> op{nullptr};
 };
 
-CLFloor::CLFloor()
-    : _impl(std::make_unique<Impl>())
+CLFloor::CLFloor() : _impl(std::make_unique<Impl>())
 {
 }
-CLFloor::CLFloor(CLFloor &&) = default;
+CLFloor::CLFloor(CLFloor &&)            = default;
 CLFloor &CLFloor::operator=(CLFloor &&) = default;
 CLFloor::~CLFloor()                     = default;
 
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 1c162db79a2a1acba424f137328d262ab9d8323d..b30f9e701ffb35072445d122c7815c3fa1104115 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClFullyConnected.h"
 
@@ -35,21 +36,22 @@ using namespace arm_compute::experimental;
 struct CLFullyConnectedLayer::Impl
 {
     MemoryGroup      memory_group{};
-    IWeightsManager *weights_manager{ nullptr };
+    IWeightsManager *weights_manager{nullptr};
 
-    std::unique_ptr<opencl::ClFullyConnected> op{ nullptr };
+    std::unique_ptr<opencl::ClFullyConnected> op{nullptr};
 
-    const ITensor *original_weights{ nullptr };
+    const ITensor *original_weights{nullptr};
 
     ITensorPack                      run_pack{};
     WorkspaceData<CLTensor>          workspace{};
     experimental::MemoryRequirements aux_mem_req{};
 
-    bool is_prepared{ false };
-    bool dynamic_weights{ false };
+    bool is_prepared{false};
+    bool dynamic_weights{false};
 };
 
-CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager,
+                                             IWeightsManager                *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group    = MemoryGroup(std::move(memory_manager));
@@ -58,39 +60,45 @@ CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> mem
 
 CLFullyConnectedLayer::~CLFullyConnectedLayer() = default;
 
-void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLFullyConnectedLayer::configure(const ICLTensor        *input,
+                                      const ICLTensor        *weights,
+                                      const ICLTensor        *biases,
+                                      ICLTensor              *output,
                                       FullyConnectedLayerInfo fc_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, fc_info);
 }
 
-void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context,
+                                      const ICLTensor        *input,
+                                      const ICLTensor        *weights,
+                                      const ICLTensor        *biases,
+                                      ICLTensor              *output,
                                       FullyConnectedLayerInfo fc_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(input->info(),
-                                                               weights->info(),
-                                                               biases != nullptr ? biases->info() : nullptr,
-                                                               output->info(),
-                                                               fc_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(
+        input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), fc_info));
 
     _impl->op               = std::make_unique<opencl::ClFullyConnected>();
     _impl->original_weights = weights;
     _impl->is_prepared      = fc_info.retain_internal_weights;
 
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info);
 
-    if(_impl->weights_manager != nullptr)
+    if (_impl->weights_manager != nullptr)
     {
         _impl->weights_manager->manage(_impl->original_weights);
     }
 
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->aux_mem_req = _impl->op->workspace();
-        _impl->run_pack    = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-        _impl->workspace   = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
+        _impl->run_pack    = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+        _impl->workspace =
+            manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
     }
     else
     {
@@ -98,14 +106,14 @@ void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, c
         _impl->run_pack.add_tensor(ACL_DST, output);
     }
 
-    _impl->dynamic_weights =
-        !weights->info()->are_values_constant() &&
-        fc_info.transpose_weights &&
-        !fc_info.are_weights_reshaped &&
-        !fc_info.retain_internal_weights;
+    _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights &&
+                             !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights;
 }
 
-Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status CLFullyConnectedLayer::validate(const ITensorInfo      *input,
+                                       const ITensorInfo      *weights,
+                                       const ITensorInfo      *biases,
+                                       const ITensorInfo      *output,
                                        FullyConnectedLayerInfo fc_info)
 {
     return opencl::ClFullyConnected::validate(input, weights, biases, output, fc_info);
@@ -113,7 +121,7 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
 
 void CLFullyConnectedLayer::run()
 {
-    if(!_impl->dynamic_weights)
+    if (!_impl->dynamic_weights)
     {
         prepare();
     }
@@ -124,7 +132,7 @@ void CLFullyConnectedLayer::run()
 
 void CLFullyConnectedLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 
@@ -133,13 +141,13 @@ void CLFullyConnectedLayer::prepare()
         _impl->is_prepared = true;
 
         // Handle weights managed infrastructure
-        if(_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
+        if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
         {
             // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare
             // This is for cases where multiple functions share the same b (weights)
             // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference
             const ITensor *original_b = _impl->original_weights;
-            if(!original_b->is_used())
+            if (!original_b->is_used())
             {
                 _impl->weights_manager->pre_mark_as_unused(original_b);
             }
diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
index 7379e9d9fee759b757ca7af08a499fe0f33d5864..e4fbf78e13b2822ae8e162f664834e2709d783ab 100644
--- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
+++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
@@ -28,9 +28,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
 
 namespace arm_compute
 {
@@ -41,29 +41,52 @@ CLFuseBatchNormalization::CLFuseBatchNormalization()
 
 CLFuseBatchNormalization::~CLFuseBatchNormalization() = default;
 
-void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                         ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                         const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                         float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalization::configure(const ICLTensor           *input_weights,
+                                         const ICLTensor           *bn_mean,
+                                         const ICLTensor           *bn_var,
+                                         ICLTensor                 *fused_weights,
+                                         ICLTensor                 *fused_bias,
+                                         const ICLTensor           *input_bias,
+                                         const ICLTensor           *bn_beta,
+                                         const ICLTensor           *bn_gamma,
+                                         float                      epsilon,
+                                         FuseBatchNormalizationType fbn_type)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+              input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
-void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                         ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                         const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                         float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalization::configure(const CLCompileContext    &compile_context,
+                                         const ICLTensor           *input_weights,
+                                         const ICLTensor           *bn_mean,
+                                         const ICLTensor           *bn_var,
+                                         ICLTensor                 *fused_weights,
+                                         ICLTensor                 *fused_bias,
+                                         const ICLTensor           *input_bias,
+                                         const ICLTensor           *bn_beta,
+                                         const ICLTensor           *bn_gamma,
+                                         float                      epsilon,
+                                         FuseBatchNormalizationType fbn_type)
 {
-    ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
-    _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+                           epsilon, fbn_type);
+    _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias,
+                               bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
-Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status CLFuseBatchNormalization::validate(const ITensorInfo         *input_weights,
+                                          const ITensorInfo         *bn_mean,
+                                          const ITensorInfo         *bn_var,
+                                          const ITensorInfo         *fused_weights,
+                                          const ITensorInfo         *fused_bias,
+                                          const ITensorInfo         *input_bias,
+                                          const ITensorInfo         *bn_beta,
+                                          const ITensorInfo         *bn_gamma,
+                                          float                      epsilon,
+                                          FuseBatchNormalizationType fbn_type)
 {
-    return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                    input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
 void CLFuseBatchNormalization::run()
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 427ea51ab99e461c89a3d6aa6786c7beaba70725..871a1d6e2790fff70ff68488e8bed65b81a511cc 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClGemm.h"
 
@@ -40,15 +41,15 @@ using OperatorType = opencl::ClGemm;
 
 struct CLGEMM::Impl
 {
-    const ICLTensor              *b{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ICLTensor              *b{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
     MemoryGroup                   memory_group{};
-    IWeightsManager              *weights_manager{ nullptr };
+    IWeightsManager              *weights_manager{nullptr};
     ITensorPack                   run_pack{};
     ITensorPack                   prep_pack{};
     MemoryRequirements            aux_mem_req{};
     WorkspaceData<CLTensor>       workspace_tensors{};
-    bool                          is_prepared{ false };
+    bool                          is_prepared{false};
 };
 
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
@@ -60,12 +61,25 @@ CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *
 
 CLGEMM::~CLGEMM() = default;
 
-void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure(const ICLTensor *a,
+                       const ICLTensor *b,
+                       const ICLTensor *c,
+                       ICLTensor       *output,
+                       float            alpha,
+                       float            beta,
+                       const GEMMInfo  &gemm_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info);
 }
 
-void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *a,
+                       const ICLTensor        *b,
+                       const ICLTensor        *c,
+                       ICLTensor              *output,
+                       float                   alpha,
+                       float                   beta,
+                       const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
@@ -73,25 +87,33 @@ void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor
     _impl->op          = std::make_unique<OperatorType>();
     _impl->is_prepared = gemm_info.retain_internal_weights();
 
-    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info);
+    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(),
+                         alpha, beta, gemm_info);
     _impl->aux_mem_req = _impl->op->workspace();
 
     // Manage/allocate auxilairy tensors
-    if(_impl->is_prepared)
+    if (_impl->is_prepared)
     {
         _impl->run_pack.add_const_tensor(ACL_SRC_0, a);
         _impl->run_pack.add_tensor(ACL_DST, output);
     }
     else
     {
-        _impl->run_pack  = { { ACL_SRC_0, a }, { ACL_SRC_2, c }, { ACL_DST, output } };
-        _impl->prep_pack = { { ACL_SRC_1, _impl->b } };
+        _impl->run_pack  = {{ACL_SRC_0, a}, {ACL_SRC_2, c}, {ACL_DST, output}};
+        _impl->prep_pack = {{ACL_SRC_1, _impl->b}};
 
-        _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+        _impl->workspace_tensors =
+            manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
     }
 }
 
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status CLGEMM::validate(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        const ITensorInfo *output,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
 {
     return OperatorType::validate(a, b, c, output, alpha, beta, gemm_info);
 }
@@ -107,15 +129,15 @@ void CLGEMM::run()
 
 void CLGEMM::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
 
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->b->mark_as_unused();
         }
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index ad5bfd8dd21487dedeca57601482e5b1e8d64568..aef7cddd7a4e9676261b1170ea56f8472684cf81 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,11 +27,11 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/experimental/PostOpUtils.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClGemmConv2d.h"
 #include "support/Cast.h"
@@ -48,18 +48,19 @@ using namespace arm_compute::experimental;
 
 struct CLGEMMConvolutionLayer::Impl
 {
-    const ITensor                        *weights{ nullptr };
-    std::unique_ptr<opencl::ClGemmConv2d> op{ nullptr };
+    const ITensor                        *weights{nullptr};
+    std::unique_ptr<opencl::ClGemmConv2d> op{nullptr};
     ITensorPack                           run_pack{};
     ITensorPack                           prep_pack{};
     MemoryGroup                           memory_group{};
-    IWeightsManager                      *weights_manager{ nullptr };
+    IWeightsManager                      *weights_manager{nullptr};
     MemoryRequirements                    aux_mem_req{};
     WorkspaceData<CLTensor>               workspace_tensors{};
-    bool                                  is_prepared{ false };
+    bool                                  is_prepared{false};
 };
 
-CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager,
+                                               IWeightsManager                *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group    = MemoryGroup(memory_manager);
@@ -68,56 +69,62 @@ CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> m
 
 CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default;
 
-void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                       const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+void CLGEMMConvolutionLayer::configure(const ICLTensor           *input,
+                                       const ICLTensor           *weights,
+                                       const ICLTensor           *biases,
+                                       ICLTensor                 *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const WeightsInfo         &weights_info,
+                                       const Size2D              &dilation,
+                                       const ActivationLayerInfo &act_info,
+                                       unsigned int               num_groups)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups, post_ops);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info,
+              dilation, act_info, num_groups);
 }
 
-void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                       const PadStrideInfo &conv_info,
-                                       const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+void CLGEMMConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                       const ICLTensor           *input,
+                                       const ICLTensor           *weights,
+                                       const ICLTensor           *biases,
+                                       ICLTensor                 *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const WeightsInfo         &weights_info,
+                                       const Size2D              &dilation,
+                                       const ActivationLayerInfo &act_info,
+                                       unsigned int               num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    _impl->weights = weights;
-    _impl->op      = std::make_unique<opencl::ClGemmConv2d>();
-    // Convert post op arguments to ITensorInfo
-    auto transformed_post_ops = experimental::transform_post_op_list_arguments<ICLTensor *, ITensorInfo *>(post_ops, [](auto tensor)
-    {
-        return tensor->info();
-    });
-    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups, transformed_post_ops);
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
+    _impl->weights               = weights;
+    _impl->op                    = std::make_unique<opencl::ClGemmConv2d>();
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, input },
-        { TensorType::ACL_SRC_1, weights },
-        { TensorType::ACL_SRC_2, biases },
-        { TensorType::ACL_DST, output }
-    };
-    // Add post op tensors
-    size_t post_op_tensor_index = 0;
-    for(const auto &op : post_ops.get_list())
-    {
-        for(auto &tensor : op->arguments())
-        {
-            _impl->run_pack.add_const_tensor(experimental::get_post_op_arg_type(post_op_tensor_index++), *tensor);
-        }
-    }
-    _impl->prep_pack =
-    {
-        { TensorType::ACL_SRC_1, weights },
-        { TensorType::ACL_SRC_2, biases },
+    _impl->run_pack  = {{TensorType::ACL_SRC_0, input},
+                        {TensorType::ACL_SRC_1, weights},
+                        {TensorType::ACL_SRC_2, biases},
+                        {TensorType::ACL_DST, output}};
+    _impl->prep_pack = {
+        {TensorType::ACL_SRC_1, weights},
+        {TensorType::ACL_SRC_2, biases},
     };
-    _impl->aux_mem_req       = _impl->op->workspace();
-    _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->workspace_tensors =
+        manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
-Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups, const experimental::PostOpList<ITensorInfo *> &post_ops)
+Status CLGEMMConvolutionLayer::validate(const ITensorInfo         *input,
+                                        const ITensorInfo         *weights,
+                                        const ITensorInfo         *biases,
+                                        const ITensorInfo         *output,
+                                        const PadStrideInfo       &conv_info,
+                                        const WeightsInfo         &weights_info,
+                                        const Size2D              &dilation,
+                                        const ActivationLayerInfo &act_info,
+                                        unsigned int               num_groups)
 {
-    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups, post_ops);
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
     return opencl::ClGemmConv2d::validate(input, weights, biases, output, conv2d_info, weights_info);
 }
 
@@ -130,14 +137,14 @@ void CLGEMMConvolutionLayer::run()
 
 void CLGEMMConvolutionLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->weights->mark_as_unused();
         }
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index 9fc81c11dad818fd99563520bed3e964d6b0f1f7..7d40cf18294833bcd89aa2852a6eeeb77bc9d61e 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -24,15 +24,15 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include <tuple>
 
@@ -40,12 +40,13 @@ namespace arm_compute
 {
 namespace
 {
-std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
+std::pair<Coordinates, Coordinates>
+compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
 {
     Coordinates start;
     Coordinates end;
 
-    if(is_nchw)
+    if (is_nchw)
     {
         start.set(0, deconv_info.pad_left());
         start.set(1, deconv_info.pad_top());
@@ -63,13 +64,16 @@ std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const IT
         end.set(2, output_info.dimension(2) - deconv_info.pad_bottom());
     }
 
-    return { start, end };
+    return {start, end};
 }
-Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, GEMMLowpOutputStageInfo &output_stage_info)
+Status construct_gemmlowp_output_stage(const ITensorInfo       *input,
+                                       const ITensorInfo       *weights,
+                                       const ITensorInfo       *output,
+                                       GEMMLowpOutputStageInfo &output_stage_info)
 {
     const auto data_type = input->data_type();
 
-    if(is_data_type_quantized_asymmetric(data_type))
+    if (is_data_type_quantized_asymmetric(data_type))
     {
         const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
         const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
@@ -78,7 +82,8 @@ Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorIn
         float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
         int   output_multiplier(0);
         int   output_shift(0);
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
         output_stage_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
         output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -122,15 +127,21 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManage
 
 CLGEMMDeconvolutionLayer::~CLGEMMDeconvolutionLayer() = default;
 
-Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info)
+Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                          const ITensorInfo   *weights,
+                                          const ITensorInfo   *bias,
+                                          const ITensorInfo   *output,
+                                          const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
 
     DataLayout data_layout  = input->data_layout();
-    const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
+    const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 ||
+                              deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
     const bool is_nchw      = input->data_layout() == DataLayout::NCHW;
     const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
 
@@ -144,21 +155,31 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso
     TensorShape nhwc_weights_shape = weights->tensor_shape();
     TensorShape nhwc_input_shape   = input->tensor_shape();
 
-    if(is_nchw)
+    if (is_nchw)
     {
         permute(nhwc_weights_shape, PermutationVector(2, 0, 1));
         permute(nhwc_input_shape, PermutationVector(2, 0, 1));
 
-        TensorInfo nhwc_input_info = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_input_shape).set_data_layout(DataLayout::NCHW);
+        TensorInfo nhwc_input_info = input->clone()
+                                         ->set_is_resizable(true)
+                                         .reset_padding()
+                                         .set_tensor_shape(nhwc_input_shape)
+                                         .set_data_layout(DataLayout::NCHW);
 
-        TensorInfo nhwc_weights_info = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_weights_shape).set_data_layout(DataLayout::NCHW);
+        TensorInfo nhwc_weights_info = weights->clone()
+                                           ->set_is_resizable(true)
+                                           .reset_padding()
+                                           .set_tensor_shape(nhwc_weights_shape)
+                                           .set_data_layout(DataLayout::NCHW);
 
         CLPermute::validate(weights, &nhwc_weights_info, PermutationVector(2, 0, 1));
         CLPermute::validate(input, &nhwc_input_info, PermutationVector(2, 0, 1));
     }
 
-    const TensorShape reshaped_shape = TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
-    const TensorInfo  reshaped_info  = weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
+    const TensorShape reshaped_shape =
+        TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
+    const TensorInfo reshaped_info =
+        weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(weights, &reshaped_info));
 
     TensorShape      transposed_shape(reshaped_shape[1], reshaped_shape[0]);
@@ -166,77 +187,95 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&reshaped_info, &reshaped_t_info));
 
     TensorShape gemm_output_shape(weights->dimension(idx_w) * weights->dimension(idx_h) * weights->dimension(idx_b),
-                                  input->dimension(idx_w),
-                                  input->dimension(idx_h),
-                                  input->dimension(idx_b));
+                                  input->dimension(idx_w), input->dimension(idx_h), input->dimension(idx_b));
 
     TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true);
     GEMMInfo   gemm_info(false, false, true, input->dimension(idx_h), true);
 
     GEMMLowpOutputStageInfo output_stage_info;
 
-    if(is_quantized)
+    if (is_quantized)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32),
-                                                                           gemm_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(
+            &input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr,
+            &gemm_output_info.set_data_type(DataType::S32), gemm_info));
         ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, output_stage_info));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true),
+                             &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
     }
 
     const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
-    auto                out_dims           = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
-    const TensorShape   deconv_shape       = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
-    TensorInfo          col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
+    auto                out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h),
+                                                                   weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
+    const TensorShape   deconv_shape =
+        misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+    TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
 
-    if(padded_input && is_quantized)
+    if (padded_input && is_quantized)
     {
         const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output, start_end.first, start_end.second));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+            &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(
+            &col2im_output_info, nullptr,
+            &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()),
+                              output, start_end.first, start_end.second));
     }
-    else if(padded_input)
+    else if (padded_input)
     {
         const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+            &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
         ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info, output, start_end.first, start_end.second));
     }
-    else if(is_quantized)
+    else if (is_quantized)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+            &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
     }
 
     return Status{};
 }
 
-void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info)
+void CLGEMMDeconvolutionLayer::configure(const ICLTensor     *input,
+                                         const ICLTensor     *weights,
+                                         const ICLTensor     *bias,
+                                         ICLTensor           *output,
+                                         const PadStrideInfo &deconv_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info);
 }
 
-void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
-                                         const PadStrideInfo &deconv_info)
+void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+                                         const ICLTensor        *input,
+                                         const ICLTensor        *weights,
+                                         const ICLTensor        *bias,
+                                         ICLTensor              *output,
+                                         const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(),
-                                                                  weights->info(),
-                                                                  bias != nullptr ? bias->info() : nullptr,
-                                                                  output->info(),
-                                                                  deconv_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(
+        input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info);
 
     _original_weights = weights;
-    _padded_input     = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
-    _is_nchw          = input->info()->data_layout() == DataLayout::NCHW;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _padded_input     = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 ||
+                    deconv_info.pad_top() > 0;
+    _is_nchw      = input->info()->data_layout() == DataLayout::NCHW;
+    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
 
     const ICLTensor *input_to_use   = input;
     const ICLTensor *weights_to_use = weights;
@@ -245,7 +284,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     // do an outer product in NCHW and then an accumulation through a reduction. This would have two
     // drawbacks: first, the outer product is less efficient than a full GEMM. Second, the reduction
     // might be slower than GEMM.
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _memory_group.manage(&_permuted_input);
         _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U));
@@ -257,10 +296,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     }
 
     // Reshape the input weights. The weights will be reshaped only once during the call to prepare()
-    _reshaped_weights.allocator()->init(TensorInfo(TensorShape(weights_to_use->info()->dimension(0),
-                                                               weights_to_use->info()->dimension(1) * weights_to_use->info()->dimension(2) * weights_to_use->info()->dimension(3)),
-                                                   1,
-                                                   input->info()->data_type(), weights->info()->quantization_info()));
+    _reshaped_weights.allocator()->init(
+        TensorInfo(TensorShape(weights_to_use->info()->dimension(0), weights_to_use->info()->dimension(1) *
+                                                                         weights_to_use->info()->dimension(2) *
+                                                                         weights_to_use->info()->dimension(3)),
+                   1, input->info()->data_type(), weights->info()->quantization_info()));
 
     _reshape_weights.configure(compile_context, weights_to_use, &_reshaped_weights);
     _transpose_weights.configure(compile_context, &_reshaped_weights, &_reshaped_weights_t);
@@ -269,15 +309,17 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     GEMMInfo     gemm_info(false, false, true, input->info()->dimension(idx_h), true);
 
     // Configure output stage for asymmetric quantized types
-    if(_is_quantized)
+    if (_is_quantized)
     {
         // gemmlowp adds the offsets (instead of subtracting them). Thus, we need to negate the original
         // and restore them back to make it work properly.
         QuantizationInfo iq_info = input->info()->quantization_info();
         QuantizationInfo wq_info = weights->info()->quantization_info();
 
-        input_to_use->info()->set_quantization_info(QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset));
-        _reshaped_weights_t.info()->set_quantization_info(QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset));
+        input_to_use->info()->set_quantization_info(
+            QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset));
+        _reshaped_weights_t.info()->set_quantization_info(
+            QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset));
 
         _mm_gemmlowp.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info);
 
@@ -286,10 +328,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     }
     else
     {
-        _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info);
+        _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f,
+                           gemm_info);
     }
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permuted_input.allocator()->allocate();
     }
@@ -298,7 +341,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     ICLTensor *slice_output          = nullptr;
     ICLTensor *output_stage_output   = nullptr;
 
-    if(_padded_input && _is_quantized)
+    if (_padded_input && _is_quantized)
     {
         _memory_group.manage(&_slice_gemm_input);
         _memory_group.manage(&_gemmlowp_final);
@@ -306,13 +349,13 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
         output_stage_output   = &_slice_gemm_input;
         slice_output          = output;
     }
-    else if(_padded_input)
+    else if (_padded_input)
     {
         _memory_group.manage(&_slice_gemm_input);
         deconv_reshape_output = &_slice_gemm_input;
         slice_output          = output;
     }
-    else if(_is_quantized)
+    else if (_is_quantized)
     {
         _memory_group.manage(&_gemmlowp_final);
         deconv_reshape_output = &_gemmlowp_final;
@@ -324,21 +367,24 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     }
 
     // Configure a Col2Im call to reshape the output of GEMM
-    _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
+    _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(),
+                               weights->info(), deconv_info);
     _gemm_output.allocator()->allocate();
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         GEMMLowpOutputStageInfo output_stage_info;
         construct_gemmlowp_output_stage(input->info(), weights->info(), output->info(), output_stage_info);
-        _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, output_stage_info);
+        _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output,
+                                         output_stage_info);
         _gemmlowp_final.allocator()->allocate();
     }
 
     // If the input was padded, the output needs to be sliced.
-    if(_padded_input)
+    if (_padded_input)
     {
-        const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
+        const auto start_end =
+            compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
         _slice_gemm.configure(compile_context, &_slice_gemm_input, slice_output, start_end.first, start_end.second);
         _slice_gemm_input.allocator()->allocate();
     }
@@ -350,12 +396,12 @@ void CLGEMMDeconvolutionLayer::run()
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permute_input_to_nhwc.run();
     }
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _mm_gemmlowp.run();
     }
@@ -366,12 +412,12 @@ void CLGEMMDeconvolutionLayer::run()
 
     CLScheduler::get().enqueue(*_deconv_reshape, false);
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _gemmlowp_output_stage.run();
     }
 
-    if(_padded_input)
+    if (_padded_input)
     {
         _slice_gemm.run();
     }
@@ -379,11 +425,11 @@ void CLGEMMDeconvolutionLayer::run()
 
 void CLGEMMDeconvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
-        if(_is_nchw)
+        if (_is_nchw)
         {
             _permuted_weights.allocator()->allocate();
             _permute_weights_to_nhwc.run();
@@ -392,7 +438,7 @@ void CLGEMMDeconvolutionLayer::prepare()
         _reshaped_weights.allocator()->allocate();
         _reshape_weights.run();
 
-        if(_is_nchw)
+        if (_is_nchw)
         {
             _permuted_weights.allocator()->free();
         }
@@ -401,7 +447,7 @@ void CLGEMMDeconvolutionLayer::prepare()
         _transpose_weights.run();
 
         // Prepare gemm
-        if(!_is_quantized)
+        if (!_is_quantized)
         {
             _mm_gemm.prepare();
         }
@@ -411,7 +457,7 @@ void CLGEMMDeconvolutionLayer::prepare()
         }
 
         // Free resources
-        if(!_reshaped_weights_t.is_used())
+        if (!_reshaped_weights_t.is_used())
         {
             _reshaped_weights_t.allocator()->free();
         }
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index d9029478a1d02b73e2f7fee6eed1b41efffecb11..8bad19865805890b3f5e1ffcedaa34ed13ecacdf 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -31,12 +31,12 @@
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/IMemoryManager.h"
-#include "src/core/helpers/MemoryHelpers.h"
 
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
 
 namespace arm_compute
@@ -46,13 +46,13 @@ using OperatorType = opencl::ClGemmLowpMatrixMultiplyCore;
 
 struct CLGEMMLowpMatrixMultiplyCore::Impl
 {
-    const ICLTensor              *b{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ICLTensor              *b{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
     MemoryGroup                   memory_group{};
     ITensorPack                   run_pack{};
     MemoryRequirements            aux_mem_req{};
     WorkspaceData<CLTensor>       workspace_tensors{};
-    bool                          is_prepared{ false };
+    bool                          is_prepared{false};
 };
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
@@ -63,12 +63,18 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemo
 
 CLGEMMLowpMatrixMultiplyCore::~CLGEMMLowpMatrixMultiplyCore() = default;
 
-void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
+void CLGEMMLowpMatrixMultiplyCore::configure(
+    const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info);
 }
 
-void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
+void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context,
+                                             const ICLTensor        *a,
+                                             const ICLTensor        *b,
+                                             const ICLTensor        *c,
+                                             ICLTensor              *output,
+                                             const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
@@ -76,23 +82,29 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     _impl->op          = std::make_unique<OperatorType>();
     _impl->is_prepared = gemm_info.retain_internal_weights();
 
-    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info);
+    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(),
+                         gemm_info);
     _impl->aux_mem_req = _impl->op->workspace();
 
     // Manage/allocate auxilairy tensors
-    if(_impl->is_prepared)
+    if (_impl->is_prepared)
     {
         _impl->run_pack.add_const_tensor(ACL_SRC_0, a);
         _impl->run_pack.add_tensor(ACL_DST, output);
     }
     else
     {
-        _impl->run_pack          = { { ACL_SRC_0, a }, { ACL_SRC_1, _impl->b }, { ACL_SRC_2, c }, { ACL_DST, output } };
-        _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
+        _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, _impl->b}, {ACL_SRC_2, c}, {ACL_DST, output}};
+        _impl->workspace_tensors =
+            manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
     }
 }
 
-Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                              const ITensorInfo *b,
+                                              const ITensorInfo *c,
+                                              const ITensorInfo *output,
+                                              const GEMMInfo    &gemm_info)
 {
     return OperatorType::validate(a, b, c, output, gemm_info);
 }
@@ -108,7 +120,7 @@ void CLGEMMLowpMatrixMultiplyCore::run()
 
 void CLGEMMLowpMatrixMultiplyCore::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index 6feed0d7130cb5a59af4bb4bcda99f15a5986c5a..3dd8c5f1012037f221e7c9a68ce2853c4c1b760d 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -40,27 +40,33 @@ namespace arm_compute
 {
 struct CLGEMMLowpOutputStage::Impl
 {
-    const ICLTensor                               *src{ nullptr };
-    const ICLTensor                               *bias{ nullptr };
-    ICLTensor                                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClGemmLowpOutputStage> op{ nullptr };
+    const ICLTensor                               *src{nullptr};
+    const ICLTensor                               *bias{nullptr};
+    ICLTensor                                     *dst{nullptr};
+    std::unique_ptr<opencl::ClGemmLowpOutputStage> op{nullptr};
     ITensorPack                                    run_pack{};
 };
 
-CLGEMMLowpOutputStage::CLGEMMLowpOutputStage()
-    : _impl(std::make_unique<Impl>())
+CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() : _impl(std::make_unique<Impl>())
 {
 }
-CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default;
+CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&)            = default;
 CLGEMMLowpOutputStage &CLGEMMLowpOutputStage::operator=(CLGEMMLowpOutputStage &&) = default;
 CLGEMMLowpOutputStage::~CLGEMMLowpOutputStage()                                   = default;
 
-void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
+void CLGEMMLowpOutputStage::configure(const ICLTensor               *input,
+                                      const ICLTensor               *bias,
+                                      ICLTensor                     *output,
+                                      const GEMMLowpOutputStageInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info);
 }
 
-void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
+void CLGEMMLowpOutputStage::configure(const CLCompileContext        &compile_context,
+                                      const ICLTensor               *input,
+                                      const ICLTensor               *bias,
+                                      ICLTensor                     *output,
+                                      const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -69,11 +75,15 @@ void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, c
     _impl->dst  = output;
 
     _impl->op = std::make_unique<opencl::ClGemmLowpOutputStage>();
-    _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info);
-    _impl->run_pack = { { ACL_SRC, _impl->src }, { ACL_BIAS, _impl->bias }, { ACL_DST, _impl->dst } };
+    _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(),
+                         info);
+    _impl->run_pack = {{ACL_SRC, _impl->src}, {ACL_BIAS, _impl->bias}, {ACL_DST, _impl->dst}};
 }
 
-Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info)
+Status CLGEMMLowpOutputStage::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *bias,
+                                       const ITensorInfo             *output,
+                                       const GEMMLowpOutputStageInfo &info)
 {
     return opencl::ClGemmLowpOutputStage::validate(input, bias, output, info);
 }
diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
index 033c117cecc521cd6bcc0311152b0cdcbc640619..2610cb1a3ba47159a673b5c130f7c86c38c8a3b7 100644
--- a/src/runtime/CL/functions/CLGather.cpp
+++ b/src/runtime/CL/functions/CLGather.cpp
@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLGather.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/CL/kernels/CLGatherKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
 
 namespace arm_compute
 {
@@ -35,7 +35,11 @@ void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTe
     configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
 }
 
-void CLGather::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+void CLGather::configure(const CLCompileContext &compile_context,
+                         const ICLTensor        *input,
+                         const ICLTensor        *indices,
+                         ICLTensor              *output,
+                         int                     axis)
 {
     ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis);
     auto k = std::make_unique<CLGatherKernel>();
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 9cb7d618cfc276ee0a3734116f78fcdb619a5322..b2c1d2631e7c2820b50bef6fb34236c35ad0a247 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -27,13 +27,13 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
 #include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
 #include "src/core/CL/kernels/CLPadLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -71,48 +71,67 @@ CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManage
 
 CLGenerateProposalsLayer::~CLGenerateProposalsLayer() = default;
 
-void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
+void CLGenerateProposalsLayer::configure(const ICLTensor             *scores,
+                                         const ICLTensor             *deltas,
+                                         const ICLTensor             *anchors,
+                                         ICLTensor                   *proposals,
+                                         ICLTensor                   *scores_out,
+                                         ICLTensor                   *num_valid_proposals,
                                          const GenerateProposalsInfo &info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
+    configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out,
+              num_valid_proposals, info);
 }
 
-void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals,
-                                         ICLTensor *scores_out,
-                                         ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info)
+void CLGenerateProposalsLayer::configure(const CLCompileContext      &compile_context,
+                                         const ICLTensor             *scores,
+                                         const ICLTensor             *deltas,
+                                         const ICLTensor             *anchors,
+                                         ICLTensor                   *proposals,
+                                         ICLTensor                   *scores_out,
+                                         ICLTensor                   *num_valid_proposals,
+                                         const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
-    ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(),
+                                                                  proposals->info(), scores_out->info(),
+                                                                  num_valid_proposals->info(), info));
     ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
 
     _is_nhwc                        = scores->info()->data_layout() == DataLayout::NHWC;
     const DataType scores_data_type = scores->info()->data_type();
     _is_qasymm8                     = scores_data_type == DataType::QASYMM8;
-    const int    num_anchors        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
-    const int    feat_width         = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
-    const int    feat_height        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
-    const int    total_num_anchors  = num_anchors * feat_width * feat_height;
-    const int    pre_nms_topN       = info.pre_nms_topN();
-    const int    post_nms_topN      = info.post_nms_topN();
-    const size_t values_per_roi     = info.values_per_roi();
+    const int num_anchors           = scores->info()->dimension(
+                  get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+    const int    total_num_anchors = num_anchors * feat_width * feat_height;
+    const int    pre_nms_topN      = info.pre_nms_topN();
+    const int    post_nms_topN     = info.post_nms_topN();
+    const size_t values_per_roi    = info.values_per_roi();
 
     const QuantizationInfo scores_qinfo   = scores->info()->quantization_info();
     const DataType         rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
-    const QuantizationInfo rois_qinfo     = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
+    const QuantizationInfo rois_qinfo =
+        (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
 
     // Compute all the anchors
     _memory_group.manage(&_all_anchors);
-    _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+    _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors,
+                                       ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
 
     const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
-    _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
+    _deltas_flattened.allocator()->init(
+        TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
 
     // Permute and reshape deltas
     _memory_group.manage(&_deltas_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_deltas_permuted);
-        _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{2, 0, 1});
         _flatten_deltas.configure(compile_context, &_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
@@ -126,10 +145,10 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
 
     // Permute and reshape scores
     _memory_group.manage(&_scores_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_scores_permuted);
-        _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{2, 0, 1});
         _flatten_scores.configure(compile_context, &_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
@@ -140,7 +159,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
 
     CLTensor *anchors_to_use = &_all_anchors;
     CLTensor *deltas_to_use  = &_deltas_flattened;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
         _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
@@ -163,11 +182,12 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
     anchors_to_use->allocator()->allocate();
 
     _all_proposals_to_use = &_all_proposals;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _memory_group.manage(&_all_proposals_quantized);
         // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
-        _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+        _all_proposals_quantized.allocator()->init(
+            TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
         _quantize_all_proposals->configure(compile_context, &_all_proposals, &_all_proposals_quantized);
         _all_proposals.allocator()->allocate();
         _all_proposals_to_use = &_all_proposals_quantized;
@@ -183,7 +203,8 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
 
     // Note that NMS needs outputs preinitialized.
     auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
-    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
+    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type,
+                       rois_qinfo);
     auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
 
     // Initialize temporaries (unused) outputs
@@ -195,20 +216,27 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
     _num_valid_proposals = num_valid_proposals;
 
     _memory_group.manage(&_proposals_4_roi_values);
-    _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
-                       BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()));
+    _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values,
+                       &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
+                       BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f,
+                                       true, min_size_scaled, info.im_width(), info.im_height()));
     _keeps_nms_unused.allocator()->allocate();
     _classes_nms_unused.allocator()->allocate();
     _all_proposals_to_use->allocator()->allocate();
     _scores_flattened.allocator()->allocate();
 
     // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
-    _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{{1, 0}});
     _proposals_4_roi_values.allocator()->allocate();
 }
 
-Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
-                                          const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+Status CLGenerateProposalsLayer::validate(const ITensorInfo           *scores,
+                                          const ITensorInfo           *deltas,
+                                          const ITensorInfo           *anchors,
+                                          const ITensorInfo           *proposals,
+                                          const ITensorInfo           *scores_out,
+                                          const ITensorInfo           *num_valid_proposals,
+                                          const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -216,9 +244,12 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
 
-    const int num_anchors       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
-    const int feat_width        = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
-    const int feat_height       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
+    const int num_anchors =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
     const int num_images        = scores->dimension(3);
     const int total_num_anchors = num_anchors * feat_width * feat_height;
     const int values_per_roi    = info.values_per_roi();
@@ -227,76 +258,101 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
 
     ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
 
-    if(is_qasymm8)
+    if (is_qasymm8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
         const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
         ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
     }
 
-    TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
-
-    TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    if(scores->data_layout() == DataLayout::NHWC)
+    TensorInfo all_anchors_info(
+        anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(
+        anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+
+    TensorInfo deltas_permuted_info =
+        deltas->clone()
+            ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height))
+            .set_is_resizable(true);
+    TensorInfo scores_permuted_info =
+        scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+    if (scores->data_layout() == DataLayout::NHWC)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1}));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1}));
     }
 
-    TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    TensorInfo deltas_flattened_info(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
 
-    TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
-    TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    TensorInfo scores_flattened_info(
+        scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+    TensorInfo proposals_4_roi_values(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
 
     TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
-    TensorInfo  proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
-    if(is_qasymm8)
+    TensorInfo  proposals_4_roi_values_quantized(
+         deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16)
+        .set_quantization_info(QuantizationInfo(0.125f, 0));
+    if (is_qasymm8)
     {
-        TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+        TensorInfo all_anchors_f32_info(anchors->clone()
+                                            ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                            .set_is_resizable(true)
+                                            .set_data_type(DataType::F32));
         ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info));
 
-        TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
-
-        TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
-                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+        TensorInfo deltas_flattened_f32_info(deltas->clone()
+                                                 ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                 .set_is_resizable(true)
+                                                 .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+
+        TensorInfo proposals_4_roi_values_f32(deltas->clone()
+                                                  ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                  .set_is_resizable(true)
+                                                  .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(
+            &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+            BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
         proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
-                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+                                                   BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}}));
 
-    if(num_valid_proposals->total_size() > 0)
+    if (num_valid_proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
     }
 
-    if(proposals->total_size() > 0)
+    if (proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
-        if(is_qasymm8)
+        if (is_qasymm8)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
             const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
@@ -309,7 +365,7 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
         }
     }
 
-    if(scores_out->total_size() > 0)
+    if (scores_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
@@ -356,7 +412,7 @@ void CLGenerateProposalsLayer::run()
     CLScheduler::get().enqueue(*_compute_anchors_kernel, false);
 
     // Transpose and reshape the inputs
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _permute_deltas.run();
         _permute_scores.run();
@@ -364,7 +420,7 @@ void CLGenerateProposalsLayer::run()
     _flatten_deltas.run();
     _flatten_scores.run();
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _dequantize_anchors->run();
         _dequantize_deltas->run();
@@ -373,7 +429,7 @@ void CLGenerateProposalsLayer::run()
     // Build the boxes
     CLScheduler::get().enqueue(*_bounding_box_kernel, false);
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _quantize_all_proposals->run();
     }
diff --git a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
index 90af36aa777d5b030eb5e6b55e40db291d99d224..1a2369c5c235c9639924d3e68c0807206899b9d4 100644
--- a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
@@ -26,36 +26,45 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "src/gpu/cl/operators/ClIndirectConv2d.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/operators/ClIndirectConv2d.h"
 
 namespace arm_compute
 {
 struct CLIndirectConvolutionLayer::Impl
 {
-    const ICLTensor                          *src{ nullptr };
-    const ICLTensor                          *weights{ nullptr };
-    const ICLTensor                          *biases{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClIndirectConv2d> op{ nullptr };
+    const ICLTensor                          *src{nullptr};
+    const ICLTensor                          *weights{nullptr};
+    const ICLTensor                          *biases{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClIndirectConv2d> op{nullptr};
 };
 
-CLIndirectConvolutionLayer::CLIndirectConvolutionLayer()
-    : _impl(std::make_unique<Impl>())
+CLIndirectConvolutionLayer::CLIndirectConvolutionLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&) = default;
+CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&)            = default;
 CLIndirectConvolutionLayer &CLIndirectConvolutionLayer::operator=(CLIndirectConvolutionLayer &&) = default;
 CLIndirectConvolutionLayer::~CLIndirectConvolutionLayer()                                        = default;
 
-void CLIndirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLIndirectConvolutionLayer::configure(ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
 }
 
-void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                           const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLIndirectConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                           ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info);
@@ -65,10 +74,15 @@ void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_conte
     _impl->biases  = biases;
     _impl->dst     = output;
     _impl->op      = std::make_unique<opencl::ClIndirectConv2d>();
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
 }
 
-Status CLIndirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status CLIndirectConvolutionLayer::validate(const ITensorInfo         *input,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *output,
+                                            const PadStrideInfo       &conv_info,
                                             const ActivationLayerInfo &act_info)
 {
     return opencl::ClIndirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
@@ -83,4 +97,4 @@ void CLIndirectConvolutionLayer::run()
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
index 5feafe19dbb2dd26e625a14812ccee852173e13c..0e994e1aee611c74dc1c8d2454a4476fd9557899 100644
--- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
@@ -27,50 +27,62 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLInstanceNormalizationLayer::CLInstanceNormalizationLayer(CLRuntimeContext *ctx) // NOLINT
-    : _inst_norm_kernel(),
-      _mean_var_kernel(),
-      _mean_var_tensor(),
-      _ctx(ctx)
+    : _inst_norm_kernel(), _mean_var_kernel(), _mean_var_tensor(), _ctx(ctx)
 {
 }
 CLInstanceNormalizationLayer::~CLInstanceNormalizationLayer()
 {
 }
 
-void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+void CLInstanceNormalizationLayer::configure(
+    ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, gamma, beta, epsilon, use_mixed_precision);
 }
 
-void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context,
+                                             ICLTensor              *input,
+                                             ICLTensor              *output,
+                                             float                   gamma,
+                                             float                   beta,
+                                             float                   epsilon,
+                                             bool                    use_mixed_precision)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon, use_mixed_precision);
     auto w = std::make_unique<CLComputeMeanVariance>();
     w->configure(compile_context, input, &_mean_var_tensor, use_mixed_precision);
     _mean_var_kernel = std::move(w);
     auto k           = std::make_unique<CLInstanceNormalizationLayerKernel>();
-    k->configure(compile_context, input, &_mean_var_tensor, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
+    k->configure(compile_context, input, &_mean_var_tensor, output,
+                 InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
     _inst_norm_kernel = std::move(k);
     _mean_var_tensor.allocator()->allocate();
 }
 
-Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input,
+                                              const ITensorInfo *output,
+                                              float              gamma,
+                                              float              beta,
+                                              float              epsilon,
+                                              bool               use_mixed_precision)
 {
-    return CLInstanceNormalizationLayerKernel::validate(input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
+    return CLInstanceNormalizationLayerKernel::validate(
+        input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
 }
 
 void CLInstanceNormalizationLayer::run()
 {
-    ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel, "The child class didn't set the CL kernel or function isn't configured");
+    ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel,
+                             "The child class didn't set the CL kernel or function isn't configured");
     schedule_kernel_on_ctx(_ctx, _mean_var_kernel.get());
     schedule_kernel_on_ctx(_ctx, _inst_norm_kernel.get());
 }
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 1278385f534068247ce2d2f987b53e94d61eb4d8..4fe1d9b20b675e56f57e6a297904bfac885fcdc5 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -29,12 +29,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace
@@ -57,7 +57,8 @@ void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon);
 }
 
-void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayer::configure(
+    const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon);
 
@@ -86,7 +87,8 @@ Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo
     sum_sq.set_tensor_shape(shape);
 
     const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
 
     // Reduce shape on axis
     shape.set(actual_axis, 1);
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index ea08beca759a453797cd3950d566d3f1f262270b..3b50234c77cdca52b75244a967d31740c404b81a 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -24,15 +24,15 @@
 #include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/gpu/cl/kernels/ClTransposeKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/gpu/cl/kernels/ClTransposeKernel.h"
 
 namespace arm_compute
 {
@@ -40,54 +40,155 @@ using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::utils::info_helpers;
 
 CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
-      _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(),
-      _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()), _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(),
-      _pixelwise_mul_cell_state2(), _fully_connected_output(), _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(),
-      _fully_connected_output_state(), _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(),
-      _concat_weights_input_gate(), _concat_weights_output(), _ones_fill(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(),
-      _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(),
-      _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(),
-      _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(),
-      _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(),
-      _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false),
-      _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false)
+    : _memory_group(std::move(memory_manager)),
+      _fully_connected_input_gate(),
+      _accum_input_gate1(),
+      _subtract_input_gate(),
+      _pixelwise_mul_input_gate(),
+      _activation_input_gate(),
+      _fully_connected_forget_gate(),
+      _accum_forget_gate1(),
+      _pixelwise_mul_forget_gate(),
+      _activation_forget_gate(),
+      _fully_connected_cell_state(),
+      _gemm_cell_state1(),
+      _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()),
+      _accum_cell_state1(),
+      _accum_cell_state2(),
+      _pixelwise_mul_cell_state1(),
+      _activation_cell_state(),
+      _cell_clip(),
+      _pixelwise_mul_cell_state2(),
+      _fully_connected_output(),
+      _pixelwise_mul_output_state1(),
+      _accum_output1(),
+      _activation_output(),
+      _activation_output_state(),
+      _pixelwise_mul_output_state2(),
+      _fully_connected_output_state(),
+      _projection_clip(),
+      _copy_cell_state(),
+      _copy_output(),
+      _concat_scratch_buffer(),
+      _concat_inputs_forget_gate(),
+      _concat_weights_forget_gate(),
+      _concat_weights_input_gate(),
+      _concat_weights_output(),
+      _ones_fill(),
+      _mean_std_norm_input_gate(),
+      _pixelwise_mul_input_gate_coeff(),
+      _accum_input_gate_bias(),
+      _mean_std_norm_forget_gate(),
+      _pixelwise_mul_forget_gate_coeff(),
+      _accum_forget_gate_bias(),
+      _mean_std_norm_cell_gate(),
+      _pixelwise_mul_cell_gate_coeff(),
+      _accum_cell_gate_bias(),
+      _mean_std_norm_output_gate(),
+      _pixelwise_mul_output_gate_coeff(),
+      _accum_output_gate_bias(),
+      _input_gate_out1(),
+      _input_gate_out2(),
+      _input_gate_out3(),
+      _input_gate_out4(),
+      _forget_gate_out1(),
+      _forget_gate_out2(),
+      _forget_gate_out3(),
+      _forget_gate_out4(),
+      _forget_gate_out5(),
+      _forget_gate_out6(),
+      _cell_state_out1(),
+      _cell_state_out2(),
+      _cell_state_out3(),
+      _cell_state_out4(),
+      _cell_state_out5(),
+      _output1(),
+      _output2(),
+      _output3(),
+      _output4(),
+      _cell_state_activation(),
+      _output_state1(),
+      _ones(),
+      _input_layer_norm_out1(),
+      _input_layer_norm_out2(),
+      _forget_layer_norm_out1(),
+      _forget_layer_norm_out2(),
+      _cell_layer_norm_out1(),
+      _cell_layer_norm_out2(),
+      _output_layer_norm_out1(),
+      _output_layer_norm_out2(),
+      _run_peephole_opt(false),
+      _run_cifg_opt(false),
+      _perform_cell_clipping(false),
+      _has_projection_weights(false),
+      _perform_projection_clipping(false),
+      _is_prepared(false),
+      _is_layer_norm_lstm(false)
 {
 }
 
 CLLSTMLayer::~CLLSTMLayer() = default;
 
-void CLLSTMLayer::configure(const ICLTensor *input,
-                            const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                            const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                            const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                            const ICLTensor *output_state_in, ICLTensor *cell_state_in,
-                            ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
-                            const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void CLLSTMLayer::configure(const ICLTensor             *input,
+                            const ICLTensor             *input_to_forget_weights,
+                            const ICLTensor             *input_to_cell_weights,
+                            const ICLTensor             *input_to_output_weights,
+                            const ICLTensor             *recurrent_to_forget_weights,
+                            const ICLTensor             *recurrent_to_cell_weights,
+                            const ICLTensor             *recurrent_to_output_weights,
+                            const ICLTensor             *forget_gate_bias,
+                            const ICLTensor             *cell_bias,
+                            const ICLTensor             *output_gate_bias,
+                            const ICLTensor             *output_state_in,
+                            ICLTensor                   *cell_state_in,
+                            ICLTensor                   *scratch_buffer,
+                            ICLTensor                   *output_state_out,
+                            ICLTensor                   *cell_state_out,
+                            ICLTensor                   *output,
+                            const LSTMParams<ICLTensor> &lstm_params,
+                            const ActivationLayerInfo   &activation_info,
+                            float                        cell_threshold,
+                            float                        projection_threshold)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-              recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights,
+              input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+              recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in,
+              cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
               cell_threshold, projection_threshold);
 }
 
-void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                            const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                            const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                            const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                            const ICLTensor *output_state_in, ICLTensor *cell_state_in,
-                            ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
-                            const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void CLLSTMLayer::configure(const CLCompileContext      &compile_context,
+                            const ICLTensor             *input,
+                            const ICLTensor             *input_to_forget_weights,
+                            const ICLTensor             *input_to_cell_weights,
+                            const ICLTensor             *input_to_output_weights,
+                            const ICLTensor             *recurrent_to_forget_weights,
+                            const ICLTensor             *recurrent_to_cell_weights,
+                            const ICLTensor             *recurrent_to_output_weights,
+                            const ICLTensor             *forget_gate_bias,
+                            const ICLTensor             *cell_bias,
+                            const ICLTensor             *output_gate_bias,
+                            const ICLTensor             *output_state_in,
+                            ICLTensor                   *cell_state_in,
+                            ICLTensor                   *scratch_buffer,
+                            ICLTensor                   *output_state_out,
+                            ICLTensor                   *cell_state_out,
+                            ICLTensor                   *output,
+                            const LSTMParams<ICLTensor> &lstm_params,
+                            const ActivationLayerInfo   &activation_info,
+                            float                        cell_threshold,
+                            float                        projection_threshold)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input,
-                                 input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                                  recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 forget_gate_bias, cell_bias, output_gate_bias,
-                                 output_state_in, cell_state_in,
+                                 forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
                                  scratch_buffer, output_state_out, cell_state_out, output);
 
-    ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-                           recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out,
-                           output, lstm_params, activation_info, cell_threshold, projection_threshold);
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                           forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
+                           scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+                           cell_threshold, projection_threshold);
 
     _is_layer_norm_lstm = lstm_params.use_layer_norm();
 
@@ -96,13 +197,12 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
     // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(),
-                                                     input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                     recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                     forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                     output_state_in->info(), cell_state_in->info(),
-                                                     scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
-                                                     lstm_params_info, activation_info, cell_threshold, projection_threshold));
+    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(
+        input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+        recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(),
+        cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
+        lstm_params_info, activation_info, cell_threshold, projection_threshold));
 
     const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
     // Configure block that calculates the forget gate
@@ -126,26 +226,31 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
 
     weights_vector.emplace_back(input_to_forget_weights);
     weights_vector.emplace_back(recurrent_to_forget_weights);
-    const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
+    const TensorShape weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
     _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type()));
 
     _concat_weights_forget_gate.configure(compile_context, weights_vector, &_forget_gate_out6, Window::DimX);
 
     _memory_group.manage(&_forget_gate_out5);
-    _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
+    _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6,
+                                           (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
     _memory_group.manage(&_forget_gate_out1);
     _memory_group.manage(&_forget_gate_out3);
     _forget_gate_out6.allocator()->allocate();
 
     CLTensor *forget_gate_out = &_forget_gate_out5;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
         _run_peephole_opt = true;
         _memory_group.manage(&_forget_gate_out4);
-        _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-        _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+        _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(),
+                                             &_forget_gate_out4, 1, ConvertPolicy::SATURATE,
+                                             RoundingPolicy::TO_NEAREST_EVEN);
+        _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3,
+                                      ConvertPolicy::SATURATE);
         _forget_gate_out4.allocator()->allocate();
         _forget_gate_out5.allocator()->allocate();
         forget_gate_out = &_forget_gate_out3;
@@ -154,22 +259,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     {
         _forget_gate_out3.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_forget_layer_norm_out1);
         _memory_group.manage(&_forget_layer_norm_out2);
         _mean_std_norm_forget_gate.configure(compile_context, forget_gate_out);
-        _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                   RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out,
+                                                   lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1,
+                                                   ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         forget_gate_out->allocator()->allocate();
-        _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias,
+                                          &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
         _forget_layer_norm_out1.allocator()->allocate();
         forget_gate_out = &_forget_layer_norm_out2;
     }
-    _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -178,12 +286,13 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     CLTensor *input_gate_out = &_input_gate_out1;
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _ones_fill.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type()));
-        _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1,
+                                       ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
     }
@@ -195,7 +304,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
         std::vector<const ICLTensor *> lstm_weights;
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
-        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorShape lstm_weights_concat_shape =
+            arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
         _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type()));
 
         _concat_weights_input_gate.configure(compile_context, lstm_weights, &_input_gate_out2, Window::DimX);
@@ -203,15 +313,20 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
         _memory_group.manage(&_input_gate_out1);
 
         _memory_group.manage(&_input_gate_out3);
-        _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
+        _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2,
+                                              (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(),
+                                              &_input_gate_out3);
         _input_gate_out2.allocator()->allocate();
 
         input_gate_out = &_input_gate_out3;
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
             _memory_group.manage(&_input_gate_out4);
-            _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-            _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(),
+                                                &_input_gate_out4, 1, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_NEAREST_EVEN);
+            _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1,
+                                         ConvertPolicy::SATURATE);
             _input_gate_out3.allocator()->allocate();
             _input_gate_out4.allocator()->allocate();
             input_gate_out = &_input_gate_out1;
@@ -221,22 +336,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
             _input_gate_out1.allocator()->allocate();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _memory_group.manage(&_input_layer_norm_out1);
             _memory_group.manage(&_input_layer_norm_out2);
             _mean_std_norm_input_gate.configure(compile_context, input_gate_out);
-            _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                      RoundingPolicy::TO_NEAREST_EVEN);
+            _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out,
+                                                      lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1,
+                                                      1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
             // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
             input_gate_out->allocator()->allocate();
-            _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+            _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(),
+                                             &_input_layer_norm_out2, ConvertPolicy::SATURATE);
             _input_layer_norm_out1.allocator()->allocate();
             input_gate_out = &_input_layer_norm_out2;
         }
-        _activation_input_gate.configure(compile_context, input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _activation_input_gate.configure(compile_context, input_gate_out, nullptr,
+                                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
     // Configure block that calculates the cell state
@@ -249,44 +367,54 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
     _memory_group.manage(&_cell_state_out1);
-    _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
+    _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights,
+                                          (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
     _memory_group.manage(&_cell_state_out2);
     _transpose_cell_state->configure(compile_context, recurrent_to_cell_weights->info(), _cell_state_out2.info());
     _recurrent_to_cell_weights = recurrent_to_cell_weights;
     _memory_group.manage(&_cell_state_out3);
-    _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
+    _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f,
+                                0.f);
     _cell_state_out2.allocator()->allocate();
     _memory_group.manage(&_cell_state_out4);
-    _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+    _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4,
+                                 ConvertPolicy::SATURATE);
     CLTensor *cell_state_out_ptr = &_cell_state_out4;
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_cell_layer_norm_out1);
         _memory_group.manage(&_cell_layer_norm_out2);
         _mean_std_norm_cell_gate.configure(compile_context, cell_state_out_ptr);
-        _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                 RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr,
+                                                 lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1,
+                                                 ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
         cell_state_out_ptr->allocator()->allocate();
-        _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2,
+                                        ConvertPolicy::SATURATE);
         _cell_layer_norm_out1.allocator()->allocate();
         cell_state_out_ptr = &_cell_layer_norm_out2;
     }
     _activation_cell_state.configure(compile_context, cell_state_out_ptr, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
-    _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     cell_state_out_ptr->allocator()->allocate();
-    _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-    _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+    _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1,
+                                 ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
     // Perform clipping
-    if(cell_threshold != 0.f)
+    if (cell_threshold != 0.f)
     {
         _perform_cell_clipping = true;
-        _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, -cell_threshold));
+        _cell_clip.configure(compile_context, &_cell_state_out1, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 cell_threshold, -cell_threshold));
     }
 
     // Configure block that calculates the output
@@ -298,7 +426,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     std::vector<const ICLTensor *> in_out_weights;
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
-    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorShape in_out_weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
     _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type()));
 
     _concat_weights_output.configure(compile_context, in_out_weights, &_output2, Window::DimX);
@@ -306,18 +435,20 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     _memory_group.manage(&_output1);
     _memory_group.manage(&_output4);
 
-    _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
+    _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2,
+                                      (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
 
     _output2.allocator()->allocate();
     _forget_gate_out2.allocator()->allocate();
 
     CLTensor *output_gate_out = &_output4;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
 
         _memory_group.manage(&_output3);
-        _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(),
+                                               &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         _accum_output1.configure(compile_context, &_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
         _output4.allocator()->allocate();
         output_gate_out = &_output1;
@@ -329,22 +460,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     {
         _output1.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_output_layer_norm_out1);
         _memory_group.manage(&_output_layer_norm_out2);
         _mean_std_norm_output_gate.configure(compile_context, output_gate_out);
-        _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                   RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out,
+                                                   lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1,
+                                                   ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         output_gate_out->allocator()->allocate();
-        _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias,
+                                          &_output_layer_norm_out2, ConvertPolicy::SATURATE);
         _output_layer_norm_out1.allocator()->allocate();
         output_gate_out = &_output_layer_norm_out2;
     }
-    _activation_output.configure(compile_context, output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_output.configure(compile_context, output_gate_out, nullptr,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the output state
     /** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -361,19 +495,24 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
 
     _memory_group.manage(&_cell_state_activation);
     _activation_output_state.configure(compile_context, &_cell_state_out1, &_cell_state_activation, activation_info);
-    _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out,
+                                           output_state_out_tmp, 1, ConvertPolicy::SATURATE,
+                                           RoundingPolicy::TO_NEAREST_EVEN);
     _cell_state_activation.allocator()->allocate();
 
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
         _has_projection_weights = true;
-        _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+        _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(),
+                                                lstm_params.projection_bias(), output_state_out);
         _output_state1.allocator()->allocate();
         // Perform clipping
-        if(projection_threshold != 0.f)
+        if (projection_threshold != 0.f)
         {
             _perform_projection_clipping = true;
-            _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+            _projection_clip.configure(compile_context, output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -projection_threshold, projection_threshold));
         }
     }
 
@@ -383,7 +522,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
 
     // Vector for holding the tensors to store in scratch buffer
     std::vector<const ICLTensor *> scratch_inputs;
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         scratch_inputs.emplace_back(input_gate_out);
     }
@@ -397,29 +536,38 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     output_gate_out->allocator()->allocate();
 }
 
-Status CLLSTMLayer::validate(const ITensorInfo *input,
-                             const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                             const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                             const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                             const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
-                             const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
-                             const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+Status CLLSTMLayer::validate(const ITensorInfo             *input,
+                             const ITensorInfo             *input_to_forget_weights,
+                             const ITensorInfo             *input_to_cell_weights,
+                             const ITensorInfo             *input_to_output_weights,
+                             const ITensorInfo             *recurrent_to_forget_weights,
+                             const ITensorInfo             *recurrent_to_cell_weights,
+                             const ITensorInfo             *recurrent_to_output_weights,
+                             const ITensorInfo             *forget_gate_bias,
+                             const ITensorInfo             *cell_bias,
+                             const ITensorInfo             *output_gate_bias,
+                             const ITensorInfo             *output_state_in,
+                             const ITensorInfo             *cell_state_in,
+                             const ITensorInfo             *scratch_buffer,
+                             const ITensorInfo             *output_state_out,
+                             const ITensorInfo             *cell_state_out,
+                             const ITensorInfo             *output,
+                             const LSTMParams<ITensorInfo> &lstm_params,
+                             const ActivationLayerInfo     &activation_info,
+                             float                          cell_threshold,
+                             float                          projection_threshold)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
-                                        input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                        forget_gate_bias, cell_bias, output_gate_bias,
-                                        output_state_in, cell_state_in,
-                                        scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check data types
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
-                                                       input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                                       forget_gate_bias, cell_bias, output_gate_bias,
-                                                       output_state_in, cell_state_in,
-                                                       scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check dimensions
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
@@ -438,16 +586,16 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
-                                && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) &&
+                                cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
 
     const unsigned int num_batches = input->dimension(1);
     const unsigned int num_cells   = input_to_output_weights->dimension(1);
 
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         // If CIFG is used, input layer normalization weights tensor is omitted
-        if(lstm_params.has_cifg_opt())
+        if (lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr);
         }
@@ -459,8 +607,12 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
         }
 
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
+                                            lstm_params.cell_layer_norm_weights(),
+                                            lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(),
+                                                           lstm_params.cell_layer_norm_weights(),
+                                                           lstm_params.output_layer_norm_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
@@ -470,7 +622,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
     }
 
     // Check peephole optimization
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
@@ -488,36 +640,42 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
     TensorInfo cell_state_tmp  = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
 
     // Validate forget gate
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+        input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
 
     std::vector<const ITensorInfo *> inputs_vector;
     inputs_vector.emplace_back(input);
     inputs_vector.emplace_back(output_state_in);
-    const TensorShape concat_shape       = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+    const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
     TensorInfo        forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&forget_gate));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+        &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate input gate
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
-                                            lstm_params.recurrent_to_input_weights(),
-                                            lstm_params.input_gate_bias());
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
@@ -525,88 +683,121 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
         std::vector<const ITensorInfo *> lstm_weights;
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
-        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
-        TensorInfo  lstm_gate_concat          = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+        TensorShape lstm_weights_concat_shape =
+            arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
         ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+            input, lstm_params.input_to_input_weights(),
+            (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
 
-        if(lstm_params.has_peephole_opt())
+        if (lstm_params.has_peephole_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
         }
 
-        if(lstm_params.use_layer_norm())
+        if (lstm_params.use_layer_norm())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&input_gate));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(),
+                                                                       &input_gate, ConvertPolicy::SATURATE));
         }
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+            &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
 
     // Validate cell state
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(lstm_params.use_layer_norm())
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+        input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
     }
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(cell_threshold != 0.f)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (cell_threshold != 0.f)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold,
-                                                                                                              -cell_threshold)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLActivationLayer::validate(&cell_state_tmp, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            cell_threshold, -cell_threshold)));
     }
 
     std::vector<const ITensorInfo *> in_out_weights;
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
-    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
-    TensorInfo  in_out_gate_concat          = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+    TensorShape in_out_weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
     ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
     // Validate output gate tmp
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+        input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+            &output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+            RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+        &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate output state
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-    if(lstm_params.has_projection())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
-        if(projection_threshold != 0.f)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp,
+                                                                    1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_NEAREST_EVEN));
+    if (lstm_params.has_projection())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(),
+                                                                    lstm_params.projection_bias(), output_state_out));
+        if (projection_threshold != 0.f)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, output_state_out,
-                                                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+                output_state_out, output_state_out,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+                                    projection_threshold)));
         }
     }
 
@@ -616,7 +807,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
 
     // Validate scratch concatenation
     std::vector<const ITensorInfo *> inputs_vector_info_raw;
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         inputs_vector_info_raw.push_back(&input_gate);
     }
@@ -638,12 +829,12 @@ void CLLSTMLayer::run()
 
     _fully_connected_forget_gate.run();
 
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
         _pixelwise_mul_forget_gate.run();
         _accum_forget_gate1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_forget_gate.run();
         _pixelwise_mul_forget_gate_coeff.run();
@@ -651,7 +842,7 @@ void CLLSTMLayer::run()
     }
     _activation_forget_gate.run();
 
-    if(_run_cifg_opt)
+    if (_run_cifg_opt)
     {
         _ones_fill.run();
         _subtract_input_gate.run();
@@ -660,13 +851,13 @@ void CLLSTMLayer::run()
     {
         _fully_connected_input_gate.run();
 
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
             _pixelwise_mul_input_gate.run();
             _accum_input_gate1.run();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _mean_std_norm_input_gate.run();
             _pixelwise_mul_input_gate_coeff.run();
@@ -679,12 +870,10 @@ void CLLSTMLayer::run()
     ITensorPack pack;
     pack.add_tensor(TensorType::ACL_SRC, _recurrent_to_cell_weights);
     pack.add_tensor(TensorType::ACL_DST, &_cell_state_out2);
-    CLScheduler::get().enqueue_op(*_transpose_cell_state,
-                                  pack,
-                                  false);
+    CLScheduler::get().enqueue_op(*_transpose_cell_state, pack, false);
     _gemm_cell_state1.run();
     _accum_cell_state1.run();
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_cell_gate.run();
         _pixelwise_mul_cell_gate_coeff.run();
@@ -695,19 +884,19 @@ void CLLSTMLayer::run()
     _pixelwise_mul_cell_state2.run();
     _accum_cell_state2.run();
 
-    if(_perform_cell_clipping)
+    if (_perform_cell_clipping)
     {
         _cell_clip.run();
     }
 
     _fully_connected_output.run();
 
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
         _pixelwise_mul_output_state1.run();
         _accum_output1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_output_gate.run();
         _pixelwise_mul_output_gate_coeff.run();
@@ -718,10 +907,10 @@ void CLLSTMLayer::run()
     _activation_output_state.run();
     _pixelwise_mul_output_state2.run();
 
-    if(_has_projection_weights)
+    if (_has_projection_weights)
     {
         _fully_connected_output_state.run();
-        if(_perform_projection_clipping)
+        if (_perform_projection_clipping)
         {
             _projection_clip.run();
         }
@@ -735,10 +924,10 @@ void CLLSTMLayer::run()
 
 void CLLSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _concat_weights_forget_gate.run();
-        if(!_run_cifg_opt)
+        if (!_run_cifg_opt)
         {
             _concat_weights_input_gate.run();
         }
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index d14c6102d5bd36c2c715dba331f08e1a454ae602..ea64eda023313a9f46c2527accc2003300add380 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -25,12 +25,12 @@
 #include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <memory>
 
@@ -46,48 +46,129 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0);  // qsymm16 with 0 integer bit
 } // namespace
 
 CLLSTMLayerQuantized::CLLSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
-      _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add_cell_state_tmps(), _add2(), _mul_forget_gate_cell_state(),
-      _mul_input_gate_input_mod_gate(), _mul_output_state_tmp_output_gate(), _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(),
-      _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr),
-      _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr),
-      _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(),
-      _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state_tmp1(), _cell_state_tmp2(),
-      _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), _is_prepared(false)
+    : _memory_group(std::move(memory_manager)),
+      _gemmlowp(),
+      _output_stage(),
+      _transpose_weights(),
+      _concat_input_weights(),
+      _concat_recurrent_weights(),
+      _concat_weights(),
+      _concat_inputs(),
+      _concat_bias(),
+      _sigmoid_forget_gate(),
+      _sigmoid_input_gate(),
+      _sigmoid_output_gate(),
+      _tanh_modulation_gate(),
+      _tanh_output_state(),
+      _add_cell_state_tmps(),
+      _add2(),
+      _mul_forget_gate_cell_state(),
+      _mul_input_gate_input_mod_gate(),
+      _mul_output_state_tmp_output_gate(),
+      _slice_input_tensor(),
+      _slice_forget_tensor(),
+      _slice_cell_tensor(),
+      _slice_output_tensor(),
+      _dequantize(),
+      _quantize(),
+      _input_to_input_weights(nullptr),
+      _input_to_forget_weights(nullptr),
+      _input_to_cell_weights(nullptr),
+      _input_to_output_weights(nullptr),
+      _recurrent_to_input_weights(nullptr),
+      _recurrent_to_forget_weights(nullptr),
+      _recurrent_to_cell_weights(nullptr),
+      _recurrent_to_output_weights(nullptr),
+      _input_gate_bias(nullptr),
+      _forget_gate_bias(nullptr),
+      _cell_bias(nullptr),
+      _output_gate_bias(nullptr),
+      _recurrent_weights(),
+      _input_weights(),
+      _weights(),
+      _input(),
+      _weights_transposed(),
+      _output_highp(),
+      _output_lowp(),
+      _bias(),
+      _forget_gate_input(),
+      _input_gate_input(),
+      _output_gate_input(),
+      _input_modulation_gate_input(),
+      _forget_gate_output(),
+      _input_gate_output(),
+      _output_gate_output(),
+      _input_modulation_gate_output(),
+      _cell_state_tmp1(),
+      _cell_state_tmp2(),
+      _output_state_tmp(),
+      _output_state_out_symm(),
+      _output_state_out_f32(),
+      _is_prepared(false)
 {
 }
 
 void CLLSTMLayerQuantized::configure(const ICLTensor *input,
-                                     const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                                     const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                                     const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                                     ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                                     ICLTensor *cell_state_out, ICLTensor *output_state_out)
+                                     const ICLTensor *input_to_input_weights,
+                                     const ICLTensor *input_to_forget_weights,
+                                     const ICLTensor *input_to_cell_weights,
+                                     const ICLTensor *input_to_output_weights,
+                                     const ICLTensor *recurrent_to_input_weights,
+                                     const ICLTensor *recurrent_to_forget_weights,
+                                     const ICLTensor *recurrent_to_cell_weights,
+                                     const ICLTensor *recurrent_to_output_weights,
+                                     const ICLTensor *input_gate_bias,
+                                     const ICLTensor *forget_gate_bias,
+                                     const ICLTensor *cell_bias,
+                                     const ICLTensor *output_gate_bias,
+                                     ICLTensor       *cell_state_in,
+                                     const ICLTensor *output_state_in,
+                                     ICLTensor       *cell_state_out,
+                                     ICLTensor       *output_state_out)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-              recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
-              output_state_out);
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights,
+              input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+              recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias,
+              output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
 }
 
-void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                                     const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                                     const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                                     const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                                     ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                                     ICLTensor *cell_state_out, ICLTensor *output_state_out)
+void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context,
+                                     const ICLTensor        *input,
+                                     const ICLTensor        *input_to_input_weights,
+                                     const ICLTensor        *input_to_forget_weights,
+                                     const ICLTensor        *input_to_cell_weights,
+                                     const ICLTensor        *input_to_output_weights,
+                                     const ICLTensor        *recurrent_to_input_weights,
+                                     const ICLTensor        *recurrent_to_forget_weights,
+                                     const ICLTensor        *recurrent_to_cell_weights,
+                                     const ICLTensor        *recurrent_to_output_weights,
+                                     const ICLTensor        *input_gate_bias,
+                                     const ICLTensor        *forget_gate_bias,
+                                     const ICLTensor        *cell_bias,
+                                     const ICLTensor        *output_gate_bias,
+                                     ICLTensor              *cell_state_in,
+                                     const ICLTensor        *output_state_in,
+                                     ICLTensor              *cell_state_out,
+                                     ICLTensor              *output_state_out)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                 recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
-
-    ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                                 input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                                 recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias,
+                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                                 cell_state_out, output_state_out);
+
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                           input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                           recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias,
+                           cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
                            output_state_out);
 
-    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
-                                                              input_to_output_weights->info(),
-                                                              recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                              input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(
+        input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+        input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(),
+        recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+        output_state_in->info(), cell_state_out->info(), output_state_out->info()));
 
     const int input_size  = input->info()->dimension(0);
     const int batch_size  = input->info()->dimension(1);
@@ -95,8 +176,10 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
 
     const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
 
-    auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
-    auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+    auto_init_if_empty(*cell_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+    auto_init_if_empty(*output_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
 
     _input_to_input_weights      = input_to_input_weights;
     _input_to_forget_weights     = input_to_forget_weights;
@@ -124,17 +207,20 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
     recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
     recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
 
-    _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _input_weights.allocator()->init(
+        TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_input_weights.configure(compile_context, inputs_weights_vector, &_input_weights, Window::DimY);
 
-    _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _recurrent_weights.allocator()->init(
+        TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_recurrent_weights.configure(compile_context, recurrent_weights_vector, &_recurrent_weights, Window::DimY);
 
     std::vector<const ICLTensor *> weights_vector;
     weights_vector.emplace_back(&_recurrent_weights);
     weights_vector.emplace_back(&_input_weights);
 
-    _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _weights.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_weights.configure(compile_context, weights_vector, &_weights, Window::DimX);
     _transpose_weights.configure(compile_context, &_weights, &_weights_transposed);
 
@@ -144,7 +230,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
     input_vector.emplace_back(output_state_in);
 
     _memory_group.manage(&_input);
-    _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+    _input.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
     _concat_inputs.configure(compile_context, input_vector, &_input, Window::DimX);
 
     // Bias concatenation
@@ -159,7 +246,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
 
     // Invert the offset for gemmlowp
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
 
     // Run gemmlowp
     _memory_group.manage(&_output_highp);
@@ -169,7 +257,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
 
     // Set the offset back
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
 
     // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
     _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
@@ -191,85 +280,111 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
     _bias.allocator()->allocate();
 
     // Get the gate tensors
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0, 0},
+                                      {output_size, batch_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size, 0},
+                                       {2 * output_size, batch_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input,
+                                     {2 * output_size, 0}, {3 * output_size, batch_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size, 0},
+                                       {4 * output_size, batch_size});
         _output_lowp.allocator()->allocate();
     }
     else
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0 }, { output_size });
+        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0}, {output_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size},
+                                       {2 * output_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, {2 * output_size},
+                                     {3 * output_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size},
+                                       {4 * output_size});
         _output_lowp.allocator()->allocate();
     }
 
     // Forget gate
     _memory_group.manage(&_forget_gate_output);
-    _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_output.allocator()->init(
+        TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _forget_gate_input.allocator()->allocate();
 
     // Input gate
     _memory_group.manage(&_input_gate_output);
-    _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _input_gate_output.allocator()->init(
+        TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output,
+                                  ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _input_gate_input.allocator()->allocate();
 
     // Input modulation gate equation
     _memory_group.manage(&_input_modulation_gate_output);
-    _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _input_modulation_gate_output.allocator()->init(
+        TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
     _input_modulation_gate_input.allocator()->allocate();
 
     // Output gate
     _memory_group.manage(&_output_gate_output);
-    _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_output.allocator()->init(
+        TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _output_gate_input.allocator()->allocate();
 
     // Long term memory
     _memory_group.manage(&_cell_state_tmp1);
-    _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state_tmp1.allocator()->init(
+        TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1,
+                                          ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _forget_gate_output.allocator()->allocate();
 
     _memory_group.manage(&_cell_state_tmp2);
-    _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state_tmp2.allocator()->init(
+        TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output,
+                                             &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _input_modulation_gate_output.allocator()->allocate();
     _input_gate_output.allocator()->allocate();
 
-    _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE);
+    _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out,
+                                   ConvertPolicy::SATURATE);
     _cell_state_tmp1.allocator()->allocate();
     _cell_state_tmp2.allocator()->allocate();
 
     // Short term memory
     _memory_group.manage(&_output_state_tmp);
-    _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _output_state_tmp.allocator()->init(
+        TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
 
     _memory_group.manage(&_output_state_out_symm);
-    _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _output_state_out_symm.allocator()->init(
+        TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output,
+                                                &_output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
     _output_gate_output.allocator()->allocate();
     _output_state_tmp.allocator()->allocate();
 
     // Requantize the output state from QSYMM16 to QASYMM8
     _memory_group.manage(&_output_state_out_f32);
-    _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+    _output_state_out_f32.allocator()->init(
+        TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
     _dequantize.configure(compile_context, &_output_state_out_symm, &_output_state_out_f32);
     _output_state_out_symm.allocator()->allocate();
 
@@ -278,15 +393,28 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
 }
 
 Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
-                                      const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                                      const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                                      const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                                      const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                                      const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+                                      const ITensorInfo *input_to_input_weights,
+                                      const ITensorInfo *input_to_forget_weights,
+                                      const ITensorInfo *input_to_cell_weights,
+                                      const ITensorInfo *input_to_output_weights,
+                                      const ITensorInfo *recurrent_to_input_weights,
+                                      const ITensorInfo *recurrent_to_forget_weights,
+                                      const ITensorInfo *recurrent_to_cell_weights,
+                                      const ITensorInfo *recurrent_to_output_weights,
+                                      const ITensorInfo *input_gate_bias,
+                                      const ITensorInfo *forget_gate_bias,
+                                      const ITensorInfo *cell_bias,
+                                      const ITensorInfo *output_gate_bias,
+                                      const ITensorInfo *cell_state_in,
+                                      const ITensorInfo *output_state_in,
+                                      const ITensorInfo *cell_state_out,
+                                      const ITensorInfo *output_state_out)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
-                                        output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+        recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+        input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+        output_state_out);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::QASYMM8);
 
     const int input_size  = input->dimension(0);
@@ -299,29 +427,51 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
 
-    TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
-    TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
-    TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+    TensorInfo input_weights_info(input_to_input_weights->clone()
+                                      ->set_tensor_shape(TensorShape(input_size, output_size))
+                                      .set_data_type(DataType::QASYMM8));
+    TensorInfo recurrent_weights_info(input_to_input_weights->clone()
+                                          ->set_tensor_shape(TensorShape(output_size, output_size))
+                                          .set_data_type(DataType::QASYMM8));
+    TensorInfo bias_info(
+        input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+    TensorInfo output_state_info(cell_state_in->clone()
+                                     ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                     .set_data_type(DataType::QASYMM8)
+                                     .set_quantization_info(qasymm));
+    TensorInfo cell_state_info(cell_state_in->clone()
+                                   ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                   .set_data_type(DataType::QSYMM16)
+                                   .set_quantization_info(qsymm_4));
 
     // Shape checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights,
+                                                   input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights,
+                                                   recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                   recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                   output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
 
     // Data type checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights,
+                                                       input_to_forget_weights, input_to_cell_weights,
+                                                       input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights,
+                                                       recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                       recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                       output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
 
     // Quantization checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights,
+                                                              input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                              recurrent_to_cell_weights, recurrent_to_output_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
 
@@ -343,7 +493,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
     recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
     const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
 
     // _concat_weights
     std::vector<const ITensorInfo *> weights_vector;
@@ -353,7 +504,7 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(weights_vector, &weights, Window::DimX));
     // _transpose_weights
     const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]);
-    TensorInfo        weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
+    TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&weights, &weights_transposed));
 
     // _concat_inputs
@@ -379,7 +530,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
 
     // _gemmlowp
     const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
 
     // Set the offset back
     input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
@@ -390,7 +542,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     const float multiplier        = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
     int         output_multiplier = 0;
     int         output_shift      = 0;
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
     // _output_stage
     GEMMLowpOutputStageInfo info{};
@@ -405,68 +558,91 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     TensorInfo input_modulation_gate_input;
     TensorInfo output_gate_input;
 
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0},
+                                                      {3 * output_size, batch_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size}));
     }
     else
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, {0}, {output_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size}));
     }
 
     // _sigmoid_forget_gate
     const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&forget_gate_input, &forget_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _sigmoid_input_gate
     const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+        &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _tanh_modulation_gate
-    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16,
+                                                  qsymm_0);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
     // _sigmoid_output_gate
     const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&output_gate_input, &output_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // _mul_forget_gate_cell_state
     const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
 
     // _mul_input_gate_input_mod_gate
     const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output,
+                                                                    &cell_state_tmp2, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _add_cell_state_tmps
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
 
     // _tanh_modulation_gate
     const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(cell_state_out, &output_state_tmp,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
 
     // _mul_output_state_tmp_output_gate
     const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output,
+                                                                    &output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _dequantize
     const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32);
@@ -475,14 +651,14 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     // _quantize
     ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&output_state_out_f32, output_state_out));
 
-    if(cell_state_out->total_size() != 0)
+    if (cell_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
     }
 
-    if(output_state_out->total_size() != 0)
+    if (output_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
@@ -541,7 +717,7 @@ void CLLSTMLayerQuantized::run()
 
 void CLLSTMLayerQuantized::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _input_weights.allocator()->allocate();
         _concat_input_weights.run();
diff --git a/src/runtime/CL/functions/CLLogicalAnd.cpp b/src/runtime/CL/functions/CLLogicalAnd.cpp
index 696191c485998e2a17c7852212dc68efd1a23a33..ea21c54bc346687bed91d517220d029edc81265f 100644
--- a/src/runtime/CL/functions/CLLogicalAnd.cpp
+++ b/src/runtime/CL/functions/CLLogicalAnd.cpp
@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLLogicalAnd.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include <utility>
 
@@ -33,7 +34,10 @@ namespace arm_compute
 {
 namespace experimental
 {
-void CLLogicalAnd::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+void CLLogicalAnd::configure(const CLCompileContext &compile_context,
+                             ITensorInfo            *input1,
+                             ITensorInfo            *input2,
+                             ITensorInfo            *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>();
@@ -54,17 +58,16 @@ void CLLogicalAnd::run(ITensorPack &tensors)
 
 struct CLLogicalAnd::Impl
 {
-    const ICLTensor                            *src0{ nullptr };
-    const ICLTensor                            *src1{ nullptr };
-    ICLTensor                                  *dst{ nullptr };
-    std::unique_ptr<experimental::CLLogicalAnd> op{ nullptr };
+    const ICLTensor                            *src0{nullptr};
+    const ICLTensor                            *src1{nullptr};
+    ICLTensor                                  *dst{nullptr};
+    std::unique_ptr<experimental::CLLogicalAnd> op{nullptr};
 };
 
-CLLogicalAnd::CLLogicalAnd()
-    : _impl(std::make_unique<Impl>())
+CLLogicalAnd::CLLogicalAnd() : _impl(std::make_unique<Impl>())
 {
 }
-CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default;
+CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&)            = default;
 CLLogicalAnd &CLLogicalAnd::operator=(CLLogicalAnd &&) = default;
 CLLogicalAnd::~CLLogicalAnd()                          = default;
 
@@ -73,7 +76,10 @@ void CLLogicalAnd::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLLogicalAnd::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLLogicalAnd::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input1,
+                             ICLTensor              *input2,
+                             ICLTensor              *output)
 {
     _impl->src0 = input1;
     _impl->src1 = input2;
diff --git a/src/runtime/CL/functions/CLLogicalNot.cpp b/src/runtime/CL/functions/CLLogicalNot.cpp
index a0504d785203ce20ce105c756b849ba8ecba46b7..71f9cce54f7cd5a4c4672b1ddf1cbc2352629317 100644
--- a/src/runtime/CL/functions/CLLogicalNot.cpp
+++ b/src/runtime/CL/functions/CLLogicalNot.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClLogicalNot.h"
 
@@ -32,16 +33,15 @@ namespace arm_compute
 {
 struct CLLogicalNot::Impl
 {
-    const ICLTensor                      *src{ nullptr };
-    ICLTensor                            *dst{ nullptr };
-    std::unique_ptr<opencl::ClLogicalNot> op{ nullptr };
+    const ICLTensor                      *src{nullptr};
+    ICLTensor                            *dst{nullptr};
+    std::unique_ptr<opencl::ClLogicalNot> op{nullptr};
 };
 
-CLLogicalNot::CLLogicalNot()
-    : _impl(std::make_unique<Impl>())
+CLLogicalNot::CLLogicalNot() : _impl(std::make_unique<Impl>())
 {
 }
-CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default;
+CLLogicalNot::CLLogicalNot(CLLogicalNot &&)            = default;
 CLLogicalNot &CLLogicalNot::operator=(CLLogicalNot &&) = default;
 CLLogicalNot::~CLLogicalNot()                          = default;
 
@@ -72,4 +72,4 @@ void CLLogicalNot::run()
     _impl->op->run(pack);
 }
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLogicalOr.cpp b/src/runtime/CL/functions/CLLogicalOr.cpp
index f9a606e8a5d38c3b04998aa7777bd67a4e713b7c..3db4fdae84bcb62ae1c1b0e534ee97530f99dfb2 100644
--- a/src/runtime/CL/functions/CLLogicalOr.cpp
+++ b/src/runtime/CL/functions/CLLogicalOr.cpp
@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLLogicalOr.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include <utility>
 
@@ -33,7 +34,10 @@ namespace arm_compute
 {
 namespace experimental
 {
-void CLLogicalOr::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+void CLLogicalOr::configure(const CLCompileContext &compile_context,
+                            ITensorInfo            *input1,
+                            ITensorInfo            *input2,
+                            ITensorInfo            *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>();
@@ -54,17 +58,16 @@ void CLLogicalOr::run(ITensorPack &tensors)
 
 struct CLLogicalOr::Impl
 {
-    const ICLTensor                           *src0{ nullptr };
-    const ICLTensor                           *src1{ nullptr };
-    ICLTensor                                 *dst{ nullptr };
-    std::unique_ptr<experimental::CLLogicalOr> op{ nullptr };
+    const ICLTensor                           *src0{nullptr};
+    const ICLTensor                           *src1{nullptr};
+    ICLTensor                                 *dst{nullptr};
+    std::unique_ptr<experimental::CLLogicalOr> op{nullptr};
 };
 
-CLLogicalOr::CLLogicalOr()
-    : _impl(std::make_unique<Impl>())
+CLLogicalOr::CLLogicalOr() : _impl(std::make_unique<Impl>())
 {
 }
-CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default;
+CLLogicalOr::CLLogicalOr(CLLogicalOr &&)            = default;
 CLLogicalOr &CLLogicalOr::operator=(CLLogicalOr &&) = default;
 CLLogicalOr::~CLLogicalOr()                         = default;
 
@@ -73,7 +76,10 @@ void CLLogicalOr::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *out
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLLogicalOr::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLLogicalOr::configure(const CLCompileContext &compile_context,
+                            ICLTensor              *input1,
+                            ICLTensor              *input2,
+                            ICLTensor              *output)
 {
     _impl->src0 = input1;
     _impl->src1 = input2;
diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp
index bef422fca1e7bea7866e4476510a609d27e49d17..e8bdad706b011e2a30ec6af7bbc1f9b7fbeb1b21 100644
--- a/src/runtime/CL/functions/CLMatMul.cpp
+++ b/src/runtime/CL/functions/CLMatMul.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLMatMul.h"
+
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTypes.h"
+
 #include "src/gpu/cl/operators/ClMatMul.h"
 
 namespace arm_compute
@@ -32,23 +34,32 @@ using OperatorType = opencl::ClMatMul;
 
 struct CLMatMul::Impl
 {
-    std::unique_ptr<OperatorType> op{ nullptr };
+    std::unique_ptr<OperatorType> op{nullptr};
     ITensorPack                   run_pack{};
 };
-CLMatMul::CLMatMul()
-    : _impl(std::make_unique<Impl>())
+CLMatMul::CLMatMul() : _impl(std::make_unique<Impl>())
 {
 }
 
 CLMatMul::~CLMatMul() = default;
 
-void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+void CLMatMul::configure(ICLTensor                 *lhs,
+                         ICLTensor                 *rhs,
+                         ICLTensor                 *output,
+                         const MatMulInfo          &matmul_info,
+                         const GpuMatMulSettings   &settings,
+                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(settings);
     configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info);
 }
 
-void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings,
+void CLMatMul::configure(const CLCompileContext    &compile_context,
+                         ICLTensor                 *lhs,
+                         ICLTensor                 *rhs,
+                         ICLTensor                 *output,
+                         const MatMulInfo          &matmul_info,
+                         const GpuMatMulSettings   &settings,
                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
@@ -56,10 +67,14 @@ void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs
 
     _impl->op = std::make_unique<OperatorType>();
     _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info);
-    _impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } };
+    _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}};
 }
 
-Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
+Status CLMatMul::validate(const ITensorInfo         *lhs,
+                          const ITensorInfo         *rhs,
+                          const ITensorInfo         *output,
+                          const MatMulInfo          &matmul_info,
+                          const ActivationLayerInfo &act_info)
 {
     return OperatorType::validate(lhs, rhs, output, matmul_info, act_info);
 }
diff --git a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
index 2786d32d3337ffe701be537e28f250059e941a37..7494f379b91de4b317693a694c68f60baf91be34 100644
--- a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
@@ -27,26 +27,32 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
 
 namespace arm_compute
 {
 CLMaxUnpoolingLayer::CLMaxUnpoolingLayer()
-    : _fill(),
-      _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>())
+    : _fill(), _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>())
 {
 }
 
 CLMaxUnpoolingLayer::~CLMaxUnpoolingLayer() = default;
 
-void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayer::configure(ICLTensor              *input,
+                                    ICLTensor              *indices,
+                                    ICLTensor              *output,
+                                    const PoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, pool_info);
 }
 
-void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context,
+                                    ICLTensor              *input,
+                                    ICLTensor              *indices,
+                                    ICLTensor              *output,
+                                    const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info);
     const PixelValue zero_value(0.f);
@@ -55,7 +61,10 @@ void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICL
     _unpooling_layer_kernel->configure(compile_context, input, indices, output, pool_info);
 }
 
-Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status CLMaxUnpoolingLayer::validate(const ITensorInfo      *input,
+                                     const ITensorInfo      *indices,
+                                     const ITensorInfo      *output,
+                                     const PoolingLayerInfo &pool_info)
 {
     return CLMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info);
 }
diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
index a81cbca1b00b958ffb92f533269292b669450c9c..5892c0e840aa99a28ca55c7fa27c54d372e83e67 100644
--- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
 
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 
 namespace arm_compute
 {
@@ -35,7 +35,10 @@ void CLMeanStdDevNormalizationLayer::configure(ICLTensor *input, ICLTensor *outp
     configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
 }
 
-void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
+void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context,
+                                               ICLTensor              *input,
+                                               ICLTensor              *output,
+                                               float                   epsilon)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, epsilon);
     auto k = std::make_unique<CLMeanStdDevNormalizationKernel>();
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index c0cc5184e6d9601484d6aa325b31f2e3422ec594..f93f82f1a2280928a931a52dc9ef3a9864671326 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
 
 namespace arm_compute
 {
@@ -50,7 +50,10 @@ void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const
     configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
 }
 
-void CLNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info)
+void CLNormalizationLayer::configure(const CLCompileContext       &compile_context,
+                                     ICLTensor                    *input,
+                                     ICLTensor                    *output,
+                                     const NormalizationLayerInfo &norm_info)
 {
     ARM_COMPUTE_ERROR_ON(input == nullptr);
     ARM_COMPUTE_LOG_PARAMS(input, output, norm_info);
@@ -58,21 +61,24 @@ void CLNormalizationLayer::configure(const CLCompileContext &compile_context, IC
     // Configure normalization kernel
     _norm_kernel->configure(compile_context, input, output, norm_info);
 
-    if(!_norm_kernel->border_size().empty())
+    if (!_norm_kernel->border_size().empty())
     {
         // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-        _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+        _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT,
+                                   PixelValue());
     }
 }
 
-Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status CLNormalizationLayer::validate(const ITensorInfo            *input,
+                                      const ITensorInfo            *output,
+                                      const NormalizationLayerInfo &norm_info)
 {
     return CLNormalizationLayerKernel::validate(input, output, norm_info);
 }
 
 void CLNormalizationLayer::run()
 {
-    if(!_norm_kernel->border_size().empty())
+    if (!_norm_kernel->border_size().empty())
     {
         // Run border handler
         CLScheduler::get().enqueue(*_border_handler, false);
diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
index 63c9164a9412b094ad122733b4b4664f4da09f28..939c95bd454ad530e0ae87d9efa405789d2cba9a 100644
--- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
@@ -24,20 +24,26 @@
 
 #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
 
-#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input,
+                                          ICLTensor       *output,
+                                          const ICLTensor *mean,
+                                          const ICLTensor *std)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
 }
 
-void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          ICLTensor              *output,
+                                          const ICLTensor        *mean,
+                                          const ICLTensor        *std)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, mean, std);
     auto k = std::make_unique<CLNormalizePlanarYUVLayerKernel>();
@@ -45,8 +51,10 @@ void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_contex
     _kernel = std::move(k);
 }
 
-Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                           const ITensorInfo *mean, const ITensorInfo *std)
+Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input,
+                                           const ITensorInfo *output,
+                                           const ITensorInfo *mean,
+                                           const ITensorInfo *std)
 {
     return CLNormalizePlanarYUVLayerKernel::validate(input, output, mean, std);
 }
diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
index 186e7b4ba2b72c35e268ebf2b6b05e75c8263a1b..ce6d285ebe354a5480e5fad915b6bf38b63355e2 100644
--- a/src/runtime/CL/functions/CLPReluLayer.cpp
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
+
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/gpu/cl/IClKernel.h"
 #include "src/gpu/cl/operators/ClPRelu.h"
 
@@ -33,17 +35,16 @@ using OperatorType = opencl::ClPRelu;
 
 struct CLPReluLayer::Impl
 {
-    const ICLTensor              *src_0{ nullptr };
-    const ICLTensor              *src_1{ nullptr };
-    ICLTensor                    *dst{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ICLTensor              *src_0{nullptr};
+    const ICLTensor              *src_1{nullptr};
+    ICLTensor                    *dst{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
 };
 
-CLPReluLayer::CLPReluLayer()
-    : _impl(std::make_unique<Impl>())
+CLPReluLayer::CLPReluLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default;
+CLPReluLayer::CLPReluLayer(CLPReluLayer &&)            = default;
 CLPReluLayer &CLPReluLayer::operator=(CLPReluLayer &&) = default;
 CLPReluLayer::~CLPReluLayer()                          = default;
 
@@ -52,13 +53,17 @@ void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *outp
     configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output);
 }
 
-void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+void CLPReluLayer::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input,
+                             ICLTensor              *alpha,
+                             ICLTensor              *output)
 {
     _impl->src_0 = input;
     _impl->src_1 = alpha;
     _impl->dst   = output;
     _impl->op    = std::make_unique<OperatorType>();
-    _impl->op->configure(compile_context, input->info(), alpha->info(), (output == nullptr ? input->info() : output->info()));
+    _impl->op->configure(compile_context, input->info(), alpha->info(),
+                         (output == nullptr ? input->info() : output->info()));
 }
 
 Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 0ed8f03d64518795c67ff31737595869c4c191fd..e788ded512ce57f105ffa1d7d5bd208c927cc6df 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -22,37 +22,38 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLPadLayer.h"
-#include "src/core/CL/kernels/CLPadLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
 
 namespace arm_compute
 {
-CLPadLayer::CLPadLayer()
-    : _pad_kernel(std::make_unique<CLPadLayerKernel>()),
-      _copy(),
-      _perform_pad(false)
+CLPadLayer::CLPadLayer() : _pad_kernel(std::make_unique<CLPadLayerKernel>()), _copy(), _perform_pad(false)
 {
 }
 
 CLPadLayer::~CLPadLayer() = default;
 
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayer::configure(
+    ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
 }
 
-void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayer::configure(const CLCompileContext &compile_context,
+                           ICLTensor              *input,
+                           ICLTensor              *output,
+                           const PaddingList      &padding,
+                           PixelValue              constant_value,
+                           PaddingMode             mode)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
     ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode);
 
-    _perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
-    {
-        return info.first > 0 || info.second > 0;
-    });
+    _perform_pad =
+        std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
 
-    if(_perform_pad)
+    if (_perform_pad)
     {
         _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode);
     }
@@ -62,14 +63,16 @@ void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *i
         _copy.configure(compile_context, input, output);
     }
 }
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status CLPadLayer::validate(const ITensorInfo *input,
+                            const ITensorInfo *output,
+                            const PaddingList &padding,
+                            PixelValue         constant_value,
+                            PaddingMode        mode)
 {
-    bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
-    {
-        return info.first > 0 || info.second > 0;
-    });
+    bool perform_pad =
+        std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
 
-    if(perform_pad)
+    if (perform_pad)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(input, output, padding, constant_value, mode));
     }
@@ -81,7 +84,7 @@ Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
 }
 void CLPadLayer::run()
 {
-    if(_perform_pad)
+    if (_perform_pad)
     {
         CLScheduler::get().enqueue(*_pad_kernel);
     }
diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index a56afff7df255b40b9ad9fb1cfdb3843898408c4..7f97eed98a9d5b88a17fe502fa2a0790d4c1e9e7 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp
@@ -27,22 +27,21 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClPermute.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClPermute.h"
 
 namespace arm_compute
 {
 struct CLPermute::Impl
 {
-    const ICLTensor                   *src{ nullptr };
-    ICLTensor                         *dst{ nullptr };
-    std::unique_ptr<opencl::ClPermute> op{ nullptr };
+    const ICLTensor                   *src{nullptr};
+    ICLTensor                         *dst{nullptr};
+    std::unique_ptr<opencl::ClPermute> op{nullptr};
 };
 
-CLPermute::CLPermute()
-    : _impl(std::make_unique<Impl>())
+CLPermute::CLPermute() : _impl(std::make_unique<Impl>())
 {
 }
 
@@ -53,7 +52,10 @@ void CLPermute::configure(const ICLTensor *input, ICLTensor *output, const Permu
     configure(CLKernelLibrary::get().get_compile_context(), input, output, perm);
 }
 
-void CLPermute::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
+void CLPermute::configure(const CLCompileContext  &compile_context,
+                          const ICLTensor         *input,
+                          ICLTensor               *output,
+                          const PermutationVector &perm)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, perm);
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 9d91e583674fd8e5b6113b2c510f78539e98e011..6aa9d9cbb3c84449146173409ca8df3f73bd22f7 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClMul.h"
 
@@ -34,38 +35,55 @@ namespace arm_compute
 {
 struct CLPixelWiseMultiplication::Impl
 {
-    const ICLTensor               *src_0{ nullptr };
-    const ICLTensor               *src_1{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClMul> op{ nullptr };
+    const ICLTensor               *src_0{nullptr};
+    const ICLTensor               *src_1{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClMul> op{nullptr};
 };
 
-CLPixelWiseMultiplication::CLPixelWiseMultiplication()
-    : _impl(std::make_unique<Impl>())
+CLPixelWiseMultiplication::CLPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
 {
 }
-CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default;
+CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&)            = default;
 CLPixelWiseMultiplication &CLPixelWiseMultiplication::operator=(CLPixelWiseMultiplication &&) = default;
 CLPixelWiseMultiplication::~CLPixelWiseMultiplication()                                       = default;
 
-void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void CLPixelWiseMultiplication::configure(ICLTensor                 *input1,
+                                          ICLTensor                 *input2,
+                                          ICLTensor                 *output,
+                                          float                      scale,
+                                          ConvertPolicy              overflow_policy,
+                                          RoundingPolicy             rounding_policy,
+                                          const ActivationLayerInfo &act_info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy,
+              rounding_policy, act_info);
 }
 
-void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void CLPixelWiseMultiplication::configure(const CLCompileContext    &compile_context,
+                                          ICLTensor                 *input1,
+                                          ICLTensor                 *input2,
+                                          ICLTensor                 *output,
+                                          float                      scale,
+                                          ConvertPolicy              overflow_policy,
+                                          RoundingPolicy             rounding_policy,
+                                          const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
     _impl->dst   = output;
     _impl->op    = std::make_unique<opencl::ClMul>();
-    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info);
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy,
+                         rounding_policy, act_info);
 }
 
-Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
-                                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status CLPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                           const ITensorInfo         *input2,
+                                           const ITensorInfo         *output,
+                                           float                      scale,
+                                           ConvertPolicy              overflow_policy,
+                                           RoundingPolicy             rounding_policy,
+                                           const ActivationLayerInfo &act_info)
 {
     return opencl::ClMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
 }
@@ -82,26 +100,33 @@ void CLPixelWiseMultiplication::run()
 
 struct CLComplexPixelWiseMultiplication::Impl
 {
-    const ICLTensor                      *src_0{ nullptr };
-    const ICLTensor                      *src_1{ nullptr };
-    ICLTensor                            *dst{ nullptr };
-    std::unique_ptr<opencl::ClComplexMul> op{ nullptr };
+    const ICLTensor                      *src_0{nullptr};
+    const ICLTensor                      *src_1{nullptr};
+    ICLTensor                            *dst{nullptr};
+    std::unique_ptr<opencl::ClComplexMul> op{nullptr};
 };
 
-CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication()
-    : _impl(std::make_unique<Impl>())
+CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
 {
 }
 CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication(CLComplexPixelWiseMultiplication &&) = default;
-CLComplexPixelWiseMultiplication &CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default;
-CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication()                                              = default;
-
-void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+CLComplexPixelWiseMultiplication &
+CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default;
+CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication()            = default;
+
+void CLComplexPixelWiseMultiplication::configure(ICLTensor                 *input1,
+                                                 ICLTensor                 *input2,
+                                                 ICLTensor                 *output,
+                                                 const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLComplexPixelWiseMultiplication::configure(const CLCompileContext    &compile_context,
+                                                 ICLTensor                 *input1,
+                                                 ICLTensor                 *input2,
+                                                 ICLTensor                 *output,
+                                                 const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -110,7 +135,10 @@ void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                                  const ITensorInfo         *input2,
+                                                  const ITensorInfo         *output,
+                                                  const ActivationLayerInfo &act_info)
 {
     return opencl::ClComplexMul::validate(input1, input2, output, act_info);
 }
diff --git a/src/runtime/CL/functions/CLPooling3dLayer.cpp b/src/runtime/CL/functions/CLPooling3dLayer.cpp
index 11ae1d0fe6fc027549269c72a4bf3c95f5f39ed0..ce1092a7cc7932eb559ef7a603f22486673bbf28 100644
--- a/src/runtime/CL/functions/CLPooling3dLayer.cpp
+++ b/src/runtime/CL/functions/CLPooling3dLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClPool3d.h"
 
@@ -32,14 +33,13 @@ namespace arm_compute
 {
 struct CLPooling3dLayer::Impl
 {
-    const ICLTensor                  *src{ nullptr };
-    ICLTensor                        *dst{ nullptr };
-    ICLTensor                        *indices{ nullptr };
-    std::unique_ptr<opencl::ClPool3d> op{ nullptr };
+    const ICLTensor                  *src{nullptr};
+    ICLTensor                        *dst{nullptr};
+    ICLTensor                        *indices{nullptr};
+    std::unique_ptr<opencl::ClPool3d> op{nullptr};
 };
 
-CLPooling3dLayer::CLPooling3dLayer()
-    : _impl(std::make_unique<Impl>())
+CLPooling3dLayer::CLPooling3dLayer() : _impl(std::make_unique<Impl>())
 {
 }
 CLPooling3dLayer::~CLPooling3dLayer() = default;
@@ -49,7 +49,10 @@ void CLPooling3dLayer::configure(const ICLTensor *input, ICLTensor *output, cons
     configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info);
 }
 
-void CLPooling3dLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info)
+void CLPooling3dLayer::configure(const CLCompileContext   &compile_context,
+                                 const ICLTensor          *input,
+                                 ICLTensor                *output,
+                                 const Pooling3dLayerInfo &pool_info)
 {
     _impl->src = input;
     _impl->dst = output;
@@ -58,7 +61,8 @@ void CLPooling3dLayer::configure(const CLCompileContext &compile_context, const
     _impl->op->configure(compile_context, input->info(), output->info(), pool_info);
 }
 
-Status CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
+Status
+CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
 {
     return opencl::ClPool3d::validate(input, output, pool_info);
 }
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 0ebce318faa0fe582771e1b90abe6cf54f167d48..65e53b9be3c4e56ea466886e3c091681ba3c013f 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClPool2d.h"
 
@@ -32,34 +33,44 @@ namespace arm_compute
 {
 struct CLPoolingLayer::Impl
 {
-    const ICLTensor                  *src{ nullptr };
-    ICLTensor                        *dst{ nullptr };
-    ICLTensor                        *indices{ nullptr };
-    std::unique_ptr<opencl::ClPool2d> op{ nullptr };
+    const ICLTensor                  *src{nullptr};
+    ICLTensor                        *dst{nullptr};
+    ICLTensor                        *indices{nullptr};
+    std::unique_ptr<opencl::ClPool2d> op{nullptr};
 };
 
-CLPoolingLayer::CLPoolingLayer()
-    : _impl(std::make_unique<Impl>())
+CLPoolingLayer::CLPoolingLayer() : _impl(std::make_unique<Impl>())
 {
 }
 CLPoolingLayer::~CLPoolingLayer() = default;
 
-void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
+void CLPoolingLayer::configure(ICLTensor              *input,
+                               ICLTensor              *output,
+                               const PoolingLayerInfo &pool_info,
+                               ICLTensor              *indices)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices);
 }
 
-void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
+void CLPoolingLayer::configure(const CLCompileContext &compile_context,
+                               ICLTensor              *input,
+                               ICLTensor              *output,
+                               const PoolingLayerInfo &pool_info,
+                               ICLTensor              *indices)
 {
     _impl->src     = input;
     _impl->dst     = output;
     _impl->indices = indices;
 
     _impl->op = std::make_unique<opencl::ClPool2d>();
-    _impl->op->configure(compile_context, input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
+    _impl->op->configure(compile_context, input->info(), output->info(), pool_info,
+                         (indices) ? indices->info() : nullptr);
 }
 
-Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CLPoolingLayer::validate(const ITensorInfo      *input,
+                                const ITensorInfo      *output,
+                                const PoolingLayerInfo &pool_info,
+                                const ITensorInfo      *indices)
 {
     return opencl::ClPool2d::validate(input, output, pool_info, indices);
 }
diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
index 019f0a7e6162624008bc6b06463da16f451bc1bd..cfd0ec4fbf600c6d7e0e3dc7c83d107f3e89869f 100644
--- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp
+++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
@@ -29,31 +29,40 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
 
 using namespace arm_compute;
 
-CLPriorBoxLayer::CLPriorBoxLayer()
-    : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr)
+CLPriorBoxLayer::CLPriorBoxLayer() : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr)
 {
 }
 
-void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+void CLPriorBoxLayer::configure(const ICLTensor         *input1,
+                                const ICLTensor         *input2,
+                                ICLTensor               *output,
+                                const PriorBoxLayerInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info);
 }
 
-void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+void CLPriorBoxLayer::configure(const CLCompileContext  &compile_context,
+                                const ICLTensor         *input1,
+                                const ICLTensor         *input2,
+                                ICLTensor               *output,
+                                const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info);
-    _min           = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float));
-    _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float));
-    if(!info.max_sizes().empty())
+    _min           = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                                info.min_sizes().size() * sizeof(float));
+    _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                                info.aspect_ratios().size() * sizeof(float));
+    if (!info.max_sizes().empty())
     {
-        _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.max_sizes().size() * sizeof(float));
+        _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                          info.max_sizes().size() * sizeof(float));
     }
 
     auto k = std::make_unique<CLPriorBoxLayerKernel>();
@@ -61,7 +70,10 @@ void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const I
     _kernel = std::move(k);
 }
 
-Status CLPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status CLPriorBoxLayer::validate(const ITensorInfo       *input1,
+                                 const ITensorInfo       *input2,
+                                 const ITensorInfo       *output,
+                                 const PriorBoxLayerInfo &info)
 {
     return CLPriorBoxLayerKernel::validate(input1, input2, output, info);
 }
diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index 7fbb866fa99a1b500a6d2ed3fed31e8347d59fed..12f6f8929034c0bd26092ec200b66a6eb59689a1 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp
@@ -26,29 +26,36 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 using namespace arm_compute::utils::info_helpers;
 using namespace arm_compute::opencl::kernels;
 namespace
 {
-Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias,
-                   float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info)
+Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info,
+                   const ITensorInfo       *mm_input,
+                   const ITensorInfo       *mm_weights,
+                   const ITensorInfo       *bias,
+                   float                    gemmlowp_scale,
+                   const TensorInfo        *mm_res_info,
+                   const TensorInfo        *outstage_tensor_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+        gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
     return Status{};
 }
 } // namespace
@@ -78,14 +85,12 @@ void CLQLSTMLayer::TensorCopyKernel::run()
     _src->map(q, true);
     _dst->map(q, true);
 
-    Iterator input_iter{ _src, _window };
-    Iterator output_iter{ _dst, _window };
+    Iterator input_iter{_src, _window};
+    Iterator output_iter{_dst, _window};
 
-    execute_window_loop(_window, [&](const Coordinates &)
-    {
-        memcpy(output_iter.ptr(), input_iter.ptr(), _row_size);
-    },
-    input_iter, output_iter);
+    execute_window_loop(
+        _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter,
+        output_iter);
 
     _src->unmap(q);
     _dst->unmap(q);
@@ -104,7 +109,7 @@ CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
       _layer_norms(),
       _copy_output()
 {
-    for(auto &norm : _layer_norms)
+    for (auto &norm : _layer_norms)
     {
         norm = std::make_unique<CLQLSTMLayerNormalizationKernel>();
     }
@@ -129,17 +134,22 @@ Status CLQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInf
 {
     // Output quantization scale will be different, but ignored here
     // since it will be configured at configure() stage.
-    const TensorInfo out
-    {
-        in
-    };
+    const TensorInfo out{in};
     return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
 }
 
-void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                                const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias,
-                                CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale,
-                                const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)
+void CLQLSTMLayer::configure_mm(const CLCompileContext       &compile_context,
+                                CLGEMMLowpMatrixMultiplyCore &mm,
+                                CLGEMMLowpOutputStage        &outstage,
+                                GEMMLowpOutputStageInfo      &gemmlowp_info,
+                                const ICLTensor              *mm_input,
+                                const ICLTensor              *mm_weights,
+                                const ICLTensor              *bias,
+                                CLTensor                     *mm_res,
+                                CLTensor                     *outstage_res,
+                                float                         gemmlowp_scale,
+                                const TensorInfo             &mm_res_info,
+                                const TensorInfo             &outstage_tensor_info)
 {
     _memory_group.manage(mm_res);
     _memory_group.manage(outstage_res);
@@ -151,30 +161,51 @@ void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMML
     mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res);
 
     // Configure output stage
-    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift);
     outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info);
     mm_res->allocator()->allocate();
 }
 
-void CLQLSTMLayer::configure(const ICLTensor *input,
-                             const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                             const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                             const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                             ICLTensor *cell_state_in, ICLTensor *output_state_in,
-                             ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+void CLQLSTMLayer::configure(const ICLTensor             *input,
+                             const ICLTensor             *input_to_forget_weights,
+                             const ICLTensor             *input_to_cell_weights,
+                             const ICLTensor             *input_to_output_weights,
+                             const ICLTensor             *recurrent_to_forget_weights,
+                             const ICLTensor             *recurrent_to_cell_weights,
+                             const ICLTensor             *recurrent_to_output_weights,
+                             const ICLTensor             *forget_gate_bias,
+                             const ICLTensor             *cell_bias,
+                             const ICLTensor             *output_gate_bias,
+                             ICLTensor                   *cell_state_in,
+                             ICLTensor                   *output_state_in,
+                             ICLTensor                   *cell_state_out,
+                             ICLTensor                   *output_state_out,
+                             ICLTensor                   *output,
                              const LSTMParams<ICLTensor> &lstm_params)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-              recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
-              cell_state_in, output_state_in, cell_state_out, output_state_out, output, lstm_params);
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights,
+              input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+              recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
+              output_state_in, cell_state_out, output_state_out, output, lstm_params);
 }
 
-void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                             const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                             const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                             const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                             ICLTensor *cell_state_in, ICLTensor *output_state_in,
-                             ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+void CLQLSTMLayer::configure(const CLCompileContext      &compile_context,
+                             const ICLTensor             *input,
+                             const ICLTensor             *input_to_forget_weights,
+                             const ICLTensor             *input_to_cell_weights,
+                             const ICLTensor             *input_to_output_weights,
+                             const ICLTensor             *recurrent_to_forget_weights,
+                             const ICLTensor             *recurrent_to_cell_weights,
+                             const ICLTensor             *recurrent_to_output_weights,
+                             const ICLTensor             *forget_gate_bias,
+                             const ICLTensor             *cell_bias,
+                             const ICLTensor             *output_gate_bias,
+                             ICLTensor                   *cell_state_in,
+                             ICLTensor                   *output_state_in,
+                             ICLTensor                   *cell_state_out,
+                             ICLTensor                   *output_state_out,
+                             ICLTensor                   *output,
                              const LSTMParams<ICLTensor> &lstm_params)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
@@ -191,11 +222,11 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
     // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                      recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                      forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                      cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
-                                                      lstm_params_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(
+        input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+        recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+        output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), lstm_params_info));
 
     const int batch_size  = input->info()->dimension(1);
     const int num_units   = input_to_output_weights->info()->dimension(1);
@@ -216,7 +247,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
 
     // Layer normalization
     _has_layer_norm = lstm_params.use_layer_norm();
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget);
         set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell);
@@ -238,53 +269,75 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
     _has_cell_clipping = quantized_cell_clip > 0;
 
     // Precompute effective bias for optimizing the matmul computations.
-    if(!_has_cifg)
+    if (!_has_cifg)
     {
         _input_to_input_weights     = lstm_params.input_to_input_weights();
         _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
 
-        _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-        _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
-                                                 -qoutput_state_in.offset, true));
+        _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(),
+                                             _input_to_input_eff_bias.info(),
+                                             GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+        _recurrent_to_input_reduction->configure(
+            compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     }
-    _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
-                                              -qoutput_state_in.offset, true));
-    _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
-                                            true));
-    _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
-                                              -qoutput_state_in.offset, true));
-    if(_has_projection)
+    _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(),
+                                          _input_to_forget_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_forget_reduction->configure(
+        compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(),
+                                        GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_cell_reduction->configure(
+        compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(),
+                                          _input_to_output_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_output_reduction->configure(
+        compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    if (_has_projection)
     {
-        _projection_reduction->configure(compile_context, _projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
-        if(_projection_bias != nullptr)
+        _projection_reduction->configure(
+            compile_context, _projection_weights->info(), _projection_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        if (_projection_bias != nullptr)
         {
-            _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
+            _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias,
+                                           &_projection_eff_bias, ConvertPolicy::SATURATE);
         }
     }
 
     // Pre-transpose weights to be used in GEMM.
-    _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed);
-    _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed);
-    _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed);
-    _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
-    _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
-    _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
-    if(!_has_cifg)
+    _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights,
+                                                 &_input_to_forget_weights_transposed);
+    _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights,
+                                               &_input_to_cell_weights_transposed);
+    _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights,
+                                                 &_input_to_output_weights_transposed);
+    _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights,
+                                                     &_recurrent_to_forget_weights_transposed);
+    _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights,
+                                                   &_recurrent_to_cell_weights_transposed);
+    _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights,
+                                                     &_recurrent_to_output_weights_transposed);
+    if (!_has_cifg)
     {
-        _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
-        _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
+        _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(),
+                                                    &_input_to_input_weights_transposed);
+        _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(),
+                                                        &_recurrent_to_input_weights_transposed);
     }
-    if(_has_projection)
+    if (_has_projection)
     {
         _transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed);
     }
@@ -297,42 +350,55 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
 
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
     // Forget gate.
-    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
-    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
-                 input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,
-                 &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,
-                 mm_out_info, forget_gate_outstage_info);
-
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                               QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.forget_intermediate_scale();
+    configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input,
+                 &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res,
+                 &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info);
+
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
     configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
                  output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,
                  &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
                  mm_out_info, forget_gate_outstage_info);
 
-    _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+    _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res,
+                                                 &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
                                                  ConvertPolicy::SATURATE);
     _input_to_forget_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_forget_res);
-        _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-        _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
+        _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(),
+                                                &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
+        _cell_to_forget_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_forget_outstage_res);
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
+        const float cell_to_forget_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.forget_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr,
+                                           &_cell_to_forget_outstage_res, gemmlowp_info);
         _mul_cell_to_forget_res.allocator()->allocate();
-        _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+        _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res,
+                                          &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
                                           ConvertPolicy::SATURATE);
         _cell_to_forget_outstage_res.allocator()->allocate();
     }
 
     CLTensor *forget_activation_input = &_recurrent_to_forget_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Forget, &_recurrent_to_forget_outstage_res);
         _recurrent_to_forget_outstage_res.allocator()->allocate();
@@ -345,30 +411,33 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_forget_gate);
     _forget_gate.allocator()->init(forget_gate_info);
-    _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     forget_activation_input->allocator()->allocate();
 
     // Modulation gate.
-    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
-                 input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,
-                 &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
-                 mm_out_info, cell_outstage_info);
-
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,
-                 &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
-                 mm_out_info, cell_outstage_info);
-
-    _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
+    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale *
+                                      qinput.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input,
+                 &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, &_mm_input_to_cell_res,
+                 &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info);
+
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res,
+                 &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info);
+
+    _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res,
+                                                     &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
                                                      ConvertPolicy::SATURATE);
     _input_to_cell_outstage_res.allocator()->allocate();
 
     CLTensor *cell_activation_input = &_recurrent_to_cell_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Cell, &_recurrent_to_cell_outstage_res);
         _recurrent_to_cell_outstage_res.allocator()->allocate();
@@ -378,14 +447,15 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_cell_gate);
     _cell_gate.allocator()->init(cell_gate_info);
-    _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate,
+                              ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     cell_activation_input->allocator()->allocate();
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _input_gate.allocator()->init(input_gate_info);
     _memory_group.manage(&_input_gate);
-    if(_has_cifg)
+    if (_has_cifg)
     {
         _ones.allocator()->init(*_forget_gate.info());
         _input_gate_sub.configure(compile_context, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
@@ -393,107 +463,142 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     }
     else
     {
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
-                     input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,
-                     &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,
-                     mm_out_info, input_outstage_info);
-
-        const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input,
+                     &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res,
+                     &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info);
+
+        const float recurrent_to_input_scale =
+            _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
         configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
                      output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
                      &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
                      mm_out_info, input_outstage_info);
-        _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res,
-                                                    ConvertPolicy::SATURATE);
+        _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res,
+                                                    &_recurrent_to_input_outstage_res,
+                                                    &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
         _input_to_input_outstage_res.allocator()->allocate();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
-            _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
+            _mul_cell_to_input_res.allocator()->init(
+                TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
             _memory_group.manage(&_mul_cell_to_input_res);
-            _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-            _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
+            _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(),
+                                                   &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
+            const float cell_to_input_scale =
+                std::pow(2, cell_shift) *
+                lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale /
+                lstm_params.input_intermediate_scale();
+            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                         &gemmlowp_info.gemmlowp_shift);
+            _cell_to_input_outstage_res.allocator()->init(
+                TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                           QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
             _memory_group.manage(&_cell_to_input_outstage_res);
-            _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
+            _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr,
+                                              &_cell_to_input_outstage_res, gemmlowp_info);
             _mul_cell_to_input_res.allocator()->allocate();
-            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res,
+                                             &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
             _cell_to_input_outstage_res.allocator()->allocate();
         }
 
         CLTensor *input_activation_input = &_recurrent_to_input_outstage_res;
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             configure_layer_norm(LayerNormGate::Input, &_recurrent_to_input_outstage_res);
             _recurrent_to_input_outstage_res.allocator()->allocate();
             input_activation_input = &get_layer_norm_output(LayerNormGate::Input);
         }
 
-        _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
         input_activation_input->allocator()->allocate();
     }
     // Cell.
     // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
-    _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     const float      cell_gate_scale      = _cell_gate.info()->quantization_info().uniform().scale;
     const float      mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
-    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));
+    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                         QuantizationInfo(mul_input_cell_scale, 0));
     _memory_group.manage(&_mul_input_cell_res);
     _mul_input_cell_res.allocator()->init(mul_input_cell_info);
-    _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f,
+                                        ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _cell_gate.allocator()->allocate();
-    _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
+    _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out,
+                               ConvertPolicy::SATURATE);
     _mul_input_cell_res.allocator()->allocate();
     _forget_gate.allocator()->allocate();
-    if(_has_cell_clipping)
+    if (_has_cell_clipping)
     {
-        _cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
+        _cell_clip.configure(compile_context, cell_state_out, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 -quantized_cell_clip, quantized_cell_clip));
     }
     // Output gate.
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
-                 input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,
-                 &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,
-                 mm_out_info, output_outstage_info);
-
-    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.output_intermediate_scale();
+    configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input,
+                 &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res,
+                 &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info);
+
+    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
     configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
                  output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,
                  &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
                  mm_out_info, output_outstage_info);
 
-    _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+    _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res,
+                                                 &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
                                                  ConvertPolicy::SATURATE);
     _input_to_output_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_output_res);
-        _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-
-        const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
+        _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(),
+                                                &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
+
+        const float cell_to_output_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.output_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_output_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_output_outstage_res);
-        _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info);
+        _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr,
+                                           &_cell_to_output_outstage_res, gemmlowp_info);
         _mul_cell_to_output_res.allocator()->allocate();
 
-        _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+        _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res,
+                                             &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
                                              ConvertPolicy::SATURATE);
         _cell_to_output_outstage_res.allocator()->allocate();
     }
 
     CLTensor *output_activation_input = &_recurrent_to_output_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Output, &_recurrent_to_output_outstage_res);
         _recurrent_to_output_outstage_res.allocator()->allocate();
@@ -503,20 +608,24 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_output_gate);
     _output_gate.allocator()->init(output_gate_info);
-    _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     output_activation_input->allocator()->allocate();
 
     // Hidden.
-    _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate,
+                           ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
     _memory_group.manage(&_hidden_mul_res);
     const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
     _hidden_mul_res.allocator()->init(hidden_mul_res);
-    _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f,
+                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _output_gate.allocator()->allocate();
     _input_gate.allocator()->allocate();
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
+    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = output_state_in->info()->data_type();
 
@@ -525,7 +634,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
 
     _memory_group.manage(&_hidden_gate);
 
-    if(_projection_tensor_copy_required)
+    if (_projection_tensor_copy_required)
     {
         _hidden_gate.allocator()->init(*output_state_out->info());
         _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape());
@@ -536,27 +645,26 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     _hidden_mul_res.allocator()->allocate();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         const TensorInfo              projection_outstage_info(*output_state_out->info());
-        const UniformQuantizationInfo qprojection      = _projection_weights->info()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        gemmlowp_info.gemmlowp_offset                  = qoutput_state_in.offset;
-        gemmlowp_info.gemmlowp_min_bound               = std::numeric_limits<int8_t>::lowest();
-        gemmlowp_info.gemmlowp_max_bound               = std::numeric_limits<int8_t>::max();
-        gemmlowp_info.output_data_type                 = DataType::QASYMM8_SIGNED;
-
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
+        const float projection_scale  = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
+        gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
+        gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
+        gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
+
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info,
-                     hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias,
-                     &_mm_projection_res, &_projection_outstage_res, projection_scale,
-                     projection_mm_out_info, projection_outstage_info);
+        configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result,
+                     &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res,
+                     &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info);
 
         ICLTensor *accumulate_destination = output_state_out;
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_gate.allocator()->allocate();
             _projection_accumulate_res.allocator()->init(*output_state_in->info());
@@ -565,31 +673,34 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
             accumulate_destination = &_projection_accumulate_res;
         }
 
-        _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
+        _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination,
+                                         accumulate_destination, ConvertPolicy::SATURATE);
         _projection_outstage_res.allocator()->allocate();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out);
             _projection_accumulate_res.allocator()->allocate();
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
-            quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
+            quantized_projection_clip =
+                utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
-                                                                                                       quantized_projection_clip));
+            _projection_clip.configure(compile_context, output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -quantized_projection_clip, quantized_projection_clip));
             _has_projection_clipping = true;
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.configure(_hidden_gate, *output_state_out);
             _hidden_gate.allocator()->allocate();
@@ -600,17 +711,27 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     _copy_output.configure(compile_context, output_state_out, output);
 }
 
-Status CLQLSTMLayer::validate(const ITensorInfo *input,
-                              const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                              const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                              const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                              const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                              const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+Status CLQLSTMLayer::validate(const ITensorInfo             *input,
+                              const ITensorInfo             *input_to_forget_weights,
+                              const ITensorInfo             *input_to_cell_weights,
+                              const ITensorInfo             *input_to_output_weights,
+                              const ITensorInfo             *recurrent_to_forget_weights,
+                              const ITensorInfo             *recurrent_to_cell_weights,
+                              const ITensorInfo             *recurrent_to_output_weights,
+                              const ITensorInfo             *forget_gate_bias,
+                              const ITensorInfo             *cell_bias,
+                              const ITensorInfo             *output_gate_bias,
+                              const ITensorInfo             *cell_state_in,
+                              const ITensorInfo             *output_state_in,
+                              const ITensorInfo             *cell_state_out,
+                              const ITensorInfo             *output_state_out,
+                              const ITensorInfo             *output,
                               const LSTMParams<ITensorInfo> &lstm_params)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
-                                        cell_state_out, output_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                        recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+                                        cell_state_in, output_state_in, cell_state_out, output_state_out, output);
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");
@@ -622,13 +743,16 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
 
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights,
+                                                   input_to_cell_weights);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights,
+                                                   recurrent_to_cell_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights,
+                                                       input_to_output_weights, recurrent_to_forget_weights,
+                                                       recurrent_to_cell_weights, recurrent_to_output_weights);
 
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);
@@ -647,20 +771,25 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);
 
     // Check whether peephole weights are all there or none
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                       lstm_params.cell_to_output_weights());
 
-        if(!lstm_params.has_cifg_opt())
+        if (!lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                               lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_input_weights());
         }
     }
 
@@ -674,7 +803,7 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
@@ -682,33 +811,50 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     // Precompute effective bias for optimizing the matmul computations.
     const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);
     const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
-                                                                               true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.input_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.recurrent_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    if(lstm_params.has_projection())
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_forget_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_cell_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_output_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
-                                                                               lstm_params.hidden_state_zero(),
-                                                                               true)));
-        if(lstm_params.projection_bias() != nullptr)
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.projection_weights(), &projection_eff_bias_info,
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)));
+        if (lstm_params.projection_bias() != nullptr)
         {
             ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
-                                                                       &projection_eff_bias_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
+                                               &projection_eff_bias_info, ConvertPolicy::SATURATE));
         }
     }
 
-    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());
-    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
+    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1,
+                                              input_to_forget_weights->data_type(),
+                                              input_to_forget_weights->quantization_info());
+    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                  recurrent_to_forget_weights->data_type(),
+                                                  recurrent_to_forget_weights->quantization_info());
 
     // Validate weights transpose
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_forget_weights, &input_weights_transposed));
@@ -717,15 +863,20 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed));
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed));
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed));
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
     }
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
     }
 
     GEMMLowpOutputStageInfo gemmlowp_info;
@@ -738,28 +889,42 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
 
     // Forget gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0);
-    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
-    const float      input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_forget_scale, &mm_out_info, &forget_outstage_info));
 
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_forget_scale, &mm_out_info,
+                                            &forget_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                               &forget_outstage_info, ConvertPolicy::SATURATE));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        const float cell_to_forget_scale = std::pow(2, cell_shift) *
+                                           lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale /
+                                           lstm_params.forget_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                                   &forget_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights();
         const ITensorInfo *b_info = forget_gate_bias;
@@ -770,20 +935,29 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
 
     const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Modulation gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0);
-    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
-
-    if(has_layer_norm)
+    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale /
+                                      lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_cell_scale, &mm_out_info, &cell_outstage_info));
+
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_cell_scale, &mm_out_info,
+                                            &cell_outstage_info));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info,
+                                                               &cell_outstage_info, ConvertPolicy::SATURATE));
+
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights();
         const ITensorInfo *b_info = cell_bias;
@@ -791,85 +965,123 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     }
 
     const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr,
+                                        "Input gate bias must not be present when CIFG is used");
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info,
+                                                                      &forget_gate_info, ConvertPolicy::SATURATE));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+            input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights,
+                                                       lstm_params.recurrent_to_input_weights());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
 
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0);
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
-
-        const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
-
-        if(lstm_params.has_peephole_opt())
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                                input_to_input_scale, &mm_out_info, &input_outstage_info));
+
+        const float recurrent_to_input_scale =
+            lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                                &eff_bias_info, recurrent_to_input_scale, &mm_out_info,
+                                                &input_outstage_info));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                   &input_outstage_info, ConvertPolicy::SATURATE));
+
+        if (lstm_params.has_peephole_opt())
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                            RoundingPolicy::TO_ZERO));
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info,
+                                                    1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            const float cell_to_input_scale = std::pow(2, cell_shift) *
+                                              lstm_params.cell_to_input_weights()->quantization_info().uniform().scale /
+                                              lstm_params.input_intermediate_scale();
+            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+                cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                       &input_outstage_info, ConvertPolicy::SATURATE));
         }
 
-        if(has_layer_norm)
+        if (has_layer_norm)
         {
             const ITensorInfo *w_info = lstm_params.input_layer_norm_weights();
             const ITensorInfo *b_info = lstm_params.input_gate_bias();
             ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+            &input_outstage_info, &input_gate_info,
+            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f)));
     }
     // Cell.
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
-    if(quantized_cell_clip > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
+    if (quantized_cell_clip > 0)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,
-                                                                                                             quantized_cell_clip)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLActivationLayer::validate(cell_state_out, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            -quantized_cell_clip, quantized_cell_clip)));
     }
     // Output gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0);
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
-
-    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
-    if(lstm_params.has_peephole_opt())
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_output_scale, &mm_out_info, &output_outstage_info));
+
+    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_output_scale, &mm_out_info,
+                                            &output_outstage_info));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                               &output_outstage_info, ConvertPolicy::SATURATE));
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1,
+                                                             DataType::QSYMM16);
         // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
         // Here we are not using the output stage because all operations are done in float
         // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
         // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+            cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+            RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                                   &output_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.output_layer_norm_weights();
         const ITensorInfo *b_info = output_gate_bias;
@@ -877,85 +1089,103 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     }
 
     const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&output_outstage_info, &output_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Hidden.
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(cell_state_out, &input_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
     const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
 
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = hidden_out_info.data_type();
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
 
     const bool projection_tensor_copy_required = num_units != output_size;
 
     // Projection.
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights,
+                                                           lstm_params.projection_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0);
 
-        const UniformQuantizationInfo qprojection      = lstm_params.projection_weights()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
+        const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
         gemmlowp_info.gemmlowp_offset    = qoutput_state_in.offset;
         gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
         gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
         gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
 
         const TensorInfo projection_outstage_info(*output_state_out);
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
 
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed,
+                                                &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
                                                 &projection_outstage_info));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out,
+                                                                   ConvertPolicy::SATURATE));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
             quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
-                                                                                                                   quantized_projection_clip)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+                output_state_out, nullptr,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                    -quantized_projection_clip, quantized_projection_clip)));
         }
     }
     else
     {
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out));
         }
     }
 
-    if(cell_state_out->total_size() > 0)
+    if (cell_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);
     }
 
-    if(output_state_out->total_size() > 0)
+    if (output_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
@@ -980,14 +1210,14 @@ void CLQLSTMLayer::run()
     _recurrent_to_forget_outstage.run();
     _accumulate_input_recurrent_forget.run();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _pixelwise_mul_cell_to_forget.run();
         _cell_to_forget_outstage.run();
         _accumulate_cell_forget.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Forget));
     }
@@ -1002,7 +1232,7 @@ void CLQLSTMLayer::run()
     _recurrent_to_cell_outstage.run();
     _accumulate_input_recurrent_modulation.run();
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Cell));
     }
@@ -1010,7 +1240,7 @@ void CLQLSTMLayer::run()
     _cell_gate_tanh.run();
 
     // Input gate
-    if(_has_cifg)
+    if (_has_cifg)
     {
         _input_gate_sub.run();
     }
@@ -1022,14 +1252,14 @@ void CLQLSTMLayer::run()
         _recurrent_to_input_outstage.run();
         _accumulate_input_recurrent_input.run();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
             _pixelwise_mul_cell_to_input.run();
             _cell_to_input_outstage.run();
             _accumulate_cell_input.run();
         }
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Input));
         }
@@ -1041,7 +1271,7 @@ void CLQLSTMLayer::run()
     _pixelwise_mul_forget_cell.run();
     _pixelwise_mul_input_cell.run();
     _add_forget_cell.run();
-    if(_has_cell_clipping)
+    if (_has_cell_clipping)
     {
         _cell_clip.run();
     }
@@ -1052,14 +1282,14 @@ void CLQLSTMLayer::run()
     _mm_recurrent_to_output.run();
     _recurrent_to_output_outstage.run();
     _accumulate_input_recurrent_output.run();
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _pixelwise_mul_cell_to_output.run();
         _cell_to_output_outstage.run();
         _accumulate_cell_to_output.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Output));
     }
@@ -1072,31 +1302,31 @@ void CLQLSTMLayer::run()
     _hidden_outstage.run();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         _mm_projection.run();
         _projection_outstage.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_output_to_accumulate_copy.run();
         }
 
         _accumulate_projection.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.run();
         }
 
-        if(_has_projection_clipping)
+        if (_has_projection_clipping)
         {
             _projection_clip.run();
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.run();
         }
@@ -1108,7 +1338,7 @@ void CLQLSTMLayer::run()
 
 void CLQLSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Pre-transpose weights to be used in GEMM.
         _input_to_forget_weights_transposed.allocator()->allocate();
@@ -1125,10 +1355,11 @@ void CLQLSTMLayer::prepare()
         _transpose_recurrent_to_output_weights.run();
 
         // Precompute effective biases
-        if(_has_cifg)
+        if (_has_cifg)
         {
             _ones.map(true);
-            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);
+            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 32767);
             _ones.unmap();
         }
         else
@@ -1136,10 +1367,12 @@ void CLQLSTMLayer::prepare()
             _input_to_input_eff_bias.allocator()->allocate();
             _recurrent_to_input_eff_bias.allocator()->allocate();
 
-            ITensorPack input_to_input_red_pack = { { ACL_SRC, _input_to_input_weights }, { ACL_DST, &_input_to_input_eff_bias } };
+            ITensorPack input_to_input_red_pack = {{ACL_SRC, _input_to_input_weights},
+                                                   {ACL_DST, &_input_to_input_eff_bias}};
             CLScheduler::get().enqueue_op(*_input_to_input_reduction, input_to_input_red_pack, false);
 
-            ITensorPack rec_to_input_red_pack = { { ACL_SRC, _recurrent_to_input_weights }, { ACL_DST, &_recurrent_to_input_eff_bias } };
+            ITensorPack rec_to_input_red_pack = {{ACL_SRC, _recurrent_to_input_weights},
+                                                 {ACL_DST, &_recurrent_to_input_eff_bias}};
             CLScheduler::get().enqueue_op(*_recurrent_to_input_reduction, rec_to_input_red_pack, false);
 
             _input_to_input_weights_transposed.allocator()->allocate();
@@ -1156,30 +1389,35 @@ void CLQLSTMLayer::prepare()
         _input_to_output_eff_bias.allocator()->allocate();
         _recurrent_to_output_eff_bias.allocator()->allocate();
 
-        ITensorPack input_to_forget_red_pack = { { ACL_SRC, _input_to_forget_weights }, { ACL_DST, &_input_to_forget_eff_bias } };
+        ITensorPack input_to_forget_red_pack = {{ACL_SRC, _input_to_forget_weights},
+                                                {ACL_DST, &_input_to_forget_eff_bias}};
         CLScheduler::get().enqueue_op(*_input_to_forget_reduction, input_to_forget_red_pack, false);
 
-        ITensorPack rec_to_forget_red_pack = { { ACL_SRC, _recurrent_to_forget_weights }, { ACL_DST, &_recurrent_to_forget_eff_bias } };
+        ITensorPack rec_to_forget_red_pack = {{ACL_SRC, _recurrent_to_forget_weights},
+                                              {ACL_DST, &_recurrent_to_forget_eff_bias}};
         CLScheduler::get().enqueue_op(*_recurrent_to_forget_reduction, rec_to_forget_red_pack, false);
 
-        ITensorPack input_to_cell_red_pack = { { ACL_SRC, _input_to_cell_weights }, { ACL_DST, &_input_to_cell_eff_bias } };
+        ITensorPack input_to_cell_red_pack = {{ACL_SRC, _input_to_cell_weights}, {ACL_DST, &_input_to_cell_eff_bias}};
         CLScheduler::get().enqueue_op(*_input_to_cell_reduction, input_to_cell_red_pack, false);
 
-        ITensorPack rec_to_cell_red_pack = { { ACL_SRC, _recurrent_to_cell_weights }, { ACL_DST, &_recurrent_to_cell_eff_bias } };
+        ITensorPack rec_to_cell_red_pack = {{ACL_SRC, _recurrent_to_cell_weights},
+                                            {ACL_DST, &_recurrent_to_cell_eff_bias}};
         CLScheduler::get().enqueue_op(*_recurrent_to_cell_reduction, rec_to_cell_red_pack, false);
 
-        ITensorPack input_to_output_red_pack = { { ACL_SRC, _input_to_output_weights }, { ACL_DST, &_input_to_output_eff_bias } };
+        ITensorPack input_to_output_red_pack = {{ACL_SRC, _input_to_output_weights},
+                                                {ACL_DST, &_input_to_output_eff_bias}};
         CLScheduler::get().enqueue_op(*_input_to_output_reduction, input_to_output_red_pack, false);
 
-        ITensorPack rec_to_output_red_pack = { { ACL_SRC, _recurrent_to_output_weights }, { ACL_DST, &_recurrent_to_output_eff_bias } };
+        ITensorPack rec_to_output_red_pack = {{ACL_SRC, _recurrent_to_output_weights},
+                                              {ACL_DST, &_recurrent_to_output_eff_bias}};
         CLScheduler::get().enqueue_op(*_recurrent_to_output_reduction, rec_to_output_red_pack, false);
 
-        if(_has_projection)
+        if (_has_projection)
         {
             _projection_eff_bias.allocator()->allocate();
-            ITensorPack proj_red_pack{ { ACL_SRC, _projection_weights }, { ACL_DST, &_projection_eff_bias } };
+            ITensorPack proj_red_pack{{ACL_SRC, _projection_weights}, {ACL_DST, &_projection_eff_bias}};
             CLScheduler::get().enqueue_op(*_projection_reduction, proj_red_pack, false);
-            if(_projection_bias != nullptr)
+            if (_projection_bias != nullptr)
             {
                 _projection_bias_add.run();
                 _projection_bias->mark_as_unused();
@@ -1189,7 +1427,7 @@ void CLQLSTMLayer::prepare()
             _transpose_projection_weights.run();
             _projection_weights->mark_as_unused();
 
-            if(!_projection_tensor_copy_required)
+            if (!_projection_tensor_copy_required)
             {
                 _hidden_gate.mark_as_unused();
                 _projection_accumulate_res.mark_as_unused();
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index b249bdd1db6e687ef68391a68c53fd114e45e835..6edef299921eab91dd489f7bebce2c3e42e22dfb 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClQuantize.h"
 
@@ -32,13 +33,12 @@ namespace arm_compute
 {
 struct CLQuantizationLayer::Impl
 {
-    const ICLTensor                    *src{ nullptr };
-    ICLTensor                          *dst{ nullptr };
-    std::unique_ptr<opencl::ClQuantize> op{ nullptr };
+    const ICLTensor                    *src{nullptr};
+    ICLTensor                          *dst{nullptr};
+    std::unique_ptr<opencl::ClQuantize> op{nullptr};
 };
 
-CLQuantizationLayer::CLQuantizationLayer()
-    : _impl(std::make_unique<Impl>())
+CLQuantizationLayer::CLQuantizationLayer() : _impl(std::make_unique<Impl>())
 {
 }
 CLQuantizationLayer::~CLQuantizationLayer() = default;
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 6f122866b27d1049553b650d40ed14d26fafcbc8..34b78eefa7fea511897814b96a4ca82fb9e8a7ec 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -28,24 +28,37 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 namespace arm_compute
 {
 using namespace arm_compute::misc::shape_calculator;
 
 CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy(), _fully_connected_out(), _gemm_output(), _add_output(),
+    : _memory_group(std::move(memory_manager)),
+      _gemm_state_f(),
+      _add_kernel(),
+      _activation(),
+      _fully_connected_kernel(),
+      _copy(),
+      _fully_connected_out(),
+      _gemm_output(),
+      _add_output(),
       _is_prepared(false)
 {
 }
 
 CLRNNLayer::~CLRNNLayer() = default;
 
-Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
-                            const ITensorInfo *output, const ActivationLayerInfo &info)
+Status CLRNNLayer::validate(const ITensorInfo         *input,
+                            const ITensorInfo         *weights,
+                            const ITensorInfo         *recurrent_weights,
+                            const ITensorInfo         *bias,
+                            const ITensorInfo         *hidden_state,
+                            const ITensorInfo         *output,
+                            const ActivationLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
@@ -63,28 +76,42 @@ Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights
     ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
 
-    auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+    auto shape_info =
+        TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&shape_info, &shape_info, info));
 
     return Status{};
 }
 
-void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output,
+void CLRNNLayer::configure(const ICLTensor     *input,
+                           const ICLTensor     *weights,
+                           const ICLTensor     *recurrent_weights,
+                           const ICLTensor     *bias,
+                           ICLTensor           *hidden_state,
+                           ICLTensor           *output,
                            ActivationLayerInfo &info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, output, info);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state,
+              output, info);
 }
 
-void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias,
-                           ICLTensor *hidden_state,
-                           ICLTensor *output, ActivationLayerInfo &info)
+void CLRNNLayer::configure(const CLCompileContext &compile_context,
+                           const ICLTensor        *input,
+                           const ICLTensor        *weights,
+                           const ICLTensor        *recurrent_weights,
+                           const ICLTensor        *bias,
+                           ICLTensor              *hidden_state,
+                           ICLTensor              *output,
+                           ActivationLayerInfo    &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(),
+                                                    bias->info(), hidden_state->info(), output->info(), info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info);
 
     const int   idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
@@ -133,7 +160,7 @@ void CLRNNLayer::run()
 
 void CLRNNLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _fully_connected_kernel.prepare();
         _gemm_state_f.prepare();
diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp
index 867ef7c7acfbc55b8c08c78f9e47ffae5cee138b..1939d1d0ba509b72b2a98e8270d0e1b06d3b494f 100644
--- a/src/runtime/CL/functions/CLROIAlignLayer.cpp
+++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp
@@ -24,26 +24,36 @@
 #include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
 
 #include "arm_compute/core/CL/ICLArray.h"
-#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
-#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
 namespace arm_compute
 {
-Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayer::validate(const ITensorInfo         *input,
+                                 const ITensorInfo         *rois,
+                                 ITensorInfo               *output,
+                                 const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(CLROIAlignLayerKernel::validate(input, rois, output, pool_info));
 
     return Status{};
 }
 
-void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayer::configure(const ICLTensor           *input,
+                                const ICLTensor           *rois,
+                                ICLTensor                 *output,
+                                const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIAlignLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayer::configure(const CLCompileContext    &compile_context,
+                                const ICLTensor           *input,
+                                const ICLTensor           *rois,
+                                ICLTensor                 *output,
+                                const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
 
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index 239a1c6bb2d248fb7f848df9eba0b9cdcf41c695..0d2eab0c76ccc4a94452d43161aad9bc5e3bc11a 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -22,24 +22,35 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
+
 #include "arm_compute/core/CL/ICLArray.h"
-#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
 using namespace arm_compute;
 
-Status CLROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIPoolingLayer::validate(const ITensorInfo         *input,
+                                   const ITensorInfo         *rois,
+                                   ITensorInfo               *output,
+                                   const ROIPoolingLayerInfo &pool_info)
 {
     return CLROIPoolingLayerKernel::validate(input, rois, output, pool_info);
 }
 
-void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const ICLTensor           *input,
+                                  const ICLTensor           *rois,
+                                  ICLTensor                 *output,
+                                  const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const CLCompileContext    &compile_context,
+                                  const ICLTensor           *input,
+                                  const ICLTensor           *rois,
+                                  const ICLTensor           *output,
+                                  const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
 
diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
index 3fbbd5f952e491b1d1b830a75657700ac89b3e7f..5c3f7f9c8cbeefd51339e56c259b5a319944a607 100644
--- a/src/runtime/CL/functions/CLRange.cpp
+++ b/src/runtime/CL/functions/CLRange.cpp
@@ -27,9 +27,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLRangeKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLRangeKernel.h"
 
 using namespace arm_compute;
 
@@ -38,7 +38,8 @@ void CLRange::configure(ICLTensor *output, const float start, const float end, c
     configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
 }
 
-void CLRange::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
+void CLRange::configure(
+    const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
 {
     ARM_COMPUTE_LOG_PARAMS(output, start, end, step);
     auto k = std::make_unique<CLRangeKernel>();
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index cddbf77d7cf9130901d53ae335671fd0a9d3cd4c..6c6daff5ba12c6b0a79ab7b75aac155deeac38d9 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -27,23 +27,25 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace
 {
-Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status
+validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
     ARM_COMPUTE_UNUSED(keep_dims);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
@@ -51,29 +53,29 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
     const int          input_dims    = input->num_dimensions();
     Coordinates        axis_local    = reduction_axis;
 
-    for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
     {
         //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
     }
 
-    if(output->tensor_shape().total_size() != 0)
+    if (output->tensor_shape().total_size() != 0)
     {
         // Only validate if not using auto_init for the output tensor
         TensorShape out_shape = input->tensor_shape();
         // Validate output_shape only if not using auto_init
         convert_negative_axis(axis_local, input_dims);
         std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-        for(unsigned int i = 0; i < reduction_ops; ++i)
+        for (unsigned int i = 0; i < reduction_ops; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
             ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
-            if(output->total_size() > 0 && keep_dims)
+            if (output->total_size() > 0 && keep_dims)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
             }
-            if(keep_dims)
+            if (keep_dims)
             {
                 out_shape.set(axis_local[i], 1);
             }
@@ -87,8 +89,9 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
         }
         const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-        const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
-        if(requant)
+        const bool requant =
+            is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
+        if (requant)
         {
             TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32));
             CLDequantizationLayer::validate(input, &input_no_quant);
@@ -98,10 +101,19 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
     }
     return Status{};
 }
-}
+} // namespace
 
 CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(),
+    : _memory_group(std::move(memory_manager)),
+      _reduction_kernels(),
+      _reduced_outs(),
+      _reshape(),
+      _dequant(),
+      _requant(),
+      _reduction_ops(),
+      _keep_dims(),
+      _do_requant(),
+      _input_no_quant(),
       _output_no_quant()
 {
 }
@@ -111,17 +123,23 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
     configure(CLKernelLibrary::get().get_compile_context(), input, reduction_axis, keep_dims, output);
 }
 
-void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
+void CLReduceMean::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input,
+                             const Coordinates      &reduction_axis,
+                             bool                    keep_dims,
+                             ICLTensor              *output)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
     ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output);
 
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
-    _do_requant    = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info();
+    _do_requant = is_data_type_quantized(input->info()->data_type()) &&
+                  input->info()->quantization_info() != output->info()->quantization_info();
     _reduction_ops = reduction_axis.num_dimensions();
     _reduction_kernels.resize(_reduction_ops);
     _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
@@ -129,7 +147,7 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor
 
     ICLTensor *tmp_input  = input;
     ICLTensor *tmp_output = output;
-    if(_do_requant)
+    if (_do_requant)
     {
         _memory_group.manage(&_input_no_quant);
         _memory_group.manage(&_output_no_quant);
@@ -148,46 +166,51 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor
     convert_negative_axis(axis_local, input_dims);
 
     // Perform reduction for every axis
-    for(int i = 0; i < _reduction_ops; ++i)
+    for (int i = 0; i < _reduction_ops; ++i)
     {
-        TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+        TensorShape out_shape =
+            i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
         auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]);
 
-        if(i == _reduction_ops - 1 && keep_dims)
+        if (i == _reduction_ops - 1 && keep_dims)
         {
-            _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i],
+                                            ReductionOperation::MEAN_SUM);
         }
         else
         {
-            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info()));
+            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(),
+                                                          tmp_input->info()->data_type(),
+                                                          tmp_input->info()->quantization_info()));
             _memory_group.manage(&_reduced_outs[i]);
-            _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i],
+                                            ReductionOperation::MEAN_SUM);
         }
     }
 
     // Allocate intermediate tensors
-    for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+    for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
     {
         _reduced_outs[i].allocator()->allocate();
     }
 
     // Configure reshape layer if we want to drop the dimensions
-    if(!_keep_dims)
+    if (!_keep_dims)
     {
         TensorShape out_shape = tmp_input->info()->tensor_shape();
 
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
         std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-        for(int i = 0; i < _reduction_ops; ++i)
+        for (int i = 0; i < _reduction_ops; ++i)
         {
             out_shape.remove_dimension(axis_local[i] - i, false);
         }
         auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape));
         _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], tmp_output);
     }
-    if(_do_requant)
+    if (_do_requant)
     {
         _requant.configure(compile_context, &_output_no_quant, output);
         _input_no_quant.allocator()->allocate();
@@ -195,7 +218,10 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor
     }
 }
 
-Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status CLReduceMean::validate(const ITensorInfo *input,
+                              const Coordinates &reduction_axis,
+                              bool               keep_dims,
+                              const ITensorInfo *output)
 {
     return validate_config(input, reduction_axis, keep_dims, output);
 }
@@ -204,19 +230,19 @@ void CLReduceMean::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_do_requant)
+    if (_do_requant)
     {
         _dequant.run();
     }
-    for(auto &kernel : _reduction_kernels)
+    for (auto &kernel : _reduction_kernels)
     {
         kernel.run();
     }
-    if(!_keep_dims)
+    if (!_keep_dims)
     {
         _reshape.run();
     }
-    if(_do_requant)
+    if (_do_requant)
     {
         _requant.run();
     }
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index cdc7fec51b7581bbfb52411757bdf5c65715ee94..ba5489018e9a48ce6bcbb2a576c793a2563a77df 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -27,35 +27,43 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/runtime/Utils.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _unreshaped_output(), _reduction_kernel(), _reshape(), _reduction_axis(), _is_reshape_required(false)
+    : _memory_group(std::move(memory_manager)),
+      _unreshaped_output(),
+      _reduction_kernel(),
+      _reshape(),
+      _reduction_axis(),
+      _is_reshape_required(false)
 {
 }
 
 CLReductionOperation::~CLReductionOperation() = default;
 
-Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+Status CLReductionOperation::validate(
+    const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
     const bool is_reshape_required = !keep_dims;
 
-    if(is_reshape_required && output->total_size() != 0)
+    if (is_reshape_required && output->total_size() != 0)
     {
-        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
     }
 
@@ -67,22 +75,23 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
     const auto input_qinfo        = input->quantization_info();
     const auto output_data_type   = output->data_type();
 
-    auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
-    {
+    auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels,
+                                    QuantizationInfo qinfo) {
         ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
     };
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
         auto shape_before_reshape = input_shape;
         shape_before_reshape.set(axis, 1);
-        initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo);
+        initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles,
+                              input_qinfo);
         output_internal = &output_before_reshape;
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op));
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(output_internal, output));
     }
@@ -92,7 +101,7 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
 
 ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output)
 {
-    if(!_is_reshape_required)
+    if (!_is_reshape_required)
     {
         return output;
     }
@@ -103,12 +112,18 @@ ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor
     return &_unreshaped_output;
 }
 
-void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void CLReductionOperation::configure(
+    ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims);
 }
 
-void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void CLReductionOperation::configure(const CLCompileContext &compile_context,
+                                     ICLTensor              *input,
+                                     ICLTensor              *output,
+                                     unsigned int            axis,
+                                     ReductionOperation      op,
+                                     bool                    keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims);
@@ -117,11 +132,17 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC
 
     auto *output_internal = configure_intermediate_result_vector(input, output);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
-        const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
-        const auto        output_data_type = input->info()->data_type();
-        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+        const TensorShape output_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+        const auto output_data_type = input->info()->data_type();
+        auto_init_if_empty(*output->info(), input->info()
+                                                ->clone()
+                                                ->set_tensor_shape(output_shape)
+                                                .set_data_type(output_data_type)
+                                                .reset_padding()
+                                                .set_is_resizable(true));
 
         _memory_group.manage(&_unreshaped_output);
     }
@@ -129,7 +150,7 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC
     _reduction_kernel = std::make_unique<CLReductionOperationKernel>();
     _reduction_kernel->configure(compile_context, input, output_internal, axis, op);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
         _reshape.configure(compile_context, &_unreshaped_output, output);
         _unreshaped_output.allocator()->allocate();
@@ -142,7 +163,7 @@ void CLReductionOperation::run()
 
     CLScheduler::get().enqueue(*_reduction_kernel, false);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
         _reshape.run();
     }
diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp
index 15de959225072016f579c197588d6eaee2cb2da2..156e9b90c1d368df291c3c511b4d0b480f3d7649 100644
--- a/src/runtime/CL/functions/CLReorgLayer.cpp
+++ b/src/runtime/CL/functions/CLReorgLayer.cpp
@@ -27,9 +27,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/kernels/CLReorgLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLReorgLayerKernel.h"
 
 #include <utility>
 
@@ -40,7 +40,10 @@ void CLReorgLayer::configure(ICLTensor *input, ICLTensor *output, int32_t stride
     configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
 }
 
-void CLReorgLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t stride)
+void CLReorgLayer::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input,
+                             ICLTensor              *output,
+                             int32_t                 stride)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, stride);
     auto k = std::make_unique<CLReorgLayerKernel>();
diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
index c51a3298c14fb23fbf302c48048b7ab00d28a471..3d6349fb25650b095a8598ab5b2b159388df3d23 100644
--- a/src/runtime/CL/functions/CLReshapeLayer.cpp
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClReshape.h"
 
@@ -35,17 +36,16 @@ namespace arm_compute
 {
 struct CLReshapeLayer::Impl
 {
-    const ICLTensor                   *src{ nullptr };
-    ICLTensor                         *dst{ nullptr };
-    std::unique_ptr<opencl::ClReshape> op{ nullptr };
+    const ICLTensor                   *src{nullptr};
+    ICLTensor                         *dst{nullptr};
+    std::unique_ptr<opencl::ClReshape> op{nullptr};
 };
 
-CLReshapeLayer::CLReshapeLayer()
-    : _impl(std::make_unique<Impl>())
+CLReshapeLayer::CLReshapeLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default;
+CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&)            = default;
 CLReshapeLayer &CLReshapeLayer::operator=(CLReshapeLayer &&) = default;
 CLReshapeLayer::~CLReshapeLayer()                            = default;
 
@@ -78,4 +78,4 @@ void CLReshapeLayer::run()
     _impl->op->run(pack);
 }
 } // namespace arm_compute
-/** [CLReshapeLayer snippet] **/
\ No newline at end of file
+  /** [CLReshapeLayer snippet] **/
diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
index 1fc93571d91c6e65ea096d20aa9c048e92662a41..a20be2335a46fb3a5a0d981ff8152e74434829ae 100644
--- a/src/runtime/CL/functions/CLReverse.cpp
+++ b/src/runtime/CL/functions/CLReverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,27 +24,34 @@
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
 
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLReverseKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLReverseKernel.h"
 
 namespace arm_compute
 {
-void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis, bool use_inverted_axis)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, use_inverted_axis);
 }
 
-void CLReverse::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverse::configure(const CLCompileContext &compile_context,
+                          const ICLTensor        *input,
+                          ICLTensor              *output,
+                          const ICLTensor        *axis,
+                          bool                    use_inverted_axis)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, axis);
     auto k = std::make_unique<CLReverseKernel>();
-    k->configure(compile_context, input, output, axis);
+    k->configure(compile_context, input, output, axis, use_inverted_axis);
     _kernel = std::move(k);
 }
 
-Status CLReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status CLReverse::validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const ITensorInfo *axis,
+                           bool               use_inverted_axis)
 {
-    return CLReverseKernel::validate(input, output, axis);
+    return CLReverseKernel::validate(input, output, axis, use_inverted_axis);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 5b78989bfa8c659c7cd2ba0bf9be7f96ef7a526c..abff0724e403a706d268d93d01f6e7d5f48049e6 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClScale.h"
 
@@ -33,13 +34,12 @@ namespace arm_compute
 {
 struct CLScale::Impl
 {
-    const ICLTensor                 *src{ nullptr };
-    ICLTensor                       *dst{ nullptr };
-    std::unique_ptr<opencl::ClScale> op{ nullptr };
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClScale> op{nullptr};
 };
 
-CLScale::CLScale()
-    : _impl(std::make_unique<Impl>())
+CLScale::CLScale() : _impl(std::make_unique<Impl>())
 {
 }
 CLScale::~CLScale() = default;
@@ -49,7 +49,10 @@ void CLScale::configure(ICLTensor *input, ICLTensor *output, const ScaleKernelIn
     configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info)
+void CLScale::configure(const CLCompileContext &compile_context,
+                        ICLTensor              *input,
+                        ICLTensor              *output,
+                        const ScaleKernelInfo  &info)
 {
     _impl->src = input;
     _impl->dst = output;
diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
index c4ab3dc67af7995b1fd6efaeb9e2dc8f774d7bfa..b4897d9e621c4c57f42cb7d0035c93ecaf50d5ec 100644
--- a/src/runtime/CL/functions/CLSelect.cpp
+++ b/src/runtime/CL/functions/CLSelect.cpp
@@ -25,9 +25,9 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLSelectKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSelectKernel.h"
 
 using namespace arm_compute;
 
@@ -38,7 +38,11 @@ void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor
     configure(CLKernelLibrary::get().get_compile_context(), c, x, y, output);
 }
 
-void CLSelect::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+void CLSelect::configure(const CLCompileContext &compile_context,
+                         const ICLTensor        *c,
+                         const ICLTensor        *x,
+                         const ICLTensor        *y,
+                         ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(c, x, y, output);
     auto k = std::make_unique<CLSelectKernel>();
diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index 7e3ac7d76985abed18be1cabfc91faff0465bd1a..f79c6a12351e06e2897911329f5324e83d595afb 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp
@@ -26,15 +26,19 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
-#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 namespace arm_compute
 {
 namespace experimental
 {
-void CLSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+void CLSlice::configure(const CLCompileContext &compile_context,
+                        const ITensorInfo      *input,
+                        ITensorInfo            *output,
+                        const Coordinates      &starts,
+                        const Coordinates      &ends)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends);
@@ -47,15 +51,16 @@ void CLSlice::configure(const CLCompileContext &compile_context, const ITensorIn
     _kernel = std::move(k);
 }
 
-Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status CLSlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
 
     // Check start dimensions for being non-negative
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
-    {
-        return i < 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; }));
 
     // Get absolute end coordinates
     const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
@@ -66,20 +71,22 @@ Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, co
 
 struct CLSlice::Impl
 {
-    const ICLTensor                       *src{ nullptr };
-    ICLTensor                             *dst{ nullptr };
-    std::unique_ptr<experimental::CLSlice> op{ nullptr };
+    const ICLTensor                       *src{nullptr};
+    ICLTensor                             *dst{nullptr};
+    std::unique_ptr<experimental::CLSlice> op{nullptr};
 };
 
-CLSlice::CLSlice()
-    : _impl(std::make_unique<Impl>())
+CLSlice::CLSlice() : _impl(std::make_unique<Impl>())
 {
 }
-CLSlice::CLSlice(CLSlice &&) = default;
+CLSlice::CLSlice(CLSlice &&)            = default;
 CLSlice &CLSlice::operator=(CLSlice &&) = default;
 CLSlice::~CLSlice()                     = default;
 
-Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status CLSlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
 {
     return experimental::CLSlice::validate(input, output, starts, ends);
 }
@@ -89,7 +96,11 @@ void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordin
     configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends);
 }
 
-void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+void CLSlice::configure(const CLCompileContext &compile_context,
+                        const ICLTensor        *input,
+                        ICLTensor              *output,
+                        const Coordinates      &starts,
+                        const Coordinates      &ends)
 {
     _impl->src = input;
     _impl->dst = output;
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index d52352fc8d5b607655890f1af3e1d152121e260e..2e70e2aa08b0a299454bfcd85327da7c98081d55 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -22,12 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
 #include "src/gpu/cl/operators/ClPermute.h"
@@ -40,9 +42,9 @@ using OperatorType = opencl::ClSoftmax;
 template <bool IS_LOG>
 struct CLSoftmaxLayerGeneric<IS_LOG>::Impl
 {
-    const ICLTensor              *src{ nullptr };
-    ICLTensor                    *dst{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ICLTensor              *src{nullptr};
+    ICLTensor                    *dst{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
     MemoryGroup                   memory_group{};
     ITensorPack                   run_pack{};
     WorkspaceData<CLTensor>       workspace_tensors{};
@@ -65,28 +67,30 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor
 }
 
 template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(
+    const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
 {
     _impl->src = input;
     _impl->dst = output;
     _impl->op  = std::make_unique<OperatorType>();
 
-    SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->info()->data_type(), axis };
+    SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->info()->data_type(), axis};
     _impl->op->configure(compile_context, *input->info(), *output->info(), softmax_info);
 
-    _impl->run_pack          = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } };
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
     _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
 template <bool IS_LOG>
-Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
+Status
+CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
 {
-    SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->data_type(), axis };
+    SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->data_type(), axis};
     return OperatorType::validate(*input, *output, softmax_info);
 }
 
 template <bool IS_LOG>
-void           CLSoftmaxLayerGeneric<IS_LOG>::run()
+void CLSoftmaxLayerGeneric<IS_LOG>::run()
 {
     // Acquire all the temporaries
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index 3b7083400b096cb151dd98170de9e8e2acf762ee..37f728895f694f5f562bd084a91ff35dc151c6f5 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -29,71 +29,100 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 
 namespace arm_compute
 {
 CLSpaceToBatchLayer::CLSpaceToBatchLayer()
-    : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()),
-      _fill(),
-      _has_padding(false)
+    : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()), _fill(), _has_padding(false)
 {
 }
 
 CLSpaceToBatchLayer::~CLSpaceToBatchLayer() = default;
 
-void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const ICLTensor *input,
+                                    const ICLTensor *block_shape,
+                                    const ICLTensor *paddings,
+                                    ICLTensor       *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
 }
 
-void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    const ICLTensor        *block_shape,
+                                    const ICLTensor        *paddings,
+                                    ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
     ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _fill.configure(compile_context, output,
+                        PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
     _space_to_batch_kernel->configure(compile_context, input, block_shape, paddings, output);
 }
 
-void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const ICLTensor *input,
+                                    const int        block_shape_x,
+                                    const int        block_shape_y,
+                                    const Size2D    &padding_left,
+                                    const Size2D    &padding_right,
+                                    ICLTensor       *output)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left,
+              padding_right, output);
 }
 
-void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
-                                    const Size2D &padding_right, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    const int               block_shape_x,
+                                    const int               block_shape_y,
+                                    const Size2D           &padding_left,
+                                    const Size2D           &padding_right,
+                                    ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _fill.configure(compile_context, output,
+                        PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right,
+                                      output);
 }
 
-Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const ITensorInfo *block_shape,
+                                     const ITensorInfo *paddings,
+                                     const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
     ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
 
     return Status{};
 }
 
-Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const int          block_shape_x,
+                                     const int          block_shape_y,
+                                     const Size2D      &padding_left,
+                                     const Size2D      &padding_right,
                                      const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
 
     return Status{};
 }
@@ -101,7 +130,7 @@ Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s
 void CLSpaceToBatchLayer::run()
 {
     // Zero out output only if we have paddings
-    if(_has_padding)
+    if (_has_padding)
     {
         //CLScheduler::get().enqueue(*_fill, true);
         _fill.run();
diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
index 67dafff47f3f02d9c6955e92d6db5aed6905d642..22695c9ef3e3efbbf711f16916d3d16e1998d5b4 100644
--- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
@@ -29,14 +29,13 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
 
 namespace arm_compute
 {
-CLSpaceToDepthLayer::CLSpaceToDepthLayer()
-    : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>())
+CLSpaceToDepthLayer::CLSpaceToDepthLayer() : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>())
 {
 }
 
@@ -47,7 +46,10 @@ void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, i
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    ICLTensor              *output,
+                                    int32_t                 block_shape)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
     _space_to_depth_kernel->configure(compile_context, input, output, block_shape);
diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
index 0b27371e3f1c8e4ae9b05e0207187adb2d666f9f..6be43cc5cd0accade0d6940a7ad6733a92f88162 100644
--- a/src/runtime/CL/functions/CLSplit.cpp
+++ b/src/runtime/CL/functions/CLSplit.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
@@ -38,7 +39,7 @@ void CLSplit::run()
 {
     cl::CommandQueue q = CLScheduler::get().queue();
 
-    for(unsigned i = 0; i < _num_outputs; ++i)
+    for (unsigned i = 0; i < _num_outputs; ++i)
     {
         _slice_functions[i].run();
     }
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
index 6a335da00c38ebfe052b44cd842de1dee0b55f0a..c15496fc31466562f53d4f4596aa7b66908473c7 100644
--- a/src/runtime/CL/functions/CLStackLayer.cpp
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -21,8 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include <complex>
-
 #include "arm_compute/runtime/CL/functions/CLStackLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -32,16 +30,16 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLStackLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStackLayerKernel.h"
+
+#include <complex>
 
 namespace arm_compute
 {
 CLStackLayer::CLStackLayer() // NOLINT
-    : _input(),
-      _stack_kernels(),
-      _num_inputs(0)
+    : _input(), _stack_kernels(), _num_inputs(0)
 {
 }
 
@@ -52,7 +50,10 @@ void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, IC
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, output);
 }
 
-void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
+void CLStackLayer::configure(const CLCompileContext         &compile_context,
+                             const std::vector<ICLTensor *> &input,
+                             int                             axis,
+                             ICLTensor                      *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input, axis, output);
     _num_inputs = input.size();
@@ -61,7 +62,7 @@ void CLStackLayer::configure(const CLCompileContext &compile_context, const std:
     // Wrap around negative values
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
 
-    for(unsigned int i = 0; i < _num_inputs; i++)
+    for (unsigned int i = 0; i < _num_inputs; i++)
     {
         _stack_kernels.emplace_back(std::make_unique<CLStackLayerKernel>());
         _stack_kernels.back()->configure(compile_context, input[i], axis_u, i, _num_inputs, output);
@@ -79,7 +80,7 @@ Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
 
     const unsigned int num_inputs = input.size();
 
-    for(unsigned int i = 0; i < num_inputs; i++)
+    for (unsigned int i = 0; i < num_inputs; i++)
     {
         // All the tensors must have the same rank
         ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
@@ -92,7 +93,7 @@ Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
 
 void CLStackLayer::run()
 {
-    for(unsigned i = 0; i < _num_inputs; i++)
+    for (unsigned i = 0; i < _num_inputs; i++)
     {
         CLScheduler::get().enqueue(*_stack_kernels[i], false);
     }
diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp
index 261bdc13d10a99da1a14d8c40404c2e4e070a9f5..c1953cc41536d0e5363ef73e80d3518fe09c2d43 100644
--- a/src/runtime/CL/functions/CLStridedSlice.cpp
+++ b/src/runtime/CL/functions/CLStridedSlice.cpp
@@ -25,17 +25,23 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 namespace arm_compute
 {
 namespace experimental
 {
-void CLStridedSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const CLCompileContext &compile_context,
+                               const ITensorInfo      *input,
+                               ITensorInfo            *output,
+                               const Coordinates      &starts,
+                               const Coordinates      &ends,
+                               const BiStrides        &strides,
+                               int32_t                 begin_mask,
+                               int32_t                 end_mask,
+                               int32_t                 shrink_axis_mask)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     auto k = std::make_unique<CLStridedSliceKernel>();
@@ -43,9 +49,14 @@ void CLStridedSlice::configure(const CLCompileContext &compile_context, const IT
     _kernel = std::move(k);
 }
 
-Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
 {
     return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
 }
@@ -53,32 +64,43 @@ Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *out
 
 struct CLStridedSlice::Impl
 {
-    const ICLTensor                              *src{ nullptr };
-    ICLTensor                                    *dst{ nullptr };
-    CLRuntimeContext                             *ctx{ nullptr };
-    std::unique_ptr<experimental::CLStridedSlice> op{ nullptr };
+    const ICLTensor                              *src{nullptr};
+    ICLTensor                                    *dst{nullptr};
+    CLRuntimeContext                             *ctx{nullptr};
+    std::unique_ptr<experimental::CLStridedSlice> op{nullptr};
 };
 
-CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx)
-    : _impl(std::make_unique<Impl>())
+CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
 {
     _impl->ctx = ctx;
 }
 
-CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default;
+CLStridedSlice::CLStridedSlice(CLStridedSlice &&)            = default;
 CLStridedSlice &CLStridedSlice::operator=(CLStridedSlice &&) = default;
 CLStridedSlice::~CLStridedSlice()                            = default;
 
-void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const ICLTensor   *input,
+                               ICLTensor         *output,
+                               const Coordinates &starts,
+                               const Coordinates &ends,
+                               const BiStrides   &strides,
+                               int32_t            begin_mask,
+                               int32_t            end_mask,
+                               int32_t            shrink_axis_mask)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask,
+              shrink_axis_mask);
 }
 
-void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const CLCompileContext &compile_context,
+                               const ICLTensor        *input,
+                               ICLTensor              *output,
+                               const Coordinates      &starts,
+                               const Coordinates      &ends,
+                               const BiStrides        &strides,
+                               int32_t                 begin_mask,
+                               int32_t                 end_mask,
+                               int32_t                 shrink_axis_mask)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -86,14 +108,21 @@ void CLStridedSlice::configure(const CLCompileContext &compile_context, const IC
     _impl->dst = output;
 
     _impl->op = std::make_unique<experimental::CLStridedSlice>();
-    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask,
+                         end_mask, shrink_axis_mask);
 }
 
-Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
 {
-    return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask,
+                                                  shrink_axis_mask);
 }
 
 void CLStridedSlice::run()
diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
index ef790995f9d68e82ac1caea50dd09ea53871b2d0..4f86c4adfa6bf02642f1b15b1db99914035c4d9b 100644
--- a/src/runtime/CL/functions/CLTile.cpp
+++ b/src/runtime/CL/functions/CLTile.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLTile.h"
 
-#include "src/core/CL/kernels/CLTileKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLTileKernel.h"
 
 namespace arm_compute
 {
@@ -34,7 +33,10 @@ void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiple
     configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
 }
 
-void CLTile::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+void CLTile::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *input,
+                       ICLTensor              *output,
+                       const Multiples        &multiples)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, multiples);
     auto k = std::make_unique<CLTileKernel>();
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index e63c92eeb43fef6bd278281b8bc242601d3514a5..5a738f47cea89ee4068c8e3a918a10abfb0e4b71 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClTranspose.h"
 
@@ -34,12 +35,11 @@ namespace arm_compute
 {
 struct CLTranspose::Impl
 {
-    const ICLTensor                     *src{ nullptr };
-    ICLTensor                           *dst{ nullptr };
-    std::unique_ptr<opencl::ClTranspose> op{ nullptr };
+    const ICLTensor                     *src{nullptr};
+    ICLTensor                           *dst{nullptr};
+    std::unique_ptr<opencl::ClTranspose> op{nullptr};
 };
-CLTranspose::CLTranspose()
-    : _impl(std::make_unique<Impl>())
+CLTranspose::CLTranspose() : _impl(std::make_unique<Impl>())
 {
 }
 CLTranspose::~CLTranspose() = default;
@@ -70,4 +70,4 @@ void CLTranspose::run()
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
index 98d47810abd68e25841c2300d3f7f7b57e8ea378..ddd83e78243beee612283ef916f20064be05a5ee 100644
--- a/src/runtime/CL/functions/CLUnstack.cpp
+++ b/src/runtime/CL/functions/CLUnstack.cpp
@@ -40,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
     return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
 }
 
-inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+inline void setup_slice_coordinates_and_mask(Coordinates       &slice_start,
+                                             int32_t           &slice_end_mask,
+                                             const unsigned int input_num_dimensions)
 {
     // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
     Coordinates slice_end;
     slice_start.set_num_dimensions(input_num_dimensions);
     slice_end.set_num_dimensions(input_num_dimensions);
-    for(size_t k = 0; k < input_num_dimensions; ++k)
+    for (size_t k = 0; k < input_num_dimensions; ++k)
     {
         slice_start.set(k, 0);
         slice_end.set(k, -1);
@@ -56,8 +58,7 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &
 } // namespace
 
 CLUnstack::CLUnstack() // NOLINT
-    : _num_slices(0),
-      _strided_slice_vector()
+    : _num_slices(0), _strided_slice_vector()
 {
 }
 
@@ -66,15 +67,19 @@ void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *>
     configure(CLKernelLibrary::get().get_compile_context(), input, output_vector, axis);
 }
 
-void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
+void CLUnstack::configure(const CLCompileContext         &compile_context,
+                          const ICLTensor                *input,
+                          const std::vector<ICLTensor *> &output_vector,
+                          int                             axis)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis);
     std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
-    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t->info();
-    });
+    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(),
+                   [](ICLTensor *t)
+                   {
+                       ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+                       return t->info();
+                   });
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis));
@@ -87,11 +92,12 @@ void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTens
     Coordinates slice_start;
     int32_t     slice_end_mask;
     setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
-    for(unsigned int slice = 0; slice < _num_slices; ++slice)
+    for (unsigned int slice = 0; slice < _num_slices; ++slice)
     {
         // Adjusts start and end coordinates to take a 2D slice at a time
         slice_start.set(axis_u, slice);
-        _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+        _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(),
+                                               BiStrides(), 0, slice_end_mask, (1 << axis_u));
     }
 }
 
@@ -106,18 +112,20 @@ Status CLUnstack::validate(const ITensorInfo *input, const std::vector<ITensorIn
     ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
     Coordinates slice_start;
     int32_t     slice_end_mask;
-    for(size_t k = 0; k < num_slices; ++k)
+    for (size_t k = 0; k < num_slices; ++k)
     {
         slice_start.set(wrap_axis(axis, input), k);
         setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
-        ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(),
+                                                             BiStrides(), 0, slice_end_mask,
+                                                             (1 << wrap_axis(axis, input))));
     }
     return Status{};
 }
 
 void CLUnstack::run()
 {
-    for(unsigned i = 0; i < _num_slices; ++i)
+    for (unsigned i = 0; i < _num_slices; ++i)
     {
         _strided_slice_vector[i].run();
     }
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index b416d0fcf106cf29cbad0192001d7760a791d609..645f817030c76d983518ce53f0c97b2555b99081 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClWinogradConv2d.h"
@@ -35,15 +36,15 @@ namespace arm_compute
 {
 struct CLWinogradConvolutionLayer::Impl
 {
-    const ICLTensor                          *src{ nullptr };
-    const ICLTensor                          *weights{ nullptr };
-    const ICLTensor                          *biases{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClWinogradConv2d> op{ nullptr };
+    const ICLTensor                          *src{nullptr};
+    const ICLTensor                          *weights{nullptr};
+    const ICLTensor                          *biases{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClWinogradConv2d> op{nullptr};
     ITensorPack                               run_pack{};
     MemoryGroup                               memory_group{};
     WorkspaceData<CLTensor>                   workspace_tensors{};
-    bool                                      is_prepared{ false };
+    bool                                      is_prepared{false};
 };
 
 CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -54,15 +55,26 @@ CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryMa
 
 CLWinogradConvolutionLayer::~CLWinogradConvolutionLayer() = default;
 
-void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
-                                           bool enable_fast_math)
+void CLWinogradConvolutionLayer::configure(ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info,
+                                           bool                       enable_fast_math)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info,
+              enable_fast_math);
 }
 
-void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLWinogradConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                           ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
                                            const PadStrideInfo       &conv_info,
-                                           const ActivationLayerInfo &act_info, bool enable_fast_math)
+                                           const ActivationLayerInfo &act_info,
+                                           bool                       enable_fast_math)
 {
     _impl->src     = input;
     _impl->weights = weights;
@@ -70,20 +82,25 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte
     _impl->dst     = output;
 
     _impl->op = std::make_unique<opencl::ClWinogradConv2d>();
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info, enable_fast_math);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info,
+                         enable_fast_math);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, _impl->src },
-        { TensorType::ACL_SRC_1, _impl->weights },
-        { TensorType::ACL_SRC_2, _impl->biases },
-        { TensorType::ACL_DST, _impl->dst }
-    };
-    _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
+    _impl->run_pack = {{TensorType::ACL_SRC_0, _impl->src},
+                       {TensorType::ACL_SRC_1, _impl->weights},
+                       {TensorType::ACL_SRC_2, _impl->biases},
+                       {TensorType::ACL_DST, _impl->dst}};
+    _impl->workspace_tensors =
+        manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
 }
 
-Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CLWinogradConvolutionLayer::validate(const ITensorInfo         *input,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *output,
+                                            const PadStrideInfo       &conv_info,
+                                            const ActivationLayerInfo &act_info,
+                                            bool                       enable_fast_math)
 {
     return opencl::ClWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math);
 }
@@ -97,7 +114,7 @@ void CLWinogradConvolutionLayer::run()
 
 void CLWinogradConvolutionLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 
@@ -107,4 +124,4 @@ void CLWinogradConvolutionLayer::prepare()
         _impl->is_prepared = true;
     }
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
index 18ade978858165ff3f3cdd599e57a06bf8315a11..4270165ab4b98dd5ef36db24dff35987980daf85 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <map>
@@ -34,8 +35,7 @@ namespace arm_compute
 {
 namespace cl_gemm
 {
-CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu)
-    : ICLGEMMKernelSelection(gpu)
+CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
 {
 }
 
@@ -44,109 +44,109 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::select_kernel(const CLGEMMKernelSelec
     // _target could be used in the future to have a dedicated heuristic for each GPU IP
     ARM_COMPUTE_UNUSED(_target);
 
-    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
 
     // Default configurations for Bifrost architectures
-    static std::map<DataType, FunctionExecutorPtr> gemm_default_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
 
     // Mali-G71 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
 
     // Mali-G52 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32},
+        {DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
 
     // Mali-G76 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32},
+        {DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
 
     const DataType data_type = params.data_type;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G71:
-            if(gemm_g71_configs.find(data_type) != gemm_g71_configs.end())
+            if (gemm_g71_configs.find(data_type) != gemm_g71_configs.end())
             {
-                return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         case GPUTarget::G76:
-            if(gemm_g76_configs.find(data_type) != gemm_g76_configs.end())
+            if (gemm_g76_configs.find(data_type) != gemm_g76_configs.end())
             {
-                return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         case GPUTarget::G52:
-            if(gemm_g52_configs.find(data_type) != gemm_g52_configs.end())
+            if (gemm_g52_configs.find(data_type) != gemm_g52_configs.end())
             {
-                return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         default:
-            if(gemm_default_configs.find(data_type) != gemm_default_configs.end())
+            if (gemm_default_configs.find(data_type) != gemm_default_configs.end())
             {
-                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                                params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
 
     CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE;
 
-    if(is_rhs_constant)
+    if (is_rhs_constant)
     {
-        if((m > 1) && (n < 16))
+        if ((m > 1) && (n < 16))
         {
             gemm_type = CLGEMMKernelType::RESHAPED;
         }
-        else if(m == 1)
+        else if (m == 1)
         {
             gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
         else
         {
-            if((k > 256) && (m > 4))
+            if ((k > 256) && (m > 4))
             {
                 constexpr float alpha = 3.2f;
                 constexpr float fact0 = 1.51f;
                 constexpr float fact1 = 1.66f;
                 constexpr float ops   = 12.0f;
                 const float     scale = k > 1024 ? 1.07f : 1.0f;
-                gemm_type             = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                gemm_type             = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops))
+                                            ? CLGEMMKernelType::RESHAPED
+                                            : CLGEMMKernelType::RESHAPED_ONLY_RHS;
             }
             else
             {
@@ -156,19 +156,21 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(unsigned int m, unsigned
 
         const auto workload = static_cast<float>((m * n) / 20.0f);
 
-        gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED : gemm_type;
+        gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED
+                                                                                        : gemm_type;
     }
 
     return gemm_type;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(n, k, b);
 
-    if(is_rhs_constant)
+    if (is_rhs_constant)
     {
-        if(m == 1)
+        if (m == 1)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
@@ -183,11 +185,12 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
-    if(is_rhs_constant)
+    if (is_rhs_constant)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
@@ -197,21 +200,22 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned i
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
 
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
-    if(m == 1)
+    if (m == 1)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
-    if(k <= 496)
+    if (k <= 496)
     {
-        if(n <= 544)
+        if (n <= 544)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
@@ -222,17 +226,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int
     }
     else
     {
-        if(k <= 588)
+        if (k <= 588)
         {
-            if(k <= 552)
+            if (k <= 552)
             {
-                if(m <= 148)
+                if (m <= 148)
                 {
                     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                 }
                 else
                 {
-                    if(m <= 278)
+                    if (m <= 278)
                     {
                         return CLGEMMKernelType::RESHAPED;
                     }
@@ -254,16 +258,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
 
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
 
-    if(m == 1)
+    if (m == 1)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
@@ -273,13 +278,13 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
     const float r_nk  = static_cast<float>(n) / static_cast<float>(k);
     const float r_mnk = static_cast<float>(m) / (static_cast<float>(n) * static_cast<float>(k));
 
-    if(r_mn <= 1.5469f)
+    if (r_mn <= 1.5469f)
     {
-        if(r_mk <= 0.8766f)
+        if (r_mk <= 0.8766f)
         {
-            if(r_mk <= 0.0211f)
+            if (r_mk <= 0.0211f)
             {
-                if(r_mnk <= 77.5833f)
+                if (r_mnk <= 77.5833f)
                 {
                     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                 }
@@ -290,7 +295,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
             }
             else
             {
-                if(r_nk <= 0.0832f)
+                if (r_nk <= 0.0832f)
                 {
                     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                 }
@@ -302,11 +307,11 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
         }
         else
         {
-            if(r_mnk <= 193.0000f)
+            if (r_mnk <= 193.0000f)
             {
-                if(r_mn <= 0.9948f)
+                if (r_mn <= 0.9948f)
                 {
-                    if(r_mk <= 2.5453f)
+                    if (r_mk <= 2.5453f)
                     {
                         return CLGEMMKernelType::RESHAPED;
                     }
@@ -328,17 +333,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
     }
     else
     {
-        if(r_mn <= 17.7370f)
+        if (r_mn <= 17.7370f)
         {
-            if(r_mnk <= 1391.2875f)
+            if (r_mnk <= 1391.2875f)
             {
-                if(r_mk <= 2.9724f)
+                if (r_mk <= 2.9724f)
                 {
                     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                 }
                 else
                 {
-                    if(r_mnk <= 470.0000f)
+                    if (r_mnk <= 470.0000f)
                     {
                         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                     }
@@ -350,9 +355,9 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
             }
             else
             {
-                if(r_nk <= 0.1381f)
+                if (r_nk <= 0.1381f)
                 {
-                    if(r_mnk <= 9040.5000f)
+                    if (r_mnk <= 9040.5000f)
                     {
                         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                     }
@@ -363,7 +368,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
                 }
                 else
                 {
-                    if(r_mn <= 5.6790f)
+                    if (r_mn <= 5.6790f)
                     {
                         return CLGEMMKernelType::RESHAPED;
                     }
@@ -381,16 +386,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
 
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
 
-    if(m == 1)
+    if (m == 1)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
@@ -398,21 +404,21 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int
     const float r_mn = static_cast<float>(m) / static_cast<float>(n);
     const float r_nk = static_cast<float>(n) / static_cast<float>(k);
 
-    if(k <= 212)
+    if (k <= 212)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
     else
     {
-        if(r_nk <= 0.4990234375f)
+        if (r_nk <= 0.4990234375f)
         {
-            if(k <= 1392)
+            if (k <= 1392)
             {
                 return CLGEMMKernelType::RESHAPED_ONLY_RHS;
             }
             else
             {
-                if(m <= 325)
+                if (m <= 325)
                 {
                     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                 }
@@ -424,13 +430,13 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int
         }
         else
         {
-            if(k <= 471)
+            if (k <= 471)
             {
                 return CLGEMMKernelType::RESHAPED_ONLY_RHS;
             }
             else
             {
-                if(r_mn <= 0.04475911520421505f)
+                if (r_mn <= 0.04475911520421505f)
                 {
                     return CLGEMMKernelType::RESHAPED;
                 }
@@ -443,37 +449,38 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
 
-    if(m == 1)
+    if (m == 1)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
 
-    if(n <= 127.0000f)
+    if (n <= 127.0000f)
     {
-        if(n <= 63.5000f)
+        if (n <= 63.5000f)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
         else
         {
-            if(m <= 3616.0000f)
+            if (m <= 3616.0000f)
             {
-                if(b <= 18.5000f)
+                if (b <= 18.5000f)
                 {
-                    if(m <= 2970.5000f)
+                    if (m <= 2970.5000f)
                     {
                         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                     }
                     else
                     {
-                        if(k <= 104.0000f)
+                        if (k <= 104.0000f)
                         {
                             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                         }
@@ -496,19 +503,19 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int
     }
     else
     {
-        if(m <= 12.5000f)
+        if (m <= 12.5000f)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
         else
         {
-            if(k <= 104.0000f)
+            if (k <= 104.0000f)
             {
-                if(b <= 18.5000f)
+                if (b <= 18.5000f)
                 {
-                    if(m <= 490.0000f)
+                    if (m <= 490.0000f)
                     {
-                        if(n <= 272.0000f)
+                        if (n <= 272.0000f)
                         {
                             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                         }
@@ -529,11 +536,11 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int
             }
             else
             {
-                if(m <= 226.0000f)
+                if (m <= 226.0000f)
                 {
-                    if(n <= 140.0000f)
+                    if (n <= 140.0000f)
                     {
-                        if(m <= 179.5000f)
+                        if (m <= 179.5000f)
                         {
                             return CLGEMMKernelType::RESHAPED;
                         }
@@ -556,15 +563,16 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
     ARM_COMPUTE_UNUSED(n);
     ARM_COMPUTE_UNUSED(k);
 
-    if(is_rhs_constant)
+    if (is_rhs_constant)
     {
-        if(m == 1)
+        if (m == 1)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
index ef30b28f963e74b4fe2f6d582101c15b829b6b9a..673038a8db8c07a4deb8de8a50461ffb3aa68c5a 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <map>
@@ -35,8 +36,7 @@ namespace arm_compute
 {
 namespace cl_gemm
 {
-CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu)
-    : ICLGEMMKernelSelection(gpu)
+CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
 {
 }
 
@@ -45,22 +45,21 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelec
     // _target could be used in the future to have a dedicated heuristic for each GPU IP
     ARM_COMPUTE_UNUSED(_target);
 
-    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
 
     // Configurations for Midgard architectures
-    static std::map<DataType, FunctionExecutorPtr> gemm_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8}};
 
     const DataType data_type = params.data_type;
 
-    if(gemm_configs.find(data_type) != gemm_configs.end())
+    if (gemm_configs.find(data_type) != gemm_configs.end())
     {
         return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
     }
@@ -68,7 +67,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelec
     ARM_COMPUTE_ERROR("Not supported data type");
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(n, k, b);
 
@@ -76,7 +76,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(unsigned int m, unsigned
     return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(n, k, b);
 
@@ -84,7 +85,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(unsigned int m, unsigned
     return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b, is_rhs_constant);
 
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
index 9e779d3752f4f89585530f657ad3f2a12f35a7b1..851e23bc84545d1c348bbec3c3ebbf7bc0cec901 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <map>
@@ -34,8 +35,7 @@ namespace arm_compute
 {
 namespace cl_gemm
 {
-CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu)
-    : ICLGEMMKernelSelection(gpu)
+CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
 {
 }
 
@@ -44,135 +44,136 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::select_kernel(const CLGEMMKernelSelec
     // _target could be used in the future to have a dedicated heuristic for each GPU IP
     ARM_COMPUTE_UNUSED(_target);
 
-    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
 
     // Default configurations for Valhall architectures
-    static std::map<DataType, FunctionExecutorPtr> gemm_default_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeValhall::default_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::default_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
 
     // Mali-G77 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
 
     // Mali-G78 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g78_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g78_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
 
     // Mali-G710 and Mali-G610 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g710_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g710_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
 
     // Mali-G715 and Mali-G615 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g715_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g715_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
 
     const DataType data_type = params.data_type;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G710:
         case GPUTarget::G610:
-            if(gemm_g710_configs.find(data_type) != gemm_g710_configs.end())
+            if (gemm_g710_configs.find(data_type) != gemm_g710_configs.end())
             {
-                return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                             params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         case GPUTarget::G715:
         case GPUTarget::G615:
-            if(gemm_g715_configs.find(data_type) != gemm_g715_configs.end())
+            if (gemm_g715_configs.find(data_type) != gemm_g715_configs.end())
             {
-                return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                             params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         case GPUTarget::G78:
-            if(gemm_g78_configs.find(data_type) != gemm_g78_configs.end())
+            if (gemm_g78_configs.find(data_type) != gemm_g78_configs.end())
             {
-                return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         case GPUTarget::G77:
-            if(gemm_g77_configs.find(data_type) != gemm_g77_configs.end())
+            if (gemm_g77_configs.find(data_type) != gemm_g77_configs.end())
             {
-                return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         default:
-            if(gemm_default_configs.find(data_type) != gemm_default_configs.end())
+            if (gemm_default_configs.find(data_type) != gemm_default_configs.end())
             {
-                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                                params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
     return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
     return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
     return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
     return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
-    if(is_rhs_constant)
+    if (is_rhs_constant)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
@@ -182,47 +183,48 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned i
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
 
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
 
-    if(m == 1)
+    if (m == 1)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
 
-    if(n <= 272.0000f)
+    if (n <= 272.0000f)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
     else
     {
-        if(k <= 471.0000f)
+        if (k <= 471.0000f)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
         else
         {
-            if(m <= 72.5000f)
+            if (m <= 72.5000f)
             {
                 return CLGEMMKernelType::RESHAPED_ONLY_RHS;
             }
             else
             {
-                if(m <= 90.5000f)
+                if (m <= 90.5000f)
                 {
                     return CLGEMMKernelType::RESHAPED;
                 }
                 else
                 {
-                    if(k <= 2448.0000f)
+                    if (k <= 2448.0000f)
                     {
-                        if(n <= 756.0000f)
+                        if (n <= 756.0000f)
                         {
                             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                         }
@@ -241,11 +243,12 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
@@ -253,9 +256,10 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int
     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return default_f32(m, n, k, b, is_rhs_constant);
     }
@@ -263,7 +267,7 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int
     unsigned int best_m0;
     unsigned int best_n0;
 
-    if(opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
+    if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL;
     }
@@ -273,9 +277,10 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return g78_f16(m, n, k, b, is_rhs_constant);
     }
@@ -283,7 +288,7 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int
     unsigned int best_m0;
     unsigned int best_n0;
 
-    if(opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
+    if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL;
     }
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelection.h b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
index 6189a324cf8506511b93af6317a2c222ac0b732c..c528dbcac48d4afe095bc3b22a5711a9b820cc82 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelection.h
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
@@ -25,6 +25,7 @@
 #define SRC_CLGEMMKERNELSELECTION_H
 
 #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
 #include "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h"
 #include "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h"
 #include "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h"
@@ -45,7 +46,7 @@ public:
      */
     static std::unique_ptr<ICLGEMMKernelSelection> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
                 return std::make_unique<CLGEMMDefaultTypeMidgard>(gpu);
diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
index b06c3b0f8e20b94b7e2f1ebc58824323ba4017c0..8df57197e2ee58755f1bbd7babcaa1aa2873ac44 100644
--- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
+++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
 #include "src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h"
@@ -51,13 +52,15 @@ GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_
     bool             valid = false;
     CLGEMMKernelType gemm_type{};
     const auto       mlgo_heuristics = CLScheduler::get().gemm_heuristics();
-    if(mlgo_heuristics != nullptr)
+    if (mlgo_heuristics != nullptr)
     {
-        std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+        std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type(
+            mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
     }
-    if(valid)
+    if (valid)
     {
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.", to_string(gemm_type).c_str());
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.",
+                                                  to_string(gemm_type).c_str());
     }
     else
     {
@@ -87,10 +90,11 @@ GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery
 {
     GEMMLHSMatrixInfo                    lhs_info;
     GEMMRHSMatrixInfo                    rhs_info;
-    std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target);
+    std::unique_ptr<IClGemmKernelConfig> gemm_config =
+        ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target);
     ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
     std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
-    return GEMMConfigResult{ true, lhs_info, rhs_info };
+    return GEMMConfigResult{true, lhs_info, rhs_info};
 }
 
 GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query)
@@ -100,32 +104,36 @@ GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &qu
     GEMMRHSMatrixInfo               rhs_info;
     mlgo::GEMMConfigReshapedOnlyRHS config{};
     const auto                      mlgo_heuristics = CLScheduler::get().gemm_heuristics();
-    if(mlgo_heuristics != nullptr)
+    if (mlgo_heuristics != nullptr)
     {
-        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs(
+            mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
     }
-    if(valid)
+    if (valid)
     {
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str());
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+                                                  to_string(config).c_str());
         // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter
-        std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs,
-                                                              config.export_cl_image);
+        std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(
+            query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs,
+            !config.transpose_rhs, config.transpose_rhs, config.export_cl_image);
     }
     else
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
     }
-    return GEMMConfigResult{ valid, lhs_info, rhs_info };
+    return GEMMConfigResult{valid, lhs_info, rhs_info};
 }
 
 GEMMConfigResult select_default_gemm_config_reshaped(const CommonQuery &query)
 {
     GEMMLHSMatrixInfo                    lhs_info;
     GEMMRHSMatrixInfo                    rhs_info;
-    std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target);
+    std::unique_ptr<IClGemmKernelConfig> gemm_config =
+        ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target);
     ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
     std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
-    return GEMMConfigResult{ true, lhs_info, rhs_info };
+    return GEMMConfigResult{true, lhs_info, rhs_info};
 }
 
 GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query)
@@ -135,21 +143,24 @@ GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query)
     GEMMRHSMatrixInfo        rhs_info;
     mlgo::GEMMConfigReshaped config{};
     const auto               mlgo_heuristics = CLScheduler::get().gemm_heuristics();
-    if(mlgo_heuristics != nullptr)
+    if (mlgo_heuristics != nullptr)
     {
-        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped(
+            mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
     }
-    if(valid)
+    if (valid)
     {
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str());
-        std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs, config.interleave_rhs, !config.transpose_rhs,
-                                                              config.transpose_rhs, config.export_cl_image);
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+                                                  to_string(config).c_str());
+        std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(
+            query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs,
+            config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs, config.export_cl_image);
     }
     else
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
     }
-    return GEMMConfigResult{ valid, lhs_info, rhs_info };
+    return GEMMConfigResult{valid, lhs_info, rhs_info};
 }
 
 GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query)
@@ -159,7 +170,7 @@ GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query)
     std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmNativeKernelConfigurationFactory::create(query.gpu_target);
     ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
     std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
-    return GEMMConfigResult{ true, lhs_info, rhs_info };
+    return GEMMConfigResult{true, lhs_info, rhs_info};
 }
 
 GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query)
@@ -169,23 +180,26 @@ GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query)
     GEMMRHSMatrixInfo      rhs_info;
     mlgo::GEMMConfigNative config{};
     const auto             mlgo_heuristics = CLScheduler::get().gemm_heuristics();
-    if(mlgo_heuristics != nullptr)
+    if (mlgo_heuristics != nullptr)
     {
-        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native(
+            mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
     }
-    if(valid)
+    if (valid)
     {
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str());
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+                                                  to_string(config).c_str());
         // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter
-        std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false);
+        std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info(
+            query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false);
     }
     else
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
     }
-    return GEMMConfigResult{ valid, lhs_info, rhs_info };
+    return GEMMConfigResult{valid, lhs_info, rhs_info};
 }
 } // namespace auto_heuristics
 
 } // namespace cl_gemm
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h
index 020237b7f4616a502baca47b0b71fc2e33257fde..f544715e03a43da6a71586fdec488b0ccb10e2b8 100644
--- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h
+++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h
@@ -50,8 +50,7 @@ struct CommonQuery
 /** Result of querying about GEMM type ( @ref CLGEMMKernelType) */
 struct GEMMTypeResult
 {
-    GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type)
-        : valid{ valid }, gemm_type{ gemm_type }
+    GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type) : valid{valid}, gemm_type{gemm_type}
     {
     }
     /** Test if the result is valid */
@@ -67,7 +66,7 @@ struct GEMMTypeResult
 struct GEMMConfigResult
 {
     GEMMConfigResult(bool valid, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info)
-        : valid{ valid }, lhs_info{ lhs_info }, rhs_info{ rhs_info }
+        : valid{valid}, lhs_info{lhs_info}, rhs_info{rhs_info}
     {
     }
     /** Test if the result is valid */
@@ -134,4 +133,4 @@ GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query);
 } // namespace cl_gemm
 } // namespace arm_compute
 
-#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H
\ No newline at end of file
+#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H
diff --git a/src/runtime/CL/mlgo/Common.h b/src/runtime/CL/mlgo/Common.h
index c451bd9062133d4a46f9ed82fd5a4132a881f12a..08a7ee8c18302a8a7602b880b6e377f23b6aff2e 100644
--- a/src/runtime/CL/mlgo/Common.h
+++ b/src/runtime/CL/mlgo/Common.h
@@ -45,37 +45,37 @@ using GEMMType = CLGEMMKernelType;
 /** GEMM Configuration for Native kernel */
 struct GEMMConfigNative
 {
-    unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */
-    unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */
-    unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+    unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
 };
 
 /** GEMM Configuration for Reshaped Only RHS kernel */
 struct GEMMConfigReshapedOnlyRHS
 {
-    unsigned int m0{ 1 };                  /**< Number of rows processed by the matrix multiplication */
-    unsigned int n0{ 1 };                  /**< Number of columns processed by the matrix multiplication */
-    unsigned int k0{ 1 };                  /**< Number of partial accumulations performed by the matrix multiplication */
-    unsigned int h0{ 1 };                  /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool         interleave_rhs{ false };  /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
-    bool         transpose_rhs{ false };   /**< True if the (k0xn0) block has to be transposed before been stored */
-    bool         export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
+    unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+    unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool         interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+    bool         transpose_rhs{false};  /**< True if the (k0xn0) block has to be transposed before been stored */
+    bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
 };
 
 /** GEMM Configuration for Reshaped kernel */
 struct GEMMConfigReshaped
 {
-    unsigned int m0{ 1 };                  /**< Number of rows processed by the matrix multiplication */
-    unsigned int n0{ 1 };                  /**< Number of columns processed by the matrix multiplication */
-    unsigned int k0{ 1 };                  /**< Number of partial accumulations performed by the matrix multiplication */
-    unsigned int v0{ 1 };                  /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
-    unsigned int h0{ 1 };                  /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool         interleave_lhs{ false };  /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
-    bool         interleave_rhs{ false };  /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
-    bool         transpose_rhs{ false };   /**< True if the (k0xn0) block has to be transposed before been stored */
-    bool         export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
+    unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+    unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int v0{1}; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
+    unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool         interleave_lhs{false}; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
+    bool         interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+    bool         transpose_rhs{false};  /**< True if the (k0xn0) block has to be transposed before been stored */
+    bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
 };
 
 } // namespace mlgo
 } // namespace arm_compute
-#endif // SRC_RUNTIME_CL_MLGO_COMMON_H
\ No newline at end of file
+#endif // SRC_RUNTIME_CL_MLGO_COMMON_H
diff --git a/src/runtime/CL/mlgo/HeuristicTree.cpp b/src/runtime/CL/mlgo/HeuristicTree.cpp
index 1c75cdc4279f3c1d49c1f1b4b8e180fc3ac3103b..f7b706902b091559b55e10319a356ada4c10168d 100644
--- a/src/runtime/CL/mlgo/HeuristicTree.cpp
+++ b/src/runtime/CL/mlgo/HeuristicTree.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/runtime/CL/mlgo/HeuristicTree.h"
+
 #include "arm_compute/core/Log.h"
 
 #include "support/Cast.h"
@@ -40,27 +41,23 @@ bool evaluate(GEMMShape shape, Condition cond)
     // PRE: all features and ConditionalOps are valid
     constexpr float eps = 0.0001f;
     // Calculate all secondary features
-    std::vector<std::pair<std::string, float>> cond_values
-    {
-        { "m", static_cast<float>(shape.m) },
-        { "n", static_cast<float>(shape.n) },
-        { "k", static_cast<float>(shape.k) },
-        { "b", static_cast<float>(shape.b) },
-        { "r_mn", static_cast<float>(shape.m) / shape.n },
-        { "r_mk", static_cast<float>(shape.m) / shape.k },
-        { "r_nk", static_cast<float>(shape.n) / shape.k },
-        { "r_mnk", static_cast<float>(shape.m) / (static_cast<float>(shape.n) / shape.k) },
-        { "workload", (static_cast<float>(shape.m) * shape.n * shape.b) / 20.0 }
-    };
-    auto cond_value_pair_it = std::find_if(cond_values.begin(), cond_values.end(),
-                                           [&cond](decltype(*cond_values.begin()) it)
-    {
-        return it.first == cond.feature;
-    });
+    std::vector<std::pair<std::string, float>> cond_values{
+        {"m", static_cast<float>(shape.m)},
+        {"n", static_cast<float>(shape.n)},
+        {"k", static_cast<float>(shape.k)},
+        {"b", static_cast<float>(shape.b)},
+        {"r_mn", static_cast<float>(shape.m) / shape.n},
+        {"r_mk", static_cast<float>(shape.m) / shape.k},
+        {"r_nk", static_cast<float>(shape.n) / shape.k},
+        {"r_mnk", static_cast<float>(shape.m) / (static_cast<float>(shape.n) / shape.k)},
+        {"workload", (static_cast<float>(shape.m) * shape.n * shape.b) / 20.0}};
+    auto cond_value_pair_it =
+        std::find_if(cond_values.begin(), cond_values.end(),
+                     [&cond](decltype(*cond_values.begin()) it) { return it.first == cond.feature; });
 
     ARM_COMPUTE_ERROR_ON(cond_value_pair_it == cond_values.end());
     const float cond_value = cond_value_pair_it->second;
-    switch(cond.op)
+    switch (cond.op)
     {
         case ConditionalOp::LT:
         {
@@ -92,13 +89,12 @@ constexpr size_t                HeuristicTree::_max_num_nodes;
 constexpr size_t                HeuristicTree::_max_query_depth;
 constexpr HeuristicTree::NodeID HeuristicTree::_root;
 
-HeuristicTree::HeuristicTree()
-    : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32)
+HeuristicTree::HeuristicTree() : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32)
 {
 }
 
 HeuristicTree::HeuristicTree(TreeID id, HeuristicType h_type, const std::string &ip_target, DataType data_type)
-    : _id{ id }, _heuristic_type{ h_type }, _ip_target{ ip_target }, _data_type{ data_type }, _tree{}
+    : _id{id}, _heuristic_type{h_type}, _ip_target{ip_target}, _data_type{data_type}, _tree{}
 {
 }
 
@@ -108,16 +104,17 @@ std::pair<bool, T> HeuristicTree::query(GEMMShape shape) const
     // Root ID = 0;
     auto   cur_node = _tree.at(_root).get();
     size_t depth    = 0;
-    while(cur_node->type() != NodeType::Leaf)
+    while (cur_node->type() != NodeType::Leaf)
     {
-        if(depth > _max_query_depth)
+        if (depth > _max_query_depth)
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?", _max_query_depth);
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?",
+                                                      _max_query_depth);
             return std::make_pair(false, T{});
         }
         ARM_COMPUTE_ERROR_ON_MSG(cur_node->type() != NodeType::Branch, "Unexpected NodeType");
         auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node);
-        if(evaluate(shape, br_node->condition))
+        if (evaluate(shape, br_node->condition))
         {
             cur_node = _tree.at(br_node->true_node).get();
         }
@@ -135,12 +132,12 @@ std::pair<bool, T> HeuristicTree::query(GEMMShape shape) const
 template <typename T>
 bool HeuristicTree::add_leaf(NodeID id, T val)
 {
-    if(_tree.size() >= _max_num_nodes)
+    if (_tree.size() >= _max_num_nodes)
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes);
         return false;
     }
-    if(_tree.find(id) != _tree.end())
+    if (_tree.find(id) != _tree.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id);
         return false;
@@ -151,28 +148,23 @@ bool HeuristicTree::add_leaf(NodeID id, T val)
 
 bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID f_node)
 {
-    if(_tree.size() >= _max_num_nodes)
+    if (_tree.size() >= _max_num_nodes)
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes);
         return false;
     }
 
-    const std::set<std::string> supported_features =
-    {
-        "m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload"
-    };
-    const auto orig_feature = cond.feature;
-    std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(), [](char c)
-    {
-        return std::tolower(c);
-    });
-    if(supported_features.find(cond.feature) == supported_features.end())
+    const std::set<std::string> supported_features = {"m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload"};
+    const auto                  orig_feature       = cond.feature;
+    std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(),
+                   [](char c) { return std::tolower(c); });
+    if (supported_features.find(cond.feature) == supported_features.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Unsupported feature %s", orig_feature.c_str());
         return false;
     }
 
-    if(_tree.find(id) != _tree.end())
+    if (_tree.find(id) != _tree.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id);
         return false;
@@ -184,32 +176,32 @@ bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID
 bool HeuristicTree::check_if_structurally_correct() const
 {
     std::set<NodeID>   visited;
-    std::deque<NodeID> to_visit{ _root };
+    std::deque<NodeID> to_visit{_root};
 
-    while(!to_visit.empty())
+    while (!to_visit.empty())
     {
         auto id = to_visit.front();
         to_visit.pop_front();
-        if(_tree.find(id) == _tree.end())
+        if (_tree.find(id) == _tree.end())
         {
             ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing node %zu", id);
             return false;
         }
         auto not_seen_before = visited.insert(id);
-        if(!not_seen_before.second)
+        if (!not_seen_before.second)
         {
             ARM_COMPUTE_LOG_INFO_MSG_CORE("Not a tree; contains cycles or loops");
             return false;
         }
         auto cur_node = _tree.at(id).get();
-        if(cur_node->type() == NodeType::Branch)
+        if (cur_node->type() == NodeType::Branch)
         {
             auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node);
             to_visit.push_back(br_node->true_node);
             to_visit.push_back(br_node->false_node);
         }
     }
-    if(visited.size() != _tree.size())
+    if (visited.size() != _tree.size())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Contains disjoint nodes");
         return false;
@@ -219,12 +211,12 @@ bool HeuristicTree::check_if_structurally_correct() const
 
 bool HeuristicTree::check()
 {
-    if(_tree.empty())
+    if (_tree.empty())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Empty tree encountered");
         return false;
     }
-    if(_tree.find(_root) == _tree.end())
+    if (_tree.find(_root) == _tree.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing root. Root must have a Node ID of %zu", _root);
         return false;
@@ -237,7 +229,8 @@ template std::pair<bool, GEMMType> HeuristicTree::query<GEMMType>(GEMMShape shap
 /** Explicit template instantiation @relates HeuristicTree */
 template std::pair<bool, GEMMConfigNative> HeuristicTree::query<GEMMConfigNative>(GEMMShape shape) const;
 /** Explicit template instantiation @relates HeuristicTree */
-template std::pair<bool, GEMMConfigReshapedOnlyRHS> HeuristicTree::query<GEMMConfigReshapedOnlyRHS>(GEMMShape shape) const;
+template std::pair<bool, GEMMConfigReshapedOnlyRHS>
+HeuristicTree::query<GEMMConfigReshapedOnlyRHS>(GEMMShape shape) const;
 /** Explicit template instantiation @relates HeuristicTree */
 template std::pair<bool, GEMMConfigReshaped> HeuristicTree::query<GEMMConfigReshaped>(GEMMShape shape) const;
 
diff --git a/src/runtime/CL/mlgo/HeuristicTree.h b/src/runtime/CL/mlgo/HeuristicTree.h
index d5c7de22155c2ec40b528e92fc7c22d56b906abe..a4f8c116b9d23c5b7a170b59723de41f1af74dac 100644
--- a/src/runtime/CL/mlgo/HeuristicTree.h
+++ b/src/runtime/CL/mlgo/HeuristicTree.h
@@ -25,6 +25,7 @@
 #define SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/runtime/CL/mlgo/Common.h"
 
 #include <map>
@@ -84,7 +85,7 @@ public:
     struct BranchNode : public Node
     {
         BranchNode(NodeID id, Condition cond, NodeID t_node, NodeID f_node)
-            : id{ id }, condition{ cond }, true_node{ t_node }, false_node{ f_node }
+            : id{id}, condition{cond}, true_node{t_node}, false_node{f_node}
         {
         }
         NodeType type() const override
@@ -100,8 +101,7 @@ public:
     template <typename T>
     struct LeafNode : public Node
     {
-        LeafNode(NodeID id, T val)
-            : id{ id }, value{ val }
+        LeafNode(NodeID id, T val) : id{id}, value{val}
         {
         }
         NodeType type() const override
@@ -177,22 +177,22 @@ public:
     bool check();
 
 private:
-    static constexpr size_t _max_query_depth{ 1000 }; // Maximum depth of query
-    static constexpr size_t _max_num_nodes{ 100000 }; // Maximum number of nodes contained by the tree
-    static constexpr NodeID _root{ 0 };               // Root tree ID
+    static constexpr size_t _max_query_depth{1000}; // Maximum depth of query
+    static constexpr size_t _max_num_nodes{100000}; // Maximum number of nodes contained by the tree
+    static constexpr NodeID _root{0};               // Root tree ID
 
 private:
     bool check_if_structurally_correct() const;
 
 private:
-    TreeID        _id;                             /**< Heuristic tree ID */
-    HeuristicType _heuristic_type;                 /**< Heuristic type */
-    std::string   _ip_target;                      /**< IP target associated with the tree */
-    DataType      _data_type;                      /**< Data type associated with the tree */
-    std::map<NodeID, std::unique_ptr<Node>> _tree; /**< Tree representation */
+    TreeID                                  _id;             /**< Heuristic tree ID */
+    HeuristicType                           _heuristic_type; /**< Heuristic type */
+    std::string                             _ip_target;      /**< IP target associated with the tree */
+    DataType                                _data_type;      /**< Data type associated with the tree */
+    std::map<NodeID, std::unique_ptr<Node>> _tree;           /**< Tree representation */
 };
 } // namespace mlgo
 
 } // namespace arm_compute
 
-#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
\ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.cpp b/src/runtime/CL/mlgo/MLGOHeuristics.cpp
index 80f3bb85e900655f5e034540e1bd67eae15f989e..aed46cd80f5e890e97c596802b956e8c807899c6 100644
--- a/src/runtime/CL/mlgo/MLGOHeuristics.cpp
+++ b/src/runtime/CL/mlgo/MLGOHeuristics.cpp
@@ -24,6 +24,7 @@
 #include "src/runtime/CL/mlgo/MLGOHeuristics.h"
 
 #include "arm_compute/core/Log.h"
+
 #include "src/runtime/CL/mlgo/MLGOParser.h"
 #include "src/runtime/CL/mlgo/Utils.h"
 
@@ -39,19 +40,19 @@ bool operator==(const GEMMConfigNative &lhs, const GEMMConfigNative &rhs)
 }
 bool operator==(const GEMMConfigReshapedOnlyRHS &lhs, const GEMMConfigReshapedOnlyRHS &rhs)
 {
-    return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs,
-                                                                                                                            rhs.export_cl_image);
+    return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) ==
+           std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
 }
 bool operator==(const GEMMConfigReshaped &lhs, const GEMMConfigReshaped &rhs)
 {
-    return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0,
-            rhs.interleave_lhs, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
+    return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs,
+                    lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0, rhs.interleave_lhs,
+                                                     rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
 }
 
 constexpr size_t MLGOHeuristics::_max_num_trees;
 
-MLGOHeuristics::MLGOHeuristics()
-    : _indices{}, _trees{}, _tree_valid{}, _valid{ false }
+MLGOHeuristics::MLGOHeuristics() : _indices{}, _trees{}, _tree_valid{}, _valid{false}
 {
 }
 
@@ -59,71 +60,74 @@ std::pair<bool, GEMMType> MLGOHeuristics::query_gemm_type(const Query &query) co
 {
     ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm type. %s.", to_string(query).c_str());
     const auto invalid = GEMMType::RESHAPED;
-    if(!_valid)
+    if (!_valid)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
-        return { false, invalid };
+        return {false, invalid};
     }
     auto      index = std::make_tuple(HeuristicType::GEMM_Type, query.ip_target, query.data_type);
-    GEMMShape shape_query{ query.m, query.n, query.k, query.b };
-    if(_trees.find(index) == _trees.end())
+    GEMMShape shape_query{query.m, query.n, query.k, query.b};
+    if (_trees.find(index) == _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
-        return { false, invalid };
+        return {false, invalid};
     }
     return _trees.at(index).query<GEMMType>(shape_query);
 }
 std::pair<bool, GEMMConfigNative> MLGOHeuristics::query_gemm_config_native(const Query &query) const
 {
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.", to_string(query).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.",
+                                              to_string(query).c_str());
     const auto invalid = GEMMConfigNative{};
-    if(!_valid)
+    if (!_valid)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
-        return { false, invalid };
+        return {false, invalid};
     }
     auto      index = std::make_tuple(HeuristicType::GEMM_Config_Native, query.ip_target, query.data_type);
-    GEMMShape shape_query{ query.m, query.n, query.k, query.b };
-    if(_trees.find(index) == _trees.end())
+    GEMMShape shape_query{query.m, query.n, query.k, query.b};
+    if (_trees.find(index) == _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
-        return { false, invalid };
+        return {false, invalid};
     }
     return _trees.at(index).query<GEMMConfigNative>(shape_query);
 }
 std::pair<bool, GEMMConfigReshapedOnlyRHS> MLGOHeuristics::query_gemm_config_reshaped_only_rhs(const Query &query) const
 {
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.", to_string(query).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.",
+                                              to_string(query).c_str());
     const auto invalid = GEMMConfigReshapedOnlyRHS{};
-    if(!_valid)
+    if (!_valid)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
-        return { false, invalid };
+        return {false, invalid};
     }
     auto      index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped_Only_RHS, query.ip_target, query.data_type);
-    GEMMShape shape_query{ query.m, query.n, query.k, query.b };
-    if(_trees.find(index) == _trees.end())
+    GEMMShape shape_query{query.m, query.n, query.k, query.b};
+    if (_trees.find(index) == _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
-        return { false, invalid };
+        return {false, invalid};
     }
     return _trees.at(index).query<GEMMConfigReshapedOnlyRHS>(shape_query);
 }
 std::pair<bool, GEMMConfigReshaped> MLGOHeuristics::query_gemm_config_reshaped(const Query &query) const
 {
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.", to_string(query).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.",
+                                              to_string(query).c_str());
     const auto invalid = GEMMConfigReshaped{};
-    if(!_valid)
+    if (!_valid)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
-        return { false, invalid };
+        return {false, invalid};
     }
     auto      index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped, query.ip_target, query.data_type);
-    GEMMShape shape_query{ query.m, query.n, query.k, query.b };
-    if(_trees.find(index) == _trees.end())
+    GEMMShape shape_query{query.m, query.n, query.k, query.b};
+    if (_trees.find(index) == _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
-        return { false, invalid };
+        return {false, invalid};
     }
     return _trees.at(index).query<GEMMConfigReshaped>(shape_query);
 }
@@ -131,14 +135,14 @@ std::pair<bool, GEMMConfigReshaped> MLGOHeuristics::query_gemm_config_reshaped(c
 bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id)
 {
     bool           status;
-    HeuristicTree *tree{ nullptr };
+    HeuristicTree *tree{nullptr};
     std::tie(status, tree) = get_heuristic_tree(id);
-    if(!status)
+    if (!status)
     {
         return status;
     }
     status = tree->check();
-    if(!status)
+    if (!status)
     {
         return status;
     }
@@ -149,14 +153,12 @@ bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id)
 bool MLGOHeuristics::check_all() const
 {
     // Tree validities are already checked and cached.
-    bool all_trees_are_checked = std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v)
-    {
-        return !v.second;
-    })
-    == _tree_valid.end();
-    if(!all_trees_are_checked)
+    bool all_trees_are_checked =
+        std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v) { return !v.second; }) == _tree_valid.end();
+    if (!all_trees_are_checked)
     {
-        ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each tree is completed. This could also indicate there are no trees in the dotmlgo");
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each "
+                                      "tree is completed. This could also indicate there are no trees in the dotmlgo");
         return false;
     }
 
@@ -167,14 +169,14 @@ bool MLGOHeuristics::check_all() const
 
 std::pair<bool, HeuristicTree *> MLGOHeuristics::get_heuristic_tree(HeuristicTree::TreeID id)
 {
-    if(_indices.find(id) == _indices.end())
+    if (_indices.find(id) == _indices.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot find tree with id %zu", id);
         return std::make_pair(false, nullptr);
     }
     const auto index = _indices[id];
 
-    if(_trees.find(index) == _trees.end())
+    if (_trees.find(index) == _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
         return std::make_pair(false, nullptr);
@@ -186,7 +188,7 @@ std::pair<bool, HeuristicTree *> MLGOHeuristics::get_heuristic_tree(HeuristicTre
 
 bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t)
 {
-    if(_indices.size() >= _max_num_trees)
+    if (_indices.size() >= _max_num_trees)
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the max number of trees allowed: %zu", _max_num_trees);
         return false;
@@ -194,7 +196,7 @@ bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t)
     // PRE: correctness of t is guaranteed by the tree construction process
     // Ensure unique id
     const auto id = t.id();
-    if(_indices.find(id) != _indices.end())
+    if (_indices.find(id) != _indices.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add redundant trees; tree id %zu already exists", id);
         return false;
@@ -202,7 +204,7 @@ bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t)
 
     // Ensure unique index
     const auto index = t.index();
-    if(_trees.find(index) != _trees.end())
+    if (_trees.find(index) != _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot add redundant trees; tree index already exists");
         return false;
@@ -219,9 +221,10 @@ bool MLGOHeuristics::reload_from_file(const std::string &filename)
     std::ifstream fs;
     fs.exceptions(std::ifstream::badbit);
     fs.open(filename, std::ios::in);
-    if(!fs.is_open())
+    if (!fs.is_open())
     {
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead", filename.c_str());
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead",
+                                                  filename.c_str());
         return _valid = false;
     }
     return reload_from_stream(fs);
@@ -230,7 +233,7 @@ bool MLGOHeuristics::reload_from_file(const std::string &filename)
 bool MLGOHeuristics::reload_from_stream(std::istream &in)
 {
     auto parsed = parser::parse_mlgo(in);
-    if(!parsed.first)
+    if (!parsed.first)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("DotMLGO parsing failed. Use default heuristics instead");
         return _valid = false;
@@ -241,4 +244,4 @@ bool MLGOHeuristics::reload_from_stream(std::istream &in)
 }
 
 } // namespace mlgo
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.h b/src/runtime/CL/mlgo/MLGOHeuristics.h
index aa212259595bb52ac9d834f43275ec9a540cc349..6a491c550327bb11bb99376d394a812d345f1328 100644
--- a/src/runtime/CL/mlgo/MLGOHeuristics.h
+++ b/src/runtime/CL/mlgo/MLGOHeuristics.h
@@ -135,16 +135,16 @@ public:
     bool check_all() const;
 
 private:
-    static constexpr size_t _max_num_trees{ 100 }; /**< Max number of trees that can be added*/
+    static constexpr size_t _max_num_trees{100}; /**< Max number of trees that can be added*/
 
 private:
     // There exists a one-to-one mappipng between TreeID and Index, either can be used to identify a @ref HeuristicTree
     std::map<HeuristicTree::TreeID, HeuristicTree::Index> _indices;    /**< A mapping from TreeID to Index */
     std::map<HeuristicTree::Index, HeuristicTree>         _trees;      /**< A mapping from Index to HeuristicTree */
     std::map<HeuristicTree::TreeID, bool>                 _tree_valid; /**< Result cache of the tree validity checks */
-    bool _valid;                                                       /**< Overall validity */
+    bool                                                  _valid;      /**< Overall validity */
 };
 
 } // namespace mlgo
 } // namespace arm_compute
-#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H
\ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H
diff --git a/src/runtime/CL/mlgo/MLGOParser.cpp b/src/runtime/CL/mlgo/MLGOParser.cpp
index 625739e45038edefa86cb1d57ec339f15c91f9b3..893daf2ed904c08ffafa97a3b320692467306749 100644
--- a/src/runtime/CL/mlgo/MLGOParser.cpp
+++ b/src/runtime/CL/mlgo/MLGOParser.cpp
@@ -22,19 +22,21 @@
  * SOFTWARE.
  */
 #include "src/runtime/CL/mlgo/MLGOParser.h"
+
 #include "arm_compute/core/Log.h"
+
 #include "src/runtime/CL/mlgo/Utils.h"
 
 #include <sstream>
 
 #define CHECK(parser_expr, valid_var) \
     (parser_expr);                    \
-    if(!valid_var)                    \
+    if (!valid_var)                   \
         return;
 
 #define CHECK_DEFAULT(parser_expr, valid_var, default_val) \
     (parser_expr);                                         \
-    if(!valid_var)                                         \
+    if (!valid_var)                                        \
         return default_val;
 
 #ifdef ARM_COMPUTE_LOGGING_ENABLED
@@ -53,8 +55,7 @@
     valid_var = false;                                          \
     return default_val;
 
-#define LOG_TOKEN_POS(tokens, pos_var) \
-    const auto pos_var = tokens.current_pos();
+#define LOG_TOKEN_POS(tokens, pos_var) const auto pos_var = tokens.current_pos();
 
 #else // ARM_COMPUTE_LOGGING_ENABLED
 
@@ -73,19 +74,12 @@ namespace
 {
 void ltrim(std::string &str)
 {
-    str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch)
-    {
-        return !std::isspace(ch);
-    }));
+    str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch) { return !std::isspace(ch); }));
 }
 
 void rtrim(std::string &str)
 {
-    str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch)
-    {
-        return !std::isspace(ch);
-    }).base(),
-    str.end());
+    str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch) { return !std::isspace(ch); }).base(), str.end());
 }
 
 void trim(std::string &str)
@@ -109,7 +103,7 @@ enum class ComparatorType
 };
 
 TokenStream::TokenStream(std::istream &s, const std::string &delims)
-    : _delims{ delims }, _istream{ s }, _tokens{}, _lookahead_pos{}
+    : _delims{delims}, _istream{s}, _tokens{}, _lookahead_pos{}
 {
     read();
 }
@@ -125,7 +119,7 @@ Token TokenStream::take()
     ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty");
     Token t = _tokens.front();
     _tokens.pop_front();
-    if(_tokens.empty())
+    if (_tokens.empty())
     {
         read();
     }
@@ -136,7 +130,7 @@ Token TokenStream::peek(size_t i)
     ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty");
     ARM_COMPUTE_ERROR_ON_MSG(i >= max_look_ahead, "TokenStream: Exceeding max look ahead");
     // NOTE: If i exceeds the stream (_istream.eof()), read() automatically appends a End token at the end
-    while(_istream && _tokens.size() <= i)
+    while (_istream && _tokens.size() <= i)
     {
         read();
     }
@@ -146,7 +140,7 @@ Token TokenStream::peek(size_t i)
 
 void advance(CharPosition &pos, char ch)
 {
-    if(ch == '\n')
+    if (ch == '\n')
     {
         pos.ln += 1;
         pos.col = 0;
@@ -167,17 +161,16 @@ void TokenStream::read()
     do
     {
         // Reached eof
-        if(!_istream.get(ch))
+        if (!_istream.get(ch))
         {
-            if(!reached_end())
+            if (!reached_end())
             {
                 _tokens.emplace_back(TokenType::End, "", _lookahead_pos);
             }
             return;
         }
         advance(_lookahead_pos, ch);
-    }
-    while(std::isspace(ch) || is_delim(ch));
+    } while (std::isspace(ch) || is_delim(ch));
     // Read chars until we hit a delim or eof
     auto orig_pos = _lookahead_pos;
     auto tok      = recognize_tok(ch);
@@ -190,41 +183,41 @@ void TokenStream::read()
 
 Token TokenStream::recognize_tok(char ch)
 {
-    if(ch == '[')
+    if (ch == '[')
     {
-        return Token{ TokenType::L_List, "", _lookahead_pos };
+        return Token{TokenType::L_List, "", _lookahead_pos};
     }
-    else if(ch == ']')
+    else if (ch == ']')
     {
-        return Token{ TokenType::R_List, "", _lookahead_pos };
+        return Token{TokenType::R_List, "", _lookahead_pos};
     }
-    else if(ch == '.')
+    else if (ch == '.')
     {
-        return float_after_dp_st(std::string{ ch });
+        return float_after_dp_st(std::string{ch});
     }
-    else if(std::isdigit(ch))
+    else if (std::isdigit(ch))
     {
-        return num_st(std::string{ ch });
+        return num_st(std::string{ch});
     }
     else
     {
-        return text_st(std::string{ ch });
+        return text_st(std::string{ch});
     }
 }
 
 Token TokenStream::num_st(std::string value)
 {
     char ch{};
-    while(_istream.get(ch))
+    while (_istream.get(ch))
     {
         advance(_lookahead_pos, ch);
-        if(ch == '.')
+        if (ch == '.')
         {
             return float_after_dp_st(value + ch);
         }
-        else if(!std::isdigit(ch))
+        else if (!std::isdigit(ch))
         {
-            if(!is_delim(ch) && !std::isspace(ch))
+            if (!is_delim(ch) && !std::isspace(ch))
             {
                 rewind(_lookahead_pos);
                 _istream.unget();
@@ -233,18 +226,18 @@ Token TokenStream::num_st(std::string value)
         }
         value += ch;
     }
-    return Token{ TokenType::Int, value, _lookahead_pos };
+    return Token{TokenType::Int, value, _lookahead_pos};
 }
 
 Token TokenStream::float_after_dp_st(std::string value)
 {
     char ch{};
-    while(_istream.get(ch))
+    while (_istream.get(ch))
     {
         advance(_lookahead_pos, ch);
-        if(!std::isdigit(ch))
+        if (!std::isdigit(ch))
         {
-            if(!is_delim(ch) && !std::isspace(ch))
+            if (!is_delim(ch) && !std::isspace(ch))
             {
                 rewind(_lookahead_pos);
                 _istream.unget();
@@ -253,20 +246,20 @@ Token TokenStream::float_after_dp_st(std::string value)
         }
         value += ch;
     }
-    return Token{ TokenType::Float, value, _lookahead_pos };
+    return Token{TokenType::Float, value, _lookahead_pos};
 }
 
 Token TokenStream::text_st(std::string value)
 {
     char ch{};
-    while(_istream.get(ch))
+    while (_istream.get(ch))
     {
         advance(_lookahead_pos, ch);
-        if(is_delim(ch))
+        if (is_delim(ch))
         {
             break;
         }
-        if(ch == '[' || ch == ']')
+        if (ch == '[' || ch == ']')
         {
             rewind(_lookahead_pos);
             _istream.unget();
@@ -274,7 +267,7 @@ Token TokenStream::text_st(std::string value)
         }
         value += ch;
     }
-    return Token{ TokenType::Text, value, _lookahead_pos };
+    return Token{TokenType::Text, value, _lookahead_pos};
 }
 
 bool TokenStream::reached_end() const
@@ -291,7 +284,7 @@ void end(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
     auto tok = in.take();
-    if(tok.type != TokenType::End)
+    if (tok.type != TokenType::End)
     {
         FAIL_WITH_MSG(valid, pos, "Unexpected token at the end of stream");
     }
@@ -301,7 +294,7 @@ bool bool_val(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
     auto tok = in.take();
-    if(tok.type != TokenType::Int)
+    if (tok.type != TokenType::Int)
     {
         FAIL_WITH_MSG_DEFAULT(valid, false, pos, "Expect bool or int token");
     }
@@ -314,7 +307,7 @@ int int_val(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
     auto tok = in.take();
-    if(tok.type != TokenType::Int)
+    if (tok.type != TokenType::Int)
     {
         FAIL_WITH_MSG_DEFAULT(valid, -1, pos, "Expect int token");
     }
@@ -327,7 +320,7 @@ unsigned int uint_val(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
     int val = CHECK_DEFAULT(int_val(in, valid), valid, 0);
-    if(val < 0)
+    if (val < 0)
     {
         FAIL_WITH_MSG_DEFAULT(valid, 0, pos, "Expect unsigned int token");
     }
@@ -338,7 +331,7 @@ float float_val(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
     auto tok = in.take();
-    if(tok.type != TokenType::Float)
+    if (tok.type != TokenType::Float)
     {
         FAIL_WITH_MSG_DEFAULT(valid, 0.f, pos, "Expect float token");
     }
@@ -351,7 +344,7 @@ std::string text_val(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
     auto tok = in.take();
-    if(tok.type != TokenType::Text || tok.value.empty())
+    if (tok.type != TokenType::Text || tok.value.empty())
     {
         FAIL_WITH_MSG_DEFAULT(valid, "", pos, "Expect a non-empty text token");
     }
@@ -361,9 +354,9 @@ std::string text_val(TokenStream &in, bool &valid)
 bool accept_text(TokenStream &in, const std::string &c_str, bool take = true)
 {
     auto tok = in.peek();
-    if(tok.type == TokenType::Text && tok.value == c_str)
+    if (tok.type == TokenType::Text && tok.value == c_str)
     {
-        if(take)
+        if (take)
         {
             in.take();
         }
@@ -375,7 +368,7 @@ bool accept_text(TokenStream &in, const std::string &c_str, bool take = true)
 void expect_text(TokenStream &in, const std::string &str, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(!accept_text(in, str))
+    if (!accept_text(in, str))
     {
         FAIL_WITH_MSG(valid, pos, std::string("Expect text token: ") + str);
     }
@@ -384,7 +377,7 @@ void expect_text(TokenStream &in, const std::string &str, bool &valid)
 bool accept_l_list(TokenStream &in)
 {
     auto tok = in.peek();
-    if(tok.type == TokenType::L_List)
+    if (tok.type == TokenType::L_List)
     {
         in.take();
         return true;
@@ -395,7 +388,7 @@ bool accept_l_list(TokenStream &in)
 void expect_l_list(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(!accept_l_list(in))
+    if (!accept_l_list(in))
     {
         FAIL_WITH_MSG(valid, pos, "Expect '['");
     }
@@ -404,7 +397,7 @@ void expect_l_list(TokenStream &in, bool &valid)
 bool accept_r_list(TokenStream &in)
 {
     auto tok = in.peek();
-    if(tok.type == TokenType::R_List)
+    if (tok.type == TokenType::R_List)
     {
         in.take();
         return true;
@@ -415,7 +408,7 @@ bool accept_r_list(TokenStream &in)
 void expect_r_list(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(!accept_r_list(in))
+    if (!accept_r_list(in))
     {
         FAIL_WITH_MSG(valid, pos, "Expect ']'");
     }
@@ -424,23 +417,23 @@ void expect_r_list(TokenStream &in, bool &valid)
 ConditionalOp conditional_op(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "<="))
+    if (accept_text(in, "<="))
     {
         return ConditionalOp::LE;
     }
-    else if(accept_text(in, ">="))
+    else if (accept_text(in, ">="))
     {
         return ConditionalOp::GE;
     }
-    else if(accept_text(in, "=="))
+    else if (accept_text(in, "=="))
     {
         return ConditionalOp::EQ;
     }
-    else if(accept_text(in, "<"))
+    else if (accept_text(in, "<"))
     {
         return ConditionalOp::LT;
     }
-    else if(accept_text(in, ">"))
+    else if (accept_text(in, ">"))
     {
         return ConditionalOp::GT;
     }
@@ -464,11 +457,11 @@ void ip_type(TokenStream &in, bool &valid)
 {
     CHECK(expect_text(in, "ip-type", valid), valid);
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "gpu"))
+    if (accept_text(in, "gpu"))
     {
         ;
     }
-    else if(accept_text(in, "cpu"))
+    else if (accept_text(in, "cpu"))
     {
         ;
     }
@@ -489,15 +482,15 @@ void header(TokenStream &in, bool &valid)
 DataType data_type(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "f16"))
+    if (accept_text(in, "f16"))
     {
         return DataType::F16;
     }
-    else if(accept_text(in, "f32"))
+    else if (accept_text(in, "f32"))
     {
         return DataType::F32;
     }
-    else if(accept_text(in, "qasymm8"))
+    else if (accept_text(in, "qasymm8"))
     {
         return DataType::QASYMM8;
     }
@@ -510,15 +503,15 @@ DataType data_type(TokenStream &in, bool &valid)
 ComparatorType comparator_type(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "var"))
+    if (accept_text(in, "var"))
     {
         return ComparatorType::Var;
     }
-    else if(accept_text(in, "num"))
+    else if (accept_text(in, "num"))
     {
         return ComparatorType::Num;
     }
-    else if(accept_text(in, "enum"))
+    else if (accept_text(in, "enum"))
     {
         return ComparatorType::Enum;
     }
@@ -531,19 +524,19 @@ ComparatorType comparator_type(TokenStream &in, bool &valid)
 HeuristicType heuristic_type(TokenStream &in, bool &valid, bool take = true)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "gemm-type", take))
+    if (accept_text(in, "gemm-type", take))
     {
         return HeuristicType::GEMM_Type;
     }
-    else if(accept_text(in, "gemm-config-native", take))
+    else if (accept_text(in, "gemm-config-native", take))
     {
         return HeuristicType::GEMM_Config_Native;
     }
-    else if(accept_text(in, "gemm-config-reshaped-only-rhs", take))
+    else if (accept_text(in, "gemm-config-reshaped-only-rhs", take))
     {
         return HeuristicType::GEMM_Config_Reshaped_Only_RHS;
     }
-    else if(accept_text(in, "gemm-config-reshaped", take))
+    else if (accept_text(in, "gemm-config-reshaped", take))
     {
         return HeuristicType::GEMM_Config_Reshaped;
     }
@@ -557,7 +550,7 @@ void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &val
 {
     LOG_TOKEN_POS(in, pos);
     auto ht = CHECK(heuristic_type(in, valid, false), valid);
-    if(ht != expected_ht)
+    if (ht != expected_ht)
     {
         FAIL_WITH_MSG(valid, pos, "Unexpected heuristic type");
     }
@@ -567,15 +560,15 @@ void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &val
 GEMMType gemm_type(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "native"))
+    if (accept_text(in, "native"))
     {
         return GEMMType::NATIVE;
     }
-    else if(accept_text(in, "reshaped-only-rhs"))
+    else if (accept_text(in, "reshaped-only-rhs"))
     {
         return GEMMType::RESHAPED_ONLY_RHS;
     }
-    else if(accept_text(in, "reshaped"))
+    else if (accept_text(in, "reshaped"))
     {
         return GEMMType::RESHAPED;
     }
@@ -593,7 +586,7 @@ GEMMConfigNative gemm_config_native(TokenStream &in, bool &valid)
     const auto n0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
     const auto k0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
     CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
-    return GEMMConfigNative{ m0, n0, k0 };
+    return GEMMConfigNative{m0, n0, k0};
 }
 
 GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &valid)
@@ -608,7 +601,7 @@ GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &v
     const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
     const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
     CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
-    return GEMMConfigReshapedOnlyRHS{ m0, n0, k0, h0, ir, tr, ex };
+    return GEMMConfigReshapedOnlyRHS{m0, n0, k0, h0, ir, tr, ex};
 }
 
 GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid)
@@ -625,17 +618,17 @@ GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid)
     const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
     const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
     CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
-    return GEMMConfigReshaped{ m0, n0, k0, v0, h0, il, ir, tr, ex };
+    return GEMMConfigReshaped{m0, n0, k0, v0, h0, il, ir, tr, ex};
 }
 
 void gpu_priority(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "best-performance"))
+    if (accept_text(in, "best-performance"))
     {
         ;
     }
-    else if(accept_text(in, "best-memory-usage"))
+    else if (accept_text(in, "best-memory-usage"))
     {
         ;
     }
@@ -648,11 +641,11 @@ void gpu_priority(TokenStream &in, bool &valid)
 void gpu_behavior(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "static"))
+    if (accept_text(in, "static"))
     {
         ;
     }
-    else if(accept_text(in, "dynamic"))
+    else if (accept_text(in, "dynamic"))
     {
         ;
     }
@@ -665,7 +658,7 @@ void gpu_behavior(TokenStream &in, bool &valid)
 void free_vars(TokenStream &in, bool &valid)
 {
     CHECK(expect_l_list(in, valid), valid);
-    while(!accept_r_list(in))
+    while (!accept_r_list(in))
     {
         CHECK(text_val(in, valid), valid);
     }
@@ -688,7 +681,7 @@ void heuristics_table_entry(TokenStream &in, MLGOHeuristics &h, bool &valid)
 void heuristics_table(TokenStream &in, MLGOHeuristics &h, bool &valid)
 {
     CHECK(expect_text(in, "<heuristics-table>", valid), valid);
-    while(!accept_text(in, "</heuristics-table>"))
+    while (!accept_text(in, "</heuristics-table>"))
     {
         CHECK(heuristics_table_entry(in, h, valid), valid);
     }
@@ -705,11 +698,12 @@ Condition condition(TokenStream &in, bool &valid)
     const auto c_o         = CHECK_DEFAULT(conditional_op(in, valid), valid, invalid_val);
     const auto r_t         = CHECK_DEFAULT(comparator_type(in, valid), valid, invalid_val);
     const auto r_v         = CHECK_DEFAULT(float_val(in, valid), valid, invalid_val);
-    if(l_t != ComparatorType::Var || r_t != ComparatorType::Num)
+    if (l_t != ComparatorType::Var || r_t != ComparatorType::Num)
     {
-        FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos, "Only accept LHS type to be Var (string) and RHS type to be Num (float)");
+        FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos,
+                              "Only accept LHS type to be Var (string) and RHS type to be Num (float)");
     }
-    return Condition{ l_v, c_o, r_v };
+    return Condition{l_v, c_o, r_v};
 }
 
 void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
@@ -717,13 +711,13 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
     CHECK(expect_text(in, "<heuristic", valid), valid);
     const auto tree_id = CHECK(uint_val(in, valid), valid);
     CHECK(expect_text(in, ">", valid), valid);
-    HeuristicTree *t = nullptr;
-    std::tie(valid, t) = CHECK(h.get_heuristic_tree(tree_id), valid);
+    HeuristicTree *t                     = nullptr;
+    std::tie(valid, t)                   = CHECK(h.get_heuristic_tree(tree_id), valid);
     const HeuristicType t_heuristic_type = std::get<0>(t->index());
-    while(!accept_text(in, "</heuristic>"))
+    while (!accept_text(in, "</heuristic>"))
     {
         LOG_TOKEN_POS(in, pos);
-        if(accept_text(in, "b"))
+        if (accept_text(in, "b"))
         {
             // Branch node
             const auto id   = CHECK(uint_val(in, valid), valid);
@@ -732,7 +726,7 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
             const auto f_id = CHECK(uint_val(in, valid), valid);
             valid           = CHECK(t->add_branch(id, cond, t_id, f_id), valid);
         }
-        else if(accept_text(in, "l"))
+        else if (accept_text(in, "l"))
         {
             // Leaf node
             const auto id = CHECK(uint_val(in, valid), valid);
@@ -740,7 +734,7 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
             // heuristic table). For now it remains as a step for validation.
             LOG_TOKEN_POS(in, pos);
             CHECK(expect_heuristic_type(in, t_heuristic_type, valid), valid);
-            switch(t_heuristic_type)
+            switch (t_heuristic_type)
             {
                 case HeuristicType::GEMM_Type:
                 {
@@ -786,7 +780,7 @@ MLGOHeuristics mlgo(TokenStream &in, bool &valid)
     MLGOHeuristics h;
     CHECK_DEFAULT(header(in, valid), valid, h);
     CHECK_DEFAULT(heuristics_table(in, h, valid), valid, h);
-    while(accept_text(in, "<heuristic", false))
+    while (accept_text(in, "<heuristic", false))
     {
         CHECK_DEFAULT(heuristic_tree(in, h, valid), valid, h);
     }
@@ -809,4 +803,4 @@ std::pair<bool, MLGOHeuristics> parse_mlgo(std::istream &in)
 #undef CHECK
 #undef CHECK_DEFAULT
 #undef FAIL_WITH_MSG
-#undef FAIL_WITH_MSG_DEFAULT
\ No newline at end of file
+#undef FAIL_WITH_MSG_DEFAULT
diff --git a/src/runtime/CL/mlgo/MLGOParser.h b/src/runtime/CL/mlgo/MLGOParser.h
index 49d8b9c644e25b64583d77c869b3de1334b37f69..cffce8d6a1654434f8c96cb92b0c375397f12abc 100644
--- a/src/runtime/CL/mlgo/MLGOParser.h
+++ b/src/runtime/CL/mlgo/MLGOParser.h
@@ -98,15 +98,14 @@ struct CharPosition
         return ln == other.ln && col == other.col;
     }
 
-    size_t ln{ 0 };
-    size_t col{ 0 };
+    size_t ln{0};
+    size_t col{0};
 };
 
 /** Token */
 struct Token
 {
-    Token(TokenType t, std::string v, CharPosition pos)
-        : type{ t }, value{ v }, pos{ pos }
+    Token(TokenType t, std::string v, CharPosition pos) : type{t}, value{v}, pos{pos}
     {
     }
 
@@ -196,4 +195,4 @@ std::pair<bool, MLGOHeuristics> parse_mlgo(std::istream &in);
 } // namespace parser
 } // namespace mlgo
 } // namespace arm_compute
-#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H
\ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H
diff --git a/src/runtime/CL/mlgo/Utils.cpp b/src/runtime/CL/mlgo/Utils.cpp
index 81d418c28ec70ac045a7068ffc02517b5e24978e..c7e0100b3cfa9c1679db6395df0ee80144495dc3 100644
--- a/src/runtime/CL/mlgo/Utils.cpp
+++ b/src/runtime/CL/mlgo/Utils.cpp
@@ -43,40 +43,38 @@ inline std::string to_str(const T &val)
 std::ostream &operator<<(std::ostream &os, const GEMMConfigNative &config)
 {
     return os << "Native:{"
-           << "m0: " << config.m0 << ", "
-           << "n0: " << config.n0 << ", "
-           << "k0: " << config.k0 << ", "
-           << "}";
+              << "m0: " << config.m0 << ", "
+              << "n0: " << config.n0 << ", "
+              << "k0: " << config.k0 << ", "
+              << "}";
 }
 std::ostream &operator<<(std::ostream &os, const GEMMConfigReshapedOnlyRHS &config)
 {
     return os << "ReshapedOnlyRHS:{"
-           << "m0: " << config.m0 << ", "
-           << "n0: " << config.n0 << ", "
-           << "k0: " << config.k0 << ", "
-           << "h0: " << config.h0 << ", "
-           << "interleave_rhs: " << config.interleave_rhs << ", "
-           << "transpose_rhs: " << config.transpose_rhs << ", "
-           << "export_cl_image: " << config.export_cl_image
-           << "}";
+              << "m0: " << config.m0 << ", "
+              << "n0: " << config.n0 << ", "
+              << "k0: " << config.k0 << ", "
+              << "h0: " << config.h0 << ", "
+              << "interleave_rhs: " << config.interleave_rhs << ", "
+              << "transpose_rhs: " << config.transpose_rhs << ", "
+              << "export_cl_image: " << config.export_cl_image << "}";
 }
 std::ostream &operator<<(std::ostream &os, const GEMMConfigReshaped &config)
 {
     return os << "Reshaped:{"
-           << "m0: " << config.m0 << ", "
-           << "n0: " << config.n0 << ", "
-           << "k0: " << config.k0 << ", "
-           << "v0: " << config.v0 << ", "
-           << "h0: " << config.h0 << ", "
-           << "interleave_lhs: " << config.interleave_lhs << ", "
-           << "interleave_rhs: " << config.interleave_rhs << ", "
-           << "transpose_rhs: " << config.transpose_rhs << ", "
-           << "export_cl_image: " << config.export_cl_image
-           << "}";
+              << "m0: " << config.m0 << ", "
+              << "n0: " << config.n0 << ", "
+              << "k0: " << config.k0 << ", "
+              << "v0: " << config.v0 << ", "
+              << "h0: " << config.h0 << ", "
+              << "interleave_lhs: " << config.interleave_lhs << ", "
+              << "interleave_rhs: " << config.interleave_rhs << ", "
+              << "transpose_rhs: " << config.transpose_rhs << ", "
+              << "export_cl_image: " << config.export_cl_image << "}";
 }
 std::ostream &operator<<(std::ostream &os, HeuristicType ht)
 {
-    switch(ht)
+    switch (ht)
     {
         case HeuristicType::GEMM_Type:
         {
@@ -103,7 +101,7 @@ std::ostream &operator<<(std::ostream &os, HeuristicType ht)
 }
 std::ostream &operator<<(std::ostream &os, DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::F32:
         {
@@ -184,4 +182,4 @@ std::ostream &operator<<(std::ostream &os, const CharPosition &pos)
 
 } // namespace mlgo
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/mlgo/Utils.h b/src/runtime/CL/mlgo/Utils.h
index c634a887e949b344470037087dcb10f2b2cb4363..73b537f476bc4d1f5c4fc9bf25c4a9098f1f2b0e 100644
--- a/src/runtime/CL/mlgo/Utils.h
+++ b/src/runtime/CL/mlgo/Utils.h
@@ -43,10 +43,10 @@ std::ostream &operator<<(std::ostream &os, HeuristicType ht);
 std::ostream &operator<<(std::ostream &os, DataType dt);
 std::ostream &operator<<(std::ostream &os, const HeuristicTree::Index &index);
 std::ostream &operator<<(std::ostream &os, const Query &query);
-std::string to_string(const GEMMConfigNative &config);
-std::string to_string(const GEMMConfigReshapedOnlyRHS &config);
-std::string to_string(const GEMMConfigReshaped &config);
-std::string to_string(const Query &query);
+std::string   to_string(const GEMMConfigNative &config);
+std::string   to_string(const GEMMConfigReshapedOnlyRHS &config);
+std::string   to_string(const GEMMConfigReshaped &config);
+std::string   to_string(const Query &query);
 namespace parser
 {
 std::ostream &operator<<(std::ostream &os, const CharPosition &pos);
@@ -54,4 +54,4 @@ std::ostream &operator<<(std::ostream &os, const CharPosition &pos);
 } // namespace mlgo
 } // namespace arm_compute
 
-#endif //SRC_RUNTIME_CL_MLGO_UTILS_H
\ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_UTILS_H
diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp
index 6f3e32491a873c652b104afdd14c97327079aa73..5e3907f1ea924d831e5b48767fcf58addd210cc1 100644
--- a/src/runtime/CL/tuners/CLTuningParametersList.cpp
+++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp
@@ -27,20 +27,20 @@ namespace arm_compute
 {
 namespace cl_tuner
 {
-constexpr unsigned int max_lws_supported_x{ 64u };
-constexpr unsigned int max_lws_supported_y{ 32u };
-constexpr unsigned int max_lws_supported_z{ 32u };
+constexpr unsigned int max_lws_supported_x{64u};
+constexpr unsigned int max_lws_supported_y{32u};
+constexpr unsigned int max_lws_supported_z{32u};
 
 /** Non instantiable base class for Tuning parameters combinations that use Index2Coord mapping */
 class CLTuningParametersList : public ICLTuningParametersList
 {
 protected:
     /* Shape of 4-D search space */
-    TensorShape               search_space_shape{ 0, 0, 0, 0 };
-    std::vector<unsigned int> _lws_x{ 0 };
-    std::vector<unsigned int> _lws_y{ 0 };
-    std::vector<unsigned int> _lws_z{ 0 };
-    std::vector<int>          _wbsm{ 0 }; /* Modify the batches size of workgroups distributed to compute units.
+    TensorShape               search_space_shape{0, 0, 0, 0};
+    std::vector<unsigned int> _lws_x{0};
+    std::vector<unsigned int> _lws_y{0};
+    std::vector<unsigned int> _lws_z{0};
+    std::vector<int>          _wbsm{0}; /* Modify the batches size of workgroups distributed to compute units.
                                              The value is in the range [-31,+31].
                                              When 0, the runtime-selected wbs used is unmodified. */
 
@@ -116,7 +116,8 @@ private:
      * @param[in]      lws_max     Max LWS value allowed to be tested
      * @param[in]      mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
      */
-    void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one);
+    void
+    initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one);
 };
 
 /** A minimal subset of LWS values that only have 1,2 and 4/8 */
@@ -170,9 +171,9 @@ CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDR
     search_space_shape[1] = lws_y_max;
     search_space_shape[2] = lws_z_max;
     search_space_shape[3] = 1;
-    if(tuning_info.tune_wbsm)
+    if (tuning_info.tune_wbsm)
     {
-        _wbsm                 = { -3, -2, -1, 0, 1, 2, 3 };
+        _wbsm                 = {-3, -2, -1, 0, 1, 2, 3};
         search_space_shape[3] = _wbsm.size();
     }
 }
@@ -194,26 +195,31 @@ CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gw
     _lws_x = {};
     _lws_y = {};
     _lws_z = {};
-    initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
-    initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+    initialize_lws_values(_lws_x, gws[0], lws_x_max,
+                          gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+    initialize_lws_values(_lws_y, gws[1], lws_y_max,
+                          gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
     initialize_lws_values(_lws_z, gws[2], lws_z_max, false);
 
     search_space_shape[0] = _lws_x.size();
     search_space_shape[1] = _lws_y.size();
     search_space_shape[2] = _lws_z.size();
     search_space_shape[3] = 1;
-    if(tuning_info.tune_wbsm)
+    if (tuning_info.tune_wbsm)
     {
-        _wbsm                 = { -2, -1, 0, 1, 2 };
+        _wbsm                 = {-2, -1, 0, 1, 2};
         search_space_shape[3] = _wbsm.size();
     }
 }
 
-void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
+void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws,
+                                                         unsigned int               gws,
+                                                         unsigned int               lws_max,
+                                                         bool                       mod_let_one)
 {
     lws.push_back(1);
 
-    for(unsigned int i = 2; i <= lws_max; ++i)
+    for (unsigned int i = 2; i <= lws_max; ++i)
     {
         // Power of two condition
         const bool is_power_of_two = (i & (i - 1)) == 0;
@@ -221,7 +227,7 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned in
         // Condition for the module accordingly with the mod_let_one flag
         const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
 
-        if(mod_cond || is_power_of_two)
+        if (mod_cond || is_power_of_two)
         {
             lws.push_back(i);
         }
@@ -246,9 +252,9 @@ CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws,
     search_space_shape[1] = _lws_y.size();
     search_space_shape[2] = _lws_z.size();
     search_space_shape[3] = 1;
-    if(tuning_info.tune_wbsm)
+    if (tuning_info.tune_wbsm)
     {
-        _wbsm                 = { -1, 0, 1 };
+        _wbsm                 = {-1, 0, 1};
         search_space_shape[3] = _wbsm.size();
     }
 }
@@ -257,7 +263,7 @@ void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int
 {
     lws.push_back(1);
 
-    for(unsigned int i = 2; i <= lws_max; i *= 4)
+    for (unsigned int i = 2; i <= lws_max; i *= 4)
     {
         lws.push_back(i);
     }
@@ -265,7 +271,7 @@ void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int
 
 std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws)
 {
-    switch(tuning_info.tuner_mode)
+    switch (tuning_info.tuner_mode)
     {
         case CLTunerMode::EXHAUSTIVE:
             return std::make_unique<CLTuningParametersListExhaustive>(gws, tuning_info);
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 45e872428f832c82b85eb77a8e38c369e0288613..9fbdc3a4ddd789266e0caec28dec85fec0345f59 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+
 #include "support/Mutex.h"
 
 #include <atomic>
@@ -53,8 +54,7 @@ public:
      * @param[in] start First value that will be returned by the feeder
      * @param[in] end   End condition (The last value returned by get_next() will be end - 1)
      */
-    explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0)
-        : _atomic_counter(start), _end(end)
+    explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0) : _atomic_counter(start), _end(end)
     {
     }
     /** Return the next element in the range if there is one.
@@ -89,8 +89,7 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads, ThreadFeede
     {
         ARM_COMPUTE_ERROR_ON(workload_index >= workloads.size());
         workloads[workload_index](info);
-    }
-    while(feeder.get_next(workload_index));
+    } while (feeder.get_next(workload_index));
 }
 
 /** Set thread affinity. Pin current thread to a particular core
@@ -99,7 +98,7 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads, ThreadFeede
  */
 void set_thread_affinity(int core_id)
 {
-    if(core_id < 0)
+    if (core_id < 0)
     {
         return;
     }
@@ -150,10 +149,10 @@ public:
      */
     explicit Thread(int core_pin = -1);
 
-    Thread(const Thread &) = delete;
+    Thread(const Thread &)            = delete;
     Thread &operator=(const Thread &) = delete;
     Thread(Thread &&)                 = delete;
-    Thread &operator=(Thread &&) = delete;
+    Thread &operator=(Thread &&)      = delete;
 
     /** Destructor. Make the thread join. */
     ~Thread();
@@ -196,21 +195,20 @@ public:
 private:
     std::thread                        _thread{};
     ThreadInfo                         _info{};
-    std::vector<IScheduler::Workload> *_workloads{ nullptr };
-    ThreadFeeder                      *_feeder{ nullptr };
+    std::vector<IScheduler::Workload> *_workloads{nullptr};
+    ThreadFeeder                      *_feeder{nullptr};
     std::mutex                         _m{};
     std::condition_variable            _cv{};
-    bool                               _wait_for_work{ false };
-    bool                               _job_complete{ true };
-    std::exception_ptr                 _current_exception{ nullptr };
-    int                                _core_pin{ -1 };
-    std::list<Thread>                 *_thread_pool{ nullptr };
-    unsigned int                       _wake_beg{ 0 };
-    unsigned int                       _wake_end{ 0 };
+    bool                               _wait_for_work{false};
+    bool                               _job_complete{true};
+    std::exception_ptr                 _current_exception{nullptr};
+    int                                _core_pin{-1};
+    std::list<Thread>                 *_thread_pool{nullptr};
+    unsigned int                       _wake_beg{0};
+    unsigned int                       _wake_end{0};
 };
 
-Thread::Thread(int core_pin)
-    : _core_pin(core_pin)
+Thread::Thread(int core_pin) : _core_pin(core_pin)
 {
     _thread = std::thread(&Thread::worker_thread, this);
 }
@@ -218,7 +216,7 @@ Thread::Thread(int core_pin)
 Thread::~Thread()
 {
     // Make sure worker thread has ended
-    if(_thread.joinable())
+    if (_thread.joinable())
     {
         ThreadFeeder feeder;
         set_workload(nullptr, feeder, ThreadInfo());
@@ -257,7 +255,7 @@ void Thread::worker_thread()
 {
     set_thread_affinity(_core_pin);
 
-    while(true)
+    while (true)
     {
         std::unique_lock<std::mutex> lock(_m);
         _cv.wait(lock, [&] { return _wait_for_work; });
@@ -266,18 +264,18 @@ void Thread::worker_thread()
         _current_exception = nullptr;
 
         // Exit if the worker thread has not been fed with workloads
-        if(_workloads == nullptr || _feeder == nullptr)
+        if (_workloads == nullptr || _feeder == nullptr)
         {
             return;
         }
 
         // Wake up more peer threads from thread pool if this job has been delegated to the current thread
-        if(_thread_pool != nullptr)
+        if (_thread_pool != nullptr)
         {
             auto thread_it = _thread_pool->begin();
             std::advance(thread_it, std::min(static_cast<unsigned int>(_thread_pool->size()), _wake_beg));
             auto wake_end = std::min(_wake_end, static_cast<unsigned int>(_info.num_threads - 1));
-            for(unsigned int t = _wake_beg; t < wake_end; ++t, ++thread_it)
+            for (unsigned int t = _wake_beg; t < wake_end; ++t, ++thread_it)
             {
                 thread_it->start();
             }
@@ -291,7 +289,7 @@ void Thread::worker_thread()
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
         }
-        catch(...)
+        catch (...)
         {
             _current_exception = std::current_exception();
         }
@@ -322,11 +320,11 @@ struct CPPScheduler::Impl final
         : _num_threads(thread_hint), _threads(_num_threads - 1), _mode(Mode::Linear), _wake_fanout(0U)
     {
         const auto mode_env_v = utility::tolower(utility::getenv("ARM_COMPUTE_CPP_SCHEDULER_MODE"));
-        if(mode_env_v == "linear")
+        if (mode_env_v == "linear")
         {
             _forced_mode = ModeToggle::Linear;
         }
-        else if(mode_env_v == "fanout")
+        else if (mode_env_v == "fanout")
         {
             _forced_mode = ModeToggle::Fanout;
         }
@@ -350,7 +348,7 @@ struct CPPScheduler::Impl final
 
         // Set affinity on worked threads
         _threads.clear();
-        for(auto i = 1U; i < _num_threads; ++i)
+        for (auto i = 1U; i < _num_threads; ++i)
         {
             _threads.emplace_back(func(i, thread_hint));
         }
@@ -359,20 +357,23 @@ struct CPPScheduler::Impl final
     void auto_switch_mode(unsigned int num_threads_to_use)
     {
         // If the environment variable is set to any of the modes, it overwrites the mode selected over num_threads_to_use
-        if(_forced_mode == ModeToggle::Fanout || (_forced_mode == ModeToggle::None && num_threads_to_use > 8))
+        if (_forced_mode == ModeToggle::Fanout || (_forced_mode == ModeToggle::None && num_threads_to_use > 8))
         {
             set_fanout_mode(m_default_wake_fanout, num_threads_to_use);
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Fanout mode, with wake up fanout : %d and %d threads to use\n", this->wake_fanout(), num_threads_to_use);
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Set CPPScheduler to Fanout mode, with wake up fanout : %d and %d threads to use\n",
+                this->wake_fanout(), num_threads_to_use);
         }
         else // Equivalent to (_forced_mode == ModeToggle::Linear || (_forced_mode == ModeToggle::None && num_threads_to_use <= 8))
         {
             set_linear_mode();
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Linear mode, with %d threads to use\n", num_threads_to_use);
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Linear mode, with %d threads to use\n",
+                                                      num_threads_to_use);
         }
     }
     void set_linear_mode()
     {
-        for(auto &thread : _threads)
+        for (auto &thread : _threads)
         {
             thread.set_linear_mode();
         }
@@ -384,14 +385,14 @@ struct CPPScheduler::Impl final
         ARM_COMPUTE_ERROR_ON(num_threads_to_use > _threads.size() + 1);
         const auto actual_wake_fanout = std::max(2U, std::min(wake_fanout, num_threads_to_use - 1));
         auto       thread_it          = _threads.begin();
-        for(auto i = 1U; i < num_threads_to_use; ++i, ++thread_it)
+        for (auto i = 1U; i < num_threads_to_use; ++i, ++thread_it)
         {
             const auto wake_begin = i * actual_wake_fanout - 1;
             const auto wake_end   = std::min((i + 1) * actual_wake_fanout - 1, num_threads_to_use - 1);
             thread_it->set_fanout_mode(&_threads, wake_begin, wake_end);
         }
         // Reset the remaining threads's wake up schedule
-        while(thread_it != _threads.end())
+        while (thread_it != _threads.end())
         {
             thread_it->set_fanout_mode(&_threads, 0U, 0U);
             ++thread_it;
@@ -417,9 +418,9 @@ struct CPPScheduler::Impl final
     unsigned int       _num_threads;
     std::list<Thread>  _threads;
     arm_compute::Mutex _run_workloads_mutex{};
-    Mode               _mode{ Mode::Linear };
-    ModeToggle         _forced_mode{ ModeToggle::None };
-    unsigned int       _wake_fanout{ 0 };
+    Mode               _mode{Mode::Linear};
+    ModeToggle         _forced_mode{ModeToggle::None};
+    unsigned int       _wake_fanout{0};
 };
 
 /*
@@ -431,8 +432,7 @@ CPPScheduler &CPPScheduler::get()
     return scheduler;
 }
 
-CPPScheduler::CPPScheduler()
-    : _impl(std::make_unique<Impl>(num_threads_hint()))
+CPPScheduler::CPPScheduler() : _impl(std::make_unique<Impl>(num_threads_hint()))
 {
 }
 
@@ -465,15 +465,15 @@ void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads)
     // This is not great because different threads workloads won't run in parallel but at least they
     // won't interfere each other and deadlock.
     arm_compute::lock_guard<std::mutex> lock(_impl->_run_workloads_mutex);
-    const unsigned int                  num_threads_to_use = std::min(_impl->num_threads(), static_cast<unsigned int>(workloads.size()));
-    if(num_threads_to_use < 1)
+    const unsigned int num_threads_to_use = std::min(_impl->num_threads(), static_cast<unsigned int>(workloads.size()));
+    if (num_threads_to_use < 1)
     {
         return;
     }
     // Re-adjust the mode if the actual number of threads to use is different from the number of threads created
     _impl->auto_switch_mode(num_threads_to_use);
     int num_threads_to_start = 0;
-    switch(_impl->mode())
+    switch (_impl->mode())
     {
         case CPPScheduler::Impl::Mode::Fanout:
         {
@@ -494,22 +494,22 @@ void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads)
     unsigned int t         = 0;
     auto         thread_it = _impl->_threads.begin();
     // Set num_threads_to_use - 1 workloads to the threads as the remaining 1 is left to the main thread
-    for(; t < num_threads_to_use - 1; ++t, ++thread_it)
+    for (; t < num_threads_to_use - 1; ++t, ++thread_it)
     {
         info.thread_id = t;
         thread_it->set_workload(&workloads, feeder, info);
     }
     thread_it = _impl->_threads.begin();
-    for(int i = 0; i < num_threads_to_start; ++i, ++thread_it)
+    for (int i = 0; i < num_threads_to_start; ++i, ++thread_it)
     {
         thread_it->start();
     }
-    info.thread_id = t;                         // Set main thread's thread_id
+    info.thread_id                    = t; // Set main thread's thread_id
     std::exception_ptr last_exception = nullptr;
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     try
     {
-#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+#endif                                              /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
         process_workloads(workloads, feeder, info); // Main thread processes workloads
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
@@ -522,7 +522,7 @@ void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads)
     {
 #endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
         thread_it = _impl->_threads.begin();
-        for(unsigned int i = 0; i < num_threads_to_use - 1; ++i, ++thread_it)
+        for (unsigned int i = 0; i < num_threads_to_use - 1; ++i, ++thread_it)
         {
             std::exception_ptr current_exception = thread_it->wait();
             if (current_exception)
@@ -536,7 +536,7 @@ void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads)
         }
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::system_error &e)
+    catch (const std::system_error &e)
     {
         std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
     }
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index 5890553f6f907d130f66e943871d298c611fd2bc..c46a2731d8a0e71eaae49ef33e0f1743f8f91015 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -39,10 +39,10 @@ void SingleThreadScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
 {
     const Window &max_window = kernel->window();
 
-    if(hints.split_dimension() != IScheduler::split_dimensions_all)
+    if (hints.split_dimension() != IScheduler::split_dimensions_all)
     {
         const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
-        if(num_iterations < 1)
+        if (num_iterations < 1)
         {
             return;
         }
@@ -53,7 +53,10 @@ void SingleThreadScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
     kernel->run(kernel->window(), info);
 }
 
-void SingleThreadScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors)
+void SingleThreadScheduler::schedule_op(ICPPKernel   *kernel,
+                                        const Hints  &hints,
+                                        const Window &window,
+                                        ITensorPack  &tensors)
 {
     ARM_COMPUTE_UNUSED(hints);
     ThreadInfo info;
@@ -65,7 +68,7 @@ void SingleThreadScheduler::run_workloads(std::vector<Workload> &workloads)
 {
     ThreadInfo info;
     info.cpu_info = &cpu_info();
-    for(auto &wl : workloads)
+    for (auto &wl : workloads)
     {
         wl(info);
     }
diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
index dccbe4045d27bece722cd6cbfc46998b2b318ca0..94a1673d59d89c2474e801304cee0f8466e90897 100644
--- a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
+++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
@@ -42,28 +42,37 @@ void dequantize_tensor(const ITensor *input, ITensor *output)
     Iterator input_it(input, window);
     Iterator output_it(output, window);
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::QASYMM8:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<float *>(output_it.ptr()) =
+                        dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
+                },
+                input_it, output_it);
             break;
         case DataType::QASYMM8_SIGNED:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<float *>(output_it.ptr()) = dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<float *>(output_it.ptr()) =
+                        dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo);
+                },
+                input_it, output_it);
             break;
         case DataType::QASYMM16:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<float *>(output_it.ptr()) =
+                        dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
+                },
+                input_it, output_it);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type");
@@ -80,28 +89,37 @@ void quantize_tensor(const ITensor *input, ITensor *output)
     Iterator input_it(input, window);
     Iterator output_it(output, window);
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::QASYMM8:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<uint8_t *>(output_it.ptr()) = quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<uint8_t *>(output_it.ptr()) =
+                        quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+                },
+                input_it, output_it);
             break;
         case DataType::QASYMM8_SIGNED:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<int8_t *>(output_it.ptr()) = quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<int8_t *>(output_it.ptr()) =
+                        quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+                },
+                input_it, output_it);
             break;
         case DataType::QASYMM16:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<uint16_t *>(output_it.ptr()) = quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<uint16_t *>(output_it.ptr()) =
+                        quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+                },
+                input_it, output_it);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type");
@@ -132,14 +150,23 @@ CPPBoxWithNonMaximaSuppressionLimit::CPPBoxWithNonMaximaSuppressionLimit(std::sh
 {
 }
 
-void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in,
-                                                    ITensor *scores_out, ITensor *boxes_out, ITensor *classes, ITensor *batch_splits_out,
-                                                    ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor        *scores_in,
+                                                    const ITensor        *boxes_in,
+                                                    const ITensor        *batch_splits_in,
+                                                    ITensor              *scores_out,
+                                                    ITensor              *boxes_out,
+                                                    ITensor              *classes,
+                                                    ITensor              *batch_splits_out,
+                                                    ITensor              *keeps,
+                                                    ITensor              *keeps_size,
+                                                    const BoxNMSLimitInfo info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
-    ARM_COMPUTE_LOG_PARAMS(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
+    ARM_COMPUTE_LOG_PARAMS(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out,
+                           keeps, keeps_size, info);
 
-    _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 || scores_in->info()->data_type() == DataType::QASYMM8_SIGNED;
+    _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 ||
+                  scores_in->info()->data_type() == DataType::QASYMM8_SIGNED;
 
     _scores_in        = scores_in;
     _boxes_in         = boxes_in;
@@ -150,7 +177,7 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co
     _batch_splits_out = batch_splits_out;
     _keeps            = keeps;
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         // Manage intermediate buffers
         _memory_group.manage(&_scores_in_f32);
@@ -160,7 +187,7 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co
         _memory_group.manage(&_classes_f32);
         _scores_in_f32.allocator()->init(scores_in->info()->clone()->set_data_type(DataType::F32));
         _boxes_in_f32.allocator()->init(boxes_in->info()->clone()->set_data_type(DataType::F32));
-        if(batch_splits_in != nullptr)
+        if (batch_splits_in != nullptr)
         {
             _memory_group.manage(&_batch_splits_in_f32);
             _batch_splits_in_f32.allocator()->init(batch_splits_in->info()->clone()->set_data_type(DataType::F32));
@@ -168,58 +195,70 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co
         _scores_out_f32.allocator()->init(scores_out->info()->clone()->set_data_type(DataType::F32));
         _boxes_out_f32.allocator()->init(boxes_out->info()->clone()->set_data_type(DataType::F32));
         _classes_f32.allocator()->init(classes->info()->clone()->set_data_type(DataType::F32));
-        if(batch_splits_out != nullptr)
+        if (batch_splits_out != nullptr)
         {
             _memory_group.manage(&_batch_splits_out_f32);
             _batch_splits_out_f32.allocator()->init(batch_splits_out->info()->clone()->set_data_type(DataType::F32));
         }
-        if(keeps != nullptr)
+        if (keeps != nullptr)
         {
             _memory_group.manage(&_keeps_f32);
             _keeps_f32.allocator()->init(keeps->info()->clone()->set_data_type(DataType::F32));
         }
 
-        _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32, (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr,
+        _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32,
+                                             (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr,
                                              &_scores_out_f32, &_boxes_out_f32, &_classes_f32,
-                                             (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr, (keeps != nullptr) ? &_keeps_f32 : nullptr,
-                                             keeps_size, info);
+                                             (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr,
+                                             (keeps != nullptr) ? &_keeps_f32 : nullptr, keeps_size, info);
     }
     else
     {
-        _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
+        _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes,
+                                             batch_splits_out, keeps, keeps_size, info);
     }
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _scores_in_f32.allocator()->allocate();
         _boxes_in_f32.allocator()->allocate();
-        if(_batch_splits_in != nullptr)
+        if (_batch_splits_in != nullptr)
         {
             _batch_splits_in_f32.allocator()->allocate();
         }
         _scores_out_f32.allocator()->allocate();
         _boxes_out_f32.allocator()->allocate();
         _classes_f32.allocator()->allocate();
-        if(batch_splits_out != nullptr)
+        if (batch_splits_out != nullptr)
         {
             _batch_splits_out_f32.allocator()->allocate();
         }
-        if(keeps != nullptr)
+        if (keeps != nullptr)
         {
             _keeps_f32.allocator()->allocate();
         }
     }
 }
 
-Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes,
-                const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info)
+Status validate(const ITensorInfo    *scores_in,
+                const ITensorInfo    *boxes_in,
+                const ITensorInfo    *batch_splits_in,
+                const ITensorInfo    *scores_out,
+                const ITensorInfo    *boxes_out,
+                const ITensorInfo    *classes,
+                const ITensorInfo    *batch_splits_out,
+                const ITensorInfo    *keeps,
+                const ITensorInfo    *keeps_size,
+                const BoxNMSLimitInfo info)
 {
     ARM_COMPUTE_UNUSED(batch_splits_in, batch_splits_out, keeps, keeps_size, info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
-    const bool is_qasymm8 = scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED;
-    if(is_qasymm8)
+    const bool is_qasymm8 =
+        scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED;
+    if (is_qasymm8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(boxes_in, 1, DataType::QASYMM16);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes_in, boxes_out);
@@ -237,11 +276,11 @@ void CPPBoxWithNonMaximaSuppressionLimit::run()
     // Acquire all the temporaries
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         dequantize_tensor(_scores_in, &_scores_in_f32);
         dequantize_tensor(_boxes_in, &_boxes_in_f32);
-        if(_batch_splits_in != nullptr)
+        if (_batch_splits_in != nullptr)
         {
             dequantize_tensor(_batch_splits_in, &_batch_splits_in_f32);
         }
@@ -249,16 +288,16 @@ void CPPBoxWithNonMaximaSuppressionLimit::run()
 
     Scheduler::get().schedule(&_box_with_nms_limit_kernel, Window::DimY);
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         quantize_tensor(&_scores_out_f32, _scores_out);
         quantize_tensor(&_boxes_out_f32, _boxes_out);
         quantize_tensor(&_classes_f32, _classes);
-        if(_batch_splits_out != nullptr)
+        if (_batch_splits_out != nullptr)
         {
             quantize_tensor(&_batch_splits_out_f32, _batch_splits_out);
         }
-        if(_keeps != nullptr)
+        if (_keeps != nullptr)
         {
             quantize_tensor(&_keeps_f32, _keeps);
         }
diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index 41d875eb97973e7608f51467143ce8b785c3c2eb..e6291f973e092c140259dff8e2e7ba2beee228eb 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
@@ -26,9 +26,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <list>
 
@@ -36,25 +36,35 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+Status validate_arguments(const ITensorInfo       *input_loc,
+                          const ITensorInfo       *input_conf,
+                          const ITensorInfo       *input_priorbox,
+                          const ITensorInfo       *output,
+                          DetectionOutputLayerInfo info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_loc, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, input_conf, input_priorbox);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_loc->num_dimensions() > 2, "The location input tensor should be [C1, N].");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_conf->num_dimensions() > 2, "The location input tensor should be [C2, N].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, "The priorbox input tensor should be [C3, 2, N].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3,
+                                    "The priorbox input tensor should be [C3, 2, N].");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.eta() <= 0.f && info.eta() > 1.f, "Eta should be between 0 and 1");
 
     const int num_priors = input_priorbox->tensor_shape()[0] / 4;
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) != input_loc->tensor_shape()[0], "Number of priors must match number of location predictions.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) != input_conf->tensor_shape()[0], "Number of priors must match number of confidence predictions.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) !=
+                                        input_loc->tensor_shape()[0],
+                                    "Number of priors must match number of location predictions.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) !=
+                                        input_conf->tensor_shape()[0],
+                                    "Number of priors must match number of confidence predictions.");
 
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const unsigned int max_size = info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1);
+        const unsigned int max_size =
+            info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), TensorShape(7U, max_size));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, output);
     }
@@ -65,8 +75,7 @@ Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input
 /** Function used to sort pair<float, T> in descend order based on the score (first) value.
  */
 template <typename T>
-bool SortScorePairDescend(const std::pair<float, T> &pair1,
-                          const std::pair<float, T> &pair2)
+bool SortScorePairDescend(const std::pair<float, T> &pair1, const std::pair<float, T> &pair2)
 {
     return pair1.first > pair2.first;
 }
@@ -82,16 +91,19 @@ bool SortScorePairDescend(const std::pair<float, T> &pair1,
  * @param[out] all_location_predictions All the location predictions.
  *
  */
-void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
-                                  const int num_priors, const int num_loc_classes,
-                                  const bool share_location, std::vector<LabelBBox> &all_location_predictions)
+void retrieve_all_loc_predictions(const ITensor          *input_loc,
+                                  const int               num,
+                                  const int               num_priors,
+                                  const int               num_loc_classes,
+                                  const bool              share_location,
+                                  std::vector<LabelBBox> &all_location_predictions)
 {
-    for(int i = 0; i < num; ++i)
+    for (int i = 0; i < num; ++i)
     {
-        for(int c = 0; c < num_loc_classes; ++c)
+        for (int c = 0; c < num_loc_classes; ++c)
         {
             int label = share_location ? -1 : c;
-            if(all_location_predictions[i].find(label) == all_location_predictions[i].end())
+            if (all_location_predictions[i].find(label) == all_location_predictions[i].end())
             {
                 all_location_predictions[i][label].resize(num_priors);
             }
@@ -102,19 +114,23 @@ void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
             }
         }
     }
-    for(int i = 0; i < num; ++i)
+    for (int i = 0; i < num; ++i)
     {
-        for(int p = 0; p < num_priors; ++p)
+        for (int p = 0; p < num_priors; ++p)
         {
-            for(int c = 0; c < num_loc_classes; ++c)
+            for (int c = 0; c < num_loc_classes; ++c)
             {
                 const int label    = share_location ? -1 : c;
                 const int base_ptr = i * num_priors * num_loc_classes * 4 + p * num_loc_classes * 4 + c * 4;
                 //xmin, ymin, xmax, ymax
-                all_location_predictions[i][label][p][0] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr)));
-                all_location_predictions[i][label][p][1] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1)));
-                all_location_predictions[i][label][p][2] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2)));
-                all_location_predictions[i][label][p][3] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3)));
+                all_location_predictions[i][label][p][0] =
+                    *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr)));
+                all_location_predictions[i][label][p][1] =
+                    *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1)));
+                all_location_predictions[i][label][p][2] =
+                    *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2)));
+                all_location_predictions[i][label][p][3] =
+                    *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3)));
             }
         }
     }
@@ -130,26 +146,28 @@ void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
  * @param[out] all_location_predictions All the location predictions.
  *
  */
-void retrieve_all_conf_scores(const ITensor *input_conf, const int num,
-                              const int num_priors, const int                 num_classes,
+void retrieve_all_conf_scores(const ITensor                                  *input_conf,
+                              const int                                       num,
+                              const int                                       num_priors,
+                              const int                                       num_classes,
                               std::vector<std::map<int, std::vector<float>>> &all_confidence_scores)
 {
     std::vector<float> tmp_buffer;
     tmp_buffer.resize(num * num_priors * num_classes);
-    for(int i = 0; i < num; ++i)
+    for (int i = 0; i < num; ++i)
     {
-        for(int c = 0; c < num_classes; ++c)
+        for (int c = 0; c < num_classes; ++c)
         {
-            for(int p = 0; p < num_priors; ++p)
+            for (int p = 0; p < num_priors; ++p)
             {
-                tmp_buffer[i * num_classes * num_priors + c * num_priors + p] =
-                    *reinterpret_cast<float *>(input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c)));
+                tmp_buffer[i * num_classes * num_priors + c * num_priors + p] = *reinterpret_cast<float *>(
+                    input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c)));
             }
         }
     }
-    for(int i = 0; i < num; ++i)
+    for (int i = 0; i < num; ++i)
     {
-        for(int c = 0; c < num_classes; ++c)
+        for (int c = 0; c < num_classes; ++c)
         {
             all_confidence_scores[i][c].resize(num_priors);
             all_confidence_scores[i][c].assign(&tmp_buffer[i * num_classes * num_priors + c * num_priors],
@@ -168,28 +186,23 @@ void retrieve_all_conf_scores(const ITensor *input_conf, const int num,
  * @param[out] all_location_predictions All the location predictions.
  *
  */
-void retrieve_all_priorbox(const ITensor     *input_priorbox,
-                           const int          num_priors,
-                           std::vector<BBox> &all_prior_bboxes,
+void retrieve_all_priorbox(const ITensor                     *input_priorbox,
+                           const int                          num_priors,
+                           std::vector<BBox>                 &all_prior_bboxes,
                            std::vector<std::array<float, 4>> &all_prior_variances)
 {
-    for(int i = 0; i < num_priors; ++i)
+    for (int i = 0; i < num_priors; ++i)
     {
-        all_prior_bboxes[i] =
-        {
-            {
-                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))),
-                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))),
-                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))),
-                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))
-            }
-        };
+        all_prior_bboxes[i] = {{*reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))),
+                                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))),
+                                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))),
+                                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))}};
     }
 
-    std::array<float, 4> var({ { 0, 0, 0, 0 } });
-    for(int i = 0; i < num_priors; ++i)
+    std::array<float, 4> var({{0, 0, 0, 0}});
+    for (int i = 0; i < num_priors; ++i)
     {
-        for(int j = 0; j < 4; ++j)
+        for (int j = 0; j < 4; ++j)
         {
             var[j] = *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates((num_priors + i) * 4 + j)));
         }
@@ -208,13 +221,17 @@ void retrieve_all_priorbox(const ITensor     *input_priorbox,
  * @param[out] decode_bbox                The decoded bboxes.
  *
  */
-void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_variance,
-                const DetectionOutputLayerCodeType code_type, const bool variance_encoded_in_target,
-                const bool clip_bbox, const BBox &bbox, BBox &decode_bbox)
+void DecodeBBox(const BBox                        &prior_bbox,
+                const std::array<float, 4>        &prior_variance,
+                const DetectionOutputLayerCodeType code_type,
+                const bool                         variance_encoded_in_target,
+                const bool                         clip_bbox,
+                const BBox                        &bbox,
+                BBox                              &decode_bbox)
 {
     // if the variance is encoded in target, we simply need to add the offset predictions
     // otherwise we need to scale the offset accordingly.
-    switch(code_type)
+    switch (code_type)
     {
         case DetectionOutputLayerCodeType::CORNER:
         {
@@ -237,10 +254,14 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian
             const float prior_center_x = (prior_bbox[0] + prior_bbox[2]) / 2.;
             const float prior_center_y = (prior_bbox[1] + prior_bbox[3]) / 2.;
 
-            const float decode_bbox_center_x = (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x;
-            const float decode_bbox_center_y = (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y;
-            const float decode_bbox_width    = (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width;
-            const float decode_bbox_height   = (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height;
+            const float decode_bbox_center_x =
+                (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x;
+            const float decode_bbox_center_y =
+                (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y;
+            const float decode_bbox_width =
+                (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width;
+            const float decode_bbox_height =
+                (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height;
 
             decode_bbox[0] = (decode_bbox_center_x - decode_bbox_width / 2.f);
             decode_bbox[1] = (decode_bbox_center_y - decode_bbox_height / 2.f);
@@ -258,10 +279,14 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian
             ARM_COMPUTE_ERROR_ON(prior_width <= 0.f);
             ARM_COMPUTE_ERROR_ON(prior_height <= 0.f);
 
-            decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width;
-            decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height;
-            decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width;
-            decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height;
+            decode_bbox[0] =
+                prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width;
+            decode_bbox[1] =
+                prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height;
+            decode_bbox[2] =
+                prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width;
+            decode_bbox[3] =
+                prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height;
 
             break;
         }
@@ -269,9 +294,9 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian
             ARM_COMPUTE_ERROR("Unsupported Detection Output Code Type.");
     }
 
-    if(clip_bbox)
+    if (clip_bbox)
     {
-        for(auto &d_bbox : decode_bbox)
+        for (auto &d_bbox : decode_bbox)
         {
             d_bbox = utility::clamp(d_bbox, 0.f, 1.f);
         }
@@ -289,10 +314,13 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian
  * @param[out] indices         The kept indices of bboxes after nms.
  *
  */
-void ApplyNMSFast(const std::vector<BBox> &bboxes,
-                  const std::vector<float> &scores, const float score_threshold,
-                  const float nms_threshold, const float eta, const int top_k,
-                  std::vector<int> &indices)
+void ApplyNMSFast(const std::vector<BBox>  &bboxes,
+                  const std::vector<float> &scores,
+                  const float               score_threshold,
+                  const float               nms_threshold,
+                  const float               eta,
+                  const int                 top_k,
+                  std::vector<int>         &indices)
 {
     ARM_COMPUTE_ERROR_ON_MSG(bboxes.size() != scores.size(), "bboxes and scores have different size.");
 
@@ -300,9 +328,9 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
     std::list<std::pair<float, int>> score_index_vec;
 
     // Generate index score pairs.
-    for(size_t i = 0; i < scores.size(); ++i)
+    for (size_t i = 0; i < scores.size(); ++i)
     {
-        if(scores[i] > score_threshold)
+        if (scores[i] > score_threshold)
         {
             score_index_vec.emplace_back(std::make_pair(scores[i], i));
         }
@@ -313,7 +341,7 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
 
     // Keep top_k scores if needed.
     const int score_index_vec_size = score_index_vec.size();
-    if(top_k > -1 && top_k < score_index_vec_size)
+    if (top_k > -1 && top_k < score_index_vec_size)
     {
         score_index_vec.resize(top_k);
     }
@@ -322,46 +350,45 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
     float adaptive_threshold = nms_threshold;
     indices.clear();
 
-    while(!score_index_vec.empty())
+    while (!score_index_vec.empty())
     {
         const int idx  = score_index_vec.front().second;
         bool      keep = true;
-        for(int kept_idx : indices)
+        for (int kept_idx : indices)
         {
-            if(keep)
+            if (keep)
             {
                 // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
-                BBox intersect_bbox = std::array<float, 4>({ 0, 0, 0, 0 });
-                if(bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1])
+                BBox intersect_bbox = std::array<float, 4>({0, 0, 0, 0});
+                if (bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] ||
+                    bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1])
                 {
-                    intersect_bbox = std::array<float, 4>({ { 0, 0, 0, 0 } });
+                    intersect_bbox = std::array<float, 4>({{0, 0, 0, 0}});
                 }
                 else
                 {
-                    intersect_bbox = std::array<float, 4>({ {
-                            std::max(bboxes[idx][0], bboxes[kept_idx][0]),
-                            std::max(bboxes[idx][1], bboxes[kept_idx][1]),
-                            std::min(bboxes[idx][2], bboxes[kept_idx][2]),
-                            std::min(bboxes[idx][3], bboxes[kept_idx][3])
-                        }
-                    });
+                    intersect_bbox = std::array<float, 4>(
+                        {{std::max(bboxes[idx][0], bboxes[kept_idx][0]), std::max(bboxes[idx][1], bboxes[kept_idx][1]),
+                          std::min(bboxes[idx][2], bboxes[kept_idx][2]),
+                          std::min(bboxes[idx][3], bboxes[kept_idx][3])}});
                 }
 
                 float intersect_width  = intersect_bbox[2] - intersect_bbox[0];
                 float intersect_height = intersect_bbox[3] - intersect_bbox[1];
 
                 float overlap = 0.f;
-                if(intersect_width > 0 && intersect_height > 0)
+                if (intersect_width > 0 && intersect_height > 0)
                 {
                     float intersect_size = intersect_width * intersect_height;
-                    float bbox1_size     = (bboxes[idx][2] < bboxes[idx][0]
-                                            || bboxes[idx][3] < bboxes[idx][1]) ?
-                                           0.f :
-                                           (bboxes[idx][2] - bboxes[idx][0]) * (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]);
-                    float bbox2_size = (bboxes[kept_idx][2] < bboxes[kept_idx][0]
-                                        || bboxes[kept_idx][3] < bboxes[kept_idx][1]) ?
-                                       0.f :
-                                       (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]);
+                    float bbox1_size     = (bboxes[idx][2] < bboxes[idx][0] || bboxes[idx][3] < bboxes[idx][1])
+                                               ? 0.f
+                                               : (bboxes[idx][2] - bboxes[idx][0]) *
+                                                 (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]);
+                    float bbox2_size =
+                        (bboxes[kept_idx][2] < bboxes[kept_idx][0] || bboxes[kept_idx][3] < bboxes[kept_idx][1])
+                            ? 0.f
+                            : (bboxes[kept_idx][2] - bboxes[kept_idx][0]) *
+                                  (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]);
                     overlap = intersect_size / (bbox1_size + bbox2_size - intersect_size);
                 }
                 keep = (overlap <= adaptive_threshold);
@@ -371,12 +398,12 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
                 break;
             }
         }
-        if(keep)
+        if (keep)
         {
             indices.push_back(idx);
         }
         score_index_vec.erase(score_index_vec.begin());
-        if(keep && eta < 1.f && adaptive_threshold > 0.5f)
+        if (keep && eta < 1.f && adaptive_threshold > 0.5f)
         {
             adaptive_threshold *= eta;
         }
@@ -385,13 +412,27 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
 } // namespace
 
 CPPDetectionOutputLayer::CPPDetectionOutputLayer()
-    : _input_loc(nullptr), _input_conf(nullptr), _input_priorbox(nullptr), _output(nullptr), _info(), _num_priors(), _num(), _all_location_predictions(), _all_confidence_scores(), _all_prior_bboxes(),
-      _all_prior_variances(), _all_decode_bboxes(), _all_indices()
+    : _input_loc(nullptr),
+      _input_conf(nullptr),
+      _input_priorbox(nullptr),
+      _output(nullptr),
+      _info(),
+      _num_priors(),
+      _num(),
+      _all_location_predictions(),
+      _all_confidence_scores(),
+      _all_prior_bboxes(),
+      _all_prior_variances(),
+      _all_decode_bboxes(),
+      _all_indices()
 {
 }
 
-void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox,
-                                        ITensor *output, DetectionOutputLayerInfo info)
+void CPPDetectionOutputLayer::configure(const ITensor           *input_loc,
+                                        const ITensor           *input_conf,
+                                        const ITensor           *input_priorbox,
+                                        ITensor                 *output,
+                                        DetectionOutputLayerInfo info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
     ARM_COMPUTE_LOG_PARAMS(input_loc, input_conf, input_priorbox, output, info);
@@ -400,11 +441,13 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor
     // Since the number of bboxes to kept is unknown before nms, the shape is set to the maximum
     // The maximum is keep_top_k * input_loc_size[1]
     // Each row is a 7 dimension std::vector, which stores [image_id, label, confidence, xmin, ymin, xmax, ymax]
-    const unsigned int max_size = info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1);
+    const unsigned int max_size =
+        info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1);
     auto_init_if_empty(*output->info(), input_loc->info()->clone()->set_tensor_shape(TensorShape(7U, max_size)));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
 
     _input_loc      = input_loc;
     _input_conf     = input_conf;
@@ -420,12 +463,12 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor
     _all_prior_variances.resize(_num_priors);
     _all_decode_bboxes.resize(_num);
 
-    for(int i = 0; i < _num; ++i)
+    for (int i = 0; i < _num; ++i)
     {
-        for(int c = 0; c < _info.num_loc_classes(); ++c)
+        for (int c = 0; c < _info.num_loc_classes(); ++c)
         {
             const int label = _info.share_location() ? -1 : c;
-            if(label == _info.background_label_id())
+            if (label == _info.background_label_id())
             {
                 // Ignore background class.
                 continue;
@@ -440,7 +483,11 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor
     output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
 }
 
-Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+Status CPPDetectionOutputLayer::validate(const ITensorInfo       *input_loc,
+                                         const ITensorInfo       *input_conf,
+                                         const ITensorInfo       *input_priorbox,
+                                         const ITensorInfo       *output,
+                                         DetectionOutputLayerInfo info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_loc, input_conf, input_priorbox, output, info));
     return Status{};
@@ -449,7 +496,8 @@ Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITe
 void CPPDetectionOutputLayer::run()
 {
     // Retrieve all location predictions.
-    retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), _all_location_predictions);
+    retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(),
+                                 _all_location_predictions);
 
     // Retrieve all confidences.
     retrieve_all_conf_scores(_input_conf, _num, _num_priors, _info.num_classes(), _all_confidence_scores);
@@ -459,75 +507,79 @@ void CPPDetectionOutputLayer::run()
 
     // Decode all loc predictions to bboxes
     const bool clip_bbox = false;
-    for(int i = 0; i < _num; ++i)
+    for (int i = 0; i < _num; ++i)
     {
-        for(int c = 0; c < _info.num_loc_classes(); ++c)
+        for (int c = 0; c < _info.num_loc_classes(); ++c)
         {
             const int label = _info.share_location() ? -1 : c;
-            if(label == _info.background_label_id())
+            if (label == _info.background_label_id())
             {
                 // Ignore background class.
                 continue;
             }
-            ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), "Could not find location predictions for label %d.", label);
+            ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(),
+                                         "Could not find location predictions for label %d.", label);
 
             const std::vector<BBox> &label_loc_preds = _all_location_predictions[i].find(label)->second;
 
             const int num_bboxes = _all_prior_bboxes.size();
             ARM_COMPUTE_ERROR_ON(_all_prior_variances[i].size() != 4);
 
-            for(int j = 0; j < num_bboxes; ++j)
+            for (int j = 0; j < num_bboxes; ++j)
             {
-                DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], _all_decode_bboxes[i][label][j]);
+                DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(),
+                           _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j],
+                           _all_decode_bboxes[i][label][j]);
             }
         }
     }
 
     int num_kept = 0;
 
-    for(int i = 0; i < _num; ++i)
+    for (int i = 0; i < _num; ++i)
     {
-        const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
-        const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
+        const LabelBBox                         &decode_bboxes = _all_decode_bboxes[i];
+        const std::map<int, std::vector<float>> &conf_scores   = _all_confidence_scores[i];
 
         std::map<int, std::vector<int>> indices;
-        int num_det = 0;
-        for(int c = 0; c < _info.num_classes(); ++c)
+        int                             num_det = 0;
+        for (int c = 0; c < _info.num_classes(); ++c)
         {
-            if(c == _info.background_label_id())
+            if (c == _info.background_label_id())
             {
                 // Ignore background class
                 continue;
             }
             const int label = _info.share_location() ? -1 : c;
-            if(conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end())
+            if (conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end())
             {
                 ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label);
             }
             const std::vector<float> &scores = conf_scores.find(c)->second;
-            const std::vector<BBox> &bboxes = decode_bboxes.find(label)->second;
+            const std::vector<BBox>  &bboxes = decode_bboxes.find(label)->second;
 
-            ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), _info.top_k(), indices[c]);
+            ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(),
+                         _info.top_k(), indices[c]);
 
             num_det += indices[c].size();
         }
 
         int num_to_add = 0;
-        if(_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
+        if (_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
         {
             std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-            for(auto const &it : indices)
+            for (auto const &it : indices)
             {
                 const int               label         = it.first;
                 const std::vector<int> &label_indices = it.second;
 
-                if(conf_scores.find(label) == conf_scores.end())
+                if (conf_scores.find(label) == conf_scores.end())
                 {
                     ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label);
                 }
 
                 const std::vector<float> &scores = conf_scores.find(label)->second;
-                for(auto idx : label_indices)
+                for (auto idx : label_indices)
                 {
                     ARM_COMPUTE_ERROR_ON(idx > static_cast<int>(scores.size()));
                     score_index_pairs.emplace_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
@@ -541,7 +593,7 @@ void CPPDetectionOutputLayer::run()
             // Store the new indices.
 
             std::map<int, std::vector<int>> new_indices;
-            for(auto score_index_pair : score_index_pairs)
+            for (auto score_index_pair : score_index_pairs)
             {
                 int label = score_index_pair.second.first;
                 int idx   = score_index_pair.second.second;
@@ -562,25 +614,25 @@ void CPPDetectionOutputLayer::run()
     _output->info()->set_valid_region(ValidRegion(Coordinates(0, 0), TensorShape(7, num_kept)));
 
     int count = 0;
-    for(int i = 0; i < _num; ++i)
+    for (int i = 0; i < _num; ++i)
     {
-        const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
-        const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
-        for(auto &it : _all_indices[i])
+        const std::map<int, std::vector<float>> &conf_scores   = _all_confidence_scores[i];
+        const LabelBBox                         &decode_bboxes = _all_decode_bboxes[i];
+        for (auto &it : _all_indices[i])
         {
             const int                 label     = it.first;
             const std::vector<float> &scores    = conf_scores.find(label)->second;
             const int                 loc_label = _info.share_location() ? -1 : label;
-            if(conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end())
+            if (conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end())
             {
                 // Either if there are no confidence predictions
                 // or there are no location predictions for current label.
                 ARM_COMPUTE_ERROR_VAR("Could not find predictions for the label %d.", label);
             }
             const std::vector<BBox> &bboxes  = decode_bboxes.find(loc_label)->second;
-            const std::vector<int> &indices = it.second;
+            const std::vector<int>  &indices = it.second;
 
-            for(auto idx : indices)
+            for (auto idx : indices)
             {
                 *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7))))     = i;
                 *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 1)))) = label;
diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
index ecbc49b3c16917ac59084509fc0a89a0e6a410c6..2861d6cacb8bb5ee072e0dcfe774bf7511f77b45 100644
--- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
@@ -26,9 +26,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <cstddef>
 #include <ios>
@@ -38,53 +38,76 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
-                          ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection,
-                          DetectionPostProcessLayerInfo info, const unsigned int kBatchSize, const unsigned int kNumCoordBox)
+Status validate_arguments(const ITensorInfo            *input_box_encoding,
+                          const ITensorInfo            *input_class_score,
+                          const ITensorInfo            *input_anchors,
+                          ITensorInfo                  *output_boxes,
+                          ITensorInfo                  *output_classes,
+                          ITensorInfo                  *output_scores,
+                          ITensorInfo                  *num_detection,
+                          DetectionPostProcessLayerInfo info,
+                          const unsigned int            kBatchSize,
+                          const unsigned int            kNumCoordBox)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_box_encoding, input_class_score, input_anchors);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_box_encoding, input_anchors);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, "The location input tensor shape should be [4, N, kBatchSize].");
-    if(input_box_encoding->num_dimensions() > 2)
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3,
+                                    "The location input tensor shape should be [4, N, kBatchSize].");
+    if (input_box_encoding->num_dimensions() > 2)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(2) != kBatchSize, "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(
+            input_box_encoding->dimension(2) != kBatchSize,
+            "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox, "The first dimension of the input box_encoding tensor should be equal to %d.", kNumCoordBox);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_class_score->dimension(0) != (info.num_classes() + 1),
-                                    "The first dimension of the input class_prediction should be equal to the number of classes plus one.");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3, "The anchors input tensor shape should be [4, N, kBatchSize].");
-    if(input_anchors->num_dimensions() > 2)
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox,
+                                        "The first dimension of the input box_encoding tensor should be equal to %d.",
+                                        kNumCoordBox);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        input_class_score->dimension(0) != (info.num_classes() + 1),
+        "The first dimension of the input class_prediction should be equal to the number of classes plus one.");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3,
+                                    "The anchors input tensor shape should be [4, N, kBatchSize].");
+    if (input_anchors->num_dimensions() > 2)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox, "The first dimension of the input anchors tensor should be equal to %d.", kNumCoordBox);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox,
+                                            "The first dimension of the input anchors tensor should be equal to %d.",
+                                            kNumCoordBox);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1))
-                                    || (input_box_encoding->dimension(1) != input_anchors->dimension(1)),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1)) ||
+                                        (input_box_encoding->dimension(1) != input_anchors->dimension(1)),
                                     "The second dimension of the inputs should be the same.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1, "The num_detection output tensor shape should be [M].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f), "The intersection over union should be positive and less than 1.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0, "The number of max classes per detection should be positive.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1,
+                                    "The num_detection output tensor shape should be [M].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f),
+                                    "The intersection over union should be positive and less than 1.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0,
+                                    "The number of max classes per detection should be positive.");
 
     const unsigned int num_detected_boxes = info.max_detections() * info.max_classes_per_detection();
 
     // Validate configured outputs
-    if(output_boxes->total_size() != 0)
+    if (output_boxes->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(), TensorShape(4U, num_detected_boxes, 1U));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(),
+                                                           TensorShape(4U, num_detected_boxes, 1U));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_boxes, 1, DataType::F32);
     }
-    if(output_classes->total_size() != 0)
+    if (output_classes->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(), TensorShape(num_detected_boxes, 1U));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(),
+                                                           TensorShape(num_detected_boxes, 1U));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_classes, 1, DataType::F32);
     }
-    if(output_scores->total_size() != 0)
+    if (output_scores->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(), TensorShape(num_detected_boxes, 1U));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(),
+                                                           TensorShape(num_detected_boxes, 1U));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_scores, 1, DataType::F32);
     }
-    if(num_detection->total_size() != 0)
+    if (num_detection->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(num_detection->tensor_shape(), TensorShape(1U));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_detection, 1, DataType::F32);
@@ -93,15 +116,18 @@ Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorIn
     return Status{};
 }
 
-inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info)
+inline void
+DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info)
 {
     const float half_factor = 0.5f;
 
     // BBox is equavalent to CenterSizeEncoding [y,x,h,w]
     const float y_center = box_centersize[0] / info.scale_value_y() * anchor[2] + anchor[0];
     const float x_center = box_centersize[1] / info.scale_value_x() * anchor[3] + anchor[1];
-    const float half_h   = half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2];
-    const float half_w   = half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3];
+    const float half_h =
+        half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2];
+    const float half_w =
+        half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3];
 
     // Box Corner encoding boxes are saved as [xmin, ymin, xmax, ymax]
     auto decoded_ptr   = reinterpret_cast<float *>(decoded_it.ptr());
@@ -118,12 +144,15 @@ inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decode
  * @param[in]  info               The detection informations
  * @param[out] decoded_boxes      The decoded bboxes.
  */
-void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *input_anchors, DetectionPostProcessLayerInfo info, Tensor *decoded_boxes)
+void DecodeCenterSizeBoxes(const ITensor                *input_box_encoding,
+                           const ITensor                *input_anchors,
+                           DetectionPostProcessLayerInfo info,
+                           Tensor                       *decoded_boxes)
 {
     const QuantizationInfo &qi_box     = input_box_encoding->info()->quantization_info();
     const QuantizationInfo &qi_anchors = input_anchors->info()->quantization_info();
-    BBox                    box_centersize{ {} };
-    BBox                    anchor{ {} };
+    BBox                    box_centersize{{}};
+    BBox                    anchor{{}};
 
     Window win;
     win.use_tensor_dimensions(input_box_encoding->info()->tensor_shape());
@@ -133,107 +162,155 @@ void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *inp
     Iterator anchor_it(input_anchors, win);
     Iterator decoded_it(decoded_boxes, win);
 
-    if(input_box_encoding->info()->data_type() == DataType::QASYMM8)
+    if (input_box_encoding->info()->data_type() == DataType::QASYMM8)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto box_ptr    = reinterpret_cast<const qasymm8_t *>(box_it.ptr());
-            const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr());
-            box_centersize        = BBox({ dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box),
-                                           dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box)
-                                         });
-            anchor = BBox({ dequantize_qasymm8(*anchor_ptr, qi_anchors), dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors),
-                            dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)
-                          });
-            DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
-        },
-        box_it, anchor_it, decoded_it);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto box_ptr    = reinterpret_cast<const qasymm8_t *>(box_it.ptr());
+                const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr());
+                box_centersize =
+                    BBox({dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box),
+                          dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box)});
+                anchor = BBox({dequantize_qasymm8(*anchor_ptr, qi_anchors),
+                               dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors),
+                               dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors),
+                               dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)});
+                DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+            },
+            box_it, anchor_it, decoded_it);
     }
-    else if(input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
+    else if (input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto box_ptr    = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr());
-            const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr());
-            box_centersize        = BBox({ dequantize_qasymm8_signed(*box_ptr, qi_box), dequantize_qasymm8_signed(*(box_ptr + 1), qi_box),
-                                           dequantize_qasymm8_signed(*(2 + box_ptr), qi_box), dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)
-                                         });
-            anchor = BBox({ dequantize_qasymm8_signed(*anchor_ptr, qi_anchors), dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors),
-                            dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)
-                          });
-            DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
-        },
-        box_it, anchor_it, decoded_it);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto box_ptr    = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr());
+                const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr());
+                box_centersize        = BBox({dequantize_qasymm8_signed(*box_ptr, qi_box),
+                                              dequantize_qasymm8_signed(*(box_ptr + 1), qi_box),
+                                              dequantize_qasymm8_signed(*(2 + box_ptr), qi_box),
+                                              dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)});
+                anchor                = BBox({dequantize_qasymm8_signed(*anchor_ptr, qi_anchors),
+                                              dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors),
+                                              dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors),
+                                              dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)});
+                DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+            },
+            box_it, anchor_it, decoded_it);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto box_ptr    = reinterpret_cast<const float *>(box_it.ptr());
-            const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr());
-            box_centersize        = BBox({ *box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr) });
-            anchor                = BBox({ *anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr) });
-            DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
-        },
-        box_it, anchor_it, decoded_it);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto box_ptr    = reinterpret_cast<const float *>(box_it.ptr());
+                const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr());
+                box_centersize        = BBox({*box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr)});
+                anchor                = BBox({*anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr)});
+                DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+            },
+            box_it, anchor_it, decoded_it);
     }
 }
 
-void SaveOutputs(const Tensor *decoded_boxes, const std::vector<int> &result_idx_boxes_after_nms, const std::vector<float> &result_scores_after_nms, const std::vector<int> &result_classes_after_nms,
-                 std::vector<unsigned int> &sorted_indices, const unsigned int num_output, const unsigned int max_detections, ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores,
-                 ITensor *num_detection)
+void SaveOutputs(const Tensor              *decoded_boxes,
+                 const std::vector<int>    &result_idx_boxes_after_nms,
+                 const std::vector<float>  &result_scores_after_nms,
+                 const std::vector<int>    &result_classes_after_nms,
+                 std::vector<unsigned int> &sorted_indices,
+                 const unsigned int         num_output,
+                 const unsigned int         max_detections,
+                 ITensor                   *output_boxes,
+                 ITensor                   *output_classes,
+                 ITensor                   *output_scores,
+                 ITensor                   *num_detection)
 {
     // xmin,ymin,xmax,ymax -> ymin,xmin,ymax,xmax
     unsigned int i = 0;
-    for(; i < num_output; ++i)
+    for (; i < num_output; ++i)
     {
         const unsigned int box_in_idx = result_idx_boxes_after_nms[sorted_indices[i]];
-        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx))));
-        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx))));
-        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx))));
-        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx))));
-        *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = static_cast<float>(result_classes_after_nms[sorted_indices[i]]);
-        *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i))))  = result_scores_after_nms[sorted_indices[i]];
+        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) =
+            *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx))));
+        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) =
+            *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx))));
+        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) =
+            *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx))));
+        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) =
+            *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx))));
+        *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) =
+            static_cast<float>(result_classes_after_nms[sorted_indices[i]]);
+        *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i)))) =
+            result_scores_after_nms[sorted_indices[i]];
     }
-    for(; i < max_detections; ++i)
+    for (; i < max_detections; ++i)
     {
         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = 0.0f;
         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = 0.0f;
         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = 0.0f;
         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = 0.0f;
-        *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = 0.0f;
-        *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i))))  = 0.0f;
+        *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i))))  = 0.0f;
+        *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i))))   = 0.0f;
     }
     *(reinterpret_cast<float *>(num_detection->ptr_to_element(Coordinates(0)))) = num_output;
 }
 } // namespace
 
 CPPDetectionPostProcessLayer::CPPDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _nms(), _input_box_encoding(nullptr), _input_scores(nullptr), _input_anchors(nullptr), _output_boxes(nullptr), _output_classes(nullptr),
-      _output_scores(nullptr), _num_detection(nullptr), _info(), _num_boxes(), _num_classes_with_background(), _num_max_detected_boxes(), _dequantize_scores(false), _decoded_boxes(), _decoded_scores(),
-      _selected_indices(), _class_scores(), _input_scores_to_use(nullptr)
+    : _memory_group(std::move(memory_manager)),
+      _nms(),
+      _input_box_encoding(nullptr),
+      _input_scores(nullptr),
+      _input_anchors(nullptr),
+      _output_boxes(nullptr),
+      _output_classes(nullptr),
+      _output_scores(nullptr),
+      _num_detection(nullptr),
+      _info(),
+      _num_boxes(),
+      _num_classes_with_background(),
+      _num_max_detected_boxes(),
+      _dequantize_scores(false),
+      _decoded_boxes(),
+      _decoded_scores(),
+      _selected_indices(),
+      _class_scores(),
+      _input_scores_to_use(nullptr)
 {
 }
 
-void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores,
-                                             const ITensor *input_anchors, ITensor *output_boxes, ITensor *output_classes,
-                                             ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info)
+void CPPDetectionPostProcessLayer::configure(const ITensor                *input_box_encoding,
+                                             const ITensor                *input_scores,
+                                             const ITensor                *input_anchors,
+                                             ITensor                      *output_boxes,
+                                             ITensor                      *output_classes,
+                                             ITensor                      *output_scores,
+                                             ITensor                      *num_detection,
+                                             DetectionPostProcessLayerInfo info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes,
+                                 output_scores);
     ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores,
                            num_detection, info);
 
     _num_max_detected_boxes = info.max_detections() * info.max_classes_per_detection();
 
-    auto_init_if_empty(*output_boxes->info(), TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*output_classes->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*output_scores->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
+    auto_init_if_empty(*output_boxes->info(),
+                       TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
+    auto_init_if_empty(*output_classes->info(),
+                       TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
+    auto_init_if_empty(*output_scores->info(),
+                       TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
     auto_init_if_empty(*num_detection->info(), TensorInfo(TensorShape(1U), 1, DataType::F32));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(), output_scores->info(),
-                                                  num_detection->info(),
-                                                  info, _kBatchSize, _kNumCoordBox));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(),
+        output_classes->info(), output_scores->info(), num_detection->info(), info, _kBatchSize, _kNumCoordBox));
 
     _input_box_encoding          = input_box_encoding;
     _input_scores                = input_scores;
@@ -245,13 +322,24 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
     _info                        = info;
     _num_boxes                   = input_box_encoding->info()->dimension(1);
     _num_classes_with_background = _input_scores->info()->dimension(0);
-    _dequantize_scores           = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type()));
-
-    auto_init_if_empty(*_decoded_boxes.info(), TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*_decoded_scores.info(), TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*_selected_indices.info(), TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, DataType::S32));
+    _dequantize_scores = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type()));
+
+    auto_init_if_empty(*_decoded_boxes.info(),
+                       TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1,
+                                  DataType::F32));
+    auto_init_if_empty(
+        *_decoded_scores.info(),
+        TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize),
+                   1, DataType::F32));
+    auto_init_if_empty(
+        *_selected_indices.info(),
+        TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1,
+                   DataType::S32));
     const unsigned int num_classes_per_box = std::min(info.max_classes_per_detection(), info.num_classes());
-    auto_init_if_empty(*_class_scores.info(), TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, DataType::F32));
+    auto_init_if_empty(
+        *_class_scores.info(),
+        TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1,
+                   DataType::F32));
 
     _input_scores_to_use = _dequantize_scores ? &_decoded_scores : _input_scores;
 
@@ -260,7 +348,9 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
     _memory_group.manage(&_decoded_scores);
     _memory_group.manage(&_selected_indices);
     _memory_group.manage(&_class_scores);
-    _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices, info.use_regular_nms() ? info.detection_per_class() : info.max_detections(), info.nms_score_threshold(), info.iou_threshold());
+    _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices,
+                   info.use_regular_nms() ? info.detection_per_class() : info.max_detections(),
+                   info.nms_score_threshold(), info.iou_threshold());
 
     // Allocate and reserve intermediate tensors and vectors
     _decoded_boxes.allocator()->allocate();
@@ -269,18 +359,28 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
     _class_scores.allocator()->allocate();
 }
 
-Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
-                                              ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info)
+Status CPPDetectionPostProcessLayer::validate(const ITensorInfo            *input_box_encoding,
+                                              const ITensorInfo            *input_class_score,
+                                              const ITensorInfo            *input_anchors,
+                                              ITensorInfo                  *output_boxes,
+                                              ITensorInfo                  *output_classes,
+                                              ITensorInfo                  *output_scores,
+                                              ITensorInfo                  *num_detection,
+                                              DetectionPostProcessLayerInfo info)
 {
-    constexpr unsigned int kBatchSize             = 1;
-    constexpr unsigned int kNumCoordBox           = 4;
-    const TensorInfo       _decoded_boxes_info    = TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32);
-    const TensorInfo       _decoded_scores_info   = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32);
-    const TensorInfo       _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32);
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info, &_selected_indices_info, info.max_detections(), info.nms_score_threshold(),
-                                                                   info.iou_threshold()));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes, output_classes, output_scores, num_detection, info, kBatchSize, kNumCoordBox));
+    constexpr unsigned int kBatchSize   = 1;
+    constexpr unsigned int kNumCoordBox = 4;
+    const TensorInfo       _decoded_boxes_info =
+        TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32);
+    const TensorInfo _decoded_scores_info = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32);
+    const TensorInfo _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info,
+                                                                   &_selected_indices_info, info.max_detections(),
+                                                                   info.nms_score_threshold(), info.iou_threshold()));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes,
+                                                   output_classes, output_scores, num_detection, info, kBatchSize,
+                                                   kNumCoordBox));
 
     return Status{};
 }
@@ -293,62 +393,69 @@ void CPPDetectionPostProcessLayer::run()
     DecodeCenterSizeBoxes(_input_box_encoding, _input_anchors, _info, &_decoded_boxes);
 
     // Decode scores if necessary
-    if(_dequantize_scores)
+    if (_dequantize_scores)
     {
-        if(_input_box_encoding->info()->data_type() == DataType::QASYMM8)
+        if (_input_box_encoding->info()->data_type() == DataType::QASYMM8)
         {
-            for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
+            for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
             {
-                for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
+                for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
                 {
                     *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
-                        dequantize_qasymm8(*(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
+                        dequantize_qasymm8(
+                            *(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))),
+                            _input_scores->info()->quantization_info());
                 }
             }
         }
-        else if(_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
+        else if (_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
         {
-            for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
+            for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
             {
-                for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
+                for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
                 {
                     *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
-                        dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
+                        dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>(
+                                                      _input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))),
+                                                  _input_scores->info()->quantization_info());
                 }
             }
         }
     }
 
     // Regular NMS
-    if(_info.use_regular_nms())
+    if (_info.use_regular_nms())
     {
         std::vector<int>          result_idx_boxes_after_nms;
         std::vector<int>          result_classes_after_nms;
         std::vector<float>        result_scores_after_nms;
         std::vector<unsigned int> sorted_indices;
 
-        for(unsigned int c = 0; c < num_classes; ++c)
+        for (unsigned int c = 0; c < num_classes; ++c)
         {
             // For each boxes get scores of the boxes for the class c
-            for(unsigned int i = 0; i < _num_boxes; ++i)
+            for (unsigned int i = 0; i < _num_boxes; ++i)
             {
                 *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(i)))) =
-                    *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1
+                    *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(
+                        Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1
             }
 
             // Run Non-maxima Suppression
             _nms.run();
 
-            for(unsigned int i = 0; i < _info.detection_per_class(); ++i)
+            for (unsigned int i = 0; i < _info.detection_per_class(); ++i)
             {
-                const auto selected_index = *(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i))));
-                if(selected_index == -1)
+                const auto selected_index =
+                    *(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i))));
+                if (selected_index == -1)
                 {
                     // Nms will return -1 for all the last M-elements not valid
                     break;
                 }
                 result_idx_boxes_after_nms.emplace_back(selected_index);
-                result_scores_after_nms.emplace_back((reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]);
+                result_scores_after_nms.emplace_back(
+                    (reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]);
                 result_classes_after_nms.emplace_back(c);
             }
         }
@@ -360,49 +467,46 @@ void CPPDetectionPostProcessLayer::run()
         // Sort selected indices based on result scores
         sorted_indices.resize(num_selected);
         std::iota(sorted_indices.begin(), sorted_indices.end(), 0);
-        std::partial_sort(sorted_indices.data(),
-                          sorted_indices.data() + num_output,
+        std::partial_sort(sorted_indices.data(), sorted_indices.data() + num_output,
                           sorted_indices.data() + num_selected,
                           [&](unsigned int first, unsigned int second)
-        {
-
-            return result_scores_after_nms[first] > result_scores_after_nms[second];
-        });
+                          { return result_scores_after_nms[first] > result_scores_after_nms[second]; });
 
-        SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, sorted_indices,
-                    num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
+        SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms,
+                    sorted_indices, num_output, max_detections, _output_boxes, _output_classes, _output_scores,
+                    _num_detection);
     }
     // Fast NMS
     else
     {
-        const unsigned int num_classes_per_box = std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes());
+        const unsigned int num_classes_per_box =
+            std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes());
         std::vector<float> max_scores;
         std::vector<int>   box_indices;
         std::vector<int>   max_score_classes;
 
-        for(unsigned int b = 0; b < _num_boxes; ++b)
+        for (unsigned int b = 0; b < _num_boxes; ++b)
         {
             std::vector<float> box_scores;
-            for(unsigned int c = 0; c < num_classes; ++c)
+            for (unsigned int c = 0; c < num_classes; ++c)
             {
-                box_scores.emplace_back(*(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b)))));
+                box_scores.emplace_back(
+                    *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b)))));
             }
 
             std::vector<unsigned int> max_score_indices;
             max_score_indices.resize(_info.num_classes());
             std::iota(max_score_indices.data(), max_score_indices.data() + _info.num_classes(), 0);
-            std::partial_sort(max_score_indices.data(),
-                              max_score_indices.data() + num_classes_per_box,
+            std::partial_sort(max_score_indices.data(), max_score_indices.data() + num_classes_per_box,
                               max_score_indices.data() + num_classes,
                               [&](unsigned int first, unsigned int second)
-            {
-                return box_scores[first] > box_scores[second];
-            });
+                              { return box_scores[first] > box_scores[second]; });
 
-            for(unsigned int i = 0; i < num_classes_per_box; ++i)
+            for (unsigned int i = 0; i < num_classes_per_box; ++i)
             {
-                const float score_to_add                                                                             = box_scores[max_score_indices[i]];
-                *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = score_to_add;
+                const float score_to_add = box_scores[max_score_indices[i]];
+                *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) =
+                    score_to_add;
                 max_scores.emplace_back(score_to_add);
                 box_indices.emplace_back(b);
                 max_score_classes.emplace_back(max_score_indices[i]);
@@ -412,10 +516,10 @@ void CPPDetectionPostProcessLayer::run()
         // Run Non-maxima Suppression
         _nms.run();
         std::vector<unsigned int> selected_indices;
-        for(unsigned int i = 0; i < max_detections; ++i)
+        for (unsigned int i = 0; i < max_detections; ++i)
         {
             // NMS returns M valid indices, the not valid tail is filled with -1
-            if(*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))) == -1)
+            if (*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))) == -1)
             {
                 // Nms will return -1 for all the last M-elements not valid
                 break;
@@ -425,8 +529,8 @@ void CPPDetectionPostProcessLayer::run()
         // We select the max detection numbers of the highest score of all classes
         const auto num_output = std::min<unsigned int>(_info.max_detections(), selected_indices.size());
 
-        SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices,
-                    num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
+        SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices, num_output,
+                    max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
index 6d01b127c0c019a3910d07a3c686c12682be78d4..3217742c6b467d0ea02789b418a30ea977a5732c 100644
--- a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
+++ b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
@@ -29,9 +29,12 @@
 
 namespace arm_compute
 {
-void CPPNonMaximumSuppression::configure(
-    const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size,
-    const float score_threshold, const float nms_threshold)
+void CPPNonMaximumSuppression::configure(const ITensor *bboxes,
+                                         const ITensor *scores,
+                                         ITensor       *indices,
+                                         unsigned int   max_output_size,
+                                         const float    score_threshold,
+                                         const float    nms_threshold)
 {
     ARM_COMPUTE_LOG_PARAMS(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold);
 
@@ -40,10 +43,14 @@ void CPPNonMaximumSuppression::configure(
     _kernel = std::move(k);
 }
 
-Status CPPNonMaximumSuppression::validate(
-    const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
-    const float score_threshold, const float nms_threshold)
+Status CPPNonMaximumSuppression::validate(const ITensorInfo *bboxes,
+                                          const ITensorInfo *scores,
+                                          const ITensorInfo *indices,
+                                          unsigned int       max_output_size,
+                                          const float        score_threshold,
+                                          const float        nms_threshold)
 {
-    return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold);
+    return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold,
+                                                    nms_threshold);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPTopKV.cpp b/src/runtime/CPP/functions/CPPTopKV.cpp
index 62a74735a2bc7df07b0461f4c5c376288d709d97..3d64def804df02b949d7de6cc361116e9f412cde 100644
--- a/src/runtime/CPP/functions/CPPTopKV.cpp
+++ b/src/runtime/CPP/functions/CPPTopKV.cpp
@@ -38,7 +38,10 @@ void CPPTopKV::configure(const ITensor *predictions, const ITensor *targets, ITe
     _kernel = std::move(kernel);
 }
 
-Status CPPTopKV::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status CPPTopKV::validate(const ITensorInfo *predictions,
+                          const ITensorInfo *targets,
+                          ITensorInfo       *output,
+                          const unsigned int k)
 {
     return CPPTopKVKernel::validate(predictions, targets, output, k);
 }
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 436fd9ca16f71ad339355551ba4723d7f4e5af0f..ecf84abd2c48e3f6733e3da299a272eb0e70e53a 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/common/cpuinfo/CpuInfo.h"
 #include "src/runtime/SchedulerUtils.h"
 
@@ -59,7 +60,7 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
 #ifndef BARE_METAL
     const Window &max_window = window;
-    if(hints.split_dimension() == IScheduler::split_dimensions_all)
+    if (hints.split_dimension() == IScheduler::split_dimensions_all)
     {
         /*
          * if the split dim is size_t max then this signals we should parallelise over
@@ -73,27 +74,27 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
         std::tie(m_threads, n_threads) = scheduler_utils::split_2d(this->num_threads(), m, n);
 
         std::vector<IScheduler::Workload> workloads;
-        for(unsigned int ni = 0; ni != n_threads; ++ni)
+        for (unsigned int ni = 0; ni != n_threads; ++ni)
         {
-            for(unsigned int mi = 0; mi != m_threads; ++mi)
+            for (unsigned int mi = 0; mi != m_threads; ++mi)
             {
                 workloads.push_back(
-                    [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo & info)
-                {
-                    //narrow the window to our mi-ni workload
-                    Window win = max_window.split_window(Window::DimX, mi, m_threads)
-                                 .split_window(Window::DimY, ni, n_threads);
+                    [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo &info)
+                    {
+                        //narrow the window to our mi-ni workload
+                        Window win = max_window.split_window(Window::DimX, mi, m_threads)
+                                         .split_window(Window::DimY, ni, n_threads);
 
-                    win.validate();
+                        win.validate();
 
-                    Window thread_locator;
-                    thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
-                    thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
+                        Window thread_locator;
+                        thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
+                        thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
 
-                    thread_locator.validate();
+                        thread_locator.validate();
 
-                    kernel->run_nd(win, info, thread_locator);
-                });
+                        kernel->run_nd(win, info, thread_locator);
+                    });
             }
         }
         run_workloads(workloads);
@@ -103,16 +104,16 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
         const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
         const unsigned int num_threads    = std::min(num_iterations, this->num_threads());
 
-        if(num_iterations == 0)
+        if (num_iterations == 0)
         {
             return;
         }
 
-        if(!kernel->is_parallelisable() || num_threads == 1)
+        if (!kernel->is_parallelisable() || num_threads == 1)
         {
             ThreadInfo info;
             info.cpu_info = &cpu_info();
-            if(tensors.empty())
+            if (tensors.empty())
             {
                 kernel->run(max_window, info);
             }
@@ -124,14 +125,15 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
         else
         {
             unsigned int num_windows = 0;
-            switch(hints.strategy())
+            switch (hints.strategy())
             {
                 case StrategyHint::STATIC:
                     num_windows = num_threads;
                     break;
                 case StrategyHint::DYNAMIC:
                 {
-                    const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
+                    const unsigned int granule_threshold =
+                        (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
                     // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
                     num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
                     break;
@@ -143,15 +145,15 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
             num_windows = adjust_num_of_windows(max_window, hints.split_dimension(), num_windows, *kernel, cpu_info());
 
             std::vector<IScheduler::Workload> workloads(num_windows);
-            for(unsigned int t = 0; t < num_windows; ++t)
+            for (unsigned int t = 0; t < num_windows; ++t)
             {
                 //Capture 't' by copy, all the other variables by reference:
-                workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info)
+                workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info)
                 {
                     Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
                     win.validate();
 
-                    if(tensors.empty())
+                    if (tensors.empty())
                     {
                         kernel->run(win, info);
                     }
@@ -175,36 +177,43 @@ void IScheduler::run_tagged_workloads(std::vector<Workload> &workloads, const ch
     run_workloads(workloads);
 }
 
-std::size_t IScheduler::adjust_num_of_windows(const Window &window, std::size_t split_dimension, std::size_t init_num_windows, const ICPPKernel &kernel, const CPUInfo &cpu_info)
+std::size_t IScheduler::adjust_num_of_windows(const Window     &window,
+                                              std::size_t       split_dimension,
+                                              std::size_t       init_num_windows,
+                                              const ICPPKernel &kernel,
+                                              const CPUInfo    &cpu_info)
 {
     // Mitigation of the narrow split issue, which occurs when the split dimension is too small to split (hence "narrow").
-    if(window.num_iterations(split_dimension) < init_num_windows)
+    if (window.num_iterations(split_dimension) < init_num_windows)
     {
         auto recommended_split_dim = Window::DimX;
-        for(std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims)
+        for (std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims)
         {
-            if(window.num_iterations(recommended_split_dim) < window.num_iterations(dims))
+            if (window.num_iterations(recommended_split_dim) < window.num_iterations(dims))
             {
                 recommended_split_dim = dims;
             }
         }
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("%zu dimension is not a suitable dimension to split the workload. Recommended: %zu recommended_split_dim", split_dimension,
-                                                  recommended_split_dim);
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+            "%zu dimension is not a suitable dimension to split the workload. Recommended: %zu recommended_split_dim",
+            split_dimension, recommended_split_dim);
     }
 
-    for(auto t = init_num_windows; t > 0; --t) // Trying the highest number of windows ,init_num_windows, first
+    for (auto t = init_num_windows; t > 0; --t) // Trying the highest number of windows ,init_num_windows, first
     {
         // Try splitting the workload into t, subject to each subworkload size <= mws.
-        if((window.num_iterations(split_dimension) / kernel.get_mws(cpu_info, t)) >= t)
+        if ((window.num_iterations(split_dimension) / kernel.get_mws(cpu_info, t)) >= t)
         {
-            if(t != init_num_windows)
+            if (t != init_num_windows)
             {
-                ARM_COMPUTE_LOG_INFO_MSG_CORE("The scheduler is using a different thread count than the one assigned by the user.");
+                ARM_COMPUTE_LOG_INFO_MSG_CORE(
+                    "The scheduler is using a different thread count than the one assigned by the user.");
             }
             return t;
         }
     }
-    ARM_COMPUTE_LOG_INFO_MSG_CORE("The scheduler is using single thread instead of the thread count assigned by the user.");
+    ARM_COMPUTE_LOG_INFO_MSG_CORE(
+        "The scheduler is using single thread instead of the thread count assigned by the user.");
     return 1; //  If the workload is so small that it can't be split, we should run a single thread
 }
 
diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp
index a6bc950644d81ea707719929c72a8d7e60f11834..8e5b62ae7d5742481544e0e2bc015ddf1d7f3f19 100644
--- a/src/runtime/ISimpleLifetimeManager.cpp
+++ b/src/runtime/ISimpleLifetimeManager.cpp
@@ -43,7 +43,7 @@ ISimpleLifetimeManager::ISimpleLifetimeManager()
 
 void ISimpleLifetimeManager::register_group(IMemoryGroup *group)
 {
-    if(_active_group == nullptr)
+    if (_active_group == nullptr)
     {
         ARM_COMPUTE_ERROR_ON(group == nullptr);
         _active_group = group;
@@ -52,12 +52,12 @@ void ISimpleLifetimeManager::register_group(IMemoryGroup *group)
 
 bool ISimpleLifetimeManager::release_group(IMemoryGroup *group)
 {
-    if(group == nullptr)
+    if (group == nullptr)
     {
         return false;
     }
     const bool status = bool(_finalized_groups.erase(group));
-    if(status)
+    if (status)
     {
         group->mappings().clear();
     }
@@ -67,12 +67,13 @@ bool ISimpleLifetimeManager::release_group(IMemoryGroup *group)
 void ISimpleLifetimeManager::start_lifetime(void *obj)
 {
     ARM_COMPUTE_ERROR_ON(obj == nullptr);
-    ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements), "Memory object is already registered!");
+    ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements),
+                             "Memory object is already registered!");
 
     // Check if there is a free blob
-    if(_free_blobs.empty())
+    if (_free_blobs.empty())
     {
-        _occupied_blobs.emplace_front(Blob{ obj, 0, 0, { obj } });
+        _occupied_blobs.emplace_front(Blob{obj, 0, 0, {obj}});
     }
     else
     {
@@ -100,10 +101,8 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t
     el.status    = true;
 
     // Find object in the occupied lists
-    auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), [&obj](const Blob & b)
-    {
-        return obj == b.id;
-    });
+    auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs),
+                                         [&obj](const Blob &b) { return obj == b.id; });
     ARM_COMPUTE_ERROR_ON(occupied_blob_it == std::end(_occupied_blobs));
 
     // Update occupied blob and return as free
@@ -114,7 +113,7 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t
     _free_blobs.splice(std::begin(_free_blobs), _occupied_blobs, occupied_blob_it);
 
     // Check if all objects are finalized and reset active group
-    if(are_all_finalized())
+    if (are_all_finalized())
     {
         ARM_COMPUTE_ERROR_ON(!_occupied_blobs.empty());
 
@@ -133,9 +132,7 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t
 
 bool ISimpleLifetimeManager::are_all_finalized() const
 {
-    return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const std::pair<void *, Element> &e)
-    {
-        return !e.second.status;
-    });
+    return !std::any_of(std::begin(_active_elements), std::end(_active_elements),
+                        [](const std::pair<void *, Element> &e) { return !e.second.status; });
 }
 } // namespace arm_compute
diff --git a/src/runtime/IWeightsManager.cpp b/src/runtime/IWeightsManager.cpp
index 373c50c73d2a998f3c840b7dba8789cf39ac713e..96287dcc49034564b66fe6070f5b01a66428983c 100644
--- a/src/runtime/IWeightsManager.cpp
+++ b/src/runtime/IWeightsManager.cpp
@@ -25,14 +25,13 @@
 
 namespace arm_compute
 {
-IWeightsManager::IWeightsManager()
-    : _managed_weights(), _managed_counter(), _managed_weights_parents()
+IWeightsManager::IWeightsManager() : _managed_weights(), _managed_counter(), _managed_weights_parents()
 {
 }
 
 void IWeightsManager::manage(const ITensor *weights, ITransformWeights *parent)
 {
-    if(!are_weights_managed(weights))
+    if (!are_weights_managed(weights))
     {
         _managed_weights[weights];
         _managed_counter[weights];
@@ -44,9 +43,9 @@ void IWeightsManager::manage(const ITensor *weights, ITransformWeights *parent)
 
     // In case the weights are an output of a previous reshape function
     // store the parent's link
-    if(parent != nullptr)
+    if (parent != nullptr)
     {
-        if(_managed_weights_parents.find(weights) == _managed_weights_parents.end())
+        if (_managed_weights_parents.find(weights) == _managed_weights_parents.end())
         {
             _managed_weights_parents[weights] = parent;
         }
@@ -59,13 +58,13 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights
 
     // Find if I have the same weights with weights transform. If I do, don't run the reshape
     auto     item = _managed_weights.find(weights);
-    bool     perform_run{ true };
-    ITensor *weights_tensor{ nullptr };
+    bool     perform_run{true};
+    ITensor *weights_tensor{nullptr};
 
     // Check if I already have the requested transform and I have run the reshape function
-    for(auto it : item->second)
+    for (auto it : item->second)
     {
-        if(it->is_reshape_run() && (it->uid() == weights_transform->uid()))
+        if (it->is_reshape_run() && (it->uid() == weights_transform->uid()))
         {
             weights_tensor = it->get_weights();
             perform_run    = false;
@@ -73,7 +72,7 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights
         }
     }
 
-    if(perform_run)
+    if (perform_run)
     {
         weights_transform->run();
         weights_tensor = weights_transform->get_weights();
@@ -81,10 +80,10 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights
 
     // Check if we can release memory from parent
     auto parent_item = _managed_weights_parents.find(weights);
-    if(parent_item != _managed_weights_parents.end())
+    if (parent_item != _managed_weights_parents.end())
     {
         int32_t refcount = parent_item->second->decrease_refcount();
-        if(refcount == 0)
+        if (refcount == 0)
         {
             parent_item->second->release();
         }
@@ -92,20 +91,20 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights
 
     // Check top level weights. If all the transformations are done
     // mark the weights as unused
-    if(_managed_weights_parents.find(weights) == _managed_weights_parents.end())
+    if (_managed_weights_parents.find(weights) == _managed_weights_parents.end())
     {
         auto item           = _managed_weights.find(weights);
         bool mark_as_unused = true;
-        for(auto it : item->second)
+        for (auto it : item->second)
         {
-            if(!it->is_reshape_run())
+            if (!it->is_reshape_run())
             {
                 mark_as_unused = false;
                 break;
             }
         }
 
-        if(mark_as_unused)
+        if (mark_as_unused)
         {
             weights->mark_as_unused();
         }
@@ -123,15 +122,15 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei
 {
     ARM_COMPUTE_ERROR_ON_MSG(!are_weights_managed(weights), "Cannot acquire weights. Weights are not managed");
 
-    ITensor *transformed_weights{ nullptr };
+    ITensor *transformed_weights{nullptr};
     auto     item = _managed_weights.find(weights);
 
     // Check if I already have the requested transform. If I do,
     // increase the refcount of the transformed weights object and
     // reuse the tensor
-    for(auto it : item->second)
+    for (auto it : item->second)
     {
-        if(it->uid() == weights_transform->uid())
+        if (it->uid() == weights_transform->uid())
         {
             transformed_weights = it->get_weights();
             it->increase_refcount();
@@ -139,7 +138,7 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei
         }
     }
 
-    if(transformed_weights == nullptr)
+    if (transformed_weights == nullptr)
     {
         transformed_weights = weights_transform->get_weights();
         weights_transform->increase_refcount();
@@ -154,13 +153,13 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei
 
 void IWeightsManager::release(const ITensor *weights)
 {
-    if(weights == nullptr || !are_weights_managed(weights))
+    if (weights == nullptr || !are_weights_managed(weights))
     {
         return;
     }
 
     _managed_counter[weights].counter--;
-    if(_managed_counter[weights].counter == 0 && _managed_counter[weights].is_unused)
+    if (_managed_counter[weights].counter == 0 && _managed_counter[weights].is_unused)
     {
         weights->mark_as_unused();
     }
@@ -168,7 +167,7 @@ void IWeightsManager::release(const ITensor *weights)
 
 void IWeightsManager::pre_mark_as_unused(const ITensor *weights)
 {
-    if(weights == nullptr || !are_weights_managed(weights))
+    if (weights == nullptr || !are_weights_managed(weights))
     {
         return;
     }
diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp
index ac0a32539e53c3b7bfb9dafc014ebb654a6e356c..90fd025eb7fca131b4ba35ff2506f2bcbe375c12 100644
--- a/src/runtime/Memory.cpp
+++ b/src/runtime/Memory.cpp
@@ -27,20 +27,17 @@
 
 namespace arm_compute
 {
-Memory::Memory()
-    : _region(nullptr), _region_owned(nullptr)
+Memory::Memory() : _region(nullptr), _region_owned(nullptr)
 {
 }
 
-Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory)
-    : _region(nullptr), _region_owned(memory)
+Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory) : _region(nullptr), _region_owned(memory)
 {
     _region_owned = memory;
     _region       = _region_owned.get();
 }
 
-Memory::Memory(IMemoryRegion *memory)
-    : _region(memory), _region_owned(nullptr)
+Memory::Memory(IMemoryRegion *memory) : _region(memory), _region_owned(nullptr)
 {
     _region = memory;
 }
diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp
index 2e418ae9e345b331c4231ca1994ef1f2398946a0..5fa9ea47e9afcf5c4e49cba9baf37709aac6d16b 100644
--- a/src/runtime/MemoryManagerOnDemand.cpp
+++ b/src/runtime/MemoryManagerOnDemand.cpp
@@ -31,7 +31,8 @@
 
 namespace arm_compute
 {
-MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, std::shared_ptr<IPoolManager> pool_manager)
+MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager,
+                                             std::shared_ptr<IPoolManager>     pool_manager)
     : _lifetime_mgr(std::move(lifetime_manager)), _pool_mgr(std::move(pool_manager))
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr, "Lifetime manager not specified correctly!");
@@ -57,7 +58,7 @@ void MemoryManagerOnDemand::populate(arm_compute::IAllocator &allocator, size_t
 
     // Create pools
     auto pool_template = _lifetime_mgr->create_pool(&allocator);
-    for(int i = num_pools; i > 1; --i)
+    for (int i = num_pools; i > 1; --i)
     {
         auto pool = pool_template->duplicate();
         _pool_mgr->register_pool(std::move(pool));
diff --git a/src/runtime/NEON/INEOperator.cpp b/src/runtime/NEON/INEOperator.cpp
index a5fc0a2726e4fd86a16244e360c114ab6e935b6b..fcfd3251ff8c748d5ea4012a0bc84a30fd302568 100644
--- a/src/runtime/NEON/INEOperator.cpp
+++ b/src/runtime/NEON/INEOperator.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/INEOperator.h"
+
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -32,14 +34,13 @@ namespace experimental
 {
 INEOperator::~INEOperator() = default;
 
-INEOperator::INEOperator(IRuntimeContext *ctx)
-    : _kernel(), _ctx(ctx), _workspace()
+INEOperator::INEOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace()
 {
 }
 
 void INEOperator::run(ITensorPack &tensors)
 {
-    if(tensors.empty())
+    if (tensors.empty())
     {
         ARM_COMPUTE_ERROR("No inputs provided");
     }
diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
index 5438bce62a17619204dcc882168316d5fd1d7e31..b6977221b949ae26b9fd6b0cec6373e2052d1921 100644
--- a/src/runtime/NEON/INESimpleFunction.cpp
+++ b/src/runtime/NEON/INESimpleFunction.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 
 namespace arm_compute
@@ -33,8 +34,7 @@ namespace arm_compute
 INESimpleFunction::~INESimpleFunction() = default;
 
 INESimpleFunction::INESimpleFunction() // NOLINT
-    : _kernel(),
-      _border_handler()
+    : _kernel(), _border_handler()
 {
 }
 
diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
index 21dd58e378321b7df9ef06bdc8693baff39be150..04bff9fa4b837848066f6d89697ccde1cc02248b 100644
--- a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
+++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/NEON/INEKernel.h"
 #include "src/runtime/Utils.h"
 
@@ -32,9 +33,7 @@ namespace arm_compute
 {
 INESimpleFunctionNoBorder::~INESimpleFunctionNoBorder() = default;
 
-INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx)
-    : _kernel(),
-      _ctx(ctx)
+INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx) : _kernel(), _ctx(ctx)
 {
 }
 
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index e48aede590137a3c2c363ee15ab888508f47da51..59199452cef451650508542d7ad4856f174b5eae 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -24,24 +24,24 @@
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuActivation.h"
 
 namespace arm_compute
 {
 struct NEActivationLayer::Impl
 {
-    const ITensor                      *src{ nullptr };
-    ITensor                            *dst{ nullptr };
-    IRuntimeContext                    *ctx{ nullptr };
-    std::unique_ptr<cpu::CpuActivation> op{ nullptr };
+    const ITensor                      *src{nullptr};
+    ITensor                            *dst{nullptr};
+    IRuntimeContext                    *ctx{nullptr};
+    std::unique_ptr<cpu::CpuActivation> op{nullptr};
 };
 
-NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx)
-    : _impl(std::make_unique<Impl>())
+NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
 {
     _impl->ctx = ctx;
 }
-NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default;
+NEActivationLayer::NEActivationLayer(NEActivationLayer &&)            = default;
 NEActivationLayer &NEActivationLayer::operator=(NEActivationLayer &&) = default;
 NEActivationLayer::~NEActivationLayer()                               = default;
 
@@ -56,7 +56,8 @@ void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLay
     _impl->op->configure(_impl->src->info(), _impl->dst->info(), activation_info);
 }
 
-Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status
+NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     return cpu::CpuActivation::validate(input, output, act_info);
 }
diff --git a/src/runtime/NEON/functions/NEAddMulAdd.cpp b/src/runtime/NEON/functions/NEAddMulAdd.cpp
index cfeaefc4fd251f976ec2540308ff827f6058dffc..a72364791caa0cf31df9c37a29d796b96cc0554e 100644
--- a/src/runtime/NEON/functions/NEAddMulAdd.cpp
+++ b/src/runtime/NEON/functions/NEAddMulAdd.cpp
@@ -25,6 +25,7 @@
 #include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h"
 
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuAddMulAdd.h"
@@ -33,45 +34,50 @@ namespace arm_compute
 {
 struct NEAddMulAdd::Impl
 {
-    std::unique_ptr<cpu::CpuAddMulAdd> op{ nullptr };
+    std::unique_ptr<cpu::CpuAddMulAdd> op{nullptr};
     WorkspaceData<Tensor>              workspace_tensors{};
     ITensorPack                        run_pack{};
     MemoryGroup                        memory_group{};
 };
 
-NEAddMulAdd::NEAddMulAdd(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEAddMulAdd::NEAddMulAdd(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group = MemoryGroup(std::move(memory_manager));
 }
 
 NEAddMulAdd::~NEAddMulAdd() = default;
 
-void NEAddMulAdd::configure(ITensor *input1, ITensor *input2, ITensor *bn_mul, ITensor *bn_add, ITensor *add_output,
-                            ITensor *final_output, const ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void NEAddMulAdd::configure(ITensor                   *input1,
+                            ITensor                   *input2,
+                            ITensor                   *bn_mul,
+                            ITensor                   *bn_add,
+                            ITensor                   *add_output,
+                            ITensor                   *final_output,
+                            const ConvertPolicy        policy,
+                            const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
 
-    _impl->op           = std::make_unique<cpu::CpuAddMulAdd>();
-    _impl->op->configure(input1->info(), input2->info(), bn_mul->info(),
-                         bn_add->info(), add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info);
+    _impl->op = std::make_unique<cpu::CpuAddMulAdd>();
+    _impl->op->configure(input1->info(), input2->info(), bn_mul->info(), bn_add->info(),
+                         add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, input1 },
-        { TensorType::ACL_SRC_1, input2 },
-        { TensorType::ACL_SRC_2, bn_mul },
-        { TensorType::ACL_SRC_3, bn_add },
-        { TensorType::ACL_DST_0, add_output },
-        { TensorType::ACL_DST_1, final_output },
+    _impl->run_pack = {
+        {TensorType::ACL_SRC_0, input1}, {TensorType::ACL_SRC_1, input2},     {TensorType::ACL_SRC_2, bn_mul},
+        {TensorType::ACL_SRC_3, bn_add}, {TensorType::ACL_DST_0, add_output}, {TensorType::ACL_DST_1, final_output},
     };
 
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
-Status NEAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *bn_mul,
-                             const ITensorInfo *bn_add, const ITensorInfo *add_output, const ITensorInfo *final_output,
-                             ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status NEAddMulAdd::validate(const ITensorInfo         *input1,
+                             const ITensorInfo         *input2,
+                             const ITensorInfo         *bn_mul,
+                             const ITensorInfo         *bn_add,
+                             const ITensorInfo         *add_output,
+                             const ITensorInfo         *final_output,
+                             ConvertPolicy              policy,
+                             const ActivationLayerInfo &act_info)
 {
     return cpu::CpuAddMulAdd::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
 }
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index 3876ae6e872b0a7d4c1997305e36103b0a43f02e..fbaf1a96e7e6eaa9c5c1d023f20679791f7fe9c6 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,33 +29,68 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/functions/NECast.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/Tensor.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 
 namespace arm_compute
 {
+struct NEArgMinMaxLayer::Impl
+{
+    MemoryGroup                           memory_group{};
+    std::shared_ptr<IMemoryManager>       memory_manager{};
+    std::unique_ptr<NEReductionOperation> reduction_function{};
+    std::unique_ptr<NECast>               cast_function{};
+    std::unique_ptr<Tensor>               tmp_reduction_result{};
+};
+
 NEArgMinMaxLayer::~NEArgMinMaxLayer() = default;
 
-NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _reduction_function(std::make_unique<NEReductionOperation>())
+NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
-    ARM_COMPUTE_UNUSED(memory_manager);
+    _impl->memory_manager = std::move(memory_manager);
 }
+
 void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_LOG_PARAMS(input, axis, output, op);
-    _reduction_function->configure(input, output, axis, op, false);
+    _impl->reduction_function = std::make_unique<NEReductionOperation>();
+    if (output->info() &&
+        (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64))
+    {
+        _impl->memory_group         = MemoryGroup(std::move(_impl->memory_manager));
+        _impl->cast_function        = std::make_unique<NECast>();
+        _impl->tmp_reduction_result = std::make_unique<Tensor>();
+        _impl->reduction_function->configure(input, _impl->tmp_reduction_result.get(), axis, op, false);
+        _impl->cast_function->configure(_impl->tmp_reduction_result.get(), output, ConvertPolicy::SATURATE);
+        _impl->memory_group.manage(_impl->tmp_reduction_result.get());
+        _impl->tmp_reduction_result->allocator()->allocate();
+    }
+    else
+    {
+        _impl->reduction_function->configure(input, output, axis, op, false);
+    }
 }
 
-Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+Status
+NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+                                    "Invalid operation");
     return NEReductionOperation::validate(input, output, axis, op, false);
 }
 
 void NEArgMinMaxLayer::run()
 {
-    _reduction_function->run();
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    _impl->reduction_function->run();
+    if (_impl->tmp_reduction_result != nullptr)
+    {
+        _impl->cast_function->run();
+    }
 }
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index a7581ca9f4dfd67a25b424460fb160036d45ae49..aff16ae9d1a5cf1747367e2a1b7a4c96f0fd35df 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuAdd.h"
 
 #include <utility>
@@ -32,26 +33,33 @@ namespace arm_compute
 {
 struct NEArithmeticAddition::Impl
 {
-    const ITensor               *src_0{ nullptr };
-    const ITensor               *src_1{ nullptr };
-    ITensor                     *dst{ nullptr };
-    std::unique_ptr<cpu::CpuAdd> op{ nullptr };
+    const ITensor               *src_0{nullptr};
+    const ITensor               *src_1{nullptr};
+    ITensor                     *dst{nullptr};
+    std::unique_ptr<cpu::CpuAdd> op{nullptr};
 };
 
-NEArithmeticAddition::NEArithmeticAddition()
-    : _impl(std::make_unique<Impl>())
+NEArithmeticAddition::NEArithmeticAddition() : _impl(std::make_unique<Impl>())
 {
 }
-NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default;
+NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&)            = default;
 NEArithmeticAddition &NEArithmeticAddition::operator=(NEArithmeticAddition &&) = default;
 NEArithmeticAddition::~NEArithmeticAddition()                                  = default;
 
-Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status NEArithmeticAddition::validate(const ITensorInfo         *input1,
+                                      const ITensorInfo         *input2,
+                                      const ITensorInfo         *output,
+                                      ConvertPolicy              policy,
+                                      const ActivationLayerInfo &act_info)
 {
     return cpu::CpuAdd::validate(input1, input2, output, policy, act_info);
 }
 
-void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void NEArithmeticAddition::configure(const ITensor             *input1,
+                                     const ITensor             *input2,
+                                     ITensor                   *output,
+                                     ConvertPolicy              policy,
+                                     const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index 6fdd4267bf511bbbc8cff8ed9a6c5799017040d8..097525c1a8247dcd5e35c70577c342cda544810c 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 
 #include "arm_compute/core/ITensor.h"
+
 #include "src/cpu/operators/CpuSub.h"
 
 #include <utility>
@@ -32,26 +33,33 @@ namespace arm_compute
 {
 struct NEArithmeticSubtraction::Impl
 {
-    const ITensor               *src_0{ nullptr };
-    const ITensor               *src_1{ nullptr };
-    ITensor                     *dst{ nullptr };
-    std::unique_ptr<cpu::CpuSub> op{ nullptr };
+    const ITensor               *src_0{nullptr};
+    const ITensor               *src_1{nullptr};
+    ITensor                     *dst{nullptr};
+    std::unique_ptr<cpu::CpuSub> op{nullptr};
 };
 
-NEArithmeticSubtraction::NEArithmeticSubtraction()
-    : _impl(std::make_unique<Impl>())
+NEArithmeticSubtraction::NEArithmeticSubtraction() : _impl(std::make_unique<Impl>())
 {
 }
-NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default;
+NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&)            = default;
 NEArithmeticSubtraction &NEArithmeticSubtraction::operator=(NEArithmeticSubtraction &&) = default;
 NEArithmeticSubtraction::~NEArithmeticSubtraction()                                     = default;
 
-Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status NEArithmeticSubtraction::validate(const ITensorInfo         *input1,
+                                         const ITensorInfo         *input2,
+                                         const ITensorInfo         *output,
+                                         ConvertPolicy              policy,
+                                         const ActivationLayerInfo &act_info)
 {
     return cpu::CpuSub::validate(input1, input2, output, policy, act_info);
 }
 
-void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void NEArithmeticSubtraction::configure(const ITensor             *input1,
+                                        const ITensor             *input2,
+                                        ITensor                   *output,
+                                        ConvertPolicy              policy,
+                                        const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index db49f4c1a0e630a18e890bebd7174e27f213bc15..d491f0aafc55408f6cbc1710e127fd7e78258c3f 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
 
@@ -36,12 +37,17 @@ namespace arm_compute
 {
 NEBatchNormalizationLayer::~NEBatchNormalizationLayer() = default;
 
-NEBatchNormalizationLayer::NEBatchNormalizationLayer()
-    : _norm_kernel()
+NEBatchNormalizationLayer::NEBatchNormalizationLayer() : _norm_kernel()
 {
 }
 
-void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon,
+void NEBatchNormalizationLayer::configure(ITensor            *input,
+                                          ITensor            *output,
+                                          const ITensor      *mean,
+                                          const ITensor      *var,
+                                          const ITensor      *beta,
+                                          const ITensor      *gamma,
+                                          float               epsilon,
                                           ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info);
@@ -50,10 +56,17 @@ void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const
     _norm_kernel->configure(input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma,
-                                           float epsilon, ActivationLayerInfo act_info)
+Status NEBatchNormalizationLayer::validate(const ITensorInfo  *input,
+                                           const ITensorInfo  *output,
+                                           const ITensorInfo  *mean,
+                                           const ITensorInfo  *var,
+                                           const ITensorInfo  *beta,
+                                           const ITensorInfo  *gamma,
+                                           float               epsilon,
+                                           ActivationLayerInfo act_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info));
     return Status{};
 }
 
diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
index e258028d05c844c64f393812fe2d594f6ebab8ec..5d711c5ddf4de1a46b35e84062f8ee1dab414986 100644
--- a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
 
@@ -41,19 +42,25 @@ void NEBatchToSpaceLayer::configure(const ITensor *input, const ITensor *block_s
     _kernel = std::move(k);
 }
 
-void NEBatchToSpaceLayer::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
+void NEBatchToSpaceLayer::configure(
+    const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
 {
     auto k = std::make_unique<NEBatchToSpaceLayerKernel>();
     k->configure(input, block_shape_x, block_shape_y, output, crop_info);
     _kernel = std::move(k);
 }
 
-Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     return NEBatchToSpaceLayerKernel::validate(input, block_shape, output);
 }
 
-Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status NEBatchToSpaceLayer::validate(const ITensorInfo *input,
+                                     int32_t            block_shape_x,
+                                     int32_t            block_shape_y,
+                                     const ITensorInfo *output,
+                                     const CropInfo    &crop_info)
 {
     return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info);
 }
diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
index 90eb72706e7695e78b4551ccb7feb6bf7a08232a..89ce2087beb151681393dcf4cde709e4f3407a2c 100644
--- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
 
-#include "src/core/NEON/kernels/NEBitwiseAndKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBitwiseAndKernel.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp
index 69e5288b8835b3a60ecce31ec1d819687a20308e..eda59cd3e93ff039ebc8ce88c7193fc2fb3fb852 100644
--- a/src/runtime/NEON/functions/NEBitwiseNot.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
 
-#include "src/core/NEON/kernels/NEBitwiseNotKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBitwiseNotKernel.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp
index 0b19e919eee1e7edab7b7c2595acb94a20cb1ca6..3d6f30b0fed77e5ca379869036fdebe3f52a3c6e 100644
--- a/src/runtime/NEON/functions/NEBitwiseOr.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
 
-#include "src/core/NEON/kernels/NEBitwiseOrKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBitwiseOrKernel.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp
index cc9df9f1c41af4b7e293a8a119b2f7d00b7692c9..f0cf3d3e5ca8561e76adcea35365ef7c189c85de 100644
--- a/src/runtime/NEON/functions/NEBitwiseXor.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
 
-#include "src/core/NEON/kernels/NEBitwiseXorKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBitwiseXorKernel.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
index af00171be652cc69179914a25c3ab7bc4905455a..adf891e417cf6190a81e2acdb75d76bf086a121d 100644
--- a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
+++ b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
@@ -22,12 +22,16 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
 
 namespace arm_compute
 {
-void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info)
+void NEBoundingBoxTransform::configure(const ITensor                  *boxes,
+                                       ITensor                        *pred_boxes,
+                                       const ITensor                  *deltas,
+                                       const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info);
     // Configure Bounding Box kernel
@@ -36,7 +40,10 @@ void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes
     _kernel = std::move(k);
 }
 
-Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status NEBoundingBoxTransform::validate(const ITensorInfo              *boxes,
+                                        const ITensorInfo              *pred_boxes,
+                                        const ITensorInfo              *deltas,
+                                        const BoundingBoxTransformInfo &info)
 {
     return NEBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
 }
diff --git a/src/runtime/NEON/functions/NECast.cpp b/src/runtime/NEON/functions/NECast.cpp
index f93a6ea74554c9bc39a84705f9a0c22b0670e66b..1fd172a7301c008fe3daf839daaee13ea3944fbb 100644
--- a/src/runtime/NEON/functions/NECast.cpp
+++ b/src/runtime/NEON/functions/NECast.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NECast.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/operators/CpuCast.h"
 
@@ -31,16 +32,15 @@ namespace arm_compute
 {
 struct NECast::Impl
 {
-    const ITensor                *src{ nullptr };
-    ITensor                      *dst{ nullptr };
-    std::unique_ptr<cpu::CpuCast> op{ nullptr };
+    const ITensor                *src{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<cpu::CpuCast> op{nullptr};
 };
 
-NECast::NECast()
-    : _impl(std::make_unique<Impl>())
+NECast::NECast() : _impl(std::make_unique<Impl>())
 {
 }
-NECast::NECast(NECast &&) = default;
+NECast::NECast(NECast &&)            = default;
 NECast &NECast::operator=(NECast &&) = default;
 NECast::~NECast()                    = default;
 
@@ -62,7 +62,7 @@ Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output, Con
 
 void NECast::run()
 {
-    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
     _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
index 8b96fadb74f1a6c8279ac03fb13b5c1024a8b607..86bee4dd43b3fe9d73b7dae911b6398c00597711 100644
--- a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
+++ b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h"
 
 #include "arm_compute/core/Types.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
 
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index ceb697aad6b3182ddeb0dc909f684cb62956cf45..59a0892f1f11f93eba11496678b045afc769cc8b 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -23,33 +23,31 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 
-#include "src/cpu/operators/CpuConcatenate.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/cpu/operators/CpuConcatenate.h"
 
 namespace arm_compute
 {
 struct NEConcatenateLayer::Impl
 {
     std::vector<const ITensor *>         srcs{};
-    ITensor                             *dst{ nullptr };
-    unsigned int                         num_inputs{ 0 };
-    unsigned int                         axis{ 0 };
-    std::unique_ptr<cpu::CpuConcatenate> op{ nullptr };
+    ITensor                             *dst{nullptr};
+    unsigned int                         num_inputs{0};
+    unsigned int                         axis{0};
+    std::unique_ptr<cpu::CpuConcatenate> op{nullptr};
 };
 
-NEConcatenateLayer::NEConcatenateLayer()
-    : _impl(std::make_unique<Impl>())
+NEConcatenateLayer::NEConcatenateLayer() : _impl(std::make_unique<Impl>())
 {
 }
-NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default;
+NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&)            = default;
 NEConcatenateLayer &NEConcatenateLayer::operator=(NEConcatenateLayer &&) = default;
 NEConcatenateLayer::~NEConcatenateLayer()                                = default;
 
@@ -64,7 +62,7 @@ void NEConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, I
     _impl->op         = std::make_unique<cpu::CpuConcatenate>();
 
     std::vector<const ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < inputs_vector.size(); ++i)
+    for (unsigned int i = 0; i < inputs_vector.size(); ++i)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
@@ -72,7 +70,9 @@ void NEConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, I
     _impl->op->configure(inputs_vector_info, _impl->dst->info(), axis);
 }
 
-Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector,
+                                    const ITensorInfo                      *output,
+                                    size_t                                  axis)
 {
     return cpu::CpuConcatenate::validate(inputs_vector, output, axis);
 }
@@ -80,7 +80,7 @@ Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inpu
 void NEConcatenateLayer::run()
 {
     ITensorPack pack;
-    for(unsigned i = 0; i < _impl->num_inputs; ++i)
+    for (unsigned i = 0; i < _impl->num_inputs; ++i)
     {
         pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i));
     }
diff --git a/src/runtime/NEON/functions/NEConv3D.cpp b/src/runtime/NEON/functions/NEConv3D.cpp
index 3bb66c44b0fe57f9e282bd3552418d3ef2bb60da..8f41151d6c4e567ccc6f14c9ff4f18c18237517c 100644
--- a/src/runtime/NEON/functions/NEConv3D.cpp
+++ b/src/runtime/NEON/functions/NEConv3D.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/operators/CpuDirectConv3d.h"
 
@@ -35,35 +36,41 @@ using namespace arm_compute::experimental;
 
 struct NEConv3D::Impl
 {
-    std::unique_ptr<cpu::ICpuOperator> op{ nullptr };
+    std::unique_ptr<cpu::ICpuOperator> op{nullptr};
     ITensorPack                        run_pack{};
 };
 
-NEConv3D::NEConv3D()
-    : _impl(std::make_unique<Impl>())
+NEConv3D::NEConv3D() : _impl(std::make_unique<Impl>())
 {
 }
 
 NEConv3D::~NEConv3D() = default;
 
-void NEConv3D::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info)
+void NEConv3D::configure(
+    ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info));
+    ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate(
+        input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info);
 
     auto f = std::make_unique<cpu::CpuDirectConv3d>();
-    f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info);
+    f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(),
+                 conv_info);
     _impl->op = std::move(f);
 
-    if(_impl->op != nullptr)
+    if (_impl->op != nullptr)
     {
-        _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
+        _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
     }
 }
 
-Status NEConv3D::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv3dInfo &conv_info)
+Status NEConv3D::validate(const ITensorInfo *input,
+                          const ITensorInfo *weights,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *output,
+                          const Conv3dInfo  &conv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuDirectConv3d::validate(input, weights, biases, output, conv_info));
 
@@ -72,7 +79,7 @@ Status NEConv3D::validate(const ITensorInfo *input, const ITensorInfo *weights,
 
 void NEConv3D::run()
 {
-    if(_impl->op != nullptr)
+    if (_impl->op != nullptr)
     {
         _impl->op->run(_impl->run_pack);
     }
diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
index 535ac9900188ec6187f0279a0496b754218be3a8..84e8565aaf4ce3eeb2d80a035b0a76764f16d19a 100644
--- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
@@ -24,24 +24,26 @@
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
 
 namespace arm_compute
 {
 struct NEConvertFullyConnectedWeights::Impl
 {
-    const ITensor                                        *src{ nullptr };
-    ITensor                                              *dst{ nullptr };
-    std::unique_ptr<cpu::CpuConvertFullyConnectedWeights> op{ nullptr };
+    const ITensor                                        *src{nullptr};
+    ITensor                                              *dst{nullptr};
+    std::unique_ptr<cpu::CpuConvertFullyConnectedWeights> op{nullptr};
 };
-NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights()
-    : _impl(std::make_unique<Impl>())
+NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>())
 {
 }
 NEConvertFullyConnectedWeights::~NEConvertFullyConnectedWeights() = default;
 
-void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape,
-                                               DataLayout data_layout)
+void NEConvertFullyConnectedWeights::configure(const ITensor     *input,
+                                               ITensor           *output,
+                                               const TensorShape &original_input_shape,
+                                               DataLayout         data_layout)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -51,8 +53,10 @@ void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *ou
     _impl->op->configure(_impl->src->info(), _impl->dst->info(), original_input_shape, data_layout);
 }
 
-Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
-                                                DataLayout data_layout)
+Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input,
+                                                const ITensorInfo *output,
+                                                const TensorShape &original_input_shape,
+                                                DataLayout         data_layout)
 {
     return cpu::CpuConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout);
 }
@@ -64,4 +68,4 @@ void NEConvertFullyConnectedWeights::run()
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 89e0e498c99121a4aeda0540659db20a54224c53..8efebbbb1ab55b08a847866d50335920b31a64e2 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -25,8 +25,10 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuConv2d.h"
@@ -43,34 +45,44 @@ struct NEConvolutionLayer::Impl
 {
     MemoryGroup                        memory_group{};
     std::shared_ptr<IMemoryManager>    memory_manager{};
-    std::unique_ptr<cpu::ICpuOperator> op{ nullptr };
+    std::unique_ptr<cpu::ICpuOperator> op{nullptr};
     ITensorPack                        run_pack{};
     ITensorPack                        prep_pack{};
     WorkspaceData<Tensor>              workspace{};
     experimental::MemoryRequirements   aux_mem_req{};
-    std::unique_ptr<IFunction>         func{ nullptr };
+    std::unique_ptr<IFunction>         func{nullptr};
 };
 
-NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_manager = std::move(memory_manager);
 }
 
 NEConvolutionLayer::~NEConvolutionLayer() = default;
 
-void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void NEConvolutionLayer::configure(ITensor                   *input,
+                                   const ITensor             *weights,
+                                   const ITensor             *biases,
+                                   ITensor                   *output,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math,
+                                   unsigned int               num_groups)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_UNUSED(num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
-                                                            enable_fast_math, num_groups));
-    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(
+        input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
+        weights_info, dilation, act_info, enable_fast_math, num_groups));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                           enable_fast_math, num_groups);
 
     const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
-    switch(cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math))
+    switch (cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
+                                                   weights_info, dilation, act_info, enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
         case ConvolutionMethod::GEMM:
@@ -78,7 +90,8 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const
         case ConvolutionMethod::DIRECT:
         {
             auto f = std::make_unique<cpu::CpuConv2d>();
-            f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+            f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr),
+                         output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
             _impl->op = std::move(f);
             break;
         }
@@ -94,33 +107,53 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const
             break;
     }
 
-    if(_impl->op)
+    if (_impl->op)
     {
         _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
         _impl->aux_mem_req  = _impl->op->workspace();
-        _impl->run_pack     = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-        _impl->prep_pack    = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
-        _impl->workspace    = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+        _impl->run_pack     = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+        _impl->prep_pack    = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+        _impl->workspace =
+            manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
     }
 }
 
-Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status NEConvolutionLayer::validate(const ITensorInfo         *input,
+                                    const ITensorInfo         *weights,
+                                    const ITensorInfo         *biases,
+                                    const ITensorInfo         *output,
+                                    const PadStrideInfo       &conv_info,
+                                    const WeightsInfo         &weights_info,
+                                    const Size2D              &dilation,
+                                    const ActivationLayerInfo &act_info,
+                                    bool                       enable_fast_math,
+                                    unsigned int               num_groups)
 {
     const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
 
-    switch(cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+    // Biases with dynamic values are not supported with quantized inputs.
+    if (biases)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((!biases->are_values_constant() && is_data_type_quantized(input->data_type())),
+                                        "Dynamic Biases are not supported with quantized input data.");
+    }
+
+    switch (cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+                                                   enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
         case ConvolutionMethod::GEMM:
         case ConvolutionMethod::GEMM_CONV2D:
         case ConvolutionMethod::DIRECT:
-            ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups));
+            ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info,
+                                                                 weights_info, dilation, act_info, enable_fast_math,
+                                                                 num_groups));
             break;
         case ConvolutionMethod::FFT:
-            ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
             break;
         default:
             ARM_COMPUTE_ERROR("Not supported.");
@@ -129,12 +162,17 @@ Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
     return Status{};
 }
 
-ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
-                                                             const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                             const WeightsInfo &weights_info, const Size2D &dilation,
-                                                             const ActivationLayerInfo &act_info, bool enable_fast_math)
+ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo         *input,
+                                                             const ITensorInfo         *weights,
+                                                             const ITensorInfo         *output,
+                                                             const PadStrideInfo       &conv_info,
+                                                             const WeightsInfo         &weights_info,
+                                                             const Size2D              &dilation,
+                                                             const ActivationLayerInfo &act_info,
+                                                             bool                       enable_fast_math)
 {
-    return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math);
+    return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+                                                  enable_fast_math);
 }
 
 void NEConvolutionLayer::run()
@@ -143,7 +181,7 @@ void NEConvolutionLayer::run()
 
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
 
-    if(_impl->func)
+    if (_impl->func)
     {
         _impl->func->run();
     }
@@ -155,7 +193,7 @@ void NEConvolutionLayer::run()
 
 void NEConvolutionLayer::prepare()
 {
-    if(_impl->func)
+    if (_impl->func)
     {
         _impl->func->prepare();
     }
diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp
index c2059e8e985b826adc3327868d429043d08f85df..c975d3a5b517a8a2e387ad4111fc0f1e7361b5ec 100644
--- a/src/runtime/NEON/functions/NECopy.cpp
+++ b/src/runtime/NEON/functions/NECopy.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuCopy.h"
 
 #include <utility>
@@ -32,16 +33,15 @@ namespace arm_compute
 {
 struct NECopy::Impl
 {
-    const ITensor                *src{ nullptr };
-    ITensor                      *dst{ nullptr };
-    std::unique_ptr<cpu::CpuCopy> op{ nullptr };
+    const ITensor                *src{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<cpu::CpuCopy> op{nullptr};
 };
 
-NECopy::NECopy()
-    : _impl(std::make_unique<Impl>())
+NECopy::NECopy() : _impl(std::make_unique<Impl>())
 {
 }
-NECopy::NECopy(NECopy &&) = default;
+NECopy::NECopy(NECopy &&)            = default;
 NECopy &NECopy::operator=(NECopy &&) = default;
 NECopy::~NECopy()                    = default;
 
diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp
index cca8b400eed5649d91d9b3f9a1d702650ea426f9..a94b0882da45b3a73a810541394b5fa594341f8e 100644
--- a/src/runtime/NEON/functions/NECropResize.cpp
+++ b/src/runtime/NEON/functions/NECropResize.cpp
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
 #include "arm_compute/runtime/NEON/functions/NECropResize.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NECropKernel.h"
 
@@ -35,18 +36,32 @@ namespace arm_compute
 NECropResize::~NECropResize() = default;
 
 NECropResize::NECropResize()
-    : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results()
+    : _output(nullptr),
+      _num_boxes(0),
+      _method(),
+      _extrapolation_value(0),
+      _crop(),
+      _scale(),
+      _crop_results(),
+      _scaled_results()
 {
 }
 
-Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output,
-                              Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+Status NECropResize::validate(const ITensorInfo  *input,
+                              const ITensorInfo  *boxes,
+                              const ITensorInfo  *box_ind,
+                              const ITensorInfo  *output,
+                              Coordinates2D       crop_size,
+                              InterpolationPolicy method,
+                              float               extrapolation_value)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
     ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
     TensorInfo temp_info;
-    ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, extrapolation_value));
-    if(output->total_size() > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(),
+                                                       box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1,
+                                                       extrapolation_value));
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -56,11 +71,17 @@ Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes
     return Status{};
 }
 
-void NECropResize::configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size,
-                             InterpolationPolicy method, float extrapolation_value)
+void NECropResize::configure(const ITensor      *input,
+                             const ITensor      *boxes,
+                             const ITensor      *box_ind,
+                             ITensor            *output,
+                             Coordinates2D       crop_size,
+                             InterpolationPolicy method,
+                             float               extrapolation_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+    ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(),
+                                                      crop_size, method, extrapolation_value));
     ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value);
 
     _num_boxes = boxes->info()->tensor_shape()[1];
@@ -81,7 +102,7 @@ void NECropResize::configure(const ITensor *input, const ITensor *boxes, const I
     _scaled_results.reserve(_num_boxes);
     _scale.reserve(_num_boxes);
 
-    for(unsigned int i = 0; i < _num_boxes; ++i)
+    for (unsigned int i = 0; i < _num_boxes; ++i)
     {
         auto       crop_tensor = std::make_unique<Tensor>();
         TensorInfo crop_result_info(1, DataType::F32);
@@ -108,7 +129,7 @@ void NECropResize::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
 
-    for(unsigned int i = 0; i < _num_boxes; ++i)
+    for (unsigned int i = 0; i < _num_boxes; ++i)
     {
         // Size of the crop box in _boxes and thus the shape of _crop_results[i]
         // may not be known until run-time and so the kernels cannot be configured until then.
@@ -117,12 +138,15 @@ void NECropResize::run()
         NEScheduler::get().schedule(_crop[i].get(), Window::DimZ);
 
         // Scale the cropped image.
-        _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false });
+        _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(),
+                             ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value),
+                                             SamplingPolicy::TOP_LEFT, false});
         _scaled_results[i]->allocator()->allocate();
         _scale[i]->run();
 
         // Copy scaled image into output.
-        std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), _output->ptr_to_element(Coordinates(0, 0, 0, i)));
+        std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(),
+                    _output->ptr_to_element(Coordinates(0, 0, 0, i)));
     }
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 439aff0840eb395e2726672afdc0c267bfa1bbaf..081c7cc538c26f33dd589682be5d82817322b674 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -25,9 +25,10 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
@@ -61,9 +62,9 @@ PadStrideInfo compute_upsample_info(const PadStrideInfo &info, uint32_t deconv_p
     deconv_pad_top += deconv_pad_y / 2;
     deconv_pad_bottom += deconv_pad_y / 2;
 
-    return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
+    return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom,
+                         DimensionRoundingType::FLOOR);
 }
-
 } // namespace
 
 NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
@@ -82,17 +83,24 @@ NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor
 {
 }
 
-Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info,
-                                      bool enable_fast_math, const WeightsInfo &weights_info)
+Status NEDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                      const ITensorInfo   *weights,
+                                      const ITensorInfo   *bias,
+                                      const ITensorInfo   *output,
+                                      const PadStrideInfo &info,
+                                      bool                 enable_fast_math,
+                                      const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    const unsigned int width_idx  = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
+    const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int height_idx =
+        get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) < 1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
-    if(is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type()))
+    if (is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
     }
@@ -101,11 +109,23 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     }
 
-    auto out_dims = deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), weights->dimension(height_idx), info);
+    const unsigned int pad_left   = info.pad_left();
+    const unsigned int pad_top    = info.pad_top();
+    const unsigned int pad_right  = info.pad_right();
+    const unsigned int pad_bottom = info.pad_bottom();
+
+    ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(width_idx) - 1) * info.stride().first +
+                                 weights->dimension(width_idx)) < (pad_left + pad_right));
+    ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(height_idx) - 1) * info.stride().second +
+                                 weights->dimension(height_idx)) < (pad_top + pad_bottom));
 
-    if(bias != nullptr)
+    auto out_dims =
+        deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx),
+                                        weights->dimension(width_idx), weights->dimension(height_idx), info);
+
+    if (bias != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(input->data_type()))
+        if (is_data_type_quantized_asymmetric(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -115,70 +135,84 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
         }
     }
 
-    if(output->tensor_shape().total_size() > 0)
+    if (output->tensor_shape().total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
         const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
 
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(),
+                                        "Output's width is invalid.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(),
+                                        "Output's height is invalid.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(),
+                                        "Output's depth is invalid.");
     }
 
-    uint32_t           deconv_pad_x = 0;
-    uint32_t           deconv_pad_y = 0;
-    const unsigned int stride_x     = info.stride().first;
-    const unsigned int stride_y     = info.stride().second;
-    // Guard against overflows in compute_deconvolution_upsampled_shape()
-    const DataLayout   data_layout = input->data_layout();
-    const size_t       idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const size_t       idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int out_x       = (input->dimension(idx_w) - 1) * stride_x + 1;
-    const unsigned int out_y       = (input->dimension(idx_h) - 1) * stride_y + 1;
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) > out_x);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) > out_y);
-    ARM_COMPUTE_RETURN_ERROR_ON((out_x - weights->dimension(idx_w) + 1) > out_dims.first);
-    ARM_COMPUTE_RETURN_ERROR_ON((out_y - weights->dimension(idx_h) + 1) > out_dims.second);
-
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
-    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
+    uint32_t       deconv_pad_x   = 0;
+    uint32_t       deconv_pad_y   = 0;
+    const uint32_t stride_x       = info.stride().first;
+    const uint32_t stride_y       = info.stride().second;
+    const auto     deconv_padding = compute_deconvolution_padding(*input, *weights, static_cast<int32_t>(stride_x),
+                                                                  static_cast<int32_t>(stride_y), out_dims);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(deconv_padding.first < 0 || deconv_padding.second < 0,
+                                    "Negative padding not supported");
+
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y,
+                                                                              out_dims, deconv_pad_x, deconv_pad_y);
+    TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
     const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
 
     // Do not perform upsampling when the operation uses unit stride in all dimensions
     const bool do_upsampling = stride_x != 1 || stride_y != 1;
 
-    const unsigned int batches_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
-    const unsigned int channel_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+    const unsigned int batches_idx =
+        get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
+    const unsigned int channel_idx =
+        get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != scale_out_info.dimension(batches_idx));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != scale_out_info.dimension(channel_idx));
 
-    if(do_upsampling)
+    if (do_upsampling)
     {
         const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info,
+                                                                 weights_info, Size2D(1U, 1U), ActivationLayerInfo(),
+                                                                 enable_fast_math));
     }
     else
     {
-        const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math));
+        const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(),
+                                      upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info,
+                                                                 Size2D(1U, 1U), ActivationLayerInfo(),
+                                                                 enable_fast_math));
     }
 
     return Status{};
 }
 
-void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, bool enable_fast_math, const WeightsInfo &weights_info)
+void NEDeconvolutionLayer::configure(ITensor             *input,
+                                     const ITensor       *weights,
+                                     const ITensor       *bias,
+                                     ITensor             *output,
+                                     const PadStrideInfo &info,
+                                     bool                 enable_fast_math,
+                                     const WeightsInfo   &weights_info)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info, enable_fast_math, weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(),
+                                                              (bias == nullptr) ? nullptr : bias->info(),
+                                                              output->info(), info, enable_fast_math, weights_info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, enable_fast_math, weights_info);
 
     const DataLayout   data_layout = input->info()->data_layout();
     const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    auto               out_dims    = deconvolution_output_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx),
-                                                                     weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info);
+    auto               out_dims    = deconvolution_output_dimensions(
+                         input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+                         weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
 
@@ -191,7 +225,8 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
     const unsigned int stride_y = info.stride().second;
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
 
     _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
 
@@ -199,12 +234,12 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
     _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
 
     // setup the function to convolve the upscaled output
-    uint32_t            deconv_pad_x    = 0;
-    uint32_t            deconv_pad_y    = 0;
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(),
-                                                                                stride_x, stride_y,
-                                                                                out_dims, deconv_pad_x, deconv_pad_y);
-    const PadStrideInfo upsample_info   = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
+    uint32_t          deconv_pad_x    = 0;
+    uint32_t          deconv_pad_y    = 0;
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(
+        *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
+
+    const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
 
     // Do not perform upsampling when the operation uses unit stride in all dimensions
     _do_upsampling = stride_x != 1 || stride_y != 1;
@@ -216,12 +251,12 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
     axis_data[1]   = static_cast<uint32_t>(height_idx);
 
     // Setup convolution and upsampling, if needed
-    if(_do_upsampling)
+    if (_do_upsampling)
     {
         _memory_group.manage(&_scaled_output);
 
         const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-        TensorInfo          scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+        TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
         scale_out_info.set_data_layout(data_layout);
         _scaled_output.allocator()->init(scale_out_info);
 
@@ -229,14 +264,17 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
         // The padding amount can be given as input to the convolution layer.
         _upsample_f.configure(input, &_scaled_output, upsample_info);
 
-        _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math);
+        _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U),
+                          ActivationLayerInfo(), enable_fast_math);
 
         _scaled_output.allocator()->allocate();
     }
     else
     {
-        const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
-        _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math);
+        const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(),
+                                      upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
+        _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U),
+                          ActivationLayerInfo(), enable_fast_math);
     }
 }
 
@@ -246,7 +284,7 @@ void NEDeconvolutionLayer::run()
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_do_upsampling)
+    if (_do_upsampling)
     {
         _upsample_f.run();
     }
@@ -255,7 +293,7 @@ void NEDeconvolutionLayer::run()
 
 void NEDeconvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
index 1ec32074a5e69356bfe3798f531cdfe41b607655..766635dfa1a4371e0661cdfe9f767e6a27c75abe 100644
--- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuCast.h"
 
 #include <utility>
@@ -32,16 +33,15 @@ namespace arm_compute
 {
 struct NEDepthConvertLayer::Impl
 {
-    const ITensor                *src{ nullptr };
-    ITensor                      *dst{ nullptr };
-    std::unique_ptr<cpu::CpuCast> op{ nullptr };
+    const ITensor                *src{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<cpu::CpuCast> op{nullptr};
 };
 
-NEDepthConvertLayer::NEDepthConvertLayer()
-    : _impl(std::make_unique<Impl>())
+NEDepthConvertLayer::NEDepthConvertLayer() : _impl(std::make_unique<Impl>())
 {
 }
-NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default;
+NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&)            = default;
 NEDepthConvertLayer &NEDepthConvertLayer::operator=(NEDepthConvertLayer &&) = default;
 NEDepthConvertLayer::~NEDepthConvertLayer()                                 = default;
 
@@ -59,7 +59,8 @@ void NEDepthConvertLayer::configure(const ITensor *input, ITensor *output, Conve
     _impl->op->configure(_impl->src->info(), _impl->dst->info(), policy);
 }
 
-Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+Status
+NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(shift != 0);
     return cpu::CpuCast::validate(input, output, policy);
@@ -67,7 +68,7 @@ Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo
 
 void NEDepthConvertLayer::run()
 {
-    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
     _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
index f4a8a17e05642b2190c411593b72c385b829c513..47564059ec8fd04b51cc194a21c3cb8d5feec316 100644
--- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
 
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 4dabef3bd7f6357a643abe4efab464bf20df8875..6c085645dbfab85ae8e5e3b879f98f2560c58546 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/operators/CpuDepthwiseConv2d.h"
 
@@ -39,38 +40,35 @@ NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;
 
 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl
 {
-    ITensor       *src{ nullptr }; // SRC_0
-    ITensor       *dst{ nullptr }; // DST_0
-    const ITensor *weights
-    {
-        nullptr
-    }; // SRC_1
-    const ITensor *biases
-    {
-        nullptr
-    };                                                           // SRC_2
+    ITensor                                 *src{nullptr};       // SRC_0
+    ITensor                                 *dst{nullptr};       // DST_0
+    const ITensor                           *weights{nullptr};   // SRC_1
+    const ITensor                           *biases{nullptr};    // SRC_2
     Tensor                                   permuted_input{};   // INT_0
     Tensor                                   permuted_weights{}; // INT_1
     Tensor                                   permuted_output{};  // INT_2
     Tensor                                   workspace{};        // INT_3
     Tensor                                   packed_weights{};   // INT_4
-    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
-    bool                                     is_prepared{ false };
-    bool                                     permute{ false };
+    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr};
+    bool                                     is_prepared{false};
+    bool                                     permute{false};
 };
 
-NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(
+    std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _impl(std::make_unique<Impl>())
 {
 }
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor       *input,
-                                                                                          const ITensor *weights,
-                                                                                          const ITensor *biases,
-                                                                                          ITensor *output, const PadStrideInfo &conv_info,
-                                                                                          unsigned int               depth_multiplier,
-                                                                                          const ActivationLayerInfo &act_info,
-                                                                                          const Size2D              &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(
+    ITensor                   *input,
+    const ITensor             *weights,
+    const ITensor             *biases,
+    ITensor                   *output,
+    const PadStrideInfo       &conv_info,
+    unsigned int               depth_multiplier,
+    const ActivationLayerInfo &act_info,
+    const Size2D              &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -82,9 +80,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
     _impl->permute = is_nhwc;
 
     _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
-    ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(),
-                         _impl->dst->info(), info);
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+    _impl->op->configure(_impl->src->info(), _impl->weights->info(),
+                         _impl->biases == nullptr ? nullptr : _impl->biases->info(), _impl->dst->info(), info);
 
     // Configure pipeline
     ActivationLayerInfo act_info_to_use            = ActivationLayerInfo();
@@ -92,15 +90,15 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
     const bool          is_relu6                   = arm_compute::utils::info_helpers::is_relu6(act_info);
     bool                is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
 
-    if(!is_activationlayer_enabled)
+    if (!is_activationlayer_enabled)
     {
         act_info_to_use = act_info;
     }
-    info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation };
+    info = ConvolutionInfo{conv_info, depth_multiplier, act_info_to_use, dilation};
 
     auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>();
 
-    if(is_nhwc)
+    if (is_nhwc)
     {
         auto permute_input   = std::make_unique<cpu::CpuPermute>();
         auto permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -122,7 +120,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
         _impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info());
 
         // Configure optimized depthwise
-        dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), info);
+        dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(),
+                                      biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(),
+                                      info);
 
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
         _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
@@ -133,29 +133,33 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
     }
     else
     {
-        dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
+        dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(),
+                                      biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
     }
 
     // Allocate memory based on the internal memory requirements
     experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace();
-    _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size + mem_req[0].alignment }, 1, DataType::S8), mem_req[0].alignment);
-    _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size + mem_req[1].alignment }, 1, DataType::S8), mem_req[1].alignment);
+    _impl->workspace.allocator()->init(TensorInfo(TensorShape{mem_req[0].size + mem_req[0].alignment}, 1, DataType::S8),
+                                       mem_req[0].alignment);
+    _impl->packed_weights.allocator()->init(
+        TensorInfo(TensorShape{mem_req[1].size + mem_req[1].alignment}, 1, DataType::S8), mem_req[1].alignment);
     _memory_group.manage(&_impl->workspace);
     _memory_group.manage(&_impl->packed_weights);
     _impl->workspace.allocator()->allocate();
     _impl->packed_weights.allocator()->allocate();
 }
 
-Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo         *input,
-                                                                                           const ITensorInfo         *weights,
-                                                                                           const ITensorInfo         *biases,
-                                                                                           const ITensorInfo         *output,
-                                                                                           const PadStrideInfo       &conv_info,
-                                                                                           unsigned int               depth_multiplier,
-                                                                                           const ActivationLayerInfo &act_info,
-                                                                                           const Size2D              &dilation)
+Status
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo   *input,
+                                                                                    const ITensorInfo   *weights,
+                                                                                    const ITensorInfo   *biases,
+                                                                                    const ITensorInfo   *output,
+                                                                                    const PadStrideInfo &conv_info,
+                                                                                    unsigned int depth_multiplier,
+                                                                                    const ActivationLayerInfo &act_info,
+                                                                                    const Size2D              &dilation)
 {
-    ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
     return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
@@ -180,15 +184,15 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         // Permute weights
-        if(_impl->permute)
+        if (_impl->permute)
         {
             _impl->permuted_weights.allocator()->allocate();
         }
 
-        if(!_impl->permuted_weights.is_used())
+        if (!_impl->permuted_weights.is_used())
         {
             _impl->permuted_weights.allocator()->free();
         }
@@ -202,14 +206,14 @@ struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl
     Tensor                                   permuted_input{};
     Tensor                                   permuted_weights{};
     Tensor                                   permuted_output{};
-    bool                                     is_prepared{ false };
-    bool                                     is_nchw{ false };
-    bool                                     is_activationlayer_enabled{ false };
-    const ITensor                           *weights{ nullptr };
-    const ITensor                           *biases{ nullptr };
-    const ITensor                           *src{ nullptr };
-    ITensor                                 *dst{ nullptr };
-    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
+    bool                                     is_prepared{false};
+    bool                                     is_nchw{false};
+    bool                                     is_activationlayer_enabled{false};
+    const ITensor                           *weights{nullptr};
+    const ITensor                           *biases{nullptr};
+    const ITensor                           *src{nullptr};
+    ITensor                                 *dst{nullptr};
+    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr};
 };
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
@@ -217,14 +221,21 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConv
 {
 }
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                                                                unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor             *input,
+                                                                                const ITensor       *weights,
+                                                                                const ITensor       *biases,
+                                                                                ITensor             *output,
+                                                                                const PadStrideInfo &conv_info,
+                                                                                unsigned int         depth_multiplier,
+                                                                                const ActivationLayerInfo &act_info,
+                                                                                const Size2D              &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
-    const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
     _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
-    _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info);
+    _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(),
+                         info);
 
     _impl->src         = input;
     _impl->dst         = output;
@@ -236,7 +247,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
     ITensor       *input_to_use   = input;
     const ITensor *weights_to_use = weights;
     ITensor       *output_to_use  = output;
-    if(_impl->is_nchw)
+    if (_impl->is_nchw)
     {
         auto permute_input   = std::make_unique<cpu::CpuPermute>();
         auto permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -249,14 +260,16 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
         _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
         weights_to_use = &_impl->permuted_weights;
 
-        _impl->permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+        _impl->permuted_output.allocator()->init(
+            output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
         output_to_use = &_impl->permuted_output;
     }
 
     auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
-    depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
+    depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(),
+                                     biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
 
-    if(_impl->is_nchw)
+    if (_impl->is_nchw)
     {
         auto permute_output = std::make_unique<cpu::CpuPermute>();
         permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
@@ -268,11 +281,16 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
     }
 }
 
-Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo   *input,
+                                                                                 const ITensorInfo   *weights,
+                                                                                 const ITensorInfo   *biases,
+                                                                                 const ITensorInfo   *output,
                                                                                  const PadStrideInfo &conv_info,
-                                                                                 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+                                                                                 unsigned int         depth_multiplier,
+                                                                                 const ActivationLayerInfo &act_info,
+                                                                                 const Size2D              &dilation)
 {
-    ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
     return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
@@ -298,49 +316,64 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemory
 #ifndef DOXYGEN_SKIP_THIS
 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
 {
-    DepthwiseConvolutionFunction                 depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
-    NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{ nullptr };
+    DepthwiseConvolutionFunction                 depth_conv_func{DepthwiseConvolutionFunction::OPTIMIZED};
+    NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{nullptr};
     NEDepthwiseConvolutionLayerGeneric           func_generic{};
-    std::shared_ptr<cpu::CpuDepthwiseConv2d>     op{ nullptr };
+    std::shared_ptr<cpu::CpuDepthwiseConv2d>     op{nullptr};
 };
 #endif // DOXYGEN_SKIP_THIS
 
-void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                                            const ActivationLayerInfo &act_info, const Size2D &dilation)
+void NEDepthwiseConvolutionLayer::configure(ITensor                   *input,
+                                            const ITensor             *weights,
+                                            const ITensor             *biases,
+                                            ITensor                   *output,
+                                            const PadStrideInfo       &conv_info,
+                                            unsigned int               depth_multiplier,
+                                            const ActivationLayerInfo &act_info,
+                                            const Size2D              &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
     ARM_COMPUTE_LOG_PARAMS(input, weights, output, conv_info, depth_multiplier, biases, act_info, dilation);
-    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
-                                                                     output->info(), conv_info, depth_multiplier, act_info, dilation));
+    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(
+        input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), output->info(), conv_info,
+        depth_multiplier, act_info, dilation));
 
-    const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
     _impl->op              = std::make_shared<cpu::CpuDepthwiseConv2d>();
-    _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
-                                                                          info);
-    switch(_impl->depth_conv_func)
+    _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(
+        input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), info);
+    switch (_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
-            _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info,
+                                            dilation);
             break;
         case DepthwiseConvolutionFunction::GENERIC:
-            _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info,
+                                          dilation);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
     }
 }
 
-Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo         *input,
+                                             const ITensorInfo         *weights,
+                                             const ITensorInfo         *biases,
+                                             const ITensorInfo         *output,
+                                             const PadStrideInfo       &conv_info,
+                                             unsigned int               depth_multiplier,
+                                             const ActivationLayerInfo &act_info,
+                                             const Size2D              &dilation)
 {
-    ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
     return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::run()
 {
-    switch(_impl->depth_conv_func)
+    switch (_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _impl->func_optimized.run();
@@ -355,7 +388,7 @@ void NEDepthwiseConvolutionLayer::run()
 
 void NEDepthwiseConvolutionLayer::prepare()
 {
-    switch(_impl->depth_conv_func)
+    switch (_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _impl->func_optimized.prepare();
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index 83e0131c8301ca2ee8b51a534b03dde36958ceeb..28d19d2950493cc103f36194a61ee98940dda6e4 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -26,19 +26,19 @@
 
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/cpu/operators/CpuDequantize.h"
 
 namespace arm_compute
 {
 struct NEDequantizationLayer::Impl
 {
-    const ITensor                      *src{ nullptr };
-    ITensor                            *dst{ nullptr };
-    std::unique_ptr<cpu::CpuDequantize> op{ nullptr };
+    const ITensor                      *src{nullptr};
+    ITensor                            *dst{nullptr};
+    std::unique_ptr<cpu::CpuDequantize> op{nullptr};
 };
 
-NEDequantizationLayer::NEDequantizationLayer()
-    : _impl(std::make_unique<Impl>())
+NEDequantizationLayer::NEDequantizationLayer() : _impl(std::make_unique<Impl>())
 {
 }
 NEDequantizationLayer::~NEDequantizationLayer() = default;
diff --git a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
index 1da8b012b3630ab7b3b4585232384bfc02f47c13..b347390162d4947d6111835e2f6c9a2a73eda57f 100644
--- a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
+++ b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 
 #include <cstddef>
@@ -35,24 +36,36 @@
 namespace arm_compute
 {
 NEDetectionPostProcessLayer::NEDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _dequantize(), _detection_post_process(), _decoded_scores(), _run_dequantize(false)
+    : _memory_group(std::move(memory_manager)),
+      _dequantize(),
+      _detection_post_process(),
+      _decoded_scores(),
+      _run_dequantize(false)
 {
 }
 
-void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors,
-                                            ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info)
+void NEDetectionPostProcessLayer::configure(const ITensor                *input_box_encoding,
+                                            const ITensor                *input_scores,
+                                            const ITensor                *input_anchors,
+                                            ITensor                      *output_boxes,
+                                            ITensor                      *output_classes,
+                                            ITensor                      *output_scores,
+                                            ITensor                      *num_detection,
+                                            DetectionPostProcessLayerInfo info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores);
-    ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(),
-                                                                     output_scores->info(),
-                                                                     num_detection->info(), info));
-    ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes,
+                                 output_scores);
+    ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(
+        input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(),
+        output_classes->info(), output_scores->info(), num_detection->info(), info));
+    ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores,
+                           num_detection, info);
 
     const ITensor                *input_scores_to_use = input_scores;
     DetectionPostProcessLayerInfo info_to_use         = info;
     _run_dequantize                                   = is_data_type_quantized(input_box_encoding->info()->data_type());
 
-    if(_run_dequantize)
+    if (_run_dequantize)
     {
         _memory_group.manage(&_decoded_scores);
 
@@ -61,26 +74,37 @@ void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, c
         input_scores_to_use = &_decoded_scores;
 
         // Create a new info struct to avoid dequantizing in the CPP layer
-        std::array<float, 4> scales_values{ info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), info.scale_value_w() };
-        DetectionPostProcessLayerInfo info_quantized(info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), info.num_classes(),
-                                                     scales_values, info.use_regular_nms(), info.detection_per_class(), false);
+        std::array<float, 4>          scales_values{info.scale_value_y(), info.scale_value_x(), info.scale_value_h(),
+                                           info.scale_value_w()};
+        DetectionPostProcessLayerInfo info_quantized(
+            info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(),
+            info.num_classes(), scales_values, info.use_regular_nms(), info.detection_per_class(), false);
         info_to_use = info_quantized;
     }
 
-    _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, output_classes, output_scores, num_detection, info_to_use);
+    _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes,
+                                      output_classes, output_scores, num_detection, info_to_use);
     _decoded_scores.allocator()->allocate();
 }
 
-Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_scores, const ITensorInfo *input_anchors,
-                                             ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info)
+Status NEDetectionPostProcessLayer::validate(const ITensorInfo            *input_box_encoding,
+                                             const ITensorInfo            *input_scores,
+                                             const ITensorInfo            *input_anchors,
+                                             ITensorInfo                  *output_boxes,
+                                             ITensorInfo                  *output_classes,
+                                             ITensorInfo                  *output_scores,
+                                             ITensorInfo                  *num_detection,
+                                             DetectionPostProcessLayerInfo info)
 {
     bool run_dequantize = is_data_type_quantized(input_box_encoding->data_type());
-    if(run_dequantize)
+    if (run_dequantize)
     {
         TensorInfo decoded_classes_info = input_scores->clone()->set_is_resizable(true).set_data_type(DataType::F32);
         ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(input_scores, &decoded_classes_info));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors,
+                                                                       output_boxes, output_classes, output_scores,
+                                                                       num_detection, info));
 
     return Status{};
 }
@@ -90,7 +114,7 @@ void NEDetectionPostProcessLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Decode scores if necessary
-    if(_run_dequantize)
+    if (_run_dequantize)
     {
         _dequantize.run();
     }
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index ef3d3d6055b8fcb7895e2793a0d694116b5c4d07..f1c2cf969f58b3ab77c1022136e439a460345738 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -27,17 +27,18 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/cpu/operators/CpuDirectConv2d.h"
 
 namespace arm_compute
 {
 struct NEDirectConvolutionLayer::Impl
 {
-    ITensor                              *src{ nullptr };
-    const ITensor                        *weights{ nullptr };
-    const ITensor                        *bias{ nullptr };
-    ITensor                              *dst{ nullptr };
-    std::unique_ptr<cpu::CpuDirectConv2d> op{ nullptr };
+    ITensor                              *src{nullptr};
+    const ITensor                        *weights{nullptr};
+    const ITensor                        *bias{nullptr};
+    ITensor                              *dst{nullptr};
+    std::unique_ptr<cpu::CpuDirectConv2d> op{nullptr};
 };
 
 NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -46,17 +47,27 @@ NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManage
 }
 NEDirectConvolutionLayer::~NEDirectConvolutionLayer() = default;
 
-void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void NEDirectConvolutionLayer::configure(ITensor                   *input,
+                                         const ITensor             *weights,
+                                         const ITensor             *bias,
+                                         ITensor                   *output,
+                                         const PadStrideInfo       &conv_info,
+                                         const ActivationLayerInfo &act_info)
 {
     _impl->src     = input;
     _impl->weights = weights;
     _impl->bias    = bias;
     _impl->dst     = output;
     _impl->op      = std::make_unique<cpu::CpuDirectConv2d>(_memory_manager);
-    _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), conv_info, act_info);
+    _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(),
+                         conv_info, act_info);
 }
 
-Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status NEDirectConvolutionLayer::validate(const ITensorInfo         *input,
+                                          const ITensorInfo         *weights,
+                                          const ITensorInfo         *bias,
+                                          const ITensorInfo         *output,
+                                          const PadStrideInfo       &conv_info,
                                           const ActivationLayerInfo &act_info)
 {
     return cpu::CpuDirectConv2d::validate(input, weights, bias, output, conv_info, act_info);
diff --git a/src/runtime/NEON/functions/NEElementwiseOperations.cpp b/src/runtime/NEON/functions/NEElementwiseOperations.cpp
index c958adf97c1a21292fbd9b80b9e6db9e97b16dc2..685ef2d4d74bfc28bb2ee929e9fa241328a1fb21 100644
--- a/src/runtime/NEON/functions/NEElementwiseOperations.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseOperations.cpp
@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
-#include "arm_compute/core/Validate.h"
-#include "src/cpu/operators/CpuElementwise.h"
 
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/cpu/operators/CpuElementwise.h"
 
 #include <utility>
 
@@ -33,17 +34,16 @@ namespace arm_compute
 {
 struct NEElementwiseMax::Impl
 {
-    const ITensor                          *src_0{ nullptr };
-    const ITensor                          *src_1{ nullptr };
-    ITensor                                *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseMax> op{ nullptr };
+    const ITensor                          *src_0{nullptr};
+    const ITensor                          *src_1{nullptr};
+    ITensor                                *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseMax> op{nullptr};
 };
 
-NEElementwiseMax::NEElementwiseMax()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseMax::NEElementwiseMax() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default;
+NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&)            = default;
 NEElementwiseMax &NEElementwiseMax::operator=(NEElementwiseMax &&) = default;
 NEElementwiseMax::~NEElementwiseMax()                              = default;
 
@@ -57,7 +57,10 @@ void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *outp
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
-Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseMax::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return cpu::CpuElementwiseMax::validate(input1, input2, output);
@@ -74,17 +77,16 @@ void NEElementwiseMax::run()
 
 struct NEElementwiseMin::Impl
 {
-    const ITensor                          *src_0{ nullptr };
-    const ITensor                          *src_1{ nullptr };
-    ITensor                                *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseMin> op{ nullptr };
+    const ITensor                          *src_0{nullptr};
+    const ITensor                          *src_1{nullptr};
+    ITensor                                *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseMin> op{nullptr};
 };
 
-NEElementwiseMin::NEElementwiseMin()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseMin::NEElementwiseMin() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default;
+NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&)            = default;
 NEElementwiseMin &NEElementwiseMin::operator=(NEElementwiseMin &&) = default;
 NEElementwiseMin::~NEElementwiseMin()                              = default;
 
@@ -98,7 +100,10 @@ void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *outp
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
-Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseMin::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return cpu::CpuElementwiseMin::validate(input1, input2, output);
@@ -115,21 +120,23 @@ void NEElementwiseMin::run()
 
 struct NEElementwiseSquaredDiff::Impl
 {
-    const ITensor                                  *src_0{ nullptr };
-    const ITensor                                  *src_1{ nullptr };
-    ITensor                                        *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseSquaredDiff> op{ nullptr };
+    const ITensor                                  *src_0{nullptr};
+    const ITensor                                  *src_1{nullptr};
+    ITensor                                        *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseSquaredDiff> op{nullptr};
 };
 
-NEElementwiseSquaredDiff::NEElementwiseSquaredDiff()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseSquaredDiff::NEElementwiseSquaredDiff() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default;
+NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&)            = default;
 NEElementwiseSquaredDiff &NEElementwiseSquaredDiff::operator=(NEElementwiseSquaredDiff &&) = default;
 NEElementwiseSquaredDiff::~NEElementwiseSquaredDiff()                                      = default;
 
-void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwiseSquaredDiff::configure(ITensor                   *input1,
+                                         ITensor                   *input2,
+                                         ITensor                   *output,
+                                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     _impl->src_0 = input1;
@@ -139,7 +146,10 @@ void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITens
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
-Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseSquaredDiff::validate(const ITensorInfo         *input1,
+                                          const ITensorInfo         *input2,
+                                          const ITensorInfo         *output,
+                                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return cpu::CpuElementwiseSquaredDiff::validate(input1, input2, output);
@@ -156,21 +166,23 @@ void NEElementwiseSquaredDiff::run()
 
 struct NEElementwiseDivision::Impl
 {
-    const ITensor                               *src_0{ nullptr };
-    const ITensor                               *src_1{ nullptr };
-    ITensor                                     *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseDivision> op{ nullptr };
+    const ITensor                               *src_0{nullptr};
+    const ITensor                               *src_1{nullptr};
+    ITensor                                     *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseDivision> op{nullptr};
 };
 
-NEElementwiseDivision::NEElementwiseDivision()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseDivision::NEElementwiseDivision() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default;
+NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&)            = default;
 NEElementwiseDivision &NEElementwiseDivision::operator=(NEElementwiseDivision &&) = default;
 NEElementwiseDivision::~NEElementwiseDivision()                                   = default;
 
-void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwiseDivision::configure(ITensor                   *input1,
+                                      ITensor                   *input2,
+                                      ITensor                   *output,
+                                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     _impl->src_0 = input1;
@@ -180,7 +192,10 @@ void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
-Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseDivision::validate(const ITensorInfo         *input1,
+                                       const ITensorInfo         *input2,
+                                       const ITensorInfo         *output,
+                                       const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return cpu::CpuElementwiseDivision::validate(input1, input2, output);
@@ -197,21 +212,23 @@ void NEElementwiseDivision::run()
 
 struct NEElementwisePower::Impl
 {
-    const ITensor                            *src_0{ nullptr };
-    const ITensor                            *src_1{ nullptr };
-    ITensor                                  *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwisePower> op{ nullptr };
+    const ITensor                            *src_0{nullptr};
+    const ITensor                            *src_1{nullptr};
+    ITensor                                  *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwisePower> op{nullptr};
 };
 
-NEElementwisePower::NEElementwisePower()
-    : _impl(std::make_unique<Impl>())
+NEElementwisePower::NEElementwisePower() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default;
+NEElementwisePower::NEElementwisePower(NEElementwisePower &&)            = default;
 NEElementwisePower &NEElementwisePower::operator=(NEElementwisePower &&) = default;
 NEElementwisePower::~NEElementwisePower()                                = default;
 
-void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwisePower::configure(ITensor                   *input1,
+                                   ITensor                   *input2,
+                                   ITensor                   *output,
+                                   const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     _impl->src_0 = input1;
@@ -221,7 +238,10 @@ void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *ou
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
-Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwisePower::validate(const ITensorInfo         *input1,
+                                    const ITensorInfo         *input2,
+                                    const ITensorInfo         *output,
+                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return cpu::CpuElementwisePower::validate(input1, input2, output);
@@ -239,22 +259,22 @@ void NEElementwisePower::run()
 template <ComparisonOperation COP>
 struct NEElementwiseComparisonStatic<COP>::Impl
 {
-    const ITensor                                            *src_0{ nullptr };
-    const ITensor                                            *src_1{ nullptr };
-    ITensor                                                  *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseComparisonStatic<COP>> op{ nullptr };
+    const ITensor                                            *src_0{nullptr};
+    const ITensor                                            *src_1{nullptr};
+    ITensor                                                  *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseComparisonStatic<COP>> op{nullptr};
 };
 
 template <ComparisonOperation COP>
-NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic() : _impl(std::make_unique<Impl>())
 {
 }
 template <ComparisonOperation COP>
 NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic(NEElementwiseComparisonStatic &&) = default;
-template <ComparisonOperation       COP>
-NEElementwiseComparisonStatic<COP> &NEElementwiseComparisonStatic<COP>::operator=(NEElementwiseComparisonStatic &&) = default;
-template <ComparisonOperation       COP>
+template <ComparisonOperation COP>
+NEElementwiseComparisonStatic<COP> &
+NEElementwiseComparisonStatic<COP>::operator=(NEElementwiseComparisonStatic &&) = default;
+template <ComparisonOperation COP>
 NEElementwiseComparisonStatic<COP>::~NEElementwiseComparisonStatic() = default;
 
 template <ComparisonOperation COP>
@@ -268,13 +288,15 @@ void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *inp
 }
 
 template <ComparisonOperation COP>
-Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1,
+                                                    const ITensorInfo *input2,
+                                                    const ITensorInfo *output)
 {
     return cpu::CpuElementwiseComparisonStatic<COP>::validate(input1, input2, output);
 }
 
 template <ComparisonOperation COP>
-void                          NEElementwiseComparisonStatic<COP>::run()
+void NEElementwiseComparisonStatic<COP>::run()
 {
     ITensorPack pack;
     pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
@@ -285,17 +307,16 @@ void                          NEElementwiseComparisonStatic<COP>::run()
 
 struct NEElementwiseComparison::Impl
 {
-    const ITensor                                 *src_0{ nullptr };
-    const ITensor                                 *src_1{ nullptr };
-    ITensor                                       *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseComparison> op{ nullptr };
+    const ITensor                                 *src_0{nullptr};
+    const ITensor                                 *src_1{nullptr};
+    ITensor                                       *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseComparison> op{nullptr};
 };
 
-NEElementwiseComparison::NEElementwiseComparison()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseComparison::NEElementwiseComparison() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default;
+NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&)            = default;
 NEElementwiseComparison &NEElementwiseComparison::operator=(NEElementwiseComparison &&) = default;
 NEElementwiseComparison::~NEElementwiseComparison()                                     = default;
 
@@ -308,7 +329,10 @@ void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITenso
     _impl->op->configure(input1->info(), input2->info(), output->info(), op);
 }
 
-Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op)
+Status NEElementwiseComparison::validate(const ITensorInfo  *input1,
+                                         const ITensorInfo  *input2,
+                                         const ITensorInfo  *output,
+                                         ComparisonOperation op)
 {
     return cpu::CpuElementwiseComparison::validate(input1, input2, output, op);
 }
diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
index a0674ec320c6670f8c64ae1442cba0fb6f302ef7..23a092c4076ee96f16b5af14dfae1f885d8f0c2b 100644
--- a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
@@ -22,7 +22,9 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h"
+
 #include "src/cpu/operators/CpuElementwiseUnary.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -32,21 +34,20 @@ using OperatorType = cpu::CpuElementwiseUnary;
 template <ElementWiseUnary op>
 struct NEElementwiseUnaryLayer<op>::Impl
 {
-    const ITensor                *src{ nullptr };
-    ITensor                      *dst{ nullptr };
-    std::unique_ptr<OperatorType> cpu_op{ nullptr };
+    const ITensor                *src{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<OperatorType> cpu_op{nullptr};
 };
 
 template <ElementWiseUnary op>
-NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer() : _impl(std::make_unique<Impl>())
 {
 }
 template <ElementWiseUnary op>
 NEElementwiseUnaryLayer<op>::~NEElementwiseUnaryLayer() = default;
 template <ElementWiseUnary op>
 NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer(NEElementwiseUnaryLayer &&) = default;
-template <ElementWiseUnary   op>
+template <ElementWiseUnary op>
 NEElementwiseUnaryLayer<op> &NEElementwiseUnaryLayer<op>::operator=(NEElementwiseUnaryLayer &&) = default;
 
 template <ElementWiseUnary op>
@@ -65,7 +66,7 @@ Status NEElementwiseUnaryLayer<op>::validate(const ITensorInfo *input, const ITe
 }
 
 template <ElementWiseUnary op>
-void                       NEElementwiseUnaryLayer<op>::run()
+void NEElementwiseUnaryLayer<op>::run()
 {
     ITensorPack pack;
     pack.add_tensor(TensorType::ACL_SRC, _impl->src);
diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp
index 343b817ebabb964f19887805f9e6abd48ec96f69..fb75f9da29c7ba9bfdb8d999eb1d71692492bcd5 100644
--- a/src/runtime/NEON/functions/NEFFT1D.cpp
+++ b/src/runtime/NEON/functions/NEFFT1D.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
 #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
@@ -37,7 +38,15 @@ namespace arm_compute
 NEFFT1D::~NEFFT1D() = default;
 
 NEFFT1D::NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false)
+    : _memory_group(std::move(memory_manager)),
+      _digit_reverse_kernel(),
+      _fft_kernels(),
+      _scale_kernel(),
+      _digit_reversed_input(),
+      _digit_reverse_indices(),
+      _num_ffts(0),
+      _axis(0),
+      _run_scale(false)
 {
 }
 
@@ -74,7 +83,7 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
     _fft_kernels.resize(_num_ffts);
     _axis = config.axis;
 
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
         const unsigned int radix_for_stage = decomposed_vector.at(i);
 
@@ -84,19 +93,21 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
         fft_kernel_info.Nx             = Nx;
         fft_kernel_info.is_first_stage = (i == 0);
         _fft_kernels[i]                = std::make_unique<NEFFTRadixStageKernel>();
-        _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+        _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr,
+                                   fft_kernel_info);
 
         Nx *= radix_for_stage;
     }
 
     // Configure scale kernel
-    if(_run_scale)
+    if (_run_scale)
     {
         FFTScaleKernelInfo scale_config;
         scale_config.scale     = static_cast<float>(N);
         scale_config.conjugate = config.direction == FFTDirection::Inverse;
         _scale_kernel          = std::make_unique<NEFFTScaleKernel>();
-        is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config);
+        is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config)
+                        : _scale_kernel->configure(output, nullptr, scale_config);
     }
 
     // Allocate tensors
@@ -113,7 +124,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
 
     // Check if FFT is decomposable
     const auto         supported_radix   = NEFFTRadixStageKernel::supported_radix();
@@ -122,7 +133,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         // All combinations are supported except real input with real output (i.e., both input channels set to 1)
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
@@ -140,13 +151,13 @@ void NEFFT1D::run()
 
     NEScheduler::get().schedule(_digit_reverse_kernel.get(), (_axis == 0 ? Window::DimY : Window::DimZ));
 
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
         NEScheduler::get().schedule(_fft_kernels[i].get(), (_axis == 0 ? Window::DimY : Window::DimX));
     }
 
     // Run output scaling
-    if(_run_scale)
+    if (_run_scale)
     {
         NEScheduler::get().schedule(_scale_kernel.get(), Window::DimY);
     }
diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp
index ab422bd2ae0667b0b5af6c2f40ac8b9fcab2fb04..066909221d40aff274560b376273ef6a98fb70f3 100644
--- a/src/runtime/NEON/functions/NEFFT2D.cpp
+++ b/src/runtime/NEON/functions/NEFFT2D.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Scheduler.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -33,7 +34,10 @@ namespace arm_compute
 NEFFT2D::~NEFFT2D() = default;
 
 NEFFT2D::NEFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+    : _memory_group(memory_manager),
+      _first_pass_func(memory_manager),
+      _second_pass_func(memory_manager),
+      _first_pass_tensor()
 {
 }
 
@@ -78,7 +82,7 @@ Status NEFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(&first_pass_tensor, output, second_pass_config));
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
index 0551d756fb06981e526f25223bc3ed1751a78d03..94f85e5ffaa2ee2d27c185a439e991f2ad2bdde9 100644
--- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
@@ -25,15 +25,16 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
 #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
 #include "src/core/NEON/kernels/NEFFTScaleKernel.h"
 #include "src/core/NEON/kernels/NEPadLayerKernel.h"
 #include "src/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/utils/helpers/fft.h"
 
 namespace arm_compute
@@ -46,11 +47,11 @@ int pad_decomposable(int N)
 
     int  pad           = 0;
     bool is_decomposed = false;
-    while(!is_decomposed)
+    while (!is_decomposed)
     {
         const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
         is_decomposed                = !decomposed_vector.empty();
-        if(!is_decomposed)
+        if (!is_decomposed)
         {
             ++pad;
         }
@@ -102,8 +103,13 @@ NEFFTConvolutionLayer::NEFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem
 }
 NEFFTConvolutionLayer::~NEFFTConvolutionLayer() = default;
 
-void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info, bool enable_fast_math)
+void NEFFTConvolutionLayer::configure(ITensor                   *input,
+                                      const ITensor             *weights,
+                                      const ITensor             *biases,
+                                      ITensor                   *output,
+                                      const PadStrideInfo       &conv_info,
+                                      const ActivationLayerInfo &act_info,
+                                      bool                       enable_fast_math)
 {
     ARM_COMPUTE_UNUSED(enable_fast_math);
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math);
@@ -115,21 +121,24 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     _has_bias = biases != nullptr;
 
     // Get indices for the width and height
-    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height =
+        get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
     // Input shape, kernel size and output tile
-    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
-    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
-    const Size2D pad_valid   = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
-                                      pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+    const Size2D input_dims =
+        Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+    const Size2D kernel_size =
+        Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+    const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+                                    pad_decomposable(input_dims.y() + kernel_size.y() - 1));
     // Tensors to use
     ITensor       *input_to_use   = input;
     const ITensor *weights_to_use = weights;
     ITensor       *output_to_use  = _has_bias ? &_bias_output : output;
 
     // Permute bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
         _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
@@ -137,7 +146,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
 
     // Permute input if needed
     _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _memory_group.manage(&_permuted_input);
         // Configure the function to transform the input tensor from NHWC -> NCHW
@@ -158,7 +167,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis);
 
     // Pad weights
-    const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}};
     _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w);
 
     // Transform weights
@@ -166,10 +175,10 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo());
 
     // Pad input
-    const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}};
     _memory_group.manage(&_padded_input);
     _pad_input_func.configure(input_to_use, &_padded_input, padding_in);
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permuted_input.allocator()->allocate();
     }
@@ -193,7 +202,8 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     _memory_group.manage(&_itransformed_output);
     FFT2DInfo itranform_info;
     itranform_info.direction = FFTDirection::Inverse;
-    _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+    _itransformed_output.allocator()->init(
+        _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
     _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info);
     _output_reduced.allocator()->allocate();
 
@@ -205,26 +215,29 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     // Extract correct region
     const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
     const int start_top  = kernel_size.y() - conv_info.pad_top() - 1;
-    const int end_right  = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
-    const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
-    if(_has_bias)
+    const int end_right =
+        _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+    const int end_botton =
+        _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+    if (_has_bias)
     {
         _memory_group.manage(&_bias_output);
     }
-    else if(_needs_permute)
+    else if (_needs_permute)
     {
         output_to_use = &_permuted_output;
         _memory_group.manage(&_permuted_output);
     }
-    _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+    _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top),
+                                   Coordinates(end_right, end_botton));
     _reshaped_output.allocator()->allocate();
     _itransformed_output.allocator()->allocate();
 
     // Add bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         output_to_use = output;
-        if(_needs_permute)
+        if (_needs_permute)
         {
             output_to_use = &_permuted_output;
             _memory_group.manage(&_permuted_output);
@@ -235,7 +248,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     }
 
     // Permute output
-    if(_needs_permute)
+    if (_needs_permute)
     {
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
         _permuted_output.info()->set_data_layout(DataLayout::NCHW);
@@ -247,7 +260,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
 
     // Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.configure(output, nullptr, act_info);
     }
@@ -260,8 +273,13 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     axis_data[1]   = 1;
 }
 
-Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                       const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status NEFFTConvolutionLayer::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *weights,
+                                       const ITensorInfo         *biases,
+                                       const ITensorInfo         *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const ActivationLayerInfo &act_info,
+                                       bool                       enable_fast_math)
 {
     ARM_COMPUTE_UNUSED(enable_fast_math);
 
@@ -279,11 +297,13 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn
     const auto strides = conv_info.stride();
     ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) ||
+                                conv_info.pad_right() != (kernel_size.x() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) ||
+                                conv_info.pad_bottom() != (kernel_size.y() / 2));
 
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
@@ -291,13 +311,14 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn
     }
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) ||
+                                    (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
 
         // Validate Activation Layer
-        if(act_info.enabled())
+        if (act_info.enabled())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
         }
@@ -313,7 +334,7 @@ void NEFFTConvolutionLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Transform input
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_input_func.run();
     }
@@ -331,17 +352,17 @@ void NEFFTConvolutionLayer::run()
     _extract_output_func.run();
 
     // Add bias
-    if(_has_bias)
+    if (_has_bias)
     {
         _bias_add_func.run();
     }
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_output_func.run();
     }
 
     // Run activation layer
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.run();
     }
@@ -349,10 +370,10 @@ void NEFFTConvolutionLayer::run()
 
 void NEFFTConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Permute bias to NCHW
-        if(_original_bias != nullptr)
+        if (_original_bias != nullptr)
         {
             _permuted_bias.allocator()->allocate();
             _permute_bias_func.run();
@@ -362,7 +383,7 @@ void NEFFTConvolutionLayer::prepare()
         const ITensor *cur_weights = _original_weights;
 
         // Permute weights
-        if(_needs_permute)
+        if (_needs_permute)
         {
             ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
 
diff --git a/src/runtime/NEON/functions/NEFill.cpp b/src/runtime/NEON/functions/NEFill.cpp
index 43667783bfbc218c3ad39815decdd529479cfc08..bc1d5b7f5cde354a63dae13985d24a63acca2561 100644
--- a/src/runtime/NEON/functions/NEFill.cpp
+++ b/src/runtime/NEON/functions/NEFill.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEFill.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuFill.h"
 
 #include <utility>
@@ -32,15 +33,14 @@ namespace arm_compute
 {
 struct NEFill::Impl
 {
-    ITensor                      *tensor{ nullptr };
-    std::unique_ptr<cpu::CpuFill> op{ nullptr };
+    ITensor                      *tensor{nullptr};
+    std::unique_ptr<cpu::CpuFill> op{nullptr};
 };
 
-NEFill::NEFill()
-    : _impl(std::make_unique<Impl>())
+NEFill::NEFill() : _impl(std::make_unique<Impl>())
 {
 }
-NEFill::NEFill(NEFill &&) = default;
+NEFill::NEFill(NEFill &&)            = default;
 NEFill &NEFill::operator=(NEFill &&) = default;
 NEFill::~NEFill()                    = default;
 
diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
index d633e340f87002bc84cf362d9d9afcfcd8db43a8..a3ab9c3db4a1e009297354844db422c272c0480e 100644
--- a/src/runtime/NEON/functions/NEFillBorder.cpp
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp
@@ -25,17 +25,20 @@
 
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 
 namespace arm_compute
 {
-NEFillBorder::NEFillBorder()
-    : _border_handler(nullptr)
+NEFillBorder::NEFillBorder() : _border_handler(nullptr)
 {
 }
 
-void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorder::configure(ITensor          *input,
+                             unsigned int      border_width,
+                             BorderMode        border_mode,
+                             const PixelValue &constant_border_value)
 {
     ARM_COMPUTE_LOG_PARAMS(input, border_width, border_mode, constant_border_value);
     _border_handler = std::make_unique<NEFillBorderKernel>();
diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index f4358426344be3ed0a9b721da8f5d91daa1897a9..56db2be3fa22c7948e0b7adb585c9e3e0ff2fcc2 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
 
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/cpu/operators/CpuFlatten.h"
 
@@ -33,16 +34,15 @@ namespace arm_compute
 {
 struct NEFlattenLayer::Impl
 {
-    const ITensor                   *src{ nullptr };
-    ITensor                         *dst{ nullptr };
-    std::unique_ptr<cpu::CpuFlatten> op{ nullptr };
+    const ITensor                   *src{nullptr};
+    ITensor                         *dst{nullptr};
+    std::unique_ptr<cpu::CpuFlatten> op{nullptr};
 };
 
-NEFlattenLayer::NEFlattenLayer()
-    : _impl(std::make_unique<Impl>())
+NEFlattenLayer::NEFlattenLayer() : _impl(std::make_unique<Impl>())
 {
 }
-NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&) = default;
+NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&)            = default;
 NEFlattenLayer &NEFlattenLayer::operator=(NEFlattenLayer &&) = default;
 NEFlattenLayer::~NEFlattenLayer()                            = default;
 
@@ -51,7 +51,8 @@ void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     _impl->src = input;
     _impl->dst = output;
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info())));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(
+                                            misc::shape_calculator::compute_flatten_shape(input->info())));
 
     _impl->op = std::make_unique<cpu::CpuFlatten>();
     _impl->op->configure(_impl->src->info(), _impl->dst->info());
@@ -60,9 +61,10 @@ void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
 Status NEFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
     }
     return cpu::CpuFlatten::validate(input, output);
diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp
index d2dc48a15951ed0be85320e23da1388c4b7377a1..112c93c478cbc21714042307cc8dd510573447a9 100644
--- a/src/runtime/NEON/functions/NEFloor.cpp
+++ b/src/runtime/NEON/functions/NEFloor.cpp
@@ -24,22 +24,22 @@
 #include "arm_compute/runtime/NEON/functions/NEFloor.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuFloor.h"
 
 namespace arm_compute
 {
 struct NEFloor::Impl
 {
-    const ITensor                 *src{ nullptr };
-    ITensor                       *dst{ nullptr };
-    std::unique_ptr<cpu::CpuFloor> op{ nullptr };
+    const ITensor                 *src{nullptr};
+    ITensor                       *dst{nullptr};
+    std::unique_ptr<cpu::CpuFloor> op{nullptr};
 };
 
-NEFloor::NEFloor()
-    : _impl(std::make_unique<Impl>())
+NEFloor::NEFloor() : _impl(std::make_unique<Impl>())
 {
 }
-NEFloor::NEFloor(NEFloor &&) = default;
+NEFloor::NEFloor(NEFloor &&)            = default;
 NEFloor &NEFloor::operator=(NEFloor &&) = default;
 NEFloor::~NEFloor()                     = default;
 
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 891487efd3e915e1107f9f7ff13020f173f32b1d..2656d0fa0fec959b63d9f171cf3a5b4968214d96 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuFullyConnected.h"
@@ -38,80 +39,90 @@ using namespace arm_compute::experimental;
 struct NEFullyConnectedLayer::Impl
 {
     MemoryGroup      memory_group{};
-    IWeightsManager *weights_manager{ nullptr };
+    IWeightsManager *weights_manager{nullptr};
 
-    std::unique_ptr<cpu::CpuFullyConnected> op{ nullptr };
+    std::unique_ptr<cpu::CpuFullyConnected> op{nullptr};
 
-    const ITensor *original_weights{ nullptr };
+    const ITensor *original_weights{nullptr};
 
     ITensorPack                      run_pack{};
     WorkspaceData<Tensor>            workspace{};
     experimental::MemoryRequirements aux_mem_req{};
 
-    bool is_prepared{ false };
-    bool dynamic_weights{ false };
+    bool is_prepared{false};
+    bool dynamic_weights{false};
 };
 
 NEFullyConnectedLayer::~NEFullyConnectedLayer() = default;
 
-NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager,
+                                             IWeightsManager                *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group    = MemoryGroup(std::move(memory_manager));
     _impl->weights_manager = weights_manager;
 }
 
-void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
-                                      FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info)
+void NEFullyConnectedLayer::configure(const ITensor          *input,
+                                      const ITensor          *weights,
+                                      const ITensor          *biases,
+                                      ITensor                *output,
+                                      FullyConnectedLayerInfo fc_info,
+                                      const WeightsInfo      &weights_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(),
-                                                               weights->info(),
+    ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(), weights->info(),
                                                                biases != nullptr ? biases->info() : nullptr,
-                                                               output->info(),
-                                                               fc_info,
-                                                               weights_info));
+                                                               output->info(), fc_info, weights_info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, fc_info);
 
     _impl->op               = std::make_unique<cpu::CpuFullyConnected>();
     _impl->original_weights = weights;
     _impl->is_prepared      = false;
 
-    _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info, weights_info);
+    _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
+                         fc_info, weights_info);
 
-    if(_impl->weights_manager != nullptr)
+    if (_impl->weights_manager != nullptr)
     {
         _impl->weights_manager->manage(_impl->original_weights);
     }
 
     _impl->aux_mem_req = _impl->op->workspace();
-    _impl->run_pack    = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-    _impl->workspace   = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
-
-    _impl->dynamic_weights =
-        !weights->info()->are_values_constant() &&
-        fc_info.transpose_weights &&
-        !fc_info.are_weights_reshaped &&
-        !fc_info.retain_internal_weights;
+    _impl->run_pack    = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+    _impl->workspace =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
+
+    _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights &&
+                             !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights;
 }
 
-Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *input, const ITensorInfo *weights,
-                                           const ITensorInfo *biases, const ITensorInfo *output, const FullyConnectedLayerInfo &fc_info,
-                                           const WeightsInfo &weights_info)
+Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat     &expected_weight_format,
+                                           const ITensorInfo             *input,
+                                           const ITensorInfo             *weights,
+                                           const ITensorInfo             *biases,
+                                           const ITensorInfo             *output,
+                                           const FullyConnectedLayerInfo &fc_info,
+                                           const WeightsInfo             &weights_info)
 {
-    return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info, weights_info);
+    return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info,
+                                                weights_info);
 }
 
-Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                       FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info)
+Status NEFullyConnectedLayer::validate(const ITensorInfo      *input,
+                                       const ITensorInfo      *weights,
+                                       const ITensorInfo      *biases,
+                                       const ITensorInfo      *output,
+                                       FullyConnectedLayerInfo fc_info,
+                                       const WeightsInfo      &weights_info)
 {
     return cpu::CpuFullyConnected::validate(input, weights, biases, output, fc_info, weights_info);
 }
 
 void NEFullyConnectedLayer::run()
 {
-    if(!_impl->dynamic_weights)
+    if (!_impl->dynamic_weights)
     {
         prepare();
     }
@@ -122,7 +133,7 @@ void NEFullyConnectedLayer::run()
 
 void NEFullyConnectedLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 
@@ -131,13 +142,13 @@ void NEFullyConnectedLayer::prepare()
         _impl->is_prepared = true;
 
         // Handle weights managed infrastructure
-        if(_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
+        if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
         {
             // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare
             // This is for cases where multiple functions share the same b (weights)
             // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference
             const ITensor *original_b = _impl->original_weights;
-            if(!original_b->is_used())
+            if (!original_b->is_used())
             {
                 _impl->weights_manager->pre_mark_as_unused(original_b);
             }
diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
index 6612845d86825da16edff7f2b91d15d6c4069108..f5b8b57dacf0aeea71c24a4ee0f86d45a670192d 100644
--- a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
+++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
 
@@ -35,29 +36,42 @@ namespace arm_compute
 {
 NEFuseBatchNormalization::~NEFuseBatchNormalization() = default;
 
-NEFuseBatchNormalization::NEFuseBatchNormalization()
-    : _fuse_bn_kernel()
+NEFuseBatchNormalization::NEFuseBatchNormalization() : _fuse_bn_kernel()
 {
 }
 
-void NEFuseBatchNormalization::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var,
-                                         ITensor *fused_weights, ITensor *fused_bias,
-                                         const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
-                                         float epsilon, FuseBatchNormalizationType fbn_type)
+void NEFuseBatchNormalization::configure(const ITensor             *input_weights,
+                                         const ITensor             *bn_mean,
+                                         const ITensor             *bn_var,
+                                         ITensor                   *fused_weights,
+                                         ITensor                   *fused_bias,
+                                         const ITensor             *input_bias,
+                                         const ITensor             *bn_beta,
+                                         const ITensor             *bn_gamma,
+                                         float                      epsilon,
+                                         FuseBatchNormalizationType fbn_type)
 {
-    ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias,
-                           bn_beta, bn_gamma, epsilon, fbn_type);
+    ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+                           epsilon, fbn_type);
 
     _fuse_bn_kernel = std::make_unique<NEFuseBatchNormalizationKernel>();
-    _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+                               epsilon, fbn_type);
 }
 
-Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status NEFuseBatchNormalization::validate(const ITensorInfo         *input_weights,
+                                          const ITensorInfo         *bn_mean,
+                                          const ITensorInfo         *bn_var,
+                                          const ITensorInfo         *fused_weights,
+                                          const ITensorInfo         *fused_bias,
+                                          const ITensorInfo         *input_bias,
+                                          const ITensorInfo         *bn_beta,
+                                          const ITensorInfo         *bn_gamma,
+                                          float                      epsilon,
+                                          FuseBatchNormalizationType fbn_type)
 {
-    return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                    input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
 void NEFuseBatchNormalization::run()
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index e51f2f9eb609873fbac41edc3fed1757c29d2ef6..934a8250cccc81ec743fb58c65b4ec8b60346d26 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuGemm.h"
@@ -39,12 +40,12 @@ namespace arm_compute
 struct NEGEMM::Impl
 {
     MemoryGroup      memory_group{};
-    IWeightsManager *weights_manager{ nullptr };
+    IWeightsManager *weights_manager{nullptr};
 
-    std::unique_ptr<cpu::CpuGemm> op{ nullptr };
+    std::unique_ptr<cpu::CpuGemm> op{nullptr};
 
-    const ITensor *original_b{ nullptr };
-    bool           is_prepared{ false };
+    const ITensor *original_b{nullptr};
+    bool           is_prepared{false};
 
     ITensorPack                      run_pack{};
     ITensorPack                      prep_pack{};
@@ -61,10 +62,17 @@ NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *
 
 NEGEMM::~NEGEMM() = default;
 
-void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info)
+void NEGEMM::configure(const ITensor  *a,
+                       const ITensor  *b,
+                       const ITensor  *c,
+                       ITensor        *d,
+                       float           alpha,
+                       float           beta,
+                       const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
-    ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info));
+    ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr,
+                                                      d->info(), alpha, beta, gemm_info));
 
     // Check if we need to reshape the matrix B only on the first run
     _impl->is_prepared = false;
@@ -73,24 +81,32 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
 
     // Make the B matrix dynamic values.
     auto b_info_to_use = b->info()->clone();
-    if(!gemm_info.reshape_b_only_on_first_run())
+    if (!gemm_info.reshape_b_only_on_first_run())
     {
         b_info_to_use->set_are_values_constant(false);
     }
 
-    _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info);
+    _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta,
+                         gemm_info);
 
     _impl->aux_mem_req = _impl->op->workspace();
-    _impl->run_pack    = { { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_SRC_2, c }, { ACL_DST, d } };
-    _impl->prep_pack   = { { ACL_SRC_1, b }, { ACL_SRC_2, c } };
-    _impl->workspace   = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->run_pack    = {{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_SRC_2, c}, {ACL_DST, d}};
+    _impl->prep_pack   = {{ACL_SRC_1, b}, {ACL_SRC_2, c}};
+    _impl->workspace =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
-Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status NEGEMM::validate(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        const ITensorInfo *output,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
 {
     // Make the B matrix dynamic values.
     auto b_to_use = b->clone();
-    if(!gemm_info.reshape_b_only_on_first_run())
+    if (!gemm_info.reshape_b_only_on_first_run())
     {
         b_to_use->set_are_values_constant(false);
     }
@@ -98,8 +114,14 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
     return cpu::CpuGemm::validate(a, b_to_use.get(), c, output, alpha, beta, gemm_info);
 }
 
-Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output,
-                            float alpha, float beta, const GEMMInfo &gemm_info)
+Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                            const ITensorInfo         *a,
+                            const ITensorInfo         *b,
+                            const ITensorInfo         *c,
+                            const ITensorInfo         *output,
+                            float                      alpha,
+                            float                      beta,
+                            const GEMMInfo            &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha, beta);
     return cpu::CpuGemm::has_opt_impl(expected_weight_format, a, b, c, output, gemm_info);
@@ -115,15 +137,15 @@ void NEGEMM::run()
 
 void NEGEMM::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
 
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->original_b->mark_as_unused();
         }
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
index 42b8b704053987588a0fff9892c3d996d511e862..6cca02eea9a03018f32f1ddaea6681780ce7cda2 100644
--- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuGemmDirectConv2d.h"
 
@@ -35,25 +36,25 @@ using namespace arm_compute::experimental;
 
 struct NEGEMMConv2d::Impl
 {
-    const ITensor                   *weights{ nullptr };
-    std::unique_ptr<OperatorType>    op{ nullptr };
+    const ITensor                   *weights{nullptr};
+    std::unique_ptr<OperatorType>    op{nullptr};
     ITensorPack                      run_pack{};
     ITensorPack                      prep_pack{};
     WorkspaceData<Tensor>            workspace{};
     MemoryGroup                      memory_group{};
-    bool                             is_prepared{ false };
+    bool                             is_prepared{false};
     experimental::MemoryRequirements aux_mem_req{};
 };
 
-NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group = MemoryGroup(memory_manager);
 }
 
 NEGEMMConv2d::~NEGEMMConv2d() = default;
 
-void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info)
+void NEGEMMConv2d::configure(
+    ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -61,15 +62,21 @@ void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITens
     _impl->is_prepared = false;
     _impl->op          = std::make_unique<OperatorType>();
 
-    _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), info);
+    _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+                         info);
 
     _impl->aux_mem_req = _impl->op->workspace();
-    _impl->run_pack    = { { TensorType::ACL_SRC_0, input }, { TensorType::ACL_SRC_2, biases }, { TensorType::ACL_DST, output } };
-    _impl->prep_pack   = { { TensorType::ACL_SRC_1, weights }, { TensorType::ACL_SRC_2, biases } };
-    _impl->workspace   = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->run_pack  = {{TensorType::ACL_SRC_0, input}, {TensorType::ACL_SRC_2, biases}, {TensorType::ACL_DST, output}};
+    _impl->prep_pack = {{TensorType::ACL_SRC_1, weights}, {TensorType::ACL_SRC_2, biases}};
+    _impl->workspace =
+        manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
-Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info)
+Status NEGEMMConv2d::validate(const ITensorInfo *input,
+                              const ITensorInfo *weights,
+                              const ITensorInfo *biases,
+                              const ITensorInfo *output,
+                              const Conv2dInfo  &info)
 {
     return OperatorType::validate(input, weights, biases, output, info);
 }
@@ -84,15 +91,15 @@ void NEGEMMConv2d::run()
 
 void NEGEMMConv2d::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
 
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->weights->mark_as_unused();
         }
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index fe3ea6a76742c4f9e7feba1de26032a00341eff9..c8f65d2fd972c317d1c5318ca43079a33cba37c6 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuGemmConv2d.h"
 
@@ -36,17 +37,18 @@ namespace arm_compute
 {
 struct NEGEMMConvolutionLayer::Impl
 {
-    const ITensor                      *weights{ nullptr };
-    std::unique_ptr<cpu::CpuGemmConv2d> op{ nullptr };
+    const ITensor                      *weights{nullptr};
+    std::unique_ptr<cpu::CpuGemmConv2d> op{nullptr};
     ITensorPack                         run_pack{};
     MemoryGroup                         memory_group{};
-    IWeightsManager                    *weights_manager{ nullptr };
+    IWeightsManager                    *weights_manager{nullptr};
     MemoryRequirements                  aux_mem_req{};
     WorkspaceData<Tensor>               workspace_tensors{};
-    bool                                is_prepared{ false };
+    bool                                is_prepared{false};
 };
 
-NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, IWeightsManager *weights_manager)
+NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager,
+                                               IWeightsManager                       *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->weights_manager = weights_manager;
@@ -54,37 +56,61 @@ NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryMana
 }
 NEGEMMConvolutionLayer::~NEGEMMConvolutionLayer() = default;
 
-void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                       const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void NEGEMMConvolutionLayer::configure(const ITensor             *input,
+                                       const ITensor             *weights,
+                                       const ITensor             *biases,
+                                       ITensor                   *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const WeightsInfo         &weights_info,
+                                       const Size2D              &dilation,
+                                       const ActivationLayerInfo &act_info,
+                                       bool                       enable_fast_math,
+                                       unsigned int               num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
     _impl->weights = weights;
     _impl->op      = std::make_unique<cpu::CpuGemmConv2d>();
-    _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(),
+                         conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, input },
-        { TensorType::ACL_SRC_1, weights },
-        { TensorType::ACL_SRC_2, biases },
-        { TensorType::ACL_DST, output }
-    };
-    _impl->aux_mem_req       = _impl->op->workspace();
-    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
+    _impl->run_pack    = {{TensorType::ACL_SRC_0, input},
+                          {TensorType::ACL_SRC_1, weights},
+                          {TensorType::ACL_SRC_2, biases},
+                          {TensorType::ACL_DST, output}};
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->workspace_tensors =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
 }
 
-Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status NEGEMMConvolutionLayer::validate(const ITensorInfo         *input,
+                                        const ITensorInfo         *weights,
+                                        const ITensorInfo         *biases,
+                                        const ITensorInfo         *output,
+                                        const PadStrideInfo       &conv_info,
+                                        const WeightsInfo         &weights_info,
+                                        const Size2D              &dilation,
+                                        const ActivationLayerInfo &act_info,
+                                        bool                       enable_fast_math,
+                                        unsigned int               num_groups)
 {
-    return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                                        enable_fast_math, num_groups);
 }
 
-Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                            const PadStrideInfo &conv_info,
-                                            const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, const bool enable_fast_math)
+Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                                            const ITensorInfo         *src,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *dst,
+                                            const PadStrideInfo       &conv_info,
+                                            const WeightsInfo         &weights_info,
+                                            const Size2D              &dilation,
+                                            const ActivationLayerInfo &act_info,
+                                            const bool                 enable_fast_math)
 {
-    return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math);
+    return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info,
+                                            dilation, act_info, enable_fast_math);
 }
 
 void NEGEMMConvolutionLayer::run()
@@ -96,7 +122,7 @@ void NEGEMMConvolutionLayer::run()
 
 void NEGEMMConvolutionLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 453d3cedef738f2b81e4dcf7a54311d9bfe212ca..44bfc6a51e114aec01b82cdab9871b54f2ab24ca 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -29,8 +29,8 @@
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "src/core/helpers/MemoryHelpers.h"
 
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
 
 using namespace arm_compute::experimental;
@@ -39,18 +39,19 @@ namespace arm_compute
 {
 struct NEGEMMLowpMatrixMultiplyCore::Impl
 {
-    const ITensor                                      *b{ nullptr };
-    std::unique_ptr<cpu::CpuGemmLowpMatrixMultiplyCore> op{ nullptr };
+    const ITensor                                      *b{nullptr};
+    std::unique_ptr<cpu::CpuGemmLowpMatrixMultiplyCore> op{nullptr};
     ITensorPack                                         run_pack{};
     ITensorPack                                         prep_pack{};
     MemoryGroup                                         memory_group{};
-    IWeightsManager                                    *weights_manager{ nullptr };
+    IWeightsManager                                    *weights_manager{nullptr};
     MemoryRequirements                                  aux_mem_req{};
     WorkspaceData<Tensor>                               workspace_tensors{};
-    bool                                                is_prepared{ false };
+    bool                                                is_prepared{false};
 };
 
-NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager,
+                                                           IWeightsManager                *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->weights_manager = weights_manager;
@@ -58,41 +59,41 @@ NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemo
 }
 NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
 
-void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
+void NEGEMMLowpMatrixMultiplyCore::configure(
+    const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
     // Make the B matrix dynamic values.
     auto b_info_to_use = b->info()->clone();
-    if(!gemm_info.reshape_b_only_on_first_run())
+    if (!gemm_info.reshape_b_only_on_first_run())
     {
         b_info_to_use->set_are_values_constant(false);
     }
 
     _impl->b  = b;
     _impl->op = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();
-    _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(), gemm_info);
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, a },
-        { TensorType::ACL_SRC_1, b },
-        { TensorType::ACL_SRC_2, c },
-        { TensorType::ACL_DST, output }
-    };
-    _impl->prep_pack =
-    {
-        { TensorType::ACL_SRC_1, b },
-        { TensorType::ACL_SRC_2, c }
-    };
-    _impl->aux_mem_req       = _impl->op->workspace();
-    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(),
+                         gemm_info);
+    _impl->run_pack    = {{TensorType::ACL_SRC_0, a},
+                          {TensorType::ACL_SRC_1, b},
+                          {TensorType::ACL_SRC_2, c},
+                          {TensorType::ACL_DST, output}};
+    _impl->prep_pack   = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}};
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->workspace_tensors =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
-Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                              const ITensorInfo *b,
+                                              const ITensorInfo *c,
+                                              const ITensorInfo *output,
+                                              const GEMMInfo    &gemm_info)
 {
     // Make the B matrix dynamic values.
     auto b_info_to_use = b->clone();
-    if(!gemm_info.reshape_b_only_on_first_run())
+    if (!gemm_info.reshape_b_only_on_first_run())
     {
         b_info_to_use->set_are_values_constant(false);
     }
@@ -109,15 +110,15 @@ void NEGEMMLowpMatrixMultiplyCore::run()
 
 void NEGEMMLowpMatrixMultiplyCore::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
 
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->b->mark_as_unused();
         }
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index 7e1de3c2573d1982a18361b174fed7013eb41114..8178003b5e9c0d5b7ce89e18f1b9bcb463aefda4 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -25,45 +25,48 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuGemmLowpOutputStage.h"
 
 namespace arm_compute
 {
 struct NEGEMMLowpOutputStage::Impl
 {
-    const ITensor                               *src{ nullptr };
-    const ITensor                               *bias{ nullptr };
-    ITensor                                     *dst{ nullptr };
+    const ITensor                               *src{nullptr};
+    const ITensor                               *bias{nullptr};
+    ITensor                                     *dst{nullptr};
     ITensorPack                                  run_pack{};
-    std::unique_ptr<cpu::CpuGemmLowpOutputStage> op{ nullptr };
+    std::unique_ptr<cpu::CpuGemmLowpOutputStage> op{nullptr};
 };
 
-NEGEMMLowpOutputStage::NEGEMMLowpOutputStage()
-    : _impl(std::make_unique<Impl>())
+NEGEMMLowpOutputStage::NEGEMMLowpOutputStage() : _impl(std::make_unique<Impl>())
 {
 }
 NEGEMMLowpOutputStage::~NEGEMMLowpOutputStage() = default;
 
-void NEGEMMLowpOutputStage::configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo &info)
+void NEGEMMLowpOutputStage::configure(const ITensor                 *input,
+                                      const ITensor                 *bias,
+                                      ITensor                       *output,
+                                      const GEMMLowpOutputStageInfo &info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info));
     _impl->src  = input;
     _impl->bias = bias;
     _impl->dst  = output;
     _impl->op   = std::make_unique<cpu::CpuGemmLowpOutputStage>();
     _impl->op->configure(input->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC, _impl->src },
-        { TensorType::ACL_BIAS, _impl->bias },
-        { TensorType::ACL_DST, _impl->dst }
-    };
+    _impl->run_pack = {
+        {TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_BIAS, _impl->bias}, {TensorType::ACL_DST, _impl->dst}};
 }
 
-Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info)
+Status NEGEMMLowpOutputStage::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *bias,
+                                       const ITensorInfo             *output,
+                                       const GEMMLowpOutputStageInfo &info)
 {
     return cpu::CpuGemmLowpOutputStage::validate(input, bias, output, info);
 }
diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp
index f5d19c769e17843298c7a780e19c0819c830bafc..62b8cfa48bbba60573c22d1aff1f79a0fb844ea8 100644
--- a/src/runtime/NEON/functions/NEGather.cpp
+++ b/src/runtime/NEON/functions/NEGather.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGather.h"
 
-#include "src/core/NEON/kernels/NEGatherKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEGatherKernel.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
index 1c0e736766eaf8800b10d9b6fdeff60bb5657ba8..1022b4153eb4fbf4a78c7c56c4c195890c687eb7 100644
--- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
+++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
@@ -25,11 +25,12 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
 #include "src/core/NEON/kernels/NEPadLayerKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
@@ -68,42 +69,55 @@ NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManage
 
 NEGenerateProposalsLayer::~NEGenerateProposalsLayer() = default;
 
-void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals,
+void NEGenerateProposalsLayer::configure(const ITensor               *scores,
+                                         const ITensor               *deltas,
+                                         const ITensor               *anchors,
+                                         ITensor                     *proposals,
+                                         ITensor                     *scores_out,
+                                         ITensor                     *num_valid_proposals,
                                          const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
-    ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(),
+                                                                  proposals->info(), scores_out->info(),
+                                                                  num_valid_proposals->info(), info));
     ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
 
     _is_nhwc                        = scores->info()->data_layout() == DataLayout::NHWC;
     const DataType scores_data_type = scores->info()->data_type();
     _is_qasymm8                     = scores_data_type == DataType::QASYMM8;
-    const int    num_anchors        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
-    const int    feat_width         = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
-    const int    feat_height        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
-    const int    total_num_anchors  = num_anchors * feat_width * feat_height;
-    const int    pre_nms_topN       = info.pre_nms_topN();
-    const int    post_nms_topN      = info.post_nms_topN();
-    const size_t values_per_roi     = info.values_per_roi();
+    const int num_anchors           = scores->info()->dimension(
+                  get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+    const int    total_num_anchors = num_anchors * feat_width * feat_height;
+    const int    pre_nms_topN      = info.pre_nms_topN();
+    const int    post_nms_topN     = info.post_nms_topN();
+    const size_t values_per_roi    = info.values_per_roi();
 
     const QuantizationInfo scores_qinfo   = scores->info()->quantization_info();
     const DataType         rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
-    const QuantizationInfo rois_qinfo     = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
+    const QuantizationInfo rois_qinfo =
+        (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
 
     // Compute all the anchors
     _memory_group.manage(&_all_anchors);
     _compute_anchors = std::make_unique<NEComputeAllAnchorsKernel>();
-    _compute_anchors->configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+    _compute_anchors->configure(anchors, &_all_anchors,
+                                ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
 
     const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
-    _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
+    _deltas_flattened.allocator()->init(
+        TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
 
     // Permute and reshape deltas
     _memory_group.manage(&_deltas_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_deltas_permuted);
-        _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{2, 0, 1});
         _flatten_deltas.configure(&_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
@@ -117,10 +131,10 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
 
     // Permute and reshape scores
     _memory_group.manage(&_scores_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_scores_permuted);
-        _permute_scores.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_scores.configure(scores, &_scores_permuted, PermutationVector{2, 0, 1});
         _flatten_scores.configure(&_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
@@ -131,7 +145,7 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
 
     Tensor *anchors_to_use = &_all_anchors;
     Tensor *deltas_to_use  = &_deltas_flattened;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
         _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
@@ -154,11 +168,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
     anchors_to_use->allocator()->allocate();
 
     _all_proposals_to_use = &_all_proposals;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _memory_group.manage(&_all_proposals_quantized);
         // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
-        _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+        _all_proposals_quantized.allocator()->init(
+            TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
         _quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized);
         _all_proposals.allocator()->allocate();
         _all_proposals_to_use = &_all_proposals_quantized;
@@ -174,7 +189,8 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
 
     // Note that NMS needs outputs preinitialized.
     auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
-    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
+    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type,
+                       rois_qinfo);
     auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
 
     // Initialize temporaries (unused) outputs
@@ -187,17 +203,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
 
     _memory_group.manage(&_proposals_4_roi_values);
 
-    const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height());
-    _cpp_nms.configure(&_scores_flattened /*scores_in*/,
-                       _all_proposals_to_use /*boxes_in,*/,
-                       nullptr /* batch_splits_in*/,
-                       scores_out /* scores_out*/,
-                       &_proposals_4_roi_values /*boxes_out*/,
-                       &_classes_nms_unused /*classes*/,
-                       nullptr /*batch_splits_out*/,
-                       &_keeps_nms_unused /*keeps*/,
-                       num_valid_proposals /* keeps_size*/,
-                       box_nms_info);
+    const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f,
+                                       true, min_size_scaled, info.im_width(), info.im_height());
+    _cpp_nms.configure(&_scores_flattened /*scores_in*/, _all_proposals_to_use /*boxes_in,*/,
+                       nullptr /* batch_splits_in*/, scores_out /* scores_out*/, &_proposals_4_roi_values /*boxes_out*/,
+                       &_classes_nms_unused /*classes*/, nullptr /*batch_splits_out*/, &_keeps_nms_unused /*keeps*/,
+                       num_valid_proposals /* keeps_size*/, box_nms_info);
 
     _keeps_nms_unused.allocator()->allocate();
     _classes_nms_unused.allocator()->allocate();
@@ -205,12 +216,17 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
     _scores_flattened.allocator()->allocate();
 
     // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
-    _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{{1, 0}});
     _proposals_4_roi_values.allocator()->allocate();
 }
 
-Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
-                                          const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+Status NEGenerateProposalsLayer::validate(const ITensorInfo           *scores,
+                                          const ITensorInfo           *deltas,
+                                          const ITensorInfo           *anchors,
+                                          const ITensorInfo           *proposals,
+                                          const ITensorInfo           *scores_out,
+                                          const ITensorInfo           *num_valid_proposals,
+                                          const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -218,9 +234,12 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
 
-    const int num_anchors       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
-    const int feat_width        = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
-    const int feat_height       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
+    const int num_anchors =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
     const int num_images        = scores->dimension(3);
     const int total_num_anchors = num_anchors * feat_width * feat_height;
     const int values_per_roi    = info.values_per_roi();
@@ -229,76 +248,100 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
 
     ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
 
-    if(is_qasymm8)
+    if (is_qasymm8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
         const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
         ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
     }
 
-    TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
-
-    TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    if(scores->data_layout() == DataLayout::NHWC)
+    TensorInfo all_anchors_info(
+        anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(
+        anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+
+    TensorInfo deltas_permuted_info =
+        deltas->clone()
+            ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height))
+            .set_is_resizable(true);
+    TensorInfo scores_permuted_info =
+        scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+    if (scores->data_layout() == DataLayout::NHWC)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1}));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1}));
     }
 
-    TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    TensorInfo deltas_flattened_info(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
     ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
 
-    TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
-    TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    TensorInfo scores_flattened_info(
+        scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+    TensorInfo proposals_4_roi_values(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
 
     TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
-    TensorInfo  proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
-    if(is_qasymm8)
+    TensorInfo  proposals_4_roi_values_quantized(
+         deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16)
+        .set_quantization_info(QuantizationInfo(0.125f, 0));
+    if (is_qasymm8)
     {
-        TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+        TensorInfo all_anchors_f32_info(anchors->clone()
+                                            ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                            .set_is_resizable(true)
+                                            .set_data_type(DataType::F32));
         ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info));
 
-        TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
-
-        TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
-                                                                     BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+        TensorInfo deltas_flattened_f32_info(deltas->clone()
+                                                 ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                 .set_is_resizable(true)
+                                                 .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+
+        TensorInfo proposals_4_roi_values_f32(deltas->clone()
+                                                  ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                  .set_is_resizable(true)
+                                                  .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(
+            &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+            BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
         proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
-                                                                     BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+                                             BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}}));
 
-    if(num_valid_proposals->total_size() > 0)
+    if (num_valid_proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
     }
 
-    if(proposals->total_size() > 0)
+    if (proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
-        if(is_qasymm8)
+        if (is_qasymm8)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
             const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
@@ -311,7 +354,7 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
         }
     }
 
-    if(scores_out->total_size() > 0)
+    if (scores_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
@@ -330,7 +373,7 @@ void NEGenerateProposalsLayer::run()
     NEScheduler::get().schedule(_compute_anchors.get(), Window::DimY);
 
     // Transpose and reshape the inputs
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _permute_deltas.run();
         _permute_scores.run();
@@ -339,7 +382,7 @@ void NEGenerateProposalsLayer::run()
     _flatten_deltas.run();
     _flatten_scores.run();
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _dequantize_anchors.run();
         _dequantize_deltas.run();
@@ -348,7 +391,7 @@ void NEGenerateProposalsLayer::run()
     // Build the boxes
     _bounding_box.run();
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _quantize_all_proposals.run();
     }
diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
index 822dcf491c42ee7d38bdc560b5cacec9c617e194..78218cbdee1834c8bbf2ece444ffcf91ee1abd7d 100644
--- a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
 
@@ -34,7 +35,13 @@ namespace arm_compute
 NEInstanceNormalizationLayer::~NEInstanceNormalizationLayer() = default;
 
 NEInstanceNormalizationLayer::NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+    : _memory_group(std::move(memory_manager)),
+      _normalization_kernel(),
+      _is_nchw(false),
+      _permute_input(),
+      _permute_output(),
+      _permuted_input(),
+      _permuted_output()
 {
 }
 
@@ -43,14 +50,14 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl
     ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon);
 
     const DataLayout data_layout       = input->info()->data_layout();
-    const auto       kernel_descriptor = InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true };
+    const auto       kernel_descriptor = InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true};
 
     // Configure Kernels
     _is_nchw = data_layout == DataLayout::NCHW;
 
     _normalization_kernel = std::make_unique<NEInstanceNormalizationLayerKernel>();
 
-    if(!_is_nchw)
+    if (!_is_nchw)
     {
         _memory_group.manage(&_permuted_input);
         _memory_group.manage(&_permuted_output);
@@ -72,11 +79,12 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl
     }
 }
 
-Status NEInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
+Status NEInstanceNormalizationLayer::validate(
+    const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
 {
-    return NEInstanceNormalizationLayerKernel::validate(&input->clone()->set_data_layout(DataLayout::NCHW),
-                                                        &output->clone()->set_data_layout(DataLayout::NCHW),
-                                                        InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true });
+    return NEInstanceNormalizationLayerKernel::validate(
+        &input->clone()->set_data_layout(DataLayout::NCHW), &output->clone()->set_data_layout(DataLayout::NCHW),
+        InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true});
 }
 
 void NEInstanceNormalizationLayer::run()
@@ -84,7 +92,7 @@ void NEInstanceNormalizationLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Permute input
-    if(!_is_nchw)
+    if (!_is_nchw)
     {
         _permute_input.run();
     }
@@ -92,7 +100,7 @@ void NEInstanceNormalizationLayer::run()
     NEScheduler::get().schedule(_normalization_kernel.get(), Window::DimZ);
 
     // Permute output
-    if(!_is_nchw)
+    if (!_is_nchw)
     {
         _permute_output.run();
     }
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index c3ecfb430f70df14d2b9c31e0b7a8cdbb71da30a..b7f6203efd7a89ee7e5138a4a9a9125dd7b429bd 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
 #include "src/core/NEON/kernels/NEReductionOperationKernel.h"
@@ -69,7 +70,8 @@ Status NEL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo
     sum_sq.set_tensor_shape(shape);
 
     const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
 
     // Reduce shape on axis
     shape.set(actual_axis, 1);
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index 428cdf8c0426de338eb08b7f38e3092789674c58..1a08cdeb06e28945912af19964f2f7022541589f 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -24,11 +24,12 @@
 #include "arm_compute/runtime/NEON/functions/NELSTMLayer.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -39,42 +40,122 @@ using namespace arm_compute::utils::info_helpers;
 NELSTMLayer::~NELSTMLayer() = default;
 
 NELSTMLayer::NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
-      _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(),
-      _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(),
-      _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(),
-      _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
-      _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(),
-      _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(),
-      _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(),
-      _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(),
-      _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(),
-      _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false),
+    : _memory_group(std::move(memory_manager)),
+      _fully_connected_input_gate(),
+      _accum_input_gate1(),
+      _subtract_input_gate(),
+      _pixelwise_mul_input_gate(),
+      _activation_input_gate(),
+      _fully_connected_forget_gate(),
+      _accum_forget_gate1(),
+      _pixelwise_mul_forget_gate(),
+      _activation_forget_gate(),
+      _fully_connected_cell_state(),
+      _gemm_cell_state1(),
+      _transpose_cell_state(),
+      _accum_cell_state1(),
+      _accum_cell_state2(),
+      _pixelwise_mul_cell_state1(),
+      _activation_cell_state(),
+      _cell_clip(),
+      _pixelwise_mul_cell_state2(),
+      _fully_connected_output(),
+      _pixelwise_mul_output_state1(),
+      _accum_output1(),
+      _activation_output(),
+      _activation_output_state(),
+      _pixelwise_mul_output_state2(),
+      _fully_connected_output_state(),
+      _projection_clip(),
+      _copy_cell_state(),
+      _copy_output(),
+      _concat_scratch_buffer(),
+      _concat_inputs_forget_gate(),
+      _concat_weights_forget_gate(),
+      _concat_weights_input_gate(),
+      _concat_weights_output(),
+      _mean_std_norm_input_gate(),
+      _pixelwise_mul_input_gate_coeff(),
+      _accum_input_gate_bias(),
+      _mean_std_norm_forget_gate(),
+      _pixelwise_mul_forget_gate_coeff(),
+      _accum_forget_gate_bias(),
+      _mean_std_norm_cell_gate(),
+      _pixelwise_mul_cell_gate_coeff(),
+      _accum_cell_gate_bias(),
+      _mean_std_norm_output_gate(),
+      _pixelwise_mul_output_gate_coeff(),
+      _accum_output_gate_bias(),
+      _input_gate_out1(),
+      _input_gate_out2(),
+      _input_gate_out3(),
+      _input_gate_out4(),
+      _forget_gate_out1(),
+      _forget_gate_out2(),
+      _forget_gate_out3(),
+      _forget_gate_out4(),
+      _forget_gate_out5(),
+      _forget_gate_out6(),
+      _cell_state_out1(),
+      _cell_state_out2(),
+      _cell_state_out3(),
+      _cell_state_out4(),
+      _cell_state_out5(),
+      _output1(),
+      _output2(),
+      _output3(),
+      _output4(),
+      _cell_state_activation(),
+      _output_state1(),
+      _ones(),
+      _input_layer_norm_out1(),
+      _input_layer_norm_out2(),
+      _forget_layer_norm_out1(),
+      _forget_layer_norm_out2(),
+      _cell_layer_norm_out1(),
+      _cell_layer_norm_out2(),
+      _output_layer_norm_out1(),
+      _output_layer_norm_out2(),
+      _run_peephole_opt(false),
+      _run_cifg_opt(false),
+      _perform_cell_clipping(false),
+      _has_projection_weights(false),
+      _perform_projection_clipping(false),
+      _is_prepared(false),
       _is_layer_norm_lstm(false)
 {
 }
 
-void NELSTMLayer::configure(const ITensor *input,
-                            const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                            const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                            const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                            const ITensor *output_state_in, const ITensor *cell_state_in,
-                            ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output,
-                            const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void NELSTMLayer::configure(const ITensor             *input,
+                            const ITensor             *input_to_forget_weights,
+                            const ITensor             *input_to_cell_weights,
+                            const ITensor             *input_to_output_weights,
+                            const ITensor             *recurrent_to_forget_weights,
+                            const ITensor             *recurrent_to_cell_weights,
+                            const ITensor             *recurrent_to_output_weights,
+                            const ITensor             *forget_gate_bias,
+                            const ITensor             *cell_bias,
+                            const ITensor             *output_gate_bias,
+                            const ITensor             *output_state_in,
+                            const ITensor             *cell_state_in,
+                            ITensor                   *scratch_buffer,
+                            ITensor                   *output_state_out,
+                            ITensor                   *cell_state_out,
+                            ITensor                   *output,
+                            const LSTMParams<ITensor> &lstm_params,
+                            const ActivationLayerInfo &activation_info,
+                            float                      cell_threshold,
+                            float                      projection_threshold)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input,
-                                 input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                                  recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 forget_gate_bias, cell_bias, output_gate_bias,
-                                 output_state_in, cell_state_in,
+                                 forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
                                  scratch_buffer, output_state_out, cell_state_out, output);
-    ARM_COMPUTE_LOG_PARAMS(input,
-                           input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                            recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                           forget_gate_bias, cell_bias, output_gate_bias,
-                           output_state_in, cell_state_in,
-                           scratch_buffer, output_state_out, cell_state_out, output,
-                           lstm_params, activation_info, cell_threshold, projection_threshold);
+                           forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
+                           scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+                           cell_threshold, projection_threshold);
 
     _is_layer_norm_lstm = lstm_params.use_layer_norm();
 
@@ -83,13 +164,12 @@ void NELSTMLayer::configure(const ITensor *input,
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
     // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(input->info(), input_to_forget_weights->info(),
-                                                     input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                     recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                     forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                     output_state_in->info(), cell_state_in->info(),
-                                                     scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
-                                                     lstm_params_info, activation_info, cell_threshold, projection_threshold));
+    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(
+        input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+        recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(),
+        cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
+        lstm_params_info, activation_info, cell_threshold, projection_threshold));
 
     const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
 
@@ -116,20 +196,23 @@ void NELSTMLayer::configure(const ITensor *input,
     _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6, Window::DimX);
 
     _memory_group.manage(&_forget_gate_out5);
-    _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
+    _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6,
+                                           (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
     _memory_group.manage(&_forget_gate_out1);
     _memory_group.manage(&_forget_gate_out3);
     _forget_gate_out6.allocator()->allocate();
 
     Tensor *forget_gate_out = &_forget_gate_out5;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
         _run_peephole_opt = true;
         _memory_group.manage(&_forget_gate_out4);
-        _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-        _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+        _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1,
+                                             ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3,
+                                      ConvertPolicy::SATURATE);
         _forget_gate_out4.allocator()->allocate();
         _forget_gate_out5.allocator()->allocate();
         forget_gate_out = &_forget_gate_out3;
@@ -138,21 +221,25 @@ void NELSTMLayer::configure(const ITensor *input,
     {
         _forget_gate_out3.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_forget_layer_norm_out1);
         _memory_group.manage(&_forget_layer_norm_out2);
         _mean_std_norm_forget_gate.configure(forget_gate_out);
-        _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(),
+                                                   &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
         // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         forget_gate_out->allocator()->allocate();
-        _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2,
+                                          ConvertPolicy::SATURATE);
         _forget_layer_norm_out1.allocator()->allocate();
         forget_gate_out = &_forget_layer_norm_out2;
     }
-    _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_forget_gate.configure(forget_gate_out, nullptr,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -161,7 +248,7 @@ void NELSTMLayer::configure(const ITensor *input,
     // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     Tensor *input_gate_out = &_input_gate_out1;
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
@@ -183,15 +270,19 @@ void NELSTMLayer::configure(const ITensor *input,
         _memory_group.manage(&_input_gate_out1);
         _memory_group.manage(&_input_gate_out4);
 
-        _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
+        _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2,
+                                              (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(),
+                                              &_input_gate_out3);
         _input_gate_out2.allocator()->allocate();
         input_gate_out = &_input_gate_out3;
 
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
             _memory_group.manage(&_input_gate_out4);
-            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-            _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+            _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1,
+                                         ConvertPolicy::SATURATE);
             _input_gate_out3.allocator()->allocate();
             _input_gate_out4.allocator()->allocate();
             input_gate_out = &_input_gate_out1;
@@ -201,21 +292,25 @@ void NELSTMLayer::configure(const ITensor *input,
             _input_gate_out1.allocator()->allocate();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _memory_group.manage(&_input_layer_norm_out1);
             _memory_group.manage(&_input_layer_norm_out2);
             _mean_std_norm_input_gate.configure(input_gate_out);
-            _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+            _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(),
+                                                      &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                      RoundingPolicy::TO_ZERO);
             // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
             input_gate_out->allocator()->allocate();
-            _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+            _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(),
+                                             &_input_layer_norm_out2, ConvertPolicy::SATURATE);
             _input_layer_norm_out1.allocator()->allocate();
             input_gate_out = &_input_layer_norm_out2;
         }
-        _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _activation_input_gate.configure(input_gate_out, nullptr,
+                                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
     // Configure block that calculates the cell state
@@ -228,7 +323,8 @@ void NELSTMLayer::configure(const ITensor *input,
     _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
     _memory_group.manage(&_cell_state_out1);
-    _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
+    _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias,
+                                          &_cell_state_out1);
     _memory_group.manage(&_cell_state_out2);
     _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2);
     _memory_group.manage(&_cell_state_out3);
@@ -237,33 +333,40 @@ void NELSTMLayer::configure(const ITensor *input,
     _memory_group.manage(&_cell_state_out4);
     _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
     Tensor *cell_state_out_ptr = &_cell_state_out4;
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_cell_layer_norm_out1);
         _memory_group.manage(&_cell_layer_norm_out2);
         _mean_std_norm_cell_gate.configure(cell_state_out_ptr);
-        _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(),
+                                                 &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                 RoundingPolicy::TO_ZERO);
         // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
         cell_state_out_ptr->allocator()->allocate();
-        _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2,
+                                        ConvertPolicy::SATURATE);
         _cell_layer_norm_out1.allocator()->allocate();
         cell_state_out_ptr = &_cell_layer_norm_out2;
     }
     _activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
-    _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     cell_state_out_ptr->allocator()->allocate();
-    _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE,
+                                         RoundingPolicy::TO_ZERO);
     _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
     // Perform clipping
-    if(cell_threshold != 0.f)
+    if (cell_threshold != 0.f)
     {
         _perform_cell_clipping = true;
-        _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, -cell_threshold));
+        _cell_clip.configure(&_cell_state_out1, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 cell_threshold, -cell_threshold));
     }
 
     // Configure block that calculates the output
@@ -281,18 +384,20 @@ void NELSTMLayer::configure(const ITensor *input,
     _memory_group.manage(&_output1);
     _memory_group.manage(&_output4);
 
-    _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
+    _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias,
+                                      &_output4);
 
     _output2.allocator()->allocate();
     _forget_gate_out2.allocator()->allocate();
 
     Tensor *output_gate_out = &_output4;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
 
         _memory_group.manage(&_output3);
-        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1,
+                                               ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
         _accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
         _output4.allocator()->allocate();
         output_gate_out = &_output1;
@@ -304,21 +409,25 @@ void NELSTMLayer::configure(const ITensor *input,
     {
         _output1.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_output_layer_norm_out1);
         _memory_group.manage(&_output_layer_norm_out2);
         _mean_std_norm_output_gate.configure(output_gate_out);
-        _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(),
+                                                   &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
         // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         output_gate_out->allocator()->allocate();
-        _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2,
+                                          ConvertPolicy::SATURATE);
         _output_layer_norm_out1.allocator()->allocate();
         output_gate_out = &_output_layer_norm_out2;
     }
-    _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_output.configure(output_gate_out, nullptr,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the output state
     /** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -335,20 +444,24 @@ void NELSTMLayer::configure(const ITensor *input,
 
     _memory_group.manage(&_cell_state_activation);
     _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
-    _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1,
+                                           ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _cell_state_activation.allocator()->allocate();
     output_gate_out->allocator()->allocate();
 
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
         _has_projection_weights = true;
-        _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+        _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(),
+                                                lstm_params.projection_bias(), output_state_out);
         _output_state1.allocator()->allocate();
         // Perform clipping
-        if(projection_threshold != 0.f)
+        if (projection_threshold != 0.f)
         {
             _perform_projection_clipping = true;
-            _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+            _projection_clip.configure(output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -projection_threshold, projection_threshold));
         }
     }
 
@@ -358,7 +471,7 @@ void NELSTMLayer::configure(const ITensor *input,
 
     // Vector for holding the tensors to store in scratch buffer
     std::vector<const ITensor *> scratch_inputs;
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         scratch_inputs.emplace_back(input_gate_out);
     }
@@ -372,29 +485,38 @@ void NELSTMLayer::configure(const ITensor *input,
     output_gate_out->allocator()->allocate();
 }
 
-Status NELSTMLayer::validate(const ITensorInfo *input,
-                             const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                             const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                             const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                             const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
-                             const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
-                             const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+Status NELSTMLayer::validate(const ITensorInfo             *input,
+                             const ITensorInfo             *input_to_forget_weights,
+                             const ITensorInfo             *input_to_cell_weights,
+                             const ITensorInfo             *input_to_output_weights,
+                             const ITensorInfo             *recurrent_to_forget_weights,
+                             const ITensorInfo             *recurrent_to_cell_weights,
+                             const ITensorInfo             *recurrent_to_output_weights,
+                             const ITensorInfo             *forget_gate_bias,
+                             const ITensorInfo             *cell_bias,
+                             const ITensorInfo             *output_gate_bias,
+                             const ITensorInfo             *output_state_in,
+                             const ITensorInfo             *cell_state_in,
+                             const ITensorInfo             *scratch_buffer,
+                             const ITensorInfo             *output_state_out,
+                             const ITensorInfo             *cell_state_out,
+                             const ITensorInfo             *output,
+                             const LSTMParams<ITensorInfo> &lstm_params,
+                             const ActivationLayerInfo     &activation_info,
+                             float                          cell_threshold,
+                             float                          projection_threshold)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
-                                        input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                        forget_gate_bias, cell_bias, output_gate_bias,
-                                        output_state_in, cell_state_in,
-                                        scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check data types
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
-                                                       input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                                       forget_gate_bias, cell_bias, output_gate_bias,
-                                                       output_state_in, cell_state_in,
-                                                       scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check dimensions
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
@@ -413,16 +535,16 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
-                                && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) &&
+                                cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
 
     const unsigned int num_batches = input->dimension(1);
     const unsigned int num_cells   = input_to_output_weights->dimension(1);
 
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         // If CIFG is used, input layer normalization weights tensor is omitted
-        if(lstm_params.has_cifg_opt())
+        if (lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr);
         }
@@ -434,8 +556,12 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
         }
 
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
+                                            lstm_params.cell_layer_norm_weights(),
+                                            lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(),
+                                                           lstm_params.cell_layer_norm_weights(),
+                                                           lstm_params.output_layer_norm_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
@@ -445,7 +571,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     }
 
     // Check peephole optimization
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
@@ -465,33 +591,39 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     std::vector<const ITensorInfo *> inputs_vector;
     inputs_vector.emplace_back(input);
     inputs_vector.emplace_back(output_state_in);
-    const TensorShape concat_shape       = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+    const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
     TensorInfo        forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
     ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
 
     // Validate forget gate
-    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+        input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&forget_gate));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+        &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate input gate
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
-                                            lstm_params.recurrent_to_input_weights(),
-                                            lstm_params.input_gate_bias());
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
@@ -499,88 +631,120 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
         std::vector<const ITensorInfo *> lstm_weights;
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
-        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
-        TensorInfo  lstm_gate_concat          = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+        TensorShape lstm_weights_concat_shape =
+            arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
         ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+            input, lstm_params.input_to_input_weights(),
+            (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
 
-        if(lstm_params.has_peephole_opt())
+        if (lstm_params.has_peephole_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
         }
 
-        if(lstm_params.use_layer_norm())
+        if (lstm_params.use_layer_norm())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&input_gate));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(),
+                                                                       &input_gate, ConvertPolicy::SATURATE));
         }
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+            &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
 
     // Validate cell state
-    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(lstm_params.use_layer_norm())
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+        input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
     }
     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(cell_threshold != 0.f)
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1,
+                                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1,
+                                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (cell_threshold != 0.f)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold,
-                                                                                                              -cell_threshold)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEActivationLayer::validate(&cell_state_tmp, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            cell_threshold, -cell_threshold)));
     }
 
     // Validate output gate tmp
     std::vector<const ITensorInfo *> in_out_weights;
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
-    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
-    TensorInfo  in_out_gate_concat          = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+    TensorShape in_out_weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
     ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+        input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(),
+                                                &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+        &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate output state
     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    if(lstm_params.has_projection())
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
-        if(projection_threshold != 0.f)
+        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(),
+                                                                    lstm_params.projection_bias(), output_state_out));
+        if (projection_threshold != 0.f)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, output_state_out,
-                                                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+                output_state_out, output_state_out,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+                                    projection_threshold)));
         }
     }
 
@@ -590,7 +754,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
 
     // Validate scratch concatenation
     std::vector<const ITensorInfo *> inputs_vector_info_raw;
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         inputs_vector_info_raw.push_back(&input_gate);
     }
@@ -611,12 +775,12 @@ void NELSTMLayer::run()
     _concat_inputs_forget_gate.run();
     _fully_connected_forget_gate.run();
 
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
         _pixelwise_mul_forget_gate.run();
         _accum_forget_gate1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_forget_gate.run();
         _pixelwise_mul_forget_gate_coeff.run();
@@ -624,15 +788,17 @@ void NELSTMLayer::run()
     }
     _activation_forget_gate.run();
 
-    if(_run_cifg_opt)
+    if (_run_cifg_opt)
     {
-        if(_ones.info()->data_type() == DataType::F16)
+        if (_ones.info()->data_type() == DataType::F16)
         {
-            std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+            std::fill_n(reinterpret_cast<half *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 1);
         }
         else
         {
-            std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+            std::fill_n(reinterpret_cast<float *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 1);
         }
         _subtract_input_gate.run();
     }
@@ -640,13 +806,13 @@ void NELSTMLayer::run()
     {
         _fully_connected_input_gate.run();
 
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
             _pixelwise_mul_input_gate.run();
             _accum_input_gate1.run();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _mean_std_norm_input_gate.run();
             _pixelwise_mul_input_gate_coeff.run();
@@ -659,7 +825,7 @@ void NELSTMLayer::run()
     _transpose_cell_state.run();
     _gemm_cell_state1.run();
     _accum_cell_state1.run();
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_cell_gate.run();
         _pixelwise_mul_cell_gate_coeff.run();
@@ -671,18 +837,18 @@ void NELSTMLayer::run()
     _pixelwise_mul_cell_state2.run();
     _accum_cell_state2.run();
 
-    if(_perform_cell_clipping)
+    if (_perform_cell_clipping)
     {
         _cell_clip.run();
     }
 
     _fully_connected_output.run();
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
         _pixelwise_mul_output_state1.run();
         _accum_output1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_output_gate.run();
         _pixelwise_mul_output_gate_coeff.run();
@@ -693,10 +859,10 @@ void NELSTMLayer::run()
     _activation_output_state.run();
     _pixelwise_mul_output_state2.run();
 
-    if(_has_projection_weights)
+    if (_has_projection_weights)
     {
         _fully_connected_output_state.run();
-        if(_perform_projection_clipping)
+        if (_perform_projection_clipping)
         {
             _projection_clip.run();
         }
@@ -710,10 +876,10 @@ void NELSTMLayer::run()
 
 void NELSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _concat_weights_forget_gate.run();
-        if(!_run_cifg_opt)
+        if (!_run_cifg_opt)
         {
             _concat_weights_input_gate.run();
         }
diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
index cfdeb000e00b8c9e3b1912a003088d62fe5bec16..41f9c3d70071db3fd9e63d317a84ea88a9d696f1 100644
--- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
@@ -46,36 +47,104 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0);  // qsymm16 with 0 integer bit
 NELSTMLayerQuantized::~NELSTMLayerQuantized() = default;
 
 NELSTMLayerQuantized::NELSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
-      _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add1(), _add2(), _mul1(), _mul2(), _mul3(),
-      _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr),
-      _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr),
-      _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(),
-      _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(),
-      _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state1(), _cell_state2(), _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(),
+    : _memory_group(std::move(memory_manager)),
+      _gemmlowp(),
+      _output_stage(),
+      _transpose_weights(),
+      _concat_input_weights(),
+      _concat_recurrent_weights(),
+      _concat_weights(),
+      _concat_inputs(),
+      _concat_bias(),
+      _sigmoid_forget_gate(),
+      _sigmoid_input_gate(),
+      _sigmoid_output_gate(),
+      _tanh_modulation_gate(),
+      _tanh_output_state(),
+      _add1(),
+      _add2(),
+      _mul1(),
+      _mul2(),
+      _mul3(),
+      _slice_input_tensor(),
+      _slice_forget_tensor(),
+      _slice_cell_tensor(),
+      _slice_output_tensor(),
+      _dequantize(),
+      _quantize(),
+      _input_to_input_weights(nullptr),
+      _input_to_forget_weights(nullptr),
+      _input_to_cell_weights(nullptr),
+      _input_to_output_weights(nullptr),
+      _recurrent_to_input_weights(nullptr),
+      _recurrent_to_forget_weights(nullptr),
+      _recurrent_to_cell_weights(nullptr),
+      _recurrent_to_output_weights(nullptr),
+      _input_gate_bias(nullptr),
+      _forget_gate_bias(nullptr),
+      _cell_bias(nullptr),
+      _output_gate_bias(nullptr),
+      _recurrent_weights(),
+      _input_weights(),
+      _weights(),
+      _input(),
+      _weights_transposed(),
+      _output_highp(),
+      _output_lowp(),
+      _bias(),
+      _forget_gate_input(),
+      _input_gate_input(),
+      _output_gate_input(),
+      _input_modulation_gate_input(),
+      _forget_gate_output(),
+      _input_gate_output(),
+      _output_gate_output(),
+      _input_modulation_gate_output(),
+      _cell_state1(),
+      _cell_state2(),
+      _output_state_tmp(),
+      _output_state_out_symm(),
+      _output_state_out_f32(),
       _is_prepared(false)
 {
 }
 
 void NELSTMLayerQuantized::configure(const ITensor *input,
-                                     const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                                     const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                                     const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                                     ITensor *cell_state_in, const ITensor *output_state_in,
-                                     ITensor *cell_state_out, ITensor *output_state_out)
+                                     const ITensor *input_to_input_weights,
+                                     const ITensor *input_to_forget_weights,
+                                     const ITensor *input_to_cell_weights,
+                                     const ITensor *input_to_output_weights,
+                                     const ITensor *recurrent_to_input_weights,
+                                     const ITensor *recurrent_to_forget_weights,
+                                     const ITensor *recurrent_to_cell_weights,
+                                     const ITensor *recurrent_to_output_weights,
+                                     const ITensor *input_gate_bias,
+                                     const ITensor *forget_gate_bias,
+                                     const ITensor *cell_bias,
+                                     const ITensor *output_gate_bias,
+                                     ITensor       *cell_state_in,
+                                     const ITensor *output_state_in,
+                                     ITensor       *cell_state_out,
+                                     ITensor       *output_state_out)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                 recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
-
-    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
-                                                              input_to_output_weights->info(),
-                                                              recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                              input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
-
-    ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                           recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                           input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                                 input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                                 recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias,
+                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                                 cell_state_out, output_state_out);
+
+    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(
+        input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+        input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(),
+        recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+        output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                           input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                           recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias,
+                           cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+                           output_state_out);
 
     const int input_size  = input->info()->dimension(0);
     const int batch_size  = input->info()->dimension(1);
@@ -83,8 +152,10 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
 
     const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
 
-    auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
-    auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+    auto_init_if_empty(*cell_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+    auto_init_if_empty(*output_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
 
     _input_to_input_weights      = input_to_input_weights;
     _input_to_forget_weights     = input_to_forget_weights;
@@ -100,34 +171,41 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
     _output_gate_bias            = output_gate_bias;
 
     // Weights concatenation
-    std::vector<const ITensor *> inputs_weights_vector{ input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights };
-    std::vector<const ITensor *> recurrent_weights_vector{ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights };
+    std::vector<const ITensor *> inputs_weights_vector{input_to_input_weights, input_to_forget_weights,
+                                                       input_to_cell_weights, input_to_output_weights};
+    std::vector<const ITensor *> recurrent_weights_vector{recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                          recurrent_to_cell_weights, recurrent_to_output_weights};
 
-    _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _input_weights.allocator()->init(
+        TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY);
 
-    _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _recurrent_weights.allocator()->init(
+        TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY);
 
-    std::vector<const ITensor *> weights_vector{ &_recurrent_weights, &_input_weights };
-    _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    std::vector<const ITensor *> weights_vector{&_recurrent_weights, &_input_weights};
+    _weights.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_weights.configure(weights_vector, &_weights, Window::DimX);
     _transpose_weights.configure(&_weights, &_weights_transposed);
 
     // Input concatenation
-    std::vector<const ITensor *> input_vector{ input, output_state_in };
+    std::vector<const ITensor *> input_vector{input, output_state_in};
     _memory_group.manage(&_input);
-    _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+    _input.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
     _concat_inputs.configure(input_vector, &_input, Window::DimX);
 
     // Bias concatenation
-    std::vector<const ITensor *> bias_vector{ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias };
+    std::vector<const ITensor *> bias_vector{input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias};
     _bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32));
     _concat_bias.configure(bias_vector, &_bias, Window::DimX);
 
     // Invert the offset for gemmlowp
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
 
     // Run gemmlowp
     _memory_group.manage(&_output_highp);
@@ -137,7 +215,8 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
 
     // Set the offset back
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
 
     // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
     _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
@@ -159,64 +238,80 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
     _bias.allocator()->allocate();
 
     // Get the gate tensors
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0, 0}, {output_size, batch_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size, 0},
+                                       {2 * output_size, batch_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size, 0},
+                                     {3 * output_size, batch_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size, 0},
+                                       {4 * output_size, batch_size});
         _output_lowp.allocator()->allocate();
     }
     else
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size });
+        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0}, {output_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size}, {2 * output_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size},
+                                     {3 * output_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size}, {4 * output_size});
         _output_lowp.allocator()->allocate();
     }
 
     // Forget gate
     _memory_group.manage(&_forget_gate_output);
-    _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_output.allocator()->init(
+        TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _forget_gate_input.allocator()->allocate();
 
     // Input gate
     _memory_group.manage(&_input_gate_output);
-    _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _input_gate_output.allocator()->init(
+        TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output,
+                                  ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _input_gate_input.allocator()->allocate();
 
     // Input modulation gate equation
     _memory_group.manage(&_input_modulation_gate_output);
-    _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _input_modulation_gate_output.allocator()->init(
+        TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
     _input_modulation_gate_input.allocator()->allocate();
 
     // Output gate
     _memory_group.manage(&_output_gate_output);
-    _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_output.allocator()->init(
+        TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _output_gate_input.allocator()->allocate();
 
     // Long term memory
     _memory_group.manage(&_cell_state1);
-    _cell_state1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state1.allocator()->init(
+        TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE,
+                    RoundingPolicy::TO_ZERO);
     _forget_gate_output.allocator()->allocate();
 
     _memory_group.manage(&_cell_state2);
-    _cell_state2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state2.allocator()->init(
+        TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE,
+                    RoundingPolicy::TO_ZERO);
     _input_modulation_gate_output.allocator()->allocate();
     _input_gate_output.allocator()->allocate();
 
@@ -226,18 +321,23 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
 
     // Short term memory
     _memory_group.manage(&_output_state_tmp);
-    _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _output_state_tmp.allocator()->init(
+        TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_output_state.configure(cell_state_out, &_output_state_tmp,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
 
     _memory_group.manage(&_output_state_out_symm);
-    _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _output_state_out_symm.allocator()->init(
+        TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                    RoundingPolicy::TO_ZERO);
     _output_gate_output.allocator()->allocate();
     _output_state_tmp.allocator()->allocate();
 
     // Requantize the output state from QSYMM16 to QASYMM8
     _memory_group.manage(&_output_state_out_f32);
-    _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+    _output_state_out_f32.allocator()->init(
+        TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
     _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32);
     _output_state_out_symm.allocator()->allocate();
 
@@ -246,15 +346,28 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
 }
 
 Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
-                                      const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                                      const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                                      const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                                      const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                                      const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+                                      const ITensorInfo *input_to_input_weights,
+                                      const ITensorInfo *input_to_forget_weights,
+                                      const ITensorInfo *input_to_cell_weights,
+                                      const ITensorInfo *input_to_output_weights,
+                                      const ITensorInfo *recurrent_to_input_weights,
+                                      const ITensorInfo *recurrent_to_forget_weights,
+                                      const ITensorInfo *recurrent_to_cell_weights,
+                                      const ITensorInfo *recurrent_to_output_weights,
+                                      const ITensorInfo *input_gate_bias,
+                                      const ITensorInfo *forget_gate_bias,
+                                      const ITensorInfo *cell_bias,
+                                      const ITensorInfo *output_gate_bias,
+                                      const ITensorInfo *cell_state_in,
+                                      const ITensorInfo *output_state_in,
+                                      const ITensorInfo *cell_state_out,
+                                      const ITensorInfo *output_state_out)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
-                                        output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+        recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+        input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+        output_state_out);
 
     const int input_size  = input->dimension(0);
     const int batch_size  = input->dimension(1);
@@ -266,29 +379,51 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
 
-    TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
-    TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
-    TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+    TensorInfo input_weights_info(input_to_input_weights->clone()
+                                      ->set_tensor_shape(TensorShape(input_size, output_size))
+                                      .set_data_type(DataType::QASYMM8));
+    TensorInfo recurrent_weights_info(input_to_input_weights->clone()
+                                          ->set_tensor_shape(TensorShape(output_size, output_size))
+                                          .set_data_type(DataType::QASYMM8));
+    TensorInfo bias_info(
+        input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+    TensorInfo output_state_info(cell_state_in->clone()
+                                     ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                     .set_data_type(DataType::QASYMM8)
+                                     .set_quantization_info(qasymm));
+    TensorInfo cell_state_info(cell_state_in->clone()
+                                   ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                   .set_data_type(DataType::QSYMM16)
+                                   .set_quantization_info(qsymm_4));
 
     // Shape checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights,
+                                                   input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights,
+                                                   recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                   recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                   output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
 
     // Data type checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights,
+                                                       input_to_forget_weights, input_to_cell_weights,
+                                                       input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                       recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                       output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
 
     // Quantization checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights,
+                                                              input_to_forget_weights, input_to_cell_weights,
+                                                              input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                              recurrent_to_cell_weights, recurrent_to_output_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
 
@@ -310,7 +445,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
     recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
     const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
 
     // _concat_weights
     std::vector<const ITensorInfo *> weights_vector;
@@ -320,7 +456,7 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(weights_vector, &weights, Window::DimX));
     // _transpose_weights
     const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]);
-    TensorInfo        weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
+    TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(&weights, &weights_transposed));
 
     // _concat_inputs
@@ -346,7 +482,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
 
     // _gemmlowp
     const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
 
     // Set the offset back
     input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
@@ -357,7 +494,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     const float multiplier        = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
     int32_t     output_multiplier = 0;
     int32_t     output_shift      = 0;
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
     // _output_stage
     GEMMLowpOutputStageInfo info;
@@ -372,68 +510,91 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     TensorInfo input_modulation_gate_input;
     TensorInfo output_gate_input;
 
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0},
+                                                      {3 * output_size, batch_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size}));
     }
     else
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, {0}, {output_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size}));
     }
 
     // _sigmoid_forget_gate
     const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&forget_gate_input, &forget_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _sigmoid_input_gate
     const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+        &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _tanh_modulation_gate
-    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16,
+                                                  qsymm_0);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
     // _sigmoid_output_gate
     const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&output_gate_input, &output_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // _mul_forget_gate_cell_state
     const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
 
     // _mul_input_gate_input_mod_gate
     const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output,
+                                                                    &cell_state_tmp2, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _add_cell_state_tmps
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
 
     // _tanh_modulation_gate
     const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(cell_state_out, &output_state_tmp,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
 
     // _mul_output_state_tmp_output_gate
     const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output,
+                                                                    &output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _dequantize
     const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32);
@@ -442,14 +603,14 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     // _quantize
     ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&output_state_out_f32, output_state_out));
 
-    if(cell_state_out->total_size() != 0)
+    if (cell_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
     }
 
-    if(output_state_out->total_size() != 0)
+    if (output_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
@@ -508,7 +669,7 @@ void NELSTMLayerQuantized::run()
 
 void NELSTMLayerQuantized::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _input_weights.allocator()->allocate();
         _concat_input_weights.run();
diff --git a/src/runtime/NEON/functions/NELogical.cpp b/src/runtime/NEON/functions/NELogical.cpp
index 92dcf15791db360e7103ba0a9ce4de0a31243baf..0013a521d13103952ff84fd1ab71d2392f239a22 100644
--- a/src/runtime/NEON/functions/NELogical.cpp
+++ b/src/runtime/NEON/functions/NELogical.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NELogicalKernel.h"
 
@@ -32,15 +33,14 @@ namespace arm_compute
 {
 struct LogicalArgs
 {
-    std::unique_ptr<kernels::NELogicalKernel> kernel{ nullptr };
+    std::unique_ptr<kernels::NELogicalKernel> kernel{nullptr};
     ITensorPack                               pack{};
 };
 
 struct NELogicalAnd::Impl : public LogicalArgs
 {
 };
-NELogicalAnd::NELogicalAnd()
-    : _impl(std::make_unique<Impl>())
+NELogicalAnd::NELogicalAnd() : _impl(std::make_unique<Impl>())
 {
 }
 NELogicalAnd::~NELogicalAnd() = default;
@@ -72,8 +72,7 @@ void NELogicalAnd::run()
 struct NELogicalOr::Impl : public LogicalArgs
 {
 };
-NELogicalOr::NELogicalOr()
-    : _impl(std::make_unique<Impl>())
+NELogicalOr::NELogicalOr() : _impl(std::make_unique<Impl>())
 {
 }
 NELogicalOr::~NELogicalOr() = default;
@@ -105,8 +104,7 @@ void NELogicalOr::run()
 struct NELogicalNot::Impl : public LogicalArgs
 {
 };
-NELogicalNot::NELogicalNot()
-    : _impl(std::make_unique<Impl>())
+NELogicalNot::NELogicalNot() : _impl(std::make_unique<Impl>())
 {
 }
 NELogicalNot::~NELogicalNot() = default;
diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp
index 58640f40eaf1449318f31945569b5232574ef393..31898bafc4c7c7c198853c510a6a9260586244c2 100644
--- a/src/runtime/NEON/functions/NEMatMul.cpp
+++ b/src/runtime/NEON/functions/NEMatMul.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuMatMul.h"
 
@@ -33,23 +34,27 @@ namespace arm_compute
 {
 struct NEMatMul::Impl
 {
-    const ITensor                  *lhs{ nullptr };
-    const ITensor                  *rhs{ nullptr };
-    ITensor                        *output{ nullptr };
-    std::unique_ptr<cpu::CpuMatMul> op{ nullptr };
+    const ITensor                  *lhs{nullptr};
+    const ITensor                  *rhs{nullptr};
+    ITensor                        *output{nullptr};
+    std::unique_ptr<cpu::CpuMatMul> op{nullptr};
     MemoryGroup                     memory_group{};
     WorkspaceData<Tensor>           workspace_tensors{};
     ITensorPack                     run_pack{};
 };
 
-NEMatMul::NEMatMul()
-    : _impl(std::make_unique<Impl>())
+NEMatMul::NEMatMul() : _impl(std::make_unique<Impl>())
 {
 }
 
 NEMatMul::~NEMatMul() = default;
 
-void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+void NEMatMul::configure(ITensor                   *lhs,
+                         ITensor                   *rhs,
+                         ITensor                   *output,
+                         const MatMulInfo          &info,
+                         const CpuMatMulSettings   &settings,
+                         const ActivationLayerInfo &act_info)
 {
     _impl->lhs    = lhs;
     _impl->rhs    = rhs;
@@ -58,11 +63,16 @@ void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatM
     ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->lhs, _impl->rhs, _impl->output);
     _impl->op = std::make_unique<cpu::CpuMatMul>();
     _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings, act_info);
-    _impl->run_pack          = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } };
+    _impl->run_pack          = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}};
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
-Status NEMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+Status NEMatMul::validate(const ITensorInfo         *lhs,
+                          const ITensorInfo         *rhs,
+                          const ITensorInfo         *output,
+                          const MatMulInfo          &info,
+                          const CpuMatMulSettings   &settings,
+                          const ActivationLayerInfo &act_info)
 {
     return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings, act_info);
 }
diff --git a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
index 97ddaea41d9f006e55fb1b62f0cb94c07ee7210a..c3861afd2c3fbe2da3265d54411523c789727b99 100644
--- a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEFill.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
 #include "src/cpu/operators/CpuMaxUnpooling.h"
@@ -35,20 +36,22 @@ namespace arm_compute
 {
 struct NEMaxUnpoolingLayer::Impl
 {
-    const ITensor                        *src{ nullptr };
-    const ITensor                        *indices{ nullptr };
-    ITensor                              *dst{ nullptr };
-    std::unique_ptr<cpu::CpuMaxUnpooling> op{ nullptr };
+    const ITensor                        *src{nullptr};
+    const ITensor                        *indices{nullptr};
+    ITensor                              *dst{nullptr};
+    std::unique_ptr<cpu::CpuMaxUnpooling> op{nullptr};
 };
 
 NEMaxUnpoolingLayer::~NEMaxUnpoolingLayer() = default;
 
-NEMaxUnpoolingLayer::NEMaxUnpoolingLayer()
-    : _fill_func(), _impl()
+NEMaxUnpoolingLayer::NEMaxUnpoolingLayer() : _fill_func(), _impl()
 {
 }
 
-void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info)
+void NEMaxUnpoolingLayer::configure(ITensor                *input,
+                                    ITensor                *indices,
+                                    ITensor                *output,
+                                    const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info);
 
@@ -64,7 +67,10 @@ void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *o
     _impl->op->configure(input->info(), indices->info(), output->info(), pool_info);
 }
 
-Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status NEMaxUnpoolingLayer::validate(const ITensorInfo      *input,
+                                     const ITensorInfo      *indices,
+                                     const ITensorInfo      *output,
+                                     const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
     ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuMaxUnpooling::validate(input, indices, output, pool_info));
diff --git a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
index 7626aa0db2d0d4b99bca0270af09ee73ec79607c..dec0dde56d4198b43e76070edd5d7df95381b8ca 100644
--- a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
 
-#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index d3b169633565ccc3453748110c408808d176bf8a..d6d2e9dc460a599b244cd32d33fd1b088ffcce8e 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
 
@@ -61,13 +62,16 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, cons
     _input_squared.allocator()->allocate();
 }
 
-Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status NENormalizationLayer::validate(const ITensorInfo            *input,
+                                      const ITensorInfo            *output,
+                                      const NormalizationLayerInfo &norm_info)
 {
     // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
 
     ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     return Status{};
 }
@@ -78,4 +82,4 @@ void NENormalizationLayer::run()
     _multiply_f.run();
     NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY);
 }
-}
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp
index 80c5690a4eee935a04eae9b5ccf5e5280b449491..963e68bac7fe9897666235e4d700e00318b251da 100644
--- a/src/runtime/NEON/functions/NEPReluLayer.cpp
+++ b/src/runtime/NEON/functions/NEPReluLayer.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
 
 #include "arm_compute/core/ITensor.h"
+
 #include "src/cpu/operators/CpuPRelu.h"
 
 namespace arm_compute
@@ -32,17 +33,16 @@ using OperatorType = cpu::CpuPRelu;
 
 struct NEPReluLayer::Impl
 {
-    const ITensor                *src_0{ nullptr };
-    const ITensor                *src_1{ nullptr };
-    ITensor                      *dst{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ITensor                *src_0{nullptr};
+    const ITensor                *src_1{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
 };
 
-NEPReluLayer::NEPReluLayer()
-    : _impl(std::make_unique<Impl>())
+NEPReluLayer::NEPReluLayer() : _impl(std::make_unique<Impl>())
 {
 }
-NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default;
+NEPReluLayer::NEPReluLayer(NEPReluLayer &&)            = default;
 NEPReluLayer &NEPReluLayer::operator=(NEPReluLayer &&) = default;
 NEPReluLayer::~NEPReluLayer()                          = default;
 
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index 8bacdd300268cc75a20a98ef64f4021e71eb8ab9..253566df0f93a0135e256b2c76bc4404c6944ae4 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -23,13 +23,13 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
 
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
-#include "src/core/NEON/kernels/NEPadLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
 
 namespace arm_compute
 {
@@ -38,9 +38,9 @@ namespace
 uint32_t last_padding_dimension(const PaddingList &padding)
 {
     int last_padding_dim = padding.size() - 1;
-    for(; last_padding_dim >= 0; --last_padding_dim)
+    for (; last_padding_dim >= 0; --last_padding_dim)
     {
-        if(padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0)
+        if (padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0)
         {
             break;
         }
@@ -52,11 +52,22 @@ uint32_t last_padding_dimension(const PaddingList &padding)
 NEPadLayer::~NEPadLayer() = default;
 
 NEPadLayer::NEPadLayer()
-    : _copy_function(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
+    : _copy_function(),
+      _pad_kernel(),
+      _mode(),
+      _padding(),
+      _num_dimensions(0),
+      _slice_functions(),
+      _concat_functions(),
+      _slice_results(),
+      _concat_results()
 {
 }
 
-void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value)
+void NEPadLayer::configure_constant_mode(ITensor           *input,
+                                         ITensor           *output,
+                                         const PaddingList &padding,
+                                         const PixelValue   constant_value)
 {
     _pad_kernel = std::make_unique<NEPadLayerKernel>();
     _pad_kernel->configure(input, output, padding, constant_value, PaddingMode::CONSTANT);
@@ -85,20 +96,20 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
     Coordinates ends_after{};
     Coordinates strides{};
     ITensor    *prev = input;
-    for(uint32_t i = 0; i < _num_dimensions; ++i)
+    for (uint32_t i = 0; i < _num_dimensions; ++i)
     {
         // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
-        if(i > 0)
+        if (i > 0)
         {
             strides.set(i - 1, 1);
         }
 
-        if(_padding[i].first > 0 || _padding[i].second > 0)
+        if (_padding[i].first > 0 || _padding[i].second > 0)
         {
             // Set the starts, ends, and strides values for the current dimension.
             // Due to the bit masks passed to strided slice, the values below the current dimension in
             // starts and ends will be ignored so do not need to be modified.
-            if(_mode == PaddingMode::REFLECT)
+            if (_mode == PaddingMode::REFLECT)
             {
                 starts_before.set(i, _padding[i].first);
                 ends_before.set(i, 0);
@@ -124,11 +135,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
 
             // Reflect the input values for the padding before and after the input.
             std::vector<const ITensor *> concat_vector;
-            if(_padding[i].first > 0)
+            if (_padding[i].first > 0)
             {
-                if(i < prev->info()->num_dimensions())
+                if (i < prev->info()->num_dimensions())
                 {
-                    _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
+                    _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides,
+                                                      begin_mask_before, end_mask_before);
                     concat_vector.emplace_back(&_slice_results[2 * i]);
                 }
                 else
@@ -138,11 +150,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
                 }
             }
             concat_vector.push_back(prev);
-            if(_padding[i].second > 0)
+            if (_padding[i].second > 0)
             {
-                if(i < prev->info()->num_dimensions())
+                if (i < prev->info()->num_dimensions())
                 {
-                    _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
+                    _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after,
+                                                          strides, begin_mask_after, end_mask_after);
                     concat_vector.emplace_back(&_slice_results[2 * i + 1]);
                 }
                 else
@@ -154,12 +167,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
             // Concatenate the padding before and after with the input.
             ITensor *out = (i == _num_dimensions - 1) ? output : &_concat_results[i];
             out->info()->set_quantization_info(output->info()->quantization_info());
-            for(auto &v : concat_vector)
+            for (auto &v : concat_vector)
             {
                 v->info()->set_quantization_info(input->info()->quantization_info());
             }
             _concat_functions[i].configure(concat_vector, out, i);
-            if(i != _num_dimensions - 1)
+            if (i != _num_dimensions - 1)
             {
                 _concat_results[i].allocator()->allocate();
             }
@@ -170,7 +183,11 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
     }
 }
 
-void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+void NEPadLayer::configure(ITensor           *input,
+                           ITensor           *output,
+                           const PaddingList &padding,
+                           const PixelValue   constant_value,
+                           const PaddingMode  mode)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
     ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode);
@@ -178,15 +195,16 @@ void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &p
     _padding = padding;
     _mode    = mode;
 
-    const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
+    const TensorShape padded_shape =
+        misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
 
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
 
     // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
     _num_dimensions = last_padding_dimension(padding) + 1;
-    if(_num_dimensions > 0)
+    if (_num_dimensions > 0)
     {
-        switch(_mode)
+        switch (_mode)
         {
             case PaddingMode::CONSTANT:
             {
@@ -210,19 +228,23 @@ void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &p
     }
 }
 
-Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+Status NEPadLayer::validate(const ITensorInfo *input,
+                            const ITensorInfo *output,
+                            const PaddingList &padding,
+                            const PixelValue   constant_value,
+                            const PaddingMode  mode)
 {
     ARM_COMPUTE_UNUSED(constant_value);
 
     const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
 
-    if(output->total_size() > 0)
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
-    switch(mode)
+    switch (mode)
     {
         case PaddingMode::CONSTANT:
         {
@@ -231,9 +253,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
         case PaddingMode::REFLECT:
         case PaddingMode::SYMMETRIC:
         {
-            for(uint32_t i = 0; i < padding.size(); ++i)
+            for (uint32_t i = 0; i < padding.size(); ++i)
             {
-                if(mode == PaddingMode::REFLECT)
+                if (mode == PaddingMode::REFLECT)
                 {
                     ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
                     ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
@@ -256,9 +278,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
 
 void NEPadLayer::run()
 {
-    if(_num_dimensions > 0)
+    if (_num_dimensions > 0)
     {
-        switch(_mode)
+        switch (_mode)
         {
             case PaddingMode::CONSTANT:
             {
@@ -268,15 +290,15 @@ void NEPadLayer::run()
             case PaddingMode::REFLECT:
             case PaddingMode::SYMMETRIC:
             {
-                for(uint32_t i = 0; i < _num_dimensions; ++i)
+                for (uint32_t i = 0; i < _num_dimensions; ++i)
                 {
-                    if(_padding[i].first > 0 || _padding[i].second > 0)
+                    if (_padding[i].first > 0 || _padding[i].second > 0)
                     {
-                        if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
+                        if (_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
                         {
                             _slice_functions[2 * i].run();
                         }
-                        if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
+                        if (_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
                         {
                             _slice_functions[2 * i + 1].run();
                         }
diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp
index 517b86a1cb58c17e7e85e4e85c0ce2aceb16f9c9..80cd04ce6c9943cdfbc0c99b5c4c00692c447f70 100644
--- a/src/runtime/NEON/functions/NEPermute.cpp
+++ b/src/runtime/NEON/functions/NEPermute.cpp
@@ -24,19 +24,19 @@
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuPermute.h"
 
 namespace arm_compute
 {
 struct NEPermute::Impl
 {
-    const ITensor                   *src{ nullptr };
-    ITensor                         *dst{ nullptr };
-    std::unique_ptr<cpu::CpuPermute> op{ nullptr };
+    const ITensor                   *src{nullptr};
+    ITensor                         *dst{nullptr};
+    std::unique_ptr<cpu::CpuPermute> op{nullptr};
 };
 
-NEPermute::NEPermute()
-    : _impl(std::make_unique<Impl>())
+NEPermute::NEPermute() : _impl(std::make_unique<Impl>())
 {
 }
 
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index ad83a26beba37a3ebae484f87bca5b52e09fd583..97155a9e74fab7d1bafd5ecb608748fada420fba 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 
 #include "arm_compute/core/ITensor.h"
+
 #include "src/cpu/operators/CpuMul.h"
 
 #include <utility>
@@ -32,32 +33,42 @@ namespace arm_compute
 {
 struct NEPixelWiseMultiplication::Impl
 {
-    const ITensor               *src_0{ nullptr };
-    const ITensor               *src_1{ nullptr };
-    ITensor                     *dst{ nullptr };
-    std::unique_ptr<cpu::CpuMul> op{ nullptr };
+    const ITensor               *src_0{nullptr};
+    const ITensor               *src_1{nullptr};
+    ITensor                     *dst{nullptr};
+    std::unique_ptr<cpu::CpuMul> op{nullptr};
 };
 
-NEPixelWiseMultiplication::NEPixelWiseMultiplication()
-    : _impl(std::make_unique<Impl>())
+NEPixelWiseMultiplication::NEPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
 {
 }
 NEPixelWiseMultiplication::~NEPixelWiseMultiplication() = default;
 
-Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+Status NEPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                           const ITensorInfo         *input2,
+                                           const ITensorInfo         *output,
+                                           float                      scale,
+                                           ConvertPolicy              overflow_policy,
+                                           RoundingPolicy             rounding_policy,
                                            const ActivationLayerInfo &act_info)
 {
     return cpu::CpuMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
 }
 
-void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+void NEPixelWiseMultiplication::configure(const ITensor             *input1,
+                                          const ITensor             *input2,
+                                          ITensor                   *output,
+                                          float                      scale,
+                                          ConvertPolicy              overflow_policy,
+                                          RoundingPolicy             rounding_policy,
                                           const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
     _impl->dst   = output;
     _impl->op    = std::make_unique<cpu::CpuMul>();
-    _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info);
+    _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy,
+                         act_info);
 }
 
 void NEPixelWiseMultiplication::run()
@@ -71,24 +82,29 @@ void NEPixelWiseMultiplication::run()
 
 struct NEComplexPixelWiseMultiplication::Impl
 {
-    ITensor                            *src_0{ nullptr };
-    ITensor                            *src_1{ nullptr };
-    ITensor                            *dst{ nullptr };
-    std::unique_ptr<cpu::CpuComplexMul> op{ nullptr };
+    ITensor                            *src_0{nullptr};
+    ITensor                            *src_1{nullptr};
+    ITensor                            *dst{nullptr};
+    std::unique_ptr<cpu::CpuComplexMul> op{nullptr};
 };
 
-NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication()
-    : _impl(std::make_unique<Impl>())
+NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
 {
 }
 NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication() = default;
 
-Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                                  const ITensorInfo         *input2,
+                                                  const ITensorInfo         *output,
+                                                  const ActivationLayerInfo &act_info)
 {
     return cpu::CpuComplexMul::validate(input1, input2, output, act_info);
 }
 
-void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEComplexPixelWiseMultiplication::configure(ITensor                   *input1,
+                                                 ITensor                   *input2,
+                                                 ITensor                   *output,
+                                                 const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
diff --git a/src/runtime/NEON/functions/NEPooling3dLayer.cpp b/src/runtime/NEON/functions/NEPooling3dLayer.cpp
index 53f9dbf0a22b3d28daddb8da4ae09d7d652ae5c8..e017e8c21dff280e6696aa8fe9c59f3e322cb089 100644
--- a/src/runtime/NEON/functions/NEPooling3dLayer.cpp
+++ b/src/runtime/NEON/functions/NEPooling3dLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuPool3d.h"
 
@@ -33,9 +34,9 @@ namespace arm_compute
 {
 struct NEPooling3dLayer::Impl
 {
-    const ITensor                  *src{ nullptr };
-    ITensor                        *dst{ nullptr };
-    std::unique_ptr<cpu::CpuPool3d> op{ nullptr };
+    const ITensor                  *src{nullptr};
+    ITensor                        *dst{nullptr};
+    std::unique_ptr<cpu::CpuPool3d> op{nullptr};
     MemoryGroup                     memory_group{};
     ITensorPack                     run_pack{};
     WorkspaceData<Tensor>           workspace_tensors{};
@@ -43,8 +44,7 @@ struct NEPooling3dLayer::Impl
 
 NEPooling3dLayer::~NEPooling3dLayer() = default;
 
-NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group = MemoryGroup(std::move(memory_manager));
 }
@@ -56,11 +56,12 @@ void NEPooling3dLayer::configure(const ITensor *input, ITensor *output, const Po
     _impl->op  = std::make_unique<cpu::CpuPool3d>();
     _impl->op->configure(input->info(), output->info(), pool_info);
 
-    _impl->run_pack          = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST_0, _impl->dst } };
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST_0, _impl->dst}};
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
-Status NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
+Status
+NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
 {
     return cpu::CpuPool3d::validate(input, output, pool_info);
 }
@@ -72,4 +73,4 @@ void NEPooling3dLayer::run()
     _impl->op->run(_impl->run_pack);
 }
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 5a3b9c5e7edee8d17ca764301f075139f06dcf04..eb9125be3c6ddb3d6ee68da14a57b5b6f219c4b3 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuPool2d.h"
 
@@ -33,10 +34,10 @@ namespace arm_compute
 {
 struct NEPoolingLayer::Impl
 {
-    ITensor                        *src{ nullptr };
-    ITensor                        *dst{ nullptr };
-    ITensor                        *indices{ nullptr };
-    std::unique_ptr<cpu::CpuPool2d> op{ nullptr };
+    ITensor                        *src{nullptr};
+    ITensor                        *dst{nullptr};
+    ITensor                        *indices{nullptr};
+    std::unique_ptr<cpu::CpuPool2d> op{nullptr};
     MemoryGroup                     memory_group{};
     ITensorPack                     run_pack{};
     WorkspaceData<Tensor>           workspace_tensors{};
@@ -44,8 +45,7 @@ struct NEPoolingLayer::Impl
 
 NEPoolingLayer::~NEPoolingLayer() = default;
 
-NEPoolingLayer::NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEPoolingLayer::NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group = MemoryGroup(std::move(memory_manager));
 }
@@ -58,11 +58,16 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay
     _impl->op      = std::make_unique<cpu::CpuPool2d>();
     _impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
 
-    _impl->run_pack          = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST_0, _impl->dst }, { TensorType::ACL_DST_1, _impl->indices } };
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src},
+                                {TensorType::ACL_DST_0, _impl->dst},
+                                {TensorType::ACL_DST_1, _impl->indices}};
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
-Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status NEPoolingLayer::validate(const ITensorInfo      *input,
+                                const ITensorInfo      *output,
+                                const PoolingLayerInfo &pool_info,
+                                const ITensorInfo      *indices)
 {
     return cpu::CpuPool2d::validate(input, output, pool_info, indices);
 }
diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
index aba09239cf00d9ece25044dd0bb45caabb040c30..dbb6bf9df1260e9ae14f23139590c939512ef252 100644
--- a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
+++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
@@ -27,15 +27,19 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
 
 namespace arm_compute
 {
-void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
+void NEPriorBoxLayer::configure(const ITensor           *input1,
+                                const ITensor           *input2,
+                                ITensor                 *output,
+                                const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info);
 
@@ -44,7 +48,10 @@ void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, IT
     _kernel = std::move(k);
 }
 
-Status NEPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status NEPriorBoxLayer::validate(const ITensorInfo       *input1,
+                                 const ITensorInfo       *input2,
+                                 const ITensorInfo       *output,
+                                 const PriorBoxLayerInfo &info)
 {
     return NEPriorBoxLayerKernel::validate(input1, input2, output, info);
 }
diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index 2caaea02d88fa8307d909d94c788da0f445fac72..dd78d10d164c1a01eb3dc03925dd8e97465798ef 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
@@ -27,13 +27,14 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
-#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
 
 namespace arm_compute
@@ -41,12 +42,19 @@ namespace arm_compute
 using namespace arm_compute::utils::info_helpers;
 namespace
 {
-Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias,
-                   float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info)
+Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info,
+                   const ITensorInfo       *mm_input,
+                   const ITensorInfo       *mm_weights,
+                   const ITensorInfo       *bias,
+                   float                    gemmlowp_scale,
+                   const TensorInfo        *mm_res_info,
+                   const TensorInfo        *outstage_tensor_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+        gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
     return Status{};
 }
 } // namespace
@@ -55,10 +63,7 @@ Status NEQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInf
 {
     // Output quantization scale will be different, but ignored here
     // since it will be configured at configure() stage.
-    const TensorInfo out
-    {
-        in
-    };
+    const TensorInfo out{in};
     return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
 }
 
@@ -98,14 +103,12 @@ void NEQLSTMLayer::TensorCopyKernel::configure(ITensor &src, ITensor &dst)
 
 void NEQLSTMLayer::TensorCopyKernel::run()
 {
-    Iterator input_iter{ _src, _window };
-    Iterator output_iter{ _dst, _window };
+    Iterator input_iter{_src, _window};
+    Iterator output_iter{_dst, _window};
 
-    execute_window_loop(_window, [&](const Coordinates &)
-    {
-        memcpy(output_iter.ptr(), input_iter.ptr(), _row_size);
-    },
-    input_iter, output_iter);
+    execute_window_loop(
+        _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter,
+        output_iter);
 }
 
 NEQLSTMLayer::~NEQLSTMLayer() = default;
@@ -191,10 +194,17 @@ NEQLSTMLayer::NEQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
     _memory_group = MemoryGroup(std::move(memory_manager));
 }
 
-void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                                const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias,
-                                Tensor *mm_res, Tensor *outstage_res, float gemmlowp_scale,
-                                const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)
+void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm,
+                                NEGEMMLowpOutputStage        &outstage,
+                                GEMMLowpOutputStageInfo      &gemmlowp_info,
+                                const ITensor                *mm_input,
+                                const ITensor                *mm_weights,
+                                const ITensor                *bias,
+                                Tensor                       *mm_res,
+                                Tensor                       *outstage_res,
+                                float                         gemmlowp_scale,
+                                const TensorInfo             &mm_res_info,
+                                const TensorInfo             &outstage_tensor_info)
 {
     _memory_group.manage(mm_res);
     _memory_group.manage(outstage_res);
@@ -206,66 +216,87 @@ void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutp
     mm.configure(mm_input, mm_weights, nullptr, mm_res);
 
     // Configure output stage
-    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift);
     outstage.configure(mm_res, bias, outstage_res, gemmlowp_info);
     mm_res->allocator()->allocate();
 }
 
-void NEQLSTMLayer::configure(const ITensor *input,
-                             const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                             const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                             const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                             const ITensor *cell_state_in, ITensor *output_state_in,
-                             ITensor *cell_state_out, ITensor *output_state_out, ITensor *output,
+void NEQLSTMLayer::configure(const ITensor             *input,
+                             const ITensor             *input_to_forget_weights,
+                             const ITensor             *input_to_cell_weights,
+                             const ITensor             *input_to_output_weights,
+                             const ITensor             *recurrent_to_forget_weights,
+                             const ITensor             *recurrent_to_cell_weights,
+                             const ITensor             *recurrent_to_output_weights,
+                             const ITensor             *forget_gate_bias,
+                             const ITensor             *cell_bias,
+                             const ITensor             *output_gate_bias,
+                             const ITensor             *cell_state_in,
+                             ITensor                   *output_state_in,
+                             ITensor                   *cell_state_out,
+                             ITensor                   *output_state_out,
+                             ITensor                   *output,
                              const LSTMParams<ITensor> &lstm_params)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                                  recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                                 cell_state_out, output_state_out);
 
     ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                            recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                           forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+                           forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                           cell_state_out, output_state_out);
 
     // Set lstm parameters
     LSTMParams<ITensorInfo> lstm_params_info{};
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
-    _input_to_forget_weights_transposed.info()->set_quantization_info(input_to_forget_weights->info()->quantization_info());
+    _input_to_forget_weights_transposed.info()->set_quantization_info(
+        input_to_forget_weights->info()->quantization_info());
     _input_to_cell_weights_transposed.info()->set_quantization_info(input_to_cell_weights->info()->quantization_info());
-    _input_to_output_weights_transposed.info()->set_quantization_info(input_to_output_weights->info()->quantization_info());
-    _recurrent_to_forget_weights_transposed.info()->set_quantization_info(recurrent_to_forget_weights->info()->quantization_info());
-    _recurrent_to_cell_weights_transposed.info()->set_quantization_info(recurrent_to_cell_weights->info()->quantization_info());
-    _recurrent_to_output_weights_transposed.info()->set_quantization_info(recurrent_to_output_weights->info()->quantization_info());
-
-    if(input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED)
+    _input_to_output_weights_transposed.info()->set_quantization_info(
+        input_to_output_weights->info()->quantization_info());
+    _recurrent_to_forget_weights_transposed.info()->set_quantization_info(
+        recurrent_to_forget_weights->info()->quantization_info());
+    _recurrent_to_cell_weights_transposed.info()->set_quantization_info(
+        recurrent_to_cell_weights->info()->quantization_info());
+    _recurrent_to_output_weights_transposed.info()->set_quantization_info(
+        recurrent_to_output_weights->info()->quantization_info());
+
+    if (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED)
     {
         _convert_input_to_forget_weights_to_qsymm8 = true;
         // Setup dequantize output tensor to go from QASYMM8_SIGNED -> F32
 
-        _input_to_forget_weights_f32.allocator()->init(TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32)
-                                                       .set_data_layout(input_to_forget_weights->info()->data_layout()));
+        _input_to_forget_weights_f32.allocator()->init(
+            TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32)
+                .set_data_layout(input_to_forget_weights->info()->data_layout()));
         // Setup the quantize output tensor to go from F32 -> QSYMM8
-        _input_to_forget_weights_symm8.allocator()->init((TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8)
-                                                          .set_data_layout(input_to_forget_weights->info()->data_layout())
-                                                          .set_quantization_info(input_to_forget_weights->info()->quantization_info())));
+        _input_to_forget_weights_symm8.allocator()->init(
+            (TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8)
+                 .set_data_layout(input_to_forget_weights->info()->data_layout())
+                 .set_quantization_info(input_to_forget_weights->info()->quantization_info())));
 
         _dequantize_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_f32);
         _quantize_input_to_forget_weights.configure(&_input_to_forget_weights_f32, &_input_to_forget_weights_symm8);
 
-        ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                          recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                          forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                          cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
-                                                          lstm_params_info));
+        ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(
+            input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(),
+            input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(),
+            recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+            cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(),
+            output->info(), lstm_params_info));
     }
     else
     {
-        ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                          recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                          forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                          cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
-                                                          lstm_params_info));
+        ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(
+            input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+            input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(),
+            recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+            cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(),
+            output->info(), lstm_params_info));
     }
 
     const int batch_size  = input->info()->dimension(1);
@@ -277,7 +308,9 @@ void NEQLSTMLayer::configure(const ITensor *input,
     const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform();
 
     _projection_bias             = lstm_params.projection_bias();
-    _input_to_forget_weights     = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) ? &_input_to_forget_weights_symm8 : input_to_forget_weights;
+    _input_to_forget_weights     = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED)
+                                       ? &_input_to_forget_weights_symm8
+                                       : input_to_forget_weights;
     _input_to_cell_weights       = input_to_cell_weights;
     _input_to_output_weights     = input_to_output_weights;
     _recurrent_to_forget_weights = recurrent_to_forget_weights;
@@ -287,7 +320,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     // Layer normalization
     _has_layer_norm = lstm_params.use_layer_norm();
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget);
         set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell);
@@ -309,22 +342,25 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
     _has_cell_clipping = quantized_cell_clip > 0;
 
     // Precompute effective bias for optimizing the matmul computations.
-    if(!_has_cifg)
+    if (!_has_cifg)
     {
         _input_to_input_weights     = lstm_params.input_to_input_weights();
         _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
 
         _input_to_input_reduction     = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
         _recurrent_to_input_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
-        _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-        _recurrent_to_input_reduction->configure(_recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+        _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(),
+                                             GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+        _recurrent_to_input_reduction->configure(
+            _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     }
 
     _input_to_forget_reduction     = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
@@ -334,19 +370,31 @@ void NEQLSTMLayer::configure(const ITensor *input,
     _input_to_output_reduction     = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
     _recurrent_to_output_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
 
-    _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_forget_reduction->configure(recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_cell_reduction->configure(recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_output_reduction->configure(recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    if(_has_projection)
+    _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_forget_reduction->configure(
+        recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(),
+                                        GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_cell_reduction->configure(
+        recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_output_reduction->configure(
+        recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    if (_has_projection)
     {
         _projection_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
-        _projection_reduction->configure(_projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
-        if(_projection_bias != nullptr)
+        _projection_reduction->configure(
+            _projection_weights->info(), _projection_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        if (_projection_bias != nullptr)
         {
-            _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
+            _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias,
+                                           ConvertPolicy::SATURATE);
         }
     }
 
@@ -354,15 +402,19 @@ void NEQLSTMLayer::configure(const ITensor *input,
     _transpose_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_transposed);
     _transpose_input_to_cell_weights.configure(input_to_cell_weights, &_input_to_cell_weights_transposed);
     _transpose_input_to_output_weights.configure(input_to_output_weights, &_input_to_output_weights_transposed);
-    _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
+    _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights,
+                                                     &_recurrent_to_forget_weights_transposed);
     _transpose_recurrent_to_cell_weights.configure(recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
-    _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
-    if(!_has_cifg)
+    _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights,
+                                                     &_recurrent_to_output_weights_transposed);
+    if (!_has_cifg)
     {
-        _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
-        _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
+        _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(),
+                                                    &_input_to_input_weights_transposed);
+        _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(),
+                                                        &_recurrent_to_input_weights_transposed);
     }
-    if(_has_projection)
+    if (_has_projection)
     {
         _transpose_projection_weights.configure(_projection_weights, &_projection_weights_transposed);
     }
@@ -375,40 +427,52 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
     // Forget gate.
-    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
-    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
-                 input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,
-                 &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,
-                 mm_out_info, forget_gate_outstage_info);
-
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
-    configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,
-                 &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
-                 mm_out_info, forget_gate_outstage_info);
-
-    _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
+    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                               QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.forget_intermediate_scale();
+    configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input,
+                 &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res,
+                 &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info);
+
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res,
+                 &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info);
+
+    _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+                                                 &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
     _input_to_forget_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_forget_res);
-        _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-        _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
+        _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(),
+                                                &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
+        _cell_to_forget_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_forget_outstage_res);
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
+        const float cell_to_forget_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.forget_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res,
+                                           gemmlowp_info);
         _mul_cell_to_forget_res.allocator()->allocate();
-        _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
+        _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res,
+                                          &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
         _cell_to_forget_outstage_res.allocator()->allocate();
     }
 
     Tensor *forget_activation_input = &_recurrent_to_forget_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Forget, forget_activation_input);
         forget_activation_input->allocator()->allocate();
@@ -417,33 +481,36 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     // Output quantization info of Sigmoid and Tanh activations
     const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
-    const TensorInfo       forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+    const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
 
     _memory_group.manage(&_forget_gate);
     _forget_gate.allocator()->init(forget_gate_info);
-    _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     forget_activation_input->allocator()->allocate();
 
     // Modulation gate.
-    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
-                 input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,
-                 &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
+    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale *
+                                      qinput.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, &_input_to_cell_weights_transposed,
+                 &_input_to_cell_eff_bias, &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
                  mm_out_info, cell_outstage_info);
 
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,
-                 &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
-                 mm_out_info, cell_outstage_info);
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res,
+                 &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info);
 
-    _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE);
+    _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
+                                                     &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE);
     _input_to_cell_outstage_res.allocator()->allocate();
 
     Tensor *cell_activation_input = &_recurrent_to_cell_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Cell, cell_activation_input);
         cell_activation_input->allocator()->allocate();
@@ -454,14 +521,15 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     _memory_group.manage(&_cell_gate);
     _cell_gate.allocator()->init(cell_gate_info);
-    _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _cell_gate_tanh.configure(cell_activation_input, &_cell_gate,
+                              ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     cell_activation_input->allocator()->allocate();
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _input_gate.allocator()->init(input_gate_info);
     _memory_group.manage(&_input_gate);
-    if(_has_cifg)
+    if (_has_cifg)
     {
         _ones.allocator()->init(*_forget_gate.info());
         _input_gate_sub.configure(&_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
@@ -469,104 +537,137 @@ void NEQLSTMLayer::configure(const ITensor *input,
     }
     else
     {
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
-                     input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,
-                     &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,
-                     mm_out_info, input_outstage_info);
-
-        const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
-        configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
-                     output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input,
+                     &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res,
+                     &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info);
+
+        const float recurrent_to_input_scale =
+            _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
+        configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, output_state_in,
+                     &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
                      &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
                      mm_out_info, input_outstage_info);
-        _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+        _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res,
+                                                    &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
         _input_to_input_outstage_res.allocator()->allocate();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
-            _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
+            _mul_cell_to_input_res.allocator()->init(
+                TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
             _memory_group.manage(&_mul_cell_to_input_res);
-            _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-            _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
+            _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(),
+                                                   &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
+            const float cell_to_input_scale =
+                std::pow(2, cell_shift) *
+                lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale /
+                lstm_params.input_intermediate_scale();
+            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                         &gemmlowp_info.gemmlowp_shift);
+            _cell_to_input_outstage_res.allocator()->init(
+                TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                           QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
             _memory_group.manage(&_cell_to_input_outstage_res);
-            _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
+            _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res,
+                                              gemmlowp_info);
             _mul_cell_to_input_res.allocator()->allocate();
-            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res,
+                                             &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
             _cell_to_input_outstage_res.allocator()->allocate();
         }
 
         Tensor *input_activation_input = &_recurrent_to_input_outstage_res;
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             configure_layer_norm(LayerNormGate::Input, input_activation_input);
             input_activation_input->allocator()->allocate();
             input_activation_input = &get_layer_norm_output(LayerNormGate::Input);
         }
 
-        _input_gate_sigmoid.configure(input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _input_gate_sigmoid.configure(input_activation_input, &_input_gate,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
         input_activation_input->allocator()->allocate();
     }
     // Cell.
     // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
-    _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE,
+                                         RoundingPolicy::TO_ZERO);
     const float      cell_gate_scale      = _cell_gate.info()->quantization_info().uniform().scale;
     const float      mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
-    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));
+    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                         QuantizationInfo(mul_input_cell_scale, 0));
     _memory_group.manage(&_mul_input_cell_res);
     _mul_input_cell_res.allocator()->init(mul_input_cell_info);
-    _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE,
+                                        RoundingPolicy::TO_ZERO);
     _cell_gate.allocator()->allocate();
     _add_forget_cell.configure(&_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
     _mul_input_cell_res.allocator()->allocate();
     _forget_gate.allocator()->allocate();
-    if(_has_cell_clipping)
+    if (_has_cell_clipping)
     {
-        _cell_clip.configure(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
+        _cell_clip.configure(cell_state_out, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 -quantized_cell_clip, quantized_cell_clip));
     }
     // Output gate.
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
-                 input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,
-                 &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,
-                 mm_out_info, output_outstage_info);
-
-    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
-    configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,
-                 &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
-                 mm_out_info, output_outstage_info);
-
-    _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.output_intermediate_scale();
+    configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input,
+                 &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res,
+                 &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info);
+
+    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res,
+                 &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info);
+
+    _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res,
+                                                 &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
     _input_to_output_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_output_res);
-        _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-
-        const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
+        _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(),
+                                                &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
+
+        const float cell_to_output_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.output_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_output_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_output_outstage_res);
-        _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info);
+        _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res,
+                                           gemmlowp_info);
         _mul_cell_to_output_res.allocator()->allocate();
 
-        _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
+        _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res,
+                                             &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
         _cell_to_output_outstage_res.allocator()->allocate();
     }
 
     Tensor *output_activation_input = &_recurrent_to_output_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Output, output_activation_input);
         output_activation_input->allocator()->allocate();
@@ -576,20 +677,24 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     _memory_group.manage(&_output_gate);
     _output_gate.allocator()->init(output_gate_info);
-    _output_gate_sigmoid.configure(output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_sigmoid.configure(output_activation_input, &_output_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     output_activation_input->allocator()->allocate();
 
     // Hidden.
-    _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _hidden_tanh.configure(cell_state_out, &_input_gate,
+                           ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
     _memory_group.manage(&_hidden_mul_res);
     const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
     _hidden_mul_res.allocator()->init(hidden_mul_res);
-    _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE,
+                                    RoundingPolicy::TO_ZERO);
     _output_gate.allocator()->allocate();
     _input_gate.allocator()->allocate();
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
+    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = output_state_in->info()->data_type();
 
@@ -598,7 +703,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     _memory_group.manage(&_hidden_gate);
 
-    if(_projection_tensor_copy_required)
+    if (_projection_tensor_copy_required)
     {
         _hidden_gate.allocator()->init(*output_state_out->info());
         _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape());
@@ -609,27 +714,26 @@ void NEQLSTMLayer::configure(const ITensor *input,
     _hidden_mul_res.allocator()->allocate();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         const TensorInfo              projection_outstage_info(*output_state_out->info());
-        const UniformQuantizationInfo qprojection      = _projection_weights->info()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        gemmlowp_info.gemmlowp_offset                  = qoutput_state_in.offset;
-        gemmlowp_info.gemmlowp_min_bound               = std::numeric_limits<int8_t>::lowest();
-        gemmlowp_info.gemmlowp_max_bound               = std::numeric_limits<int8_t>::max();
-        gemmlowp_info.output_data_type                 = DataType::QASYMM8_SIGNED;
-
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
+        const float projection_scale  = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
+        gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
+        gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
+        gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
+
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        configure_mm(_mm_projection, _projection_outstage, gemmlowp_info,
-                     hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias,
-                     &_mm_projection_res, &_projection_outstage_res, projection_scale,
-                     projection_mm_out_info, projection_outstage_info);
+        configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result,
+                     &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res,
+                     &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info);
 
         ITensor *accumulate_destination = output_state_out;
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_gate.allocator()->allocate();
             _projection_accumulate_res.allocator()->init(*output_state_in->info());
@@ -638,30 +742,34 @@ void NEQLSTMLayer::configure(const ITensor *input,
             accumulate_destination = &_projection_accumulate_res;
         }
 
-        _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
+        _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination,
+                                         ConvertPolicy::SATURATE);
         _projection_outstage_res.allocator()->allocate();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out);
             _projection_accumulate_res.allocator()->allocate();
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
-            quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
+            quantized_projection_clip =
+                utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, quantized_projection_clip));
+            _projection_clip.configure(output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -quantized_projection_clip, quantized_projection_clip));
             _has_projection_clipping = true;
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.configure(_hidden_gate, *output_state_out);
             _hidden_gate.allocator()->allocate();
@@ -672,17 +780,27 @@ void NEQLSTMLayer::configure(const ITensor *input,
     _copy_output.configure(output_state_out, output);
 }
 
-Status NEQLSTMLayer::validate(const ITensorInfo *input,
-                              const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                              const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                              const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                              const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                              const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+Status NEQLSTMLayer::validate(const ITensorInfo             *input,
+                              const ITensorInfo             *input_to_forget_weights,
+                              const ITensorInfo             *input_to_cell_weights,
+                              const ITensorInfo             *input_to_output_weights,
+                              const ITensorInfo             *recurrent_to_forget_weights,
+                              const ITensorInfo             *recurrent_to_cell_weights,
+                              const ITensorInfo             *recurrent_to_output_weights,
+                              const ITensorInfo             *forget_gate_bias,
+                              const ITensorInfo             *cell_bias,
+                              const ITensorInfo             *output_gate_bias,
+                              const ITensorInfo             *cell_state_in,
+                              const ITensorInfo             *output_state_in,
+                              const ITensorInfo             *cell_state_out,
+                              const ITensorInfo             *output_state_out,
+                              const ITensorInfo             *output,
                               const LSTMParams<ITensorInfo> &lstm_params)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
-                                        cell_state_out, output_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                        recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+                                        cell_state_in, output_state_in, cell_state_out, output_state_out, output);
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");
@@ -694,22 +812,27 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
 
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights,
+                                                   input_to_cell_weights);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights,
+                                                   recurrent_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8);
 
     // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED
     if (input_to_forget_weights->data_type() == DataType::QSYMM8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_cell_weights, input_to_output_weights,
-                                                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+                                                           recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                           recurrent_to_output_weights);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights,
+                                                           input_to_output_weights, recurrent_to_forget_weights,
+                                                           recurrent_to_cell_weights, recurrent_to_output_weights);
     }
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);
@@ -728,20 +851,25 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);
 
     // Check whether peephole weights are all there or none
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                       lstm_params.cell_to_output_weights());
 
-        if(!lstm_params.has_cifg_opt())
+        if (!lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                               lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_input_weights());
         }
     }
 
@@ -755,7 +883,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
@@ -763,60 +891,90 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     // Precompute effective bias for optimizing the matmul computations.
     const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);
     const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
-                                                                                              -qinput.offset, true)));
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
-                                                                                              -qoutput_state_in.offset,
-                                                                                              true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.input_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.recurrent_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
-                                                                                          -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
-                                                                                          true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
-                                                                                          -qoutput_state_in.offset, true)));
-    if(lstm_params.has_projection())
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_forget_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_cell_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_output_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
-                                                                                              lstm_params.hidden_state_zero(),
-                                                                                              true)));
-        if(lstm_params.projection_bias() != nullptr)
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.projection_weights(), &projection_eff_bias_info,
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)));
+        if (lstm_params.projection_bias() != nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, &projection_eff_bias_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
+                                               &projection_eff_bias_info, ConvertPolicy::SATURATE));
         }
     }
 
-    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(), input_to_cell_weights->quantization_info());
-    const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1, input_to_output_weights->data_type(), input_to_output_weights->quantization_info());
-    const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
-    const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_cell_weights->data_type(), recurrent_to_cell_weights->quantization_info());
-    const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_output_weights->data_type(), recurrent_to_output_weights->quantization_info());
-    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
+    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(),
+                                              input_to_cell_weights->quantization_info());
+    const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1,
+                                                        input_to_output_weights->data_type(),
+                                                        input_to_output_weights->quantization_info());
+    const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                            recurrent_to_forget_weights->data_type(),
+                                                            recurrent_to_forget_weights->quantization_info());
+    const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                          recurrent_to_cell_weights->data_type(),
+                                                          recurrent_to_cell_weights->quantization_info());
+    const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                            recurrent_to_output_weights->data_type(),
+                                                            recurrent_to_output_weights->quantization_info());
+    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                  recurrent_to_forget_weights->data_type(),
+                                                  recurrent_to_forget_weights->quantization_info());
 
     ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_cell_weights, &input_weights_transposed));
     ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_to_output_weights_transposed));
-    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed));
-    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed));
-    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed));
-    if(!lstm_params.has_cifg_opt())
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed));
+    if (!lstm_params.has_cifg_opt())
     {
-        const TensorInfo recurrent_to_input_weights_transposed(TensorShape(num_units, output_size), 1,
-                                                               recurrent_to_forget_weights->data_type(), lstm_params.recurrent_to_input_weights()->quantization_info());
+        const TensorInfo recurrent_to_input_weights_transposed(
+            TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(),
+            lstm_params.recurrent_to_input_weights()->quantization_info());
         const TensorInfo input_to_input_weights_transposed(TensorShape(num_units, input_size), 1,
-                                                           lstm_params.input_to_input_weights()->data_type(), lstm_params.input_to_input_weights()->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed));
-        ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed));
+                                                           lstm_params.input_to_input_weights()->data_type(),
+                                                           lstm_params.input_to_input_weights()->quantization_info());
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed));
     }
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
     }
 
     GEMMLowpOutputStageInfo gemmlowp_info;
@@ -829,28 +987,42 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
 
     // Forget gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0);
-    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
-    const float      input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_forget_scale, &mm_out_info, &forget_outstage_info));
 
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_forget_scale, &mm_out_info,
+                                            &forget_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                               &forget_outstage_info, ConvertPolicy::SATURATE));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        const float cell_to_forget_scale = std::pow(2, cell_shift) *
+                                           lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale /
+                                           lstm_params.forget_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                                   &forget_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights();
         const ITensorInfo *b_info = forget_gate_bias;
@@ -859,22 +1031,31 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
 
     // Output quantization info of Sigmoid and Tanh activations
     const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
-    const TensorInfo       forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+    const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Modulation gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0);
-    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
-
-    if(has_layer_norm)
+    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale /
+                                      lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_cell_scale, &mm_out_info, &cell_outstage_info));
+
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_cell_scale, &mm_out_info,
+                                            &cell_outstage_info));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info,
+                                                               &cell_outstage_info, ConvertPolicy::SATURATE));
+
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights();
         const ITensorInfo *b_info = cell_bias;
@@ -882,94 +1063,134 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     }
     const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr,
+                                        "Input gate bias must not be present when CIFG is used");
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info,
+                                                                      &forget_gate_info, ConvertPolicy::SATURATE));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
 
         // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED
         if (input_to_forget_weights->data_type() == DataType::QSYMM8)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(),
+                                                               lstm_params.recurrent_to_input_weights());
         }
         else
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights,
+                                                               lstm_params.input_to_input_weights(),
+                                                               lstm_params.recurrent_to_input_weights());
         }
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights,
+                                                       lstm_params.recurrent_to_input_weights());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
 
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0);
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
-
-        const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
-
-        if(lstm_params.has_peephole_opt())
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                                input_to_input_scale, &mm_out_info, &input_outstage_info));
+
+        const float recurrent_to_input_scale =
+            lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                                &eff_bias_info, recurrent_to_input_scale, &mm_out_info,
+                                                &input_outstage_info));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                   &input_outstage_info, ConvertPolicy::SATURATE));
+
+        if (lstm_params.has_peephole_opt())
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                            RoundingPolicy::TO_ZERO));
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info,
+                                                    1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            const float cell_to_input_scale = std::pow(2, cell_shift) *
+                                              lstm_params.cell_to_input_weights()->quantization_info().uniform().scale /
+                                              lstm_params.input_intermediate_scale();
+            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+                cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                       &input_outstage_info, ConvertPolicy::SATURATE));
         }
 
-        if(has_layer_norm)
+        if (has_layer_norm)
         {
             const ITensorInfo *w_info = lstm_params.input_layer_norm_weights();
             const ITensorInfo *b_info = lstm_params.input_gate_bias();
             ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(input_outstage_info, *w_info, *b_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEActivationLayer::validate(&input_outstage_info, &input_gate_info,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     }
     // Cell.
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
-    if(quantized_cell_clip > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
+    if (quantized_cell_clip > 0)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,
-                                                                                                             quantized_cell_clip)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEActivationLayer::validate(cell_state_out, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            -quantized_cell_clip, quantized_cell_clip)));
     }
     // Output gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0);
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
-
-    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
-    if(lstm_params.has_peephole_opt())
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_output_scale, &mm_out_info, &output_outstage_info));
+
+    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_output_scale, &mm_out_info,
+                                            &output_outstage_info));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                               &output_outstage_info, ConvertPolicy::SATURATE));
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1,
+                                                             DataType::QSYMM16);
         // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
         // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+            cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+            RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                                   &output_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.output_layer_norm_weights();
         const ITensorInfo *b_info = output_gate_bias;
@@ -977,85 +1198,103 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     }
 
     const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&output_outstage_info, &output_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Hidden.
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(cell_state_out, &input_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
     const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
 
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = hidden_out_info.data_type();
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
 
     const bool projection_tensor_copy_required = num_units != output_size;
 
     // Projection.
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights,
+                                                           lstm_params.projection_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0);
 
-        const UniformQuantizationInfo qprojection      = lstm_params.projection_weights()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
+        const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
         gemmlowp_info.gemmlowp_offset    = qoutput_state_in.offset;
         gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
         gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
         gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
 
         const TensorInfo projection_outstage_info(*output_state_out);
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
 
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed,
+                                                &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
                                                 &projection_outstage_info));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out,
+                                                                   ConvertPolicy::SATURATE));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
             quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
-                                                                                                                   quantized_projection_clip)));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+                output_state_out, nullptr,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                    -quantized_projection_clip, quantized_projection_clip)));
         }
     }
     else
     {
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
             ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out));
         }
     }
 
-    if(cell_state_out->total_size() > 0)
+    if (cell_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);
     }
 
-    if(output_state_out->total_size() > 0)
+    if (output_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
@@ -1080,14 +1319,14 @@ void NEQLSTMLayer::run()
     _recurrent_to_forget_outstage.run();
     _accumulate_input_recurrent_forget.run();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _pixelwise_mul_cell_to_forget.run();
         _cell_to_forget_outstage.run();
         _accumulate_cell_forget.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Forget).get(), Window::DimY);
     }
@@ -1102,7 +1341,7 @@ void NEQLSTMLayer::run()
     _recurrent_to_cell_outstage.run();
     _accumulate_input_recurrent_modulation.run();
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Cell).get(), Window::DimY);
     }
@@ -1110,7 +1349,7 @@ void NEQLSTMLayer::run()
     _cell_gate_tanh.run();
 
     // Input gate
-    if(_has_cifg)
+    if (_has_cifg)
     {
         _input_gate_sub.run();
     }
@@ -1122,14 +1361,14 @@ void NEQLSTMLayer::run()
         _recurrent_to_input_outstage.run();
         _accumulate_input_recurrent_input.run();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
             _pixelwise_mul_cell_to_input.run();
             _cell_to_input_outstage.run();
             _accumulate_cell_input.run();
         }
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Input).get(), Window::DimY);
         }
@@ -1142,7 +1381,7 @@ void NEQLSTMLayer::run()
     _pixelwise_mul_input_cell.run();
     _add_forget_cell.run();
 
-    if(_has_cell_clipping)
+    if (_has_cell_clipping)
     {
         _cell_clip.run();
     }
@@ -1153,14 +1392,14 @@ void NEQLSTMLayer::run()
     _mm_recurrent_to_output.run();
     _recurrent_to_output_outstage.run();
     _accumulate_input_recurrent_output.run();
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _pixelwise_mul_cell_to_output.run();
         _cell_to_output_outstage.run();
         _accumulate_cell_to_output.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Output).get(), Window::DimY);
     }
@@ -1173,31 +1412,31 @@ void NEQLSTMLayer::run()
     _hidden_outstage.run();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         _mm_projection.run();
         _projection_outstage.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_output_to_accumulate_copy.run();
         }
 
         _accumulate_projection.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.run();
         }
 
-        if(_has_projection_clipping)
+        if (_has_projection_clipping)
         {
             _projection_clip.run();
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.run();
         }
@@ -1209,9 +1448,9 @@ void NEQLSTMLayer::run()
 
 void NEQLSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        if(_convert_input_to_forget_weights_to_qsymm8)
+        if (_convert_input_to_forget_weights_to_qsymm8)
         {
             _input_to_forget_weights_f32.allocator()->allocate();
             _input_to_forget_weights_symm8.allocator()->allocate();
@@ -1234,28 +1473,25 @@ void NEQLSTMLayer::prepare()
         _transpose_recurrent_to_output_weights.run();
 
         // Precompute effective biases
-        if(_has_cifg)
+        if (_has_cifg)
         {
-            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);
+            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 32767);
         }
         else
         {
             _input_to_input_eff_bias.allocator()->allocate();
             _recurrent_to_input_eff_bias.allocator()->allocate();
 
-            ITensorPack packII =
-            {
-                { TensorType::ACL_SRC, _input_to_input_weights },
-                { TensorType::ACL_DST, &_input_to_input_eff_bias }
-            };
-            NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY, _input_to_input_reduction->window(), packII);
+            ITensorPack packII = {{TensorType::ACL_SRC, _input_to_input_weights},
+                                  {TensorType::ACL_DST, &_input_to_input_eff_bias}};
+            NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY,
+                                           _input_to_input_reduction->window(), packII);
 
-            ITensorPack packRI =
-            {
-                { TensorType::ACL_SRC, _recurrent_to_input_weights },
-                { TensorType::ACL_DST, &_recurrent_to_input_eff_bias }
-            };
-            NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY, _recurrent_to_input_reduction->window(), packRI);
+            ITensorPack packRI = {{TensorType::ACL_SRC, _recurrent_to_input_weights},
+                                  {TensorType::ACL_DST, &_recurrent_to_input_eff_bias}};
+            NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY,
+                                           _recurrent_to_input_reduction->window(), packRI);
 
             _input_to_input_weights_transposed.allocator()->allocate();
             _recurrent_to_input_weights_transposed.allocator()->allocate();
@@ -1271,58 +1507,44 @@ void NEQLSTMLayer::prepare()
         _input_to_output_eff_bias.allocator()->allocate();
         _recurrent_to_output_eff_bias.allocator()->allocate();
 
-        ITensorPack packIF =
-        {
-            { TensorType::ACL_SRC, _input_to_forget_weights },
-            { TensorType::ACL_DST, &_input_to_forget_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY, _input_to_forget_reduction->window(), packIF);
-
-        ITensorPack packRF =
-        {
-            { TensorType::ACL_SRC, _recurrent_to_forget_weights },
-            { TensorType::ACL_DST, &_recurrent_to_forget_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY, _recurrent_to_forget_reduction->window(), packRF);
-
-        ITensorPack packIC =
-        {
-            { TensorType::ACL_SRC, _input_to_cell_weights },
-            { TensorType::ACL_DST, &_input_to_cell_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(), packIC);
-
-        ITensorPack packRC =
-        {
-            { TensorType::ACL_SRC, _recurrent_to_cell_weights },
-            { TensorType::ACL_DST, &_recurrent_to_cell_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY, _recurrent_to_cell_reduction->window(), packRC);
-
-        ITensorPack packIO =
-        {
-            { TensorType::ACL_SRC, _input_to_output_weights },
-            { TensorType::ACL_DST, &_input_to_output_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY, _input_to_output_reduction->window(), packIO);
-
-        ITensorPack packRO =
-        {
-            { TensorType::ACL_SRC, _recurrent_to_output_weights },
-            { TensorType::ACL_DST, &_recurrent_to_output_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY, _recurrent_to_output_reduction->window(), packRO);
-
-        if(_has_projection)
+        ITensorPack packIF = {{TensorType::ACL_SRC, _input_to_forget_weights},
+                              {TensorType::ACL_DST, &_input_to_forget_eff_bias}};
+        NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY,
+                                       _input_to_forget_reduction->window(), packIF);
+
+        ITensorPack packRF = {{TensorType::ACL_SRC, _recurrent_to_forget_weights},
+                              {TensorType::ACL_DST, &_recurrent_to_forget_eff_bias}};
+        NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY,
+                                       _recurrent_to_forget_reduction->window(), packRF);
+
+        ITensorPack packIC = {{TensorType::ACL_SRC, _input_to_cell_weights},
+                              {TensorType::ACL_DST, &_input_to_cell_eff_bias}};
+        NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(),
+                                       packIC);
+
+        ITensorPack packRC = {{TensorType::ACL_SRC, _recurrent_to_cell_weights},
+                              {TensorType::ACL_DST, &_recurrent_to_cell_eff_bias}};
+        NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY,
+                                       _recurrent_to_cell_reduction->window(), packRC);
+
+        ITensorPack packIO = {{TensorType::ACL_SRC, _input_to_output_weights},
+                              {TensorType::ACL_DST, &_input_to_output_eff_bias}};
+        NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY,
+                                       _input_to_output_reduction->window(), packIO);
+
+        ITensorPack packRO = {{TensorType::ACL_SRC, _recurrent_to_output_weights},
+                              {TensorType::ACL_DST, &_recurrent_to_output_eff_bias}};
+        NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY,
+                                       _recurrent_to_output_reduction->window(), packRO);
+
+        if (_has_projection)
         {
             _projection_eff_bias.allocator()->allocate();
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, _projection_weights },
-                { TensorType::ACL_DST, &_projection_eff_bias }
-            };
-            NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(), pack);
-            if(_projection_bias != nullptr)
+            ITensorPack pack = {{TensorType::ACL_SRC, _projection_weights},
+                                {TensorType::ACL_DST, &_projection_eff_bias}};
+            NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(),
+                                           pack);
+            if (_projection_bias != nullptr)
             {
                 _projection_bias_add.run();
                 _projection_bias->mark_as_unused();
@@ -1332,7 +1554,7 @@ void NEQLSTMLayer::prepare()
             _transpose_projection_weights.run();
             _projection_weights->mark_as_unused();
 
-            if(!_projection_tensor_copy_required)
+            if (!_projection_tensor_copy_required)
             {
                 _hidden_gate.mark_as_unused();
                 _projection_accumulate_res.mark_as_unused();
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index dad246ac8937e874704cf2d4ab8370c3203000fb..9b72783c97394ee16708909cc89677c38dde82ee 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -26,19 +26,19 @@
 
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/cpu/operators/CpuQuantize.h"
 
 namespace arm_compute
 {
 struct NEQuantizationLayer::Impl
 {
-    const ITensor                    *src{ nullptr };
-    ITensor                          *dst{ nullptr };
-    std::unique_ptr<cpu::CpuQuantize> op{ nullptr };
+    const ITensor                    *src{nullptr};
+    ITensor                          *dst{nullptr};
+    std::unique_ptr<cpu::CpuQuantize> op{nullptr};
 };
 
-NEQuantizationLayer::NEQuantizationLayer()
-    : _impl(std::make_unique<Impl>())
+NEQuantizationLayer::NEQuantizationLayer() : _impl(std::make_unique<Impl>())
 {
 }
 NEQuantizationLayer::~NEQuantizationLayer() = default;
diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index a66ef3d27a784d1c28d29de99a4b1d82d061f407..2824693800723d7b92cd0759a0c9cf1c46265186 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -37,13 +38,26 @@ namespace arm_compute
 NERNNLayer::~NERNNLayer() = default;
 
 NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_f(), _activation(), _fully_connected(memory_manager), _copy_f(), _fully_connected_out(), _gemm_output(), _add_output(),
+    : _memory_group(std::move(memory_manager)),
+      _gemm_state_f(),
+      _add_f(),
+      _activation(),
+      _fully_connected(memory_manager),
+      _copy_f(),
+      _fully_connected_out(),
+      _gemm_output(),
+      _add_output(),
       _is_prepared(false)
 {
 }
 
-Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
-                            const ITensorInfo *output, const ActivationLayerInfo &info)
+Status NERNNLayer::validate(const ITensorInfo         *input,
+                            const ITensorInfo         *weights,
+                            const ITensorInfo         *recurrent_weights,
+                            const ITensorInfo         *bias,
+                            const ITensorInfo         *hidden_state,
+                            const ITensorInfo         *output,
+                            const ActivationLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
@@ -60,24 +74,34 @@ Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights
     ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
 
-    auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+    auto shape_info =
+        TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1,
+                   input->data_type());
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&shape_info, &shape_info, info));
 
     return Status{};
 }
 
-void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output,
+void NERNNLayer::configure(const ITensor       *input,
+                           const ITensor       *weights,
+                           const ITensor       *recurrent_weights,
+                           const ITensor       *bias,
+                           ITensor             *hidden_state,
+                           ITensor             *output,
                            ActivationLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(),
+                                                    bias->info(), hidden_state->info(), output->info(), info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info);
 
     const int   idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
-    TensorShape shape      = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+    TensorShape shape      = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(),
+                                                                       hidden_state->info()->dimension(idx_height));
 
     _is_prepared = false;
 
@@ -125,7 +149,7 @@ void NERNNLayer::run()
 
 void NERNNLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _fully_connected.prepare();
         _gemm_state_f.prepare();
diff --git a/src/runtime/NEON/functions/NEROIAlignLayer.cpp b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
index a9bdb50d95fab3c1d161e406a4c964954679294f..68bb5d5ef3e9e61b0e5416d14a8e4826bd59078b 100644
--- a/src/runtime/NEON/functions/NEROIAlignLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
@@ -29,14 +29,20 @@
 
 namespace arm_compute
 {
-Status NEROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIAlignLayer::validate(const ITensorInfo         *input,
+                                 const ITensorInfo         *rois,
+                                 ITensorInfo               *output,
+                                 const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(NEROIAlignLayerKernel::validate(input, rois, output, pool_info));
 
     return Status{};
 }
 
-void NEROIAlignLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIAlignLayer::configure(const ITensor             *input,
+                                const ITensor             *rois,
+                                ITensor                   *output,
+                                const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
 
diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
index a24f2aac50e19245fa0fda1e22c2128d837cb4a5..babec4aa92b68b1fa0a554b61d0cc7834c745387 100644
--- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 
@@ -31,17 +33,22 @@ namespace arm_compute
 {
 NEROIPoolingLayer::~NEROIPoolingLayer() = default;
 
-NEROIPoolingLayer::NEROIPoolingLayer()
-    : _roi_kernel()
+NEROIPoolingLayer::NEROIPoolingLayer() : _roi_kernel()
 {
 }
 
-Status NEROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIPoolingLayer::validate(const ITensorInfo         *input,
+                                   const ITensorInfo         *rois,
+                                   const ITensorInfo         *output,
+                                   const ROIPoolingLayerInfo &pool_info)
 {
     return NEROIPoolingLayerKernel::validate(input, rois, output, pool_info);
 }
 
-void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayer::configure(const ITensor             *input,
+                                  const ITensor             *rois,
+                                  const ITensor             *output,
+                                  const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
 
@@ -53,4 +60,4 @@ void NEROIPoolingLayer::run()
 {
     NEScheduler::get().schedule(_roi_kernel.get(), Window::DimX);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp
index a6f7be8be05d4b68a7de47c758331f06a321e80a..95492df126ff6e1c026267eac03c531b79d372f8 100644
--- a/src/runtime/NEON/functions/NERange.cpp
+++ b/src/runtime/NEON/functions/NERange.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NERange.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NERangeKernel.h"
 
@@ -31,8 +32,7 @@ namespace arm_compute
 {
 NERange::~NERange() = default;
 
-NERange::NERange()
-    : _kernel()
+NERange::NERange() : _kernel()
 {
 }
 
@@ -52,4 +52,4 @@ void NERange::run()
 {
     NEScheduler::get().schedule(_kernel.get(), Window::DimX);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 9f964792951384a017d535bcc782c537bf6592dd..d37cf4a8d0c1f3078c556ebca11b8ebb5111cefb 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -25,21 +25,24 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status
+validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
     ARM_COMPUTE_UNUSED(keep_dims);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
@@ -47,29 +50,29 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
     const int          input_dims    = input->num_dimensions();
     Coordinates        axis_local    = reduction_axis;
 
-    for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
     {
         //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
     }
 
-    if(output->tensor_shape().total_size() != 0)
+    if (output->tensor_shape().total_size() != 0)
     {
         // Only validate if not using auto_init for the output tensor
         TensorShape out_shape = input->tensor_shape();
         // Validate output_shape only if not using auto_init
         convert_negative_axis(axis_local, input_dims);
         std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-        for(unsigned int i = 0; i < reduction_ops; ++i)
+        for (unsigned int i = 0; i < reduction_ops; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
             ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
-            if(output->total_size() > 0 && keep_dims)
+            if (output->total_size() > 0 && keep_dims)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
             }
-            if(keep_dims)
+            if (keep_dims)
             {
                 out_shape.set(axis_local[i], 1);
             }
@@ -91,11 +94,19 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
 NEReduceMean::~NEReduceMean() = default;
 
 NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
+    : _memory_group(std::move(memory_manager)),
+      _reduction_kernels(),
+      _reduced_outs(),
+      _reshape(),
+      _reduction_ops(),
+      _keep_dims()
 {
 }
 
-Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status NEReduceMean::validate(const ITensorInfo *input,
+                              const Coordinates &reduction_axis,
+                              bool               keep_dims,
+                              const ITensorInfo *output)
 {
     return validate_config(input, reduction_axis, keep_dims, output);
 }
@@ -107,7 +118,8 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     _reduction_ops = reduction_axis.num_dimensions();
@@ -124,37 +136,40 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
     convert_negative_axis(axis_local, input_dims);
 
     // Perform reduction for every axis
-    for(int i = 0; i < _reduction_ops; ++i)
+    for (int i = 0; i < _reduction_ops; ++i)
     {
-        TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+        TensorShape out_shape =
+            i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
         auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]);
 
-        if(i == _reduction_ops - 1 && keep_dims)
+        if (i == _reduction_ops - 1 && keep_dims)
         {
             _reduction_kernels[i].configure(in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM);
         }
         else
         {
-            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(), tmp_output->info()->data_type(), tmp_output->info()->quantization_info()));
+            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(),
+                                                          tmp_output->info()->data_type(),
+                                                          tmp_output->info()->quantization_info()));
             _memory_group.manage(&_reduced_outs[i]);
             _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
         }
     }
 
     // Allocate intermediate tensors
-    for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+    for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
     {
         _reduced_outs[i].allocator()->allocate();
     }
     // Configure reshape layer if we want to drop the dimensions
-    if(!keep_dims)
+    if (!keep_dims)
     {
         TensorShape out_shape = tmp_input->info()->tensor_shape();
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
         std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-        for(int i = 0; i < _reduction_ops; ++i)
+        for (int i = 0; i < _reduction_ops; ++i)
         {
             out_shape.remove_dimension(axis_local[i] - i, false);
         }
@@ -166,11 +181,11 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
 void NEReduceMean::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
-    for(auto &kernel : _reduction_kernels)
+    for (auto &kernel : _reduction_kernels)
     {
         kernel.run();
     }
-    if(!_keep_dims)
+    if (!_keep_dims)
     {
         _reshape.run();
     }
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 9660347a1621b047f991d94ef63d12b0c5e28c7a..8540d750fc4945473aa2c81469bec897c6b90fa4 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -26,9 +26,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
-#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 
 namespace arm_compute
 {
@@ -42,7 +43,7 @@ namespace
  */
 size_t reduction_window_split_dimension(unsigned int axis)
 {
-    switch(axis)
+    switch (axis)
     {
         case 0:
             return Window::DimY;
@@ -59,13 +60,21 @@ size_t reduction_window_split_dimension(unsigned int axis)
 NEReductionOperation::~NEReductionOperation() = default;
 
 NEReductionOperation::NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reduction_kernel(), _reshape(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
+    : _memory_group(memory_manager),
+      _reduction_kernel(),
+      _reshape(),
+      _output_internal(),
+      _window_split(0),
+      _reduction_axis(),
+      _is_reshape_required(false)
 {
 }
 
-Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+Status NEReductionOperation::validate(
+    const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
     const auto is_reshape_required = !keep_dims;
@@ -74,9 +83,10 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf
 
     TensorInfo info_before_reshape;
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
-        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
 
         auto shape_before_reshape = input->tensor_shape();
@@ -84,17 +94,20 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf
 
         const auto input_num_channles = input->num_channels();
         const auto input_qinfo        = input->quantization_info();
-        const auto is_arg_min_max     = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
-        const auto output_data_type   = is_arg_min_max ? DataType::S32 : output->data_type();
+        const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+        const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type();
 
-        info_before_reshape.set_data_type(output_data_type).set_tensor_shape(shape_before_reshape).set_num_channels(input_num_channles).set_quantization_info(input_qinfo);
+        info_before_reshape.set_data_type(output_data_type)
+            .set_tensor_shape(shape_before_reshape)
+            .set_num_channels(input_num_channles)
+            .set_quantization_info(input_qinfo);
 
         output_internal = &info_before_reshape;
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output_internal, axis, op));
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(output_internal, output));
     }
@@ -102,7 +115,8 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf
     return Status{};
 }
 
-void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void NEReductionOperation::configure(
+    ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims);
@@ -112,19 +126,32 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i
     auto      *output_internal = output;
     const auto is_arg_min_max  = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
-        const auto output_internal_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
-        const auto output_external_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
-        const auto output_data_type      = is_arg_min_max ? DataType::S32 : input->info()->data_type();
-        const auto num_channels          = input->info()->num_channels();
-        const auto qinfo                 = input->info()->quantization_info();
-
-        _output_internal.allocator()->init(input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_internal_shape).reset_padding().set_is_resizable(true).set_num_channels(
-                                               num_channels).set_quantization_info(qinfo));
+        const auto output_internal_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
+        const auto output_external_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+        const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
+        const auto num_channels     = input->info()->num_channels();
+        const auto qinfo            = input->info()->quantization_info();
+
+        _output_internal.allocator()->init(input->info()
+                                               ->clone()
+                                               ->set_data_type(output_data_type)
+                                               .set_tensor_shape(output_internal_shape)
+                                               .reset_padding()
+                                               .set_is_resizable(true)
+                                               .set_num_channels(num_channels)
+                                               .set_quantization_info(qinfo));
         _memory_group.manage(&_output_internal);
         output_internal = &_output_internal;
-        auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_external_shape).reset_padding().set_is_resizable(true));
+        auto_init_if_empty(*output->info(), input->info()
+                                                ->clone()
+                                                ->set_data_type(output_data_type)
+                                                .set_tensor_shape(output_external_shape)
+                                                .reset_padding()
+                                                .set_is_resizable(true));
     }
 
     ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op, keep_dims));
@@ -135,7 +162,7 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i
     _window_split   = reduction_window_split_dimension(axis);
     _reduction_axis = axis;
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
         _reshape.configure(output_internal, output);
         _output_internal.allocator()->allocate();
@@ -146,7 +173,7 @@ void NEReductionOperation::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
     NEScheduler::get().schedule(_reduction_kernel.get(), _window_split);
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
         _reshape.run();
     }
diff --git a/src/runtime/NEON/functions/NEReorderLayer.cpp b/src/runtime/NEON/functions/NEReorderLayer.cpp
index 427bf8c50193f1b174c54bb23538e04c986977d2..89cf575f38717d09030298e24744c7c0eb478f24 100644
--- a/src/runtime/NEON/functions/NEReorderLayer.cpp
+++ b/src/runtime/NEON/functions/NEReorderLayer.cpp
@@ -23,20 +23,24 @@
  */
 #if defined(__aarch64__)
 
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEReorderLayer.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/NEON/kernels/NEReorderKernel.h"
 
 namespace arm_compute
 {
 NEReorderLayer::~NEReorderLayer() = default;
 
-NEReorderLayer::NEReorderLayer()
-    : _reorder_kernel(std::make_unique<NEReorderKernel>())
+NEReorderLayer::NEReorderLayer() : _reorder_kernel(std::make_unique<NEReorderKernel>())
 {
 }
 
-void NEReorderLayer::configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf)
+void NEReorderLayer::configure(const ITensor            *input,
+                               ITensor                  *output,
+                               arm_compute::WeightFormat input_wf,
+                               arm_compute::WeightFormat output_wf)
 {
     auto k = std::make_unique<NEReorderKernel>();
     k->configure(input, output, input_wf, output_wf);
@@ -49,11 +53,14 @@ void NEReorderLayer::run()
     NEScheduler::get().schedule(_reorder_kernel.get(), Window::DimX);
 }
 
-Status NEReorderLayer::validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf)
+Status NEReorderLayer::validate(const ITensorInfo        *input,
+                                const ITensorInfo        *output,
+                                arm_compute::WeightFormat input_wf,
+                                arm_compute::WeightFormat output_wf)
 {
     return NEReorderKernel::validate(input, output, input_wf, output_wf);
 }
 
 } // namespace arm_compute
 
-#endif  // defined(__aarch64__)
\ No newline at end of file
+#endif // defined(__aarch64__)
diff --git a/src/runtime/NEON/functions/NEReorgLayer.cpp b/src/runtime/NEON/functions/NEReorgLayer.cpp
index 8ee73d739003ed8ffc5993e15032ce197ef9b883..14e41d6df477112c9e1925cae84146940cbb6ba5 100644
--- a/src/runtime/NEON/functions/NEReorgLayer.cpp
+++ b/src/runtime/NEON/functions/NEReorgLayer.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h"
 
-#include "src/core/NEON/kernels/NEReorgLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEReorgLayerKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
index 3ccb42361e615bd7853f923575aac04068e5a6f4..bed70ff66cae4223e6f2884e84b75f91bddea798 100644
--- a/src/runtime/NEON/functions/NEReshapeLayer.cpp
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuReshape.h"
 
 #include <utility>
@@ -32,16 +33,15 @@ namespace arm_compute
 {
 struct NEReshapeLayer::Impl
 {
-    const ITensor                   *src{ nullptr };
-    ITensor                         *dst{ nullptr };
-    std::unique_ptr<cpu::CpuReshape> op{ nullptr };
+    const ITensor                   *src{nullptr};
+    ITensor                         *dst{nullptr};
+    std::unique_ptr<cpu::CpuReshape> op{nullptr};
 };
 
-NEReshapeLayer::NEReshapeLayer()
-    : _impl(std::make_unique<Impl>())
+NEReshapeLayer::NEReshapeLayer() : _impl(std::make_unique<Impl>())
 {
 }
-NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default;
+NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&)            = default;
 NEReshapeLayer &NEReshapeLayer::operator=(NEReshapeLayer &&) = default;
 NEReshapeLayer::~NEReshapeLayer()                            = default;
 
diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp
index d4ed2a9018c5b45dc8cfb648bb1d3ab41d4e5827..a90f8d2e76c22c587cd848a6e526bc71f88101b4 100644
--- a/src/runtime/NEON/functions/NEReverse.cpp
+++ b/src/runtime/NEON/functions/NEReverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,23 +23,25 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReverse.h"
 
-#include "src/core/NEON/kernels/NEReverseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEReverseKernel.h"
 
 namespace arm_compute
 {
-void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *axis)
+void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *axis, bool use_inverted_axis)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, axis);
 
     auto k = std::make_unique<NEReverseKernel>();
-    k->configure(input, output, axis);
+    k->configure(input, output, axis, use_inverted_axis);
     _kernel = std::move(k);
 }
 
-Status NEReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status NEReverse::validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const ITensorInfo *axis,
+                           bool               use_inverted_axis)
 {
-    return NEReverseKernel::validate(input, output, axis);
+    return NEReverseKernel::validate(input, output, axis, use_inverted_axis);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 09f037334e9f04cdd3afb43833e337fe94f5cf8c..0d011064f6f537fac6970e38c83093732faa23f2 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEScale.h"
 
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "src/cpu/operators/CpuScale.h"
@@ -32,16 +33,16 @@ namespace arm_compute
 {
 struct NEScale::Impl
 {
-    const ITensor                 *src{ nullptr };
-    ITensor                       *dst{ nullptr };
-    Tensor                         dx{ nullptr };      /**< Element's distance between the X real coordinate and the smallest X following integer */
-    Tensor                         dy{ nullptr };      /**< Element's distance between the Y real coordinate and the smallest Y following integer */
-    Tensor                         offsets{ nullptr }; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
-    std::unique_ptr<cpu::CpuScale> op{ nullptr };
+    const ITensor *src{nullptr};
+    ITensor       *dst{nullptr};
+    Tensor dx{nullptr}; /**< Element's distance between the X real coordinate and the smallest X following integer */
+    Tensor dy{nullptr}; /**< Element's distance between the Y real coordinate and the smallest Y following integer */
+    Tensor offsets{
+        nullptr}; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
+    std::unique_ptr<cpu::CpuScale> op{nullptr};
 };
 
-NEScale::NEScale()
-    : _impl(std::make_unique<Impl>())
+NEScale::NEScale() : _impl(std::make_unique<Impl>())
 {
 }
 NEScale::~NEScale() = default;
@@ -57,25 +58,33 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &
 
     // Configure for size of allocation of internal tensors
     // Get data layout and width/height indices
-    const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout;
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const DataLayout data_layout =
+        info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout;
+    const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the ratio between source width/height and destination width/height
-    const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
-    const auto wr                    = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used);
-    const auto hr                    = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used);
+    const bool is_align_corners_used =
+        info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
+    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(
+        input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(
+        input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
+    InterpolationPolicy policy_to_use =
+        (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+            ? InterpolationPolicy::NEAREST_NEIGHBOR
+            : info.interpolation_policy;
 
     // Get the tensor shape
     TensorShape shape(output->info()->dimension(idx_width));
     shape.set(1, output->info()->dimension(idx_height), false);
 
-    bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(data_layout, input->info()->data_type(), policy_to_use, info.border_mode);
+    bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(
+        data_layout, input->info()->data_type(), policy_to_use, info.border_mode);
 
-    if(precompute_indices_weights)
+    if (precompute_indices_weights)
     {
         const TensorInfo tensor_info_dxdy(shape, Format::F32);
         const TensorInfo tensor_info_offsets(shape, Format::S32);
@@ -83,7 +92,7 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &
         _impl->dx.allocator()->init(tensor_info_dxdy);
         _impl->dy.allocator()->init(tensor_info_dxdy);
         _impl->offsets.allocator()->init(tensor_info_offsets);
-        switch(policy_to_use)
+        switch (policy_to_use)
         {
             case InterpolationPolicy::NEAREST_NEIGHBOR:
             {
@@ -109,7 +118,8 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &
     }
     else
     {
-        if(policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA)
+        if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR &&
+            policy_to_use != InterpolationPolicy::AREA)
         {
             ARM_COMPUTE_ERROR("Unsupported interpolation mode");
         }
diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp
index 26c2eb8fe9874ba105330d44ac5f18e9467a7353..55cad2202b6ff47faf662b14f3279de1dff368b8 100644
--- a/src/runtime/NEON/functions/NESelect.cpp
+++ b/src/runtime/NEON/functions/NESelect.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NESelect.h"
 
 #include "arm_compute/core/Types.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NESelectKernel.h"
 
diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp
index 4a8912bfe994c4503306f277a7c860330d6a0aad..12d43adc843ca7048cf81e08d4670c3322f01ae0 100644
--- a/src/runtime/NEON/functions/NESlice.cpp
+++ b/src/runtime/NEON/functions/NESlice.cpp
@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 
@@ -34,7 +35,10 @@ namespace arm_compute
 {
 namespace experimental
 {
-void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+void NESlice::configure(const ITensorInfo *input,
+                        ITensorInfo       *output,
+                        const Coordinates &starts,
+                        const Coordinates &ends)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends);
@@ -47,15 +51,16 @@ void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coo
     _kernel = std::move(k);
 }
 
-Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status NESlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
 
     // Check start dimensions for being non-negative
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
-    {
-        return i < 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; }));
 
     // Get absolute end coordinates
     const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
@@ -66,20 +71,22 @@ Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, co
 
 struct NESlice::Impl
 {
-    const ITensor                         *src{ nullptr };
-    ITensor                               *dst{ nullptr };
-    std::unique_ptr<experimental::NESlice> op{ nullptr };
+    const ITensor                         *src{nullptr};
+    ITensor                               *dst{nullptr};
+    std::unique_ptr<experimental::NESlice> op{nullptr};
 };
 
-NESlice::NESlice()
-    : _impl(std::make_unique<Impl>())
+NESlice::NESlice() : _impl(std::make_unique<Impl>())
 {
 }
-NESlice::NESlice(NESlice &&) = default;
+NESlice::NESlice(NESlice &&)            = default;
 NESlice &NESlice::operator=(NESlice &&) = default;
 NESlice::~NESlice()                     = default;
 
-Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status NESlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
 {
     return experimental::NESlice::validate(input, output, starts, ends);
 }
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 0947ff94a67b08412a502e7725c8edb44c582187..e3c2012d059c03cbd2b62c935a00dd413b218933 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -22,9 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/helpers/SoftmaxHelpers.h"
 #include "src/cpu/kernels/CpuSoftmaxKernel.h"
@@ -35,10 +37,10 @@ namespace arm_compute
 template <bool IS_LOG>
 struct NESoftmaxLayerGeneric<IS_LOG>::Impl
 {
-    const ITensor                                  *src{ nullptr };
-    ITensor                                        *dst{ nullptr };
-    Tensor                                          max{ nullptr };
-    std::unique_ptr<cpu::CpuSoftmaxGeneric<IS_LOG>> op{ nullptr };
+    const ITensor                                  *src{nullptr};
+    ITensor                                        *dst{nullptr};
+    Tensor                                          max{nullptr};
+    std::unique_ptr<cpu::CpuSoftmaxGeneric<IS_LOG>> op{nullptr};
     MemoryGroup                                     memory_group{};
     ITensorPack                                     run_pack{};
     WorkspaceData<Tensor>                           workspace_tensors{};
@@ -53,9 +55,9 @@ NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryMana
 
 template <bool IS_LOG>
 NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&) = default;
-template <bool                 IS_LOG>
+template <bool IS_LOG>
 NESoftmaxLayerGeneric<IS_LOG> &NESoftmaxLayerGeneric<IS_LOG>::operator=(NESoftmaxLayerGeneric &&) = default;
-template <bool                 IS_LOG>
+template <bool IS_LOG>
 NESoftmaxLayerGeneric<IS_LOG>::~NESoftmaxLayerGeneric() = default;
 
 template <bool IS_LOG>
@@ -68,12 +70,13 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f
     _impl->op  = std::make_unique<cpu::CpuSoftmaxGeneric<IS_LOG>>();
     _impl->op->configure(input->info(), output->info(), beta, axis);
 
-    _impl->run_pack          = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } };
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
 template <bool IS_LOG>
-Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
+Status
+NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric<IS_LOG>::validate(input, output, beta, axis));
@@ -81,7 +84,7 @@ Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const I
 }
 
 template <bool IS_LOG>
-void           NESoftmaxLayerGeneric<IS_LOG>::run()
+void NESoftmaxLayerGeneric<IS_LOG>::run()
 {
     // Acquire all the temporaries
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
index c4509510dcac1af224f2f0783adc4b41a489e52a..556ebdd800db5e3050b092c648d9db1b32d5c2fb 100644
--- a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
@@ -28,8 +28,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEFill.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
 
@@ -37,17 +38,19 @@ namespace arm_compute
 {
 NESpaceToBatchLayer::~NESpaceToBatchLayer() = default;
 
-NESpaceToBatchLayer::NESpaceToBatchLayer()
-    : _space_to_batch_kernel(), _fill_f(), _has_padding(false)
+NESpaceToBatchLayer::NESpaceToBatchLayer() : _space_to_batch_kernel(), _fill_f(), _has_padding(false)
 {
 }
 
-void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+void NESpaceToBatchLayer::configure(const ITensor *input,
+                                    const ITensor *block_shape,
+                                    const ITensor *paddings,
+                                    ITensor       *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
     ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
         _fill_f      = std::make_unique<NEFill>();
@@ -57,11 +60,16 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_s
     _space_to_batch_kernel->configure(input, block_shape, paddings, output);
 }
 
-void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output)
+void NESpaceToBatchLayer::configure(const ITensor *input,
+                                    const int      block_shape_x,
+                                    const int      block_shape_y,
+                                    const Size2D  &padding_left,
+                                    const Size2D  &padding_right,
+                                    ITensor       *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
         _fill_f      = std::make_unique<NEFill>();
@@ -71,17 +79,25 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_
     _space_to_batch_kernel->configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 }
 
-Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const ITensorInfo *block_shape,
+                                     const ITensorInfo *paddings,
+                                     const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
 
     return Status{};
 }
 
-Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const int          block_shape_x,
+                                     const int          block_shape_y,
+                                     const Size2D      &padding_left,
+                                     const Size2D      &padding_right,
                                      const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
 
     return Status{};
 }
@@ -89,7 +105,7 @@ Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s
 void NESpaceToBatchLayer::run()
 {
     // Zero out output only if we have paddings
-    if(_has_padding)
+    if (_has_padding)
     {
         _fill_f->run();
     }
diff --git a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
index b37bf0d20f120cd9a3a4a9a5565c903abe5283e1..846b619429940fa09bb0443c9b0d8aa23a9b21b5 100644
--- a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
 
@@ -36,8 +37,7 @@ namespace arm_compute
 {
 NESpaceToDepthLayer::~NESpaceToDepthLayer() = default;
 
-NESpaceToDepthLayer::NESpaceToDepthLayer()
-    : _space_to_depth_kernel()
+NESpaceToDepthLayer::NESpaceToDepthLayer() : _space_to_depth_kernel()
 {
 }
 
diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp
index db19bbb824f764da4ff9ccf8cd7b2a5e657db3bc..53b09e9ae59590f9a30f4e95764ab5154d4bd06d 100644
--- a/src/runtime/NEON/functions/NESplit.cpp
+++ b/src/runtime/NEON/functions/NESplit.cpp
@@ -34,7 +34,7 @@ namespace arm_compute
 {
 void NESplit::run()
 {
-    for(unsigned i = 0; i < _num_outputs; ++i)
+    for (unsigned i = 0; i < _num_outputs; ++i)
     {
         _slice_functions[i].run();
     }
diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
index 68554e0931e29106567f686703b854f58d3d95b5..2f88ffca2ad4a3ab3acfdba4ff67318023ca4d7a 100644
--- a/src/runtime/NEON/functions/NEStackLayer.cpp
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEStackLayerKernel.h"
 
@@ -38,9 +39,7 @@ namespace arm_compute
 NEStackLayer::~NEStackLayer() = default;
 
 NEStackLayer::NEStackLayer() // NOLINT
-    : _input(),
-      _stack_kernels(),
-      _num_inputs(0)
+    : _stack_kernel(std::make_unique<NEStackLayerKernel>()), _is_prepared(false)
 {
 }
 
@@ -48,17 +47,10 @@ void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITen
 {
     ARM_COMPUTE_LOG_PARAMS(input, axis, output);
 
-    _num_inputs = input.size();
-    _stack_kernels.resize(_num_inputs);
-
     // Wrap around negative values
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
 
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        _stack_kernels[i] = std::make_unique<NEStackLayerKernel>();
-        _stack_kernels[i]->configure(input[i], axis_u, i, _num_inputs, output);
-    }
+    _stack_kernel->configure(input, axis_u, output);
 }
 
 Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output)
@@ -70,24 +62,20 @@ Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
     const size_t       rank   = input[0]->num_dimensions();
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1));
 
-    const unsigned int num_inputs = input.size();
-
-    for(unsigned int i = 0; i < num_inputs; i++)
-    {
-        // All the tensors must have the same rank
-        ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
-        // Validate Kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output));
-    }
+    // Validate Kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input, axis_u, output));
 
     return Status{};
 }
 
 void NEStackLayer::run()
 {
-    for(unsigned i = 0; i < _num_inputs; i++)
+    if (!_is_prepared)
     {
-        NEScheduler::get().schedule(_stack_kernels[i].get(), Window::DimY);
+        _stack_kernel->prepare();
+        _is_prepared = true;
     }
+
+    NEScheduler::get().schedule(_stack_kernel.get(), _stack_kernel->get_split_dimension());
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp
index 4f50749a4f77c9a80cc10b17798d8eb1c5f835b4..6a3ac8be059a150bc60419236f675aaf3a5d4171 100644
--- a/src/runtime/NEON/functions/NEStridedSlice.cpp
+++ b/src/runtime/NEON/functions/NEStridedSlice.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 
@@ -32,9 +33,14 @@ namespace arm_compute
 {
 namespace experimental
 {
-void NEStridedSlice::configure(const ITensorInfo *input, ITensorInfo *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void NEStridedSlice::configure(const ITensorInfo *input,
+                               ITensorInfo       *output,
+                               const Coordinates &starts,
+                               const Coordinates &ends,
+                               const BiStrides   &strides,
+                               int32_t            begin_mask,
+                               int32_t            end_mask,
+                               int32_t            shrink_axis_mask)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
 
@@ -43,9 +49,14 @@ void NEStridedSlice::configure(const ITensorInfo *input, ITensorInfo *output,
     _kernel = std::move(k);
 }
 
-Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
 {
     return NEStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
 }
@@ -53,22 +64,26 @@ Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *out
 
 struct NEStridedSlice::Impl
 {
-    const ITensor                                *src{ nullptr };
-    ITensor                                      *dst{ nullptr };
-    std::unique_ptr<experimental::NEStridedSlice> op{ nullptr };
+    const ITensor                                *src{nullptr};
+    ITensor                                      *dst{nullptr};
+    std::unique_ptr<experimental::NEStridedSlice> op{nullptr};
 };
 
-NEStridedSlice::NEStridedSlice()
-    : _impl(std::make_unique<Impl>())
+NEStridedSlice::NEStridedSlice() : _impl(std::make_unique<Impl>())
 {
 }
-NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default;
+NEStridedSlice::NEStridedSlice(NEStridedSlice &&)            = default;
 NEStridedSlice &NEStridedSlice::operator=(NEStridedSlice &&) = default;
 NEStridedSlice::~NEStridedSlice()                            = default;
 
-void NEStridedSlice::configure(const ITensor *input, ITensor *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void NEStridedSlice::configure(const ITensor     *input,
+                               ITensor           *output,
+                               const Coordinates &starts,
+                               const Coordinates &ends,
+                               const BiStrides   &strides,
+                               int32_t            begin_mask,
+                               int32_t            end_mask,
+                               int32_t            shrink_axis_mask)
 {
     _impl->src = input;
     _impl->dst = output;
@@ -84,10 +99,16 @@ void NEStridedSlice::run()
     _impl->op->run(pack);
 }
 
-Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
 {
-    return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask,
+                                                  shrink_axis_mask);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp
index 526603f1a35a3a60028a1f5662fa1901161adf94..d10b1c8e95ce05d5c8fbce2a8a9b3be9bcea8e91 100644
--- a/src/runtime/NEON/functions/NETile.cpp
+++ b/src/runtime/NEON/functions/NETile.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NETile.h"
 
-#include "src/core/NEON/kernels/NETileKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NETileKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
index 78c7ea202a9c775ddd7d41cf103754b984489cce..0144a85e8cb47729bbfd98fb2843e0a7bdadf3c7 100644
--- a/src/runtime/NEON/functions/NETranspose.cpp
+++ b/src/runtime/NEON/functions/NETranspose.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/operators/CpuTranspose.h"
 
@@ -31,13 +32,12 @@ namespace arm_compute
 {
 struct NETranspose::Impl
 {
-    const ITensor                     *src{ nullptr };
-    ITensor                           *dst{ nullptr };
-    std::unique_ptr<cpu::CpuTranspose> op{ nullptr };
+    const ITensor                     *src{nullptr};
+    ITensor                           *dst{nullptr};
+    std::unique_ptr<cpu::CpuTranspose> op{nullptr};
 };
 
-NETranspose::NETranspose()
-    : _impl(std::make_unique<Impl>())
+NETranspose::NETranspose() : _impl(std::make_unique<Impl>())
 {
 }
 
diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp
index 0ffab5e92a6b0b90d430e74de9b3ea62362ecaf0..2f7ed2bb1f382dbb06014f38d8268256e1398635 100644
--- a/src/runtime/NEON/functions/NEUnstack.cpp
+++ b/src/runtime/NEON/functions/NEUnstack.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -39,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
     return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
 }
 
-inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+inline void setup_slice_coordinates_and_mask(Coordinates       &slice_start,
+                                             int32_t           &slice_end_mask,
+                                             const unsigned int input_num_dimensions)
 {
     // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
     Coordinates slice_end;
     slice_start.set_num_dimensions(input_num_dimensions);
     slice_end.set_num_dimensions(input_num_dimensions);
-    for(size_t k = 0; k < input_num_dimensions; ++k)
+    for (size_t k = 0; k < input_num_dimensions; ++k)
     {
         slice_start.set(k, 0);
         slice_end.set(k, -1);
@@ -55,19 +58,19 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &
 } // namespace
 
 NEUnstack::NEUnstack() // NOLINT
-    : _num_slices(0),
-      _strided_slice_vector()
+    : _num_slices(0), _strided_slice_vector()
 {
 }
 
 void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &output_vector, int axis)
 {
     std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
-    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ITensor * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t->info();
-    });
+    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(),
+                   [](ITensor *t)
+                   {
+                       ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+                       return t->info();
+                   });
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_ERROR_THROW_ON(NEUnstack::validate(input->info(), outputs_vector_info, axis));
@@ -81,11 +84,12 @@ void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &ou
     Coordinates slice_start;
     int32_t     slice_end_mask;
     setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
-    for(unsigned int slice = 0; slice < _num_slices; ++slice)
+    for (unsigned int slice = 0; slice < _num_slices; ++slice)
     {
         // Adjusts start and end coordinates to take a 2D slice at a time
         slice_start.set(axis_u, slice);
-        _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+        _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0,
+                                               slice_end_mask, (1 << axis_u));
     }
 }
 
@@ -102,18 +106,20 @@ Status NEUnstack::validate(const ITensorInfo *input, const std::vector<ITensorIn
 
     Coordinates slice_start;
     int32_t     slice_end_mask;
-    for(size_t k = 0; k < num_slices; ++k)
+    for (size_t k = 0; k < num_slices; ++k)
     {
         slice_start.set(wrap_axis(axis, input), k);
         setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
-        ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(),
+                                                             BiStrides(), 0, slice_end_mask,
+                                                             (1 << wrap_axis(axis, input))));
     }
     return Status{};
 }
 
 void NEUnstack::run()
 {
-    for(unsigned i = 0; i < _num_slices; ++i)
+    for (unsigned i = 0; i < _num_slices; ++i)
     {
         _strided_slice_vector[i].run();
     }
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index a8eded29ffd125831c58bdb2dfbb4b3645fc82af..8d77abcfc741929f207a0f3fa6f7e1eae60fbbc6 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -26,15 +26,15 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/NEON/kernels/convolution/common/utils.hpp"
 #include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/operators/CpuWinogradConv2d.h"
 
-#include "src/core/NEON/kernels/convolution/common/utils.hpp"
-
 namespace arm_compute
 {
 using namespace arm_compute::experimental;
@@ -42,14 +42,14 @@ using namespace arm_compute::experimental;
 struct NEWinogradConvolutionLayer::Impl
 {
     MemoryGroup                             memory_group{};
-    std::unique_ptr<cpu::CpuWinogradConv2d> op{ nullptr };
+    std::unique_ptr<cpu::CpuWinogradConv2d> op{nullptr};
     ITensorPack                             run_pack{};
     ITensorPack                             prep_pack{};
     WorkspaceData<Tensor>                   workspace{};
     experimental::MemoryRequirements        aux_mem_req{};
-    const ITensor                          *original_weights{ nullptr };
-    bool                                    is_prepared{ false };
-    bool                                    is_activationlayer_enabled{ false };
+    const ITensor                          *original_weights{nullptr};
+    bool                                    is_prepared{false};
+    bool                                    is_activationlayer_enabled{false};
     DataLayout                              data_layout{};
 };
 
@@ -61,17 +61,24 @@ NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMe
 
 NEWinogradConvolutionLayer::~NEWinogradConvolutionLayer() = default;
 
-void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
-                                           bool enable_fast_math)
+void NEWinogradConvolutionLayer::configure(const ITensor             *input,
+                                           const ITensor             *weights,
+                                           const ITensor             *biases,
+                                           ITensor                   *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info,
+                                           bool                       enable_fast_math)
 {
     _impl->original_weights = weights;
     _impl->op               = std::make_unique<cpu::CpuWinogradConv2d>();
-    _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, act_info, enable_fast_math);
+    _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+                         conv_info, act_info, enable_fast_math);
 
     _impl->aux_mem_req = _impl->op->workspace();
-    _impl->run_pack    = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-    _impl->prep_pack   = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
-    _impl->workspace   = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->run_pack    = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+    _impl->prep_pack   = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+    _impl->workspace =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
 void NEWinogradConvolutionLayer::run()
@@ -82,15 +89,20 @@ void NEWinogradConvolutionLayer::run()
     _impl->op->run(_impl->run_pack);
 }
 
-Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status NEWinogradConvolutionLayer::validate(const ITensorInfo         *input,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *output,
+                                            const PadStrideInfo       &conv_info,
+                                            const ActivationLayerInfo &act_info,
+                                            bool                       enable_fast_math)
 {
     return cpu::CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math);
 }
 
 void NEWinogradConvolutionLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
         _impl->original_weights->mark_as_unused();
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index b0a553212a21c42a0258ff0560484c16fb0aa23b..d4d6193fcee9f37196c4e7c9adeed963ff4406ba 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
+
 #include <omp.h>
 
 namespace arm_compute
@@ -63,7 +64,7 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win
     const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
     const unsigned int num_threads    = std::min(num_iterations, _num_threads);
 
-    if(!kernel->is_parallelisable() || num_threads == 1)
+    if (!kernel->is_parallelisable() || num_threads == 1)
     {
         ThreadInfo info;
         info.cpu_info = &cpu_info();
@@ -73,10 +74,10 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win
     {
         const unsigned int                num_windows = num_threads;
         std::vector<IScheduler::Workload> workloads(num_windows);
-        for(unsigned int t = 0; t < num_windows; t++)
+        for (unsigned int t = 0; t < num_windows; t++)
         {
             //Capture 't' by copy, all the other variables by reference:
-            workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info)
+            workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info)
             {
                 Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
                 win.validate();
@@ -92,7 +93,7 @@ void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload>
     const unsigned int amount_of_work     = static_cast<unsigned int>(workloads.size());
     const unsigned int num_threads_to_use = std::min(_num_threads, amount_of_work);
 
-    if(num_threads_to_use < 1)
+    if (num_threads_to_use < 1)
     {
         return;
     }
@@ -100,8 +101,9 @@ void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload>
     ThreadInfo info;
     info.cpu_info    = &cpu_info();
     info.num_threads = num_threads_to_use;
-    #pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) schedule(static, 1)
-    for(unsigned int wid = 0; wid < amount_of_work; ++wid)
+#pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) \
+    schedule(static, 1)
+    for (unsigned int wid = 0; wid < amount_of_work; ++wid)
     {
         const int tid = omp_get_thread_num();
 
diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp
index a47fa184fa2360fdc4fed7c414e422d92689b7e7..d746f618b5f71b1c422daeac95b0b3f24b0385f3 100644
--- a/src/runtime/OffsetLifetimeManager.cpp
+++ b/src/runtime/OffsetLifetimeManager.cpp
@@ -43,8 +43,7 @@ size_t align_offset(size_t offset, size_t alignment)
     return (remainder != 0U) ? offset + (alignment - remainder) : offset;
 }
 } // namespace
-OffsetLifetimeManager::OffsetLifetimeManager()
-    : _blob(0)
+OffsetLifetimeManager::OffsetLifetimeManager() : _blob(0)
 {
 }
 
@@ -71,21 +70,22 @@ void OffsetLifetimeManager::update_blobs_and_mappings()
 
     // Update blob size
     size_t max_aggregated_size = 0;
-    std::for_each(std::begin(_free_blobs), std::end(_free_blobs), [&](const Blob & b)
-    {
-        max_aggregated_size += b.max_size;
-        _blob.alignment = std::max(_blob.alignment, b.max_alignment);
-    });
+    std::for_each(std::begin(_free_blobs), std::end(_free_blobs),
+                  [&](const Blob &b)
+                  {
+                      max_aggregated_size += b.max_size;
+                      _blob.alignment = std::max(_blob.alignment, b.max_alignment);
+                  });
     max_aggregated_size += _free_blobs.size() * _blob.alignment;
     _blob.owners = std::max(_blob.owners, _free_blobs.size());
     _blob.size   = std::max(_blob.size, max_aggregated_size);
 
     // Calculate group mappings
-    auto &group_mappings = _active_group->mappings();
+    auto  &group_mappings = _active_group->mappings();
     size_t offset         = 0;
-    for(auto &free_blob : _free_blobs)
+    for (auto &free_blob : _free_blobs)
     {
-        for(auto &bound_element_id : free_blob.bound_elements)
+        for (auto &bound_element_id : free_blob.bound_elements)
         {
             ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements));
             Element &bound_element               = _active_elements[bound_element_id];
diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp
index ffedf5586ca25de747b6d8be80a44144439bad0d..8f3c1a84bad617a92a61e5f82ac5749d656ea267 100644
--- a/src/runtime/OffsetMemoryPool.cpp
+++ b/src/runtime/OffsetMemoryPool.cpp
@@ -21,8 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include <algorithm>
-
 #include "arm_compute/runtime/OffsetMemoryPool.h"
 
 #include "arm_compute/core/Error.h"
@@ -31,6 +29,8 @@
 #include "arm_compute/runtime/MemoryRegion.h"
 #include "arm_compute/runtime/Types.h"
 
+#include <algorithm>
+
 namespace arm_compute
 {
 OffsetMemoryPool::OffsetMemoryPool(IAllocator *allocator, BlobInfo blob_info)
@@ -50,7 +50,7 @@ void OffsetMemoryPool::acquire(MemoryMappings &handles)
     ARM_COMPUTE_ERROR_ON(_blob == nullptr);
 
     // Set memory to handlers
-    for(auto &handle : handles)
+    for (auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
         handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_info.size - handle.second));
@@ -59,7 +59,7 @@ void OffsetMemoryPool::acquire(MemoryMappings &handles)
 
 void OffsetMemoryPool::release(MemoryMappings &handles)
 {
-    for(auto &handle : handles)
+    for (auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
         handle.first->set_region(nullptr);
diff --git a/src/runtime/OperatorTensor.cpp b/src/runtime/OperatorTensor.cpp
index a8ad53da907d35053d7886a0c9e72ce81a640d99..19415b35cf2207b5cd47d5beebd89c0a711b23e7 100644
--- a/src/runtime/OperatorTensor.cpp
+++ b/src/runtime/OperatorTensor.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/OperatorTensor.h"
+
 #include "arm_compute/runtime/MemoryRegion.h"
 
 #include "support/Cast.h"
@@ -47,7 +48,7 @@ ITensorInfo *OperatorTensor::info()
 
 uint8_t *OperatorTensor::buffer() const
 {
-    switch(_mem_type)
+    switch (_mem_type)
     {
         case MemoryType::CPU:
             return (uint8_t *)utils::cast::polymorphic_downcast<MemoryRegion *>(_memory->region())->buffer();
diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp
index 87376a71a4dcec5b8e8ec2ae7d802247e7cd9e5b..7fb9bd8000d49f413c6b8102f3844ed2d9a5b83b 100644
--- a/src/runtime/PoolManager.cpp
+++ b/src/runtime/PoolManager.cpp
@@ -31,8 +31,7 @@
 
 using namespace arm_compute;
 
-PoolManager::PoolManager()
-    : _free_pools(), _occupied_pools(), _sem(), _mtx()
+PoolManager::PoolManager() : _free_pools(), _occupied_pools(), _sem(), _mtx()
 {
 }
 
@@ -52,10 +51,8 @@ void PoolManager::unlock_pool(IMemoryPool *pool)
     ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty() && _occupied_pools.empty(), "Haven't setup any pools!");
 
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-    auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools), [pool](const std::unique_ptr<IMemoryPool> &pool_it)
-    {
-        return pool_it.get() == pool;
-    });
+    auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools),
+                           [pool](const std::unique_ptr<IMemoryPool> &pool_it) { return pool_it.get() == pool; });
     ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_occupied_pools), "Pool to be unlocked couldn't be found!");
     _free_pools.splice(std::begin(_free_pools), _occupied_pools, it);
     _sem->signal();
@@ -78,7 +75,7 @@ std::unique_ptr<IMemoryPool> PoolManager::release_pool()
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
     ARM_COMPUTE_ERROR_ON_MSG(!_occupied_pools.empty(), "All pools should be free in order to release one!");
 
-    if(!_free_pools.empty())
+    if (!_free_pools.empty())
     {
         std::unique_ptr<IMemoryPool> pool = std::move(_free_pools.front());
         ARM_COMPUTE_ERROR_ON(_free_pools.front() != nullptr);
diff --git a/src/runtime/RuntimeContext.cpp b/src/runtime/RuntimeContext.cpp
index d1dea066e73d562093ab6bb4dbb3a455080db368..1de8d2abdbf8a2b3c4fd2a3f73e41ff509a1a236 100644
--- a/src/runtime/RuntimeContext.cpp
+++ b/src/runtime/RuntimeContext.cpp
@@ -28,8 +28,7 @@
 
 namespace arm_compute
 {
-RuntimeContext::RuntimeContext()
-    : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get())
+RuntimeContext::RuntimeContext() : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get())
 {
 }
 
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
index 0713b9a2ad3c17c9e8cd26b081c5b53e9ba149ec..e52fb59940b49a56cadee9edd6e26ce1d6cbd04f 100644
--- a/src/runtime/Scheduler.cpp
+++ b/src/runtime/Scheduler.cpp
@@ -76,7 +76,7 @@ void Scheduler::set(Type t)
 
 bool Scheduler::is_available(Type t)
 {
-    if(t == Type::CUSTOM)
+    if (t == Type::CUSTOM)
     {
         return _custom_scheduler != nullptr;
     }
@@ -93,11 +93,12 @@ Scheduler::Type Scheduler::get_type()
 
 IScheduler &Scheduler::get()
 {
-    if(_scheduler_type == Type::CUSTOM)
+    if (_scheduler_type == Type::CUSTOM)
     {
-        if(_custom_scheduler == nullptr)
+        if (_custom_scheduler == nullptr)
         {
-            ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) before Scheduler::get()");
+            ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) "
+                              "before Scheduler::get()");
         }
         else
         {
@@ -106,13 +107,13 @@ IScheduler &Scheduler::get()
     }
     else
     {
-        if(_schedulers.empty())
+        if (_schedulers.empty())
         {
             _schedulers = init();
         }
 
         auto it = _schedulers.find(_scheduler_type);
-        if(it != _schedulers.end())
+        if (it != _schedulers.end())
         {
             return *it->second;
         }
diff --git a/src/runtime/SchedulerFactory.cpp b/src/runtime/SchedulerFactory.cpp
index cc21d6263092665b9c45b7a1f474f9b96f4d57d4..4fb08d79f53a7fea1a8eabdca22c223f8b0963eb 100644
--- a/src/runtime/SchedulerFactory.cpp
+++ b/src/runtime/SchedulerFactory.cpp
@@ -48,7 +48,7 @@ const SchedulerFactory::Type SchedulerFactory::_default_type = SchedulerFactory:
 
 std::unique_ptr<IScheduler> SchedulerFactory::create(Type type)
 {
-    switch(type)
+    switch (type)
     {
         case Type::ST:
         {
diff --git a/src/runtime/SchedulerUtils.cpp b/src/runtime/SchedulerUtils.cpp
index 6f9a32c879783fd0da645bdf142fdd8bedbe4356..74ee539fec178e9f055bb7f14c3f255019d4199d 100644
--- a/src/runtime/SchedulerUtils.cpp
+++ b/src/runtime/SchedulerUtils.cpp
@@ -47,35 +47,34 @@ std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std:
     double ratio = m / static_cast<double>(n);
 
     // nt = sqrt(max_threads * (m / n) )
-    const unsigned adjusted = std::round(
-                                  std::sqrt(max_threads * ratio));
+    const unsigned adjusted = std::round(std::sqrt(max_threads * ratio));
 
     //find the nearest factor of max_threads
-    for(unsigned i = 0; i != adjusted; ++i)
+    for (unsigned i = 0; i != adjusted; ++i)
     {
         //try down
         const unsigned adj_down = adjusted - i;
-        if(max_threads % adj_down == 0)
+        if (max_threads % adj_down == 0)
         {
-            return { adj_down, max_threads / adj_down };
+            return {adj_down, max_threads / adj_down};
         }
 
         //try up
         const unsigned adj_up = adjusted + i;
-        if(max_threads % adj_up == 0)
+        if (max_threads % adj_up == 0)
         {
-            return { adj_up, max_threads / adj_up };
+            return {adj_up, max_threads / adj_up};
         }
     }
 
     //we didn't find anything so lets bail out with maxes biased to the largest dimension
-    if(m > n)
+    if (m > n)
     {
-        return { std::min<unsigned>(m, max_threads), 1 };
+        return {std::min<unsigned>(m, max_threads), 1};
     }
     else
     {
-        return { 1, std::min<unsigned>(n, max_threads) };
+        return {1, std::min<unsigned>(n, max_threads)};
     }
 }
 #endif /* #ifndef BARE_METAL */
diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp
index ae16c8be0a7537238922bf679cde82e2796b1aa5..f87256abb1dfdea5f349e9e720724ef5066ed01f 100644
--- a/src/runtime/SubTensor.cpp
+++ b/src/runtime/SubTensor.cpp
@@ -27,8 +27,7 @@
 
 using namespace arm_compute;
 
-SubTensor::SubTensor()
-    : _parent(nullptr), _info()
+SubTensor::SubTensor() : _parent(nullptr), _info()
 {
 }
 
diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp
index 6dcef9f0b5cc6432d12fd4ecbe4a1b3ab70c8c1e..f17e3236945ce4f069d5cb4130f97df0c6679efe 100644
--- a/src/runtime/Tensor.cpp
+++ b/src/runtime/Tensor.cpp
@@ -25,8 +25,7 @@
 
 namespace arm_compute
 {
-Tensor::Tensor(IRuntimeContext *)
-    : _allocator(this)
+Tensor::Tensor(IRuntimeContext *) : _allocator(this)
 {
 }
 
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 4ae27c59fc9ba6fb37ae3d85259c25bf533bf7ed..372852bfead3f220dca8a41880b6281e02e1003d 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp
@@ -43,13 +43,13 @@ bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &c
     const size_t       parent_dims  = parent_info.num_dimensions();
     const size_t       child_dims   = child_info.num_dimensions();
 
-    if(child_dims <= parent_dims)
+    if (child_dims <= parent_dims)
     {
-        for(size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions)
+        for (size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions)
         {
             const size_t child_dim_size = coords[num_dimensions - 1] + child_shape[num_dimensions - 1];
 
-            if((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1]))
+            if ((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1]))
             {
                 is_valid = false;
                 break;
@@ -65,8 +65,7 @@ bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &c
 }
 } // namespace
 
-TensorAllocator::TensorAllocator(IMemoryManageable *owner)
-    : _owner(owner), _associated_memory_group(nullptr), _memory()
+TensorAllocator::TensorAllocator(IMemoryManageable *owner) : _owner(owner), _associated_memory_group(nullptr), _memory()
 {
 }
 
@@ -88,7 +87,7 @@ TensorAllocator::TensorAllocator(TensorAllocator &&o) noexcept
 
 TensorAllocator &TensorAllocator::operator=(TensorAllocator &&o) noexcept
 {
-    if(&o != this)
+    if (&o != this)
     {
         _owner   = o._owner;
         o._owner = nullptr;
@@ -117,8 +116,10 @@ void TensorAllocator::init(const TensorAllocator &allocator, const Coordinates &
     _memory = Memory(allocator._memory.region());
 
     // Init tensor info with new dimensions
-    size_t total_size = parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
-    sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(), parent_info.offset_element_in_bytes(coords), total_size);
+    size_t total_size =
+        parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
+    sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(),
+                  parent_info.offset_element_in_bytes(coords), total_size);
 
     // Set TensorInfo
     init(sub_info);
@@ -133,7 +134,7 @@ void TensorAllocator::allocate()
 {
     // Align to 64-byte boundaries by default if alignment is not specified
     const size_t alignment_to_use = (alignment() != 0) ? alignment() : 64;
-    if(_associated_memory_group == nullptr)
+    if (_associated_memory_group == nullptr)
     {
         _memory.set_owned_region(std::make_unique<MemoryRegion>(info().total_size(), alignment_to_use));
     }
diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp
index 15e9d43a4969119a1dc2e7fc69a3dab63ab564e4..a7f7b5f3cb5426d43d54581c6a5c3d5df7087421 100644
--- a/src/runtime/Utils.cpp
+++ b/src/runtime/Utils.cpp
@@ -41,20 +41,17 @@ static const std::string information =
 
 const std::string &string_from_scheduler_type(Scheduler::Type t)
 {
-    static std::map<Scheduler::Type, const std::string> scheduler_type_map =
-    {
-        { Scheduler::Type::ST, "Single Thread" },
-        { Scheduler::Type::CPP, "C++11 Threads" },
-        { Scheduler::Type::OMP, "OpenMP Threads" },
-        { Scheduler::Type::CUSTOM, "Custom" }
-    };
+    static std::map<Scheduler::Type, const std::string> scheduler_type_map = {{Scheduler::Type::ST, "Single Thread"},
+                                                                              {Scheduler::Type::CPP, "C++11 Threads"},
+                                                                              {Scheduler::Type::OMP, "OpenMP Threads"},
+                                                                              {Scheduler::Type::CUSTOM, "Custom"}};
 
     return scheduler_type_map[t];
 }
 
 void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const IScheduler::Hints &hints)
 {
-    if(ctx)
+    if (ctx)
     {
         ARM_COMPUTE_ERROR_ON(ctx->scheduler() == nullptr);
         ctx->scheduler()->schedule(kernel, hints);
@@ -68,7 +65,7 @@ void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const ISch
 unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis)
 {
     // We need only 1 stage for all axis except x-axis
-    if(axis != 0)
+    if (axis != 0)
     {
         return 1;
     }
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
index 1bfb8124e925b3baa3682cf9085d60a2532d54f3..aba32871d034fbd66518926745b201b63411e597 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -37,25 +38,27 @@ namespace cl_direct_conv
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu)
-    : IClDirectConvKernelConfig(gpu)
+ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu) : IClDirectConvKernelConfig(gpu)
 {
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info)
 {
-    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)(const ITensorInfo * src, const ITensorInfo * wei, const PadStrideInfo & conv_info);
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 
-    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClDirectConvDefaultConfigBifrost::configure_G71_f32,
-                                                                          &ClDirectConvDefaultConfigBifrost::configure_G71_f16,
-                                                                          &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClDirectConvDefaultConfigBifrost::configure_G71_f32, &ClDirectConvDefaultConfigBifrost::configure_G71_f16,
+        &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
 
-    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_default(&ClDirectConvDefaultConfigBifrost::configure_default_f32,
-                                                                              &ClDirectConvDefaultConfigBifrost::configure_default_f16,
-                                                                              &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_default(
+        &ClDirectConvDefaultConfigBifrost::configure_default_f32,
+        &ClDirectConvDefaultConfigBifrost::configure_default_f16, &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G71:
             func = configs_G71.get_function(src->data_type());
@@ -69,18 +72,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const IT
     return (this->*func)(src, wei, conv_info);
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 2;
         }
@@ -93,18 +98,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 4;
         }
@@ -117,18 +124,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo   *src,
+                                                                               const ITensorInfo   *wei,
+                                                                               const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 4;
         }
@@ -141,18 +150,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(c
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo   *src,
+                                                                                    const ITensorInfo   *wei,
+                                                                                    const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 2;
         }
@@ -165,18 +176,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo   *src,
+                                                                                    const ITensorInfo   *wei,
+                                                                                    const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 4;
         }
@@ -188,5 +201,5 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_
 
     return desc;
 }
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
index 6b60b2c007a8a6d01a10689a8d942453028ef772..ed6a4c3c685acc647acaa3bf5ad1ba4f168cd539 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
@@ -41,15 +41,21 @@ public:
     ClDirectConvDefaultConfigBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
 
 private:
-    DirectConvComputeKernelInfo configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 };
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST */
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
index 8f2fd82412d70a0b218c567949104c852f7f95bd..4b7666d5aa0989c1a4c72da8ce3c9c02ffc90890 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -37,25 +38,27 @@ namespace cl_direct_conv
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu)
-    : IClDirectConvKernelConfig(gpu)
+ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu) : IClDirectConvKernelConfig(gpu)
 {
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info)
 {
-    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)(const ITensorInfo * src, const ITensorInfo * wei, const PadStrideInfo & conv_info);
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 
-    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClDirectConvDefaultConfigValhall::configure_G78_f32,
-                                                                          &ClDirectConvDefaultConfigValhall::configure_G78_f16,
-                                                                          &ClDirectConvDefaultConfigValhall::configure_G78_u8);
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClDirectConvDefaultConfigValhall::configure_G78_f32, &ClDirectConvDefaultConfigValhall::configure_G78_f16,
+        &ClDirectConvDefaultConfigValhall::configure_G78_u8);
 
-    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G57(&ClDirectConvDefaultConfigValhall::configure_G57_f32,
-                                                                          &ClDirectConvDefaultConfigValhall::configure_G57_f16,
-                                                                          &ClDirectConvDefaultConfigValhall::configure_G78_u8);
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G57(
+        &ClDirectConvDefaultConfigValhall::configure_G57_f32, &ClDirectConvDefaultConfigValhall::configure_G57_f16,
+        &ClDirectConvDefaultConfigValhall::configure_G78_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G57:
             func = configs_G57.get_function(src->data_type());
@@ -70,15 +73,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const IT
     return (this->*func)(src, wei, conv_info);
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
         const int32_t ofm          = dst_shape[0];
@@ -87,11 +92,11 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(dst_shape[0] <= 4)
+        if (dst_shape[0] <= 4)
         {
-            if(is_pointwise)
+            if (is_pointwise)
             {
-                if(ofm == 4)
+                if (ofm == 4)
                 {
                     desc.m0 = 1;
                     desc.n0 = 4;
@@ -113,7 +118,7 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
                 desc.m0 = 1;
                 desc.n0 = 1;
@@ -131,15 +136,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
         const int32_t ofm          = dst_shape[0];
@@ -149,15 +156,15 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(dst_shape[0] <= 4)
+        if (dst_shape[0] <= 4)
         {
             // k0 should be as larger as possible. However, we should avoid
             // having left-over for loops that make the implementation slower.
-            if((k % 16) == 0)
+            if ((k % 16) == 0)
             {
                 desc.k0 = 16;
             }
-            else if((k % 8) == 0)
+            else if ((k % 8) == 0)
             {
                 desc.k0 = 8;
             }
@@ -166,9 +173,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
                 desc.k0 = 4;
             }
 
-            if(is_pointwise)
+            if (is_pointwise)
             {
-                if(ofm == 4)
+                if (ofm == 4)
                 {
                     desc.m0 = 1;
                     desc.n0 = 4;
@@ -187,15 +194,15 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
                 desc.m0 = 1;
                 desc.n0 = 1;
-                if((k % 16) == 0)
+                if ((k % 16) == 0)
                 {
                     desc.k0 = 16;
                 }
-                else if((k % 8) == 0)
+                else if ((k % 8) == 0)
                 {
                     desc.k0 = 8;
                 }
@@ -206,9 +213,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
             }
             else
             {
-                if(ofm >= 16)
+                if (ofm >= 16)
                 {
-                    if(m / 6 > 24000)
+                    if (m / 6 > 24000)
                     {
                         desc.m0 = 6;
                     }
@@ -223,11 +230,11 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
                 {
                     desc.m0 = 2;
                     desc.n0 = 8;
-                    if((k % 16) == 0)
+                    if ((k % 16) == 0)
                     {
                         desc.k0 = 16;
                     }
-                    else if((k % 8) == 0)
+                    else if ((k % 8) == 0)
                     {
                         desc.k0 = 8;
                     }
@@ -243,18 +250,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo   *src,
+                                                                               const ITensorInfo   *wei,
+                                                                               const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 4;
         }
@@ -267,15 +276,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(c
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
         const int32_t m            = dst_shape[1] * dst_shape[2];
@@ -283,9 +294,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(dst_shape[0] <= 4)
+        if (dst_shape[0] <= 4)
         {
-            if(is_pointwise)
+            if (is_pointwise)
             {
                 desc.m0 = 1;
                 desc.n0 = 1;
@@ -300,9 +311,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
-                if(m == 1)
+                if (m == 1)
                 {
                     desc.m0 = 1;
                     desc.n0 = 1;
@@ -327,15 +338,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
         const int32_t ofm          = dst_shape[0];
@@ -344,9 +357,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(dst_shape[0] <= 4)
+        if (dst_shape[0] <= 4)
         {
-            if(is_pointwise)
+            if (is_pointwise)
             {
                 desc.m0 = 2;
                 desc.n0 = 1;
@@ -361,9 +374,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
-                if(m == 1)
+                if (m == 1)
                 {
                     desc.m0 = 1;
                     desc.n0 = 1;
@@ -378,7 +391,7 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(
             }
             else
             {
-                if(ofm > 16)
+                if (ofm > 16)
                 {
                     desc.m0 = 4;
                     desc.n0 = 8;
@@ -396,5 +409,5 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(
 
     return desc;
 }
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
index f9d5c5299e0bb59ad5a6c8954b7dcc2458e17607..efd879a5675c1391d6905dbcad1dc37ab252fde2 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
@@ -41,15 +41,21 @@ public:
     ClDirectConvDefaultConfigValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
 
 private:
-    DirectConvComputeKernelInfo configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 };
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
index 232167fc5978c30a46a930231d553f37c87470d9..2c2509f70b094484ed46e5591988cd8b5efb0b66 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
@@ -46,7 +46,7 @@ public:
      */
     static std::unique_ptr<IClDirectConvKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
                 return std::make_unique<ClDirectConvDefaultConfigBifrost>(GPUTarget::G71);
@@ -59,6 +59,6 @@ public:
         }
     }
 };
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG */
diff --git a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
index 6104d7359438b8d284be8025e41646305fa03559..e5b270c7203d212605cb6fd1ab61bebf1752b537 100644
--- a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
+++ b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 
 namespace arm_compute
@@ -52,8 +53,7 @@ public:
      * @param[in] func_int8 Function to call for direct convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
      *
      */
-    ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
+    ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
     {
     }
 
@@ -65,7 +65,7 @@ public:
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -92,8 +92,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    IClDirectConvKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClDirectConvKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDirectConvKernelConfig);
@@ -105,11 +104,12 @@ public:
      * @param[in] wei       Weights tensor
      * @param[in] conv_info Convolution info
      */
-    virtual DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
+    virtual DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
 
 protected:
     GPUTarget _target;
 };
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
index 5311fdcec3fc6e92b618e4a956532003bb6a7ff3..98ebf3ebbe6cf5a47403e1b40b08c99ffe6a9095 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h"
-#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
@@ -30,28 +29,34 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
 namespace arm_compute
 {
 namespace cl_dwc
 {
 namespace
 {
-DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                   unsigned int depth_multiplier, bool is_g71)
+DWCComputeKernelInfo configure_f32(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier,
+                                   bool                 is_g71)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      kernel_c  = wei_shape[idx_c];
         const size_t      kernel_w  = wei_shape[idx_w];
 
         desc.export_input_to_cl_image = false;
 
-        if(is_g71)
+        if (is_g71)
         {
             desc.export_weights_to_cl_image = false;
         }
@@ -60,17 +65,17 @@ DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *we
             desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
         }
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
             desc.n0 = 4;
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -81,14 +86,15 @@ DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *we
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
@@ -106,16 +112,20 @@ DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *we
     return desc;
 }
 
-DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                   unsigned int depth_multiplier, bool is_g71)
+DWCComputeKernelInfo configure_f16(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier,
+                                   bool                 is_g71)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Src and weights have the same dimension indices
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape src_shape = src->tensor_shape();
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      src_w     = src_shape[idx_w];
@@ -124,7 +134,7 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we
 
         desc.export_input_to_cl_image = false;
 
-        if(is_g71)
+        if (is_g71)
         {
             desc.export_weights_to_cl_image = false;
         }
@@ -133,9 +143,9 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we
             desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
         }
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
-            if(desc.export_weights_to_cl_image == false)
+            if (desc.export_weights_to_cl_image == false)
             {
                 desc.n0 = 8;
             }
@@ -146,11 +156,11 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -161,20 +171,21 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
             else
             {
-                if((src_w % 5) == 0)
+                if ((src_w % 5) == 0)
                 {
                     desc.m0 = 5;
                 }
@@ -194,27 +205,30 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we
 }
 } // namespace
 
-ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu)
-    : IClDWCNativeKernelConfig(gpu)
+ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu)
 {
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo   *src,
+                                                                const ITensorInfo   *wei,
+                                                                const PadStrideInfo &conv_info,
+                                                                const Size2D        &dilation,
+                                                                unsigned int         depth_multiplier)
 {
-    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                   unsigned int depth_multiplier);
+    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+        unsigned int depth_multiplier);
 
-    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClDWCNativeDefaultConfigBifrost::configure_G71_f32,
-                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G71_f16,
-                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClDWCNativeDefaultConfigBifrost::configure_G71_f32, &ClDWCNativeDefaultConfigBifrost::configure_G71_f16,
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
 
-    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClDWCNativeDefaultConfigBifrost::configure_G7x_f32,
-                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16,
-                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_f32, &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16,
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G71:
             func = configs_G71.get_function(src->data_type());
@@ -228,43 +242,58 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInf
     return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     return configure_f32(src, wei, conv_info, dilation, depth_multiplier, true);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     return configure_f16(src, wei, conv_info, dilation, depth_multiplier, true);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     return configure_f32(src, wei, conv_info, dilation, depth_multiplier, false);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     return configure_f16(src, wei, conv_info, dilation, depth_multiplier, false);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo   *src,
+                                                                       const ITensorInfo   *wei,
+                                                                       const PadStrideInfo &conv_info,
+                                                                       const Size2D        &dilation,
+                                                                       unsigned int         depth_multiplier)
 {
     ARM_COMPUTE_UNUSED(wei);
 
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = false;
         desc.n0                         = (depth_multiplier == 1) ? 4 : 1;
-        if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
         {
             desc.m0 = 2;
         }
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
index cec2cae5dd90a0c316ca3350210dc113c4a5b376..41d86c9c14638e4e5eada178013863d838b2435f 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
@@ -41,20 +41,38 @@ public:
     ClDWCNativeDefaultConfigBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                   unsigned int depth_multiplier) override;
+    DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier) override;
 
 private:
-    DWCComputeKernelInfo configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                          unsigned int depth_multiplier);
+    DWCComputeKernelInfo configure_G71_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G71_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo   *src,
+                                          const ITensorInfo   *wei,
+                                          const PadStrideInfo &conv_info,
+                                          const Size2D        &dilation,
+                                          unsigned int         depth_multiplier);
 };
 } // namespace cl_dwc
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
index 51f3787875ff60bec3be6e330ae26236630ff0da..ef1bb3858c996c1b42b9e903df8fede6a9273edd 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h"
-#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
@@ -30,31 +29,36 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
 namespace arm_compute
 {
 namespace cl_dwc
 {
-ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu)
-    : IClDWCNativeKernelConfig(gpu)
+ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu)
 {
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                const ITensorInfo   *wei,
+                                                                const PadStrideInfo &conv_info,
+                                                                const Size2D        &dilation,
+                                                                unsigned int         depth_multiplier)
 {
-    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                   unsigned int depth_multiplier);
+    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+        unsigned int depth_multiplier);
 
-    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClDWCNativeDefaultConfigValhall::configure_G78_f32,
-                                                                         &ClDWCNativeDefaultConfigValhall::configure_G78_f16,
-                                                                         &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G78_f16,
+        &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
 
-    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClDWCNativeDefaultConfigValhall::configure_G78_f32,
-                                                                         &ClDWCNativeDefaultConfigValhall::configure_G77_f16,
-                                                                         &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G77_f16,
+        &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G77:
             func = configs_G77.get_function(src->data_type());
@@ -69,15 +73,18 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInf
     return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      kernel_c  = wei_shape[idx_c];
         const size_t      kernel_w  = wei_shape[idx_w];
@@ -85,17 +92,17 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const IT
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
             desc.n0 = 4;
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -106,14 +113,15 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const IT
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
@@ -131,16 +139,19 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const IT
     return desc;
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Src and weights have the same dimension indices
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape src_shape = src->tensor_shape();
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      src_w     = src_shape[idx_w];
@@ -150,9 +161,9 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
-            if(desc.export_weights_to_cl_image == false)
+            if (desc.export_weights_to_cl_image == false)
             {
                 desc.n0 = 8;
             }
@@ -163,11 +174,11 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -178,20 +189,21 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
             else
             {
-                if((src_w % 5) == 0)
+                if ((src_w % 5) == 0)
                 {
                     desc.m0 = 5;
                 }
@@ -210,19 +222,22 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT
     return desc;
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                       unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo   *src,
+                                                                       const ITensorInfo   *wei,
+                                                                       const PadStrideInfo &conv_info,
+                                                                       const Size2D        &dilation,
+                                                                       unsigned int         depth_multiplier)
 {
     ARM_COMPUTE_UNUSED(wei);
 
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = false;
         desc.n0                         = (depth_multiplier == 1) ? 4 : 1;
-        if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
         {
             desc.m0 = 2;
         }
@@ -235,15 +250,18 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITe
     return desc;
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      kernel_c  = wei_shape[idx_c];
         const size_t      kernel_w  = wei_shape[idx_w];
@@ -251,9 +269,9 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const IT
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
-            if(desc.export_weights_to_cl_image == false)
+            if (desc.export_weights_to_cl_image == false)
             {
                 desc.n0 = 8;
             }
@@ -264,11 +282,11 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const IT
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -279,14 +297,15 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const IT
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
index 4d51fa668ca8d482eddc05d1146b82ca65a9d3e4..fabce77b54b9a023ace8efd8e8fa5bb14ea3d1fe 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
@@ -41,18 +41,33 @@ public:
     ClDWCNativeDefaultConfigValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                   unsigned int depth_multiplier) override;
+    DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier) override;
 
 private:
-    DWCComputeKernelInfo configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                          unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
+    DWCComputeKernelInfo configure_G78_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G78_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G78_u8(const ITensorInfo   *src,
+                                          const ITensorInfo   *wei,
+                                          const PadStrideInfo &conv_info,
+                                          const Size2D        &dilation,
+                                          unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G77_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
 };
 } // namespace cl_dwc
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
index 5593c6de6119ff54a208e360b1f860d4345280a0..c8b006c546a3ec34b42e20dcd0a33fbe72a89b29 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
@@ -32,7 +32,7 @@ namespace cl_dwc
 bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier)
 {
     // Check whether we can use the cl image with the weights.
-    if(!export_to_cl_image(weights))
+    if (!export_to_cl_image(weights))
     {
         return false;
     }
@@ -45,12 +45,12 @@ bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_mul
     // If we can use the cl image storage with the weights, we prefer to use the cl buffer storage in the following cases for performance reasons:
     // 1- When the kernel size is 1x1
     // 2- When the depth multiplier is greater than 1 and not multiple of 4.
-    if((kernel_w == 1) && (kernel_h == 1))
+    if ((kernel_w == 1) && (kernel_h == 1))
     {
         return false;
     }
 
-    if((depth_multiplier > 1) && (depth_multiplier % 4) != 0)
+    if ((depth_multiplier > 1) && (depth_multiplier % 4) != 0)
     {
         return false;
     }
@@ -58,4 +58,4 @@ bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_mul
     return true;
 }
 } // namespace cl_dwc
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
index c08053dcb309a30a86a27f10248f38d156f9cc03..49ce6ff4793cf5ae9fa49acf7300d545682ca2f1 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
@@ -46,7 +46,7 @@ public:
      */
     static std::unique_ptr<IClDWCNativeKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
                 // The heuristic for Midgard is the same as the one used for Arm Mali-G71
diff --git a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
index b5df132a126b6365227bcb11cf0bf47178c5507c..614a6622dfd5718a7a44b5c603acb1bde60d77a9 100644
--- a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
+++ b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 
 namespace arm_compute
@@ -52,8 +53,7 @@ public:
      * @param[in] func_int8 Function to call for depthwise convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
      *
      */
-    ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
+    ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
     {
     }
 
@@ -65,7 +65,7 @@ public:
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -92,8 +92,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    IClDWCNativeKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClDWCNativeKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDWCNativeKernelConfig);
@@ -107,8 +106,11 @@ public:
      * @param[in] dilation         Kernel dilation
      * @param[in] depth_multiplier Output feature maps multiplier
      */
-    virtual DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier) = 0;
+    virtual DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier) = 0;
 
 protected:
     GPUTarget _target;
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
index 990f050112d6af6413c76b893aeb35e30a1fdfb4..3380d8f1b7c0f48fde1440c3f91b76fd0289d408 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
@@ -35,17 +35,19 @@ namespace cl_indirect_conv
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu)
-    : IClIndirectConvKernelConfig(gpu)
+ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu) : IClIndirectConvKernelConfig(gpu)
 {
 }
 
-DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                          const ITensorInfo   *wei,
+                                                                          const PadStrideInfo &conv_info)
 {
-    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 
-    ClIndirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClIndirectConvDefaultConfigValhall::configure_G77_f32,
-                                                                            &ClIndirectConvDefaultConfigValhall::configure_G77_f16);
+    ClIndirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClIndirectConvDefaultConfigValhall::configure_G77_f32, &ClIndirectConvDefaultConfigValhall::configure_G77_f16);
 
     // Important note: Indirect convolution should not be used when the kernel size is 1x1 (pointwise). The reason is because the indirect buffer makes
     // indirect convolution less efficient than direct convolution or gemm. For this reason, the heuristic of indirect convolution has not been tuned
@@ -57,22 +59,24 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const
     return (this->*func)(src, wei, conv_info);
 }
 
-DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo   *src,
+                                                                                  const ITensorInfo   *wei,
+                                                                                  const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
-        const int32_t stride_x     = conv_info.stride().first;
-        const int32_t stride_y     = conv_info.stride().second;
-        const int32_t ofm          = dst_shape[0];
-        const int32_t m            = (dst_shape[1]/ stride_x) * (dst_shape[2] / stride_y);
+        const int32_t     stride_x                   = conv_info.stride().first;
+        const int32_t     stride_y                   = conv_info.stride().second;
+        const int32_t     ofm                        = dst_shape[0];
+        const int32_t     m                          = (dst_shape[1] / stride_x) * (dst_shape[2] / stride_y);
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(ofm <= 4)
+        if (ofm <= 4)
         {
             desc.m0 = 1;
             desc.n0 = 2;
@@ -82,7 +86,7 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f3
         {
             // The 16000 threshold value has been identified as the right
             // one for using the biggest block size allowed on F32: 5x4x4
-            if(m < 16000)
+            if (m < 16000)
             {
                 desc.m0 = 4;
                 desc.n0 = 4;
@@ -100,31 +104,33 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f3
     return desc;
 }
 
-DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo   *src,
+                                                                                  const ITensorInfo   *wei,
+                                                                                  const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
-        const int32_t ofm          = dst_shape[0];
-        const int32_t m            = dst_shape[1] * dst_shape[2];
-        const int32_t k            = wei_shape[0];
+        const int32_t ofm = dst_shape[0];
+        const int32_t m   = dst_shape[1] * dst_shape[2];
+        const int32_t k   = wei_shape[0];
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(ofm <= 4)
+        if (ofm <= 4)
         {
             // k0 should be as larger as possible. However, we should avoid
             // having left-over for loops that make the implementation slower.
-            if((k % 16) == 0)
+            if ((k % 16) == 0)
             {
                 desc.k0 = 16;
             }
-            else if((k % 8) == 0)
+            else if ((k % 8) == 0)
             {
                 desc.k0 = 8;
             }
@@ -140,11 +146,11 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f1
         {
             // The 16000 threshold value has been identified as the right
             // one for using the biggest block size allowed on F16: 8x4
-            if(m >= 16000 && k < 4)
+            if (m >= 16000 && k < 4)
             {
                 desc.m0 = 8;
                 desc.n0 = 4;
-                desc.k0 = 4;    // k0 is clamped to k inside the kernel when k is less than 4
+                desc.k0 = 4; // k0 is clamped to k inside the kernel when k is less than 4
             }
             else
             {
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
index 68dca918859aa8380f9e059a06b98f7969f56b1c..bab808c66c9f4771dac214694471b7f7e5e25dc1 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
@@ -41,11 +41,14 @@ public:
     ClIndirectConvDefaultConfigValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
 
 private:
-    DirectConvComputeKernelInfo configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 };
 } // namespace cl_indirect_conv
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
index 73fbb8756039e889ecd10ffe4a98fef549460094..dd614e1f68728881556b1c09b7f378bcb2936038 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
@@ -45,7 +45,7 @@ public:
      */
     static std::unique_ptr<IClIndirectConvKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
diff --git a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
index d2f4cde662e96de20ef584b9947b36fa8f4d79b7..d05da18b581e79f3efcc2a7956c52302e539d4b6 100644
--- a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
+++ b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 
 namespace arm_compute
@@ -49,8 +50,7 @@ public:
      * @param[in] func_f16 Function to call for indirect convolution F16
      *
      */
-    ClIndirectConvConfigArray(T func_f32, T func_f16)
-        : _configs{ func_f32, func_f16}
+    ClIndirectConvConfigArray(T func_f32, T func_f16) : _configs{func_f32, func_f16}
     {
     }
 
@@ -62,7 +62,7 @@ public:
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -85,8 +85,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    IClIndirectConvKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClIndirectConvKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClIndirectConvKernelConfig);
@@ -98,7 +97,8 @@ public:
      * @param[in] wei       Weights tensor
      * @param[in] conv_info Convolution info
      */
-    virtual DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
+    virtual DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
 
 protected:
     GPUTarget _target;
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
index 01102b3d602da5d8bb27d71c9bf2be4b9ed59040..4b923547c47e1aeb39c9823bc489a0f5e68d7577 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
@@ -28,31 +28,42 @@
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
-#include <utility>
 
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
 #include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h"
 
+#include <utility>
+
 namespace arm_compute
 {
 namespace cl_matmul
 {
-ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu)
-    : IClMatMulNativeKernelConfig(gpu)
+ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu) : IClMatMulNativeKernelConfig(gpu)
 {
 }
 
-MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info)
+MatMulKernelInfo
+ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info)
 {
-    using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo & info);
+    using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
 
-    ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(&ClMatMulNativeDefaultConfigValhall::configure_G710_f32,
-                                                                             &ClMatMulNativeDefaultConfigValhall::configure_G710_f16,
-                                                                             &ClMatMulNativeDefaultConfigValhall::configure_G710_u8);
+    ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_f32,
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_f16,
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_u8);
+
+    ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(
+        &ClMatMulNativeDefaultConfigValhall::configure_G715_f32,
+        &ClMatMulNativeDefaultConfigValhall::configure_G715_f16,
+        &ClMatMulNativeDefaultConfigValhall::configure_G715_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
+        case GPUTarget::G715:
+            func = configs_G715.get_function(lhs->data_type());
+            break;
         case GPUTarget::G710:
         default:
             func = configs_G710.get_function(lhs->data_type());
@@ -67,7 +78,7 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo
 
     const bool is_batched = lhs_shape.num_dimensions() > 2;
 
-    if(is_batched == true)
+    if (is_batched == true)
     {
         lhs_shape.collapse_from(2);
     }
@@ -81,103 +92,68 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo
     return (this->*func)(m, n, k, b, rhs->lock_paddings(), info);
 }
 
-MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
 {
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 4, 1 },
-        { 688, 92, 68, 32, 2, 8, 4, 1 },
-        { 24, 464, 412, 24, 2, 8, 4, 1 },
-        { 112, 184, 144, 28, 4, 4, 16, 1 },
-        { 5776, 64, 32, 36, 2, 4, 16, 1 },
-        { 1568, 64, 40, 36, 2, 8, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 16, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 8, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 0 },
-        { 688, 92, 68, 32, 5, 4, 4, 0 },
-        { 24, 464, 412, 24, 6, 2, 8, 0 },
-        { 112, 184, 144, 28, 6, 4, 4, 0 },
-        { 5776, 64, 32, 36, 5, 4, 4, 0 },
-        { 1568, 64, 40, 36, 4, 4, 8, 0 },
-        { 2920, 64, 64, 24, 4, 4, 8, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 1 },
-        { 4096, 48, 32, 36, 2, 2, 16, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 6, 2, 8, 1 },
-        { 112, 184, 144, 28, 4, 2, 16, 1 },
-        { 5776, 64, 32, 36, 4, 4, 4, 1 },
-        { 1568, 64, 40, 36, 4, 4, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 4, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t =
-    {
-        { 3136, 64, 64, 36, 5, 4, 4, 0 },
-        { 4096, 48, 32, 36, 5, 4, 4, 0 },
-        { 688, 92, 68, 32, 5, 4, 4, 0 },
-        { 24, 464, 412, 24, 6, 2, 4, 0 },
-        { 112, 184, 144, 28, 5, 4, 4, 0 },
-        { 5776, 64, 32, 36, 5, 4, 4, 0 },
-        { 1568, 64, 40, 36, 5, 4, 4, 0 },
-        { 2920, 64, 64, 24, 6, 2, 4, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 4, 1 },
-        { 688, 92, 68, 32, 2, 8, 4, 1 },
-        { 24, 464, 412, 24, 2, 8, 4, 1 },
-        { 112, 184, 144, 28, 4, 4, 16, 1 },
-        { 5776, 64, 32, 36, 2, 8, 8, 1 },
-        { 1568, 64, 40, 36, 4, 4, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 16, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 0 },
-        { 4096, 48, 32, 36, 4, 4, 4, 0 },
-        { 688, 92, 68, 32, 4, 4, 4, 0 },
-        { 24, 464, 412, 24, 4, 4, 4, 0 },
-        { 112, 184, 144, 28, 4, 4, 4, 0 },
-        { 5776, 64, 32, 36, 4, 4, 8, 0 },
-        { 1568, 64, 40, 36, 4, 4, 4, 0 },
-        { 2920, 64, 64, 24, 4, 4, 4, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 1 },
-        { 4096, 48, 32, 36, 4, 4, 4, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 2, 2, 16, 1 },
-        { 112, 184, 144, 28, 4, 4, 4, 1 },
-        { 5776, 64, 32, 36, 4, 4, 4, 1 },
-        { 1568, 64, 40, 36, 4, 4, 4, 1 },
-        { 2920, 64, 64, 24, 4, 4, 4, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 0 },
-        { 4096, 48, 32, 36, 4, 4, 4, 0 },
-        { 688, 92, 68, 32, 4, 4, 4, 0 },
-        { 24, 464, 412, 24, 4, 2, 8, 0 },
-        { 112, 184, 144, 28, 4, 4, 4, 0 },
-        { 5776, 64, 32, 36, 4, 4, 4, 0 },
-        { 1568, 64, 40, 36, 4, 4, 4, 0 },
-        { 2920, 64, 64, 24, 4, 4, 4, 0 }
-    };
+    ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding);
+    return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 1, /* n0 */ 4, /* k0 */ 1, /* export_to_cl_image */ false};
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+    return configure_G715_f32(m, n, k, b, rhs_lock_padding, info);
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+    ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding);
+    return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 4, /* n0 */ 16, /* k0 */ 4, /* export_to_cl_image */ false};
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},   {688, 92, 68, 32, 2, 8, 4, 1},
+        {24, 464, 412, 24, 2, 8, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 4, 16, 1},
+        {1568, 64, 40, 36, 2, 8, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 5, 4, 4, 0},
+        {24, 464, 412, 24, 6, 2, 8, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 2, 2, 16, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 6, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = {
+        {3136, 64, 64, 36, 5, 4, 4, 0}, {4096, 48, 32, 36, 5, 4, 4, 0},  {688, 92, 68, 32, 5, 4, 4, 0},
+        {24, 464, 412, 24, 6, 2, 4, 0}, {112, 184, 144, 28, 5, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0},
+        {1568, 64, 40, 36, 5, 4, 4, 0}, {2920, 64, 64, 24, 6, 2, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},   {688, 92, 68, 32, 2, 8, 4, 1},
+        {24, 464, 412, 24, 2, 8, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 8, 8, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 1},  {4096, 48, 32, 36, 4, 4, 4, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 2, 2, 16, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 4, 1},  {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 2, 8, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
 
     const bool adj_lhs = info.adj_lhs();
     const bool adj_rhs = info.adj_rhs();
@@ -185,17 +161,17 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(unsigned
     const MatMulNativeConfigsMatrix *configs_best_to_use     = nullptr;
     const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr;
 
-    if((adj_lhs == false) && (adj_rhs == false))
+    if ((adj_lhs == false) && (adj_rhs == false))
     {
         configs_best_to_use     = &configs_mnkb_best_nt_nt;
         configs_fallback_to_use = &configs_mnkb_fallback_nt_nt;
     }
-    else if((adj_lhs == false) && (adj_rhs == true))
+    else if ((adj_lhs == false) && (adj_rhs == true))
     {
         configs_best_to_use     = &configs_mnkb_best_nt_t;
         configs_fallback_to_use = &configs_mnkb_fallback_nt_t;
     }
-    else if((adj_lhs == true) && (adj_rhs == false))
+    else if ((adj_lhs == true) && (adj_rhs == false))
     {
         configs_best_to_use     = &configs_mnkb_best_t_nt;
         configs_fallback_to_use = &configs_mnkb_fallback_t_nt;
@@ -209,108 +185,51 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(unsigned
     MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b);
     MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b);
 
-    return select_info(desc0,
-                       desc1,
-                       m, n, k, b, DataType::F32, rhs_lock_padding);
+    return select_info(desc0, desc1, m, n, k, b, DataType::F32, rhs_lock_padding);
 }
 
-MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
 {
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1 },
-        { 688, 92, 68, 32, 4, 4, 16, 1 },
-        { 24, 464, 412, 24, 4, 4, 4, 1 },
-        { 112, 184, 144, 28, 4, 4, 16, 1 },
-        { 5776, 64, 32, 36, 4, 4, 8, 1 },
-        { 1568, 64, 40, 36, 4, 4, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 16, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt =
-    {
-        { 3136, 64, 64, 36, 6, 4, 8, 0 },
-        { 4096, 48, 32, 36, 6, 4, 8, 0 },
-        { 688, 92, 68, 32, 6, 4, 8, 0 },
-        { 24, 464, 412, 24, 4, 4, 8, 0 },
-        { 112, 184, 144, 28, 6, 4, 8, 0 },
-        { 5776, 64, 32, 36, 6, 4, 8, 0 },
-        { 1568, 64, 40, 36, 6, 4, 8, 0 },
-        { 2920, 64, 64, 24, 6, 4, 8, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t =
-    {
-        { 3136, 64, 64, 36, 6, 4, 8, 1 },
-        { 4096, 48, 32, 36, 6, 4, 8, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 6, 2, 4, 1 },
-        { 112, 184, 144, 28, 4, 2, 16, 1 },
-        { 5776, 64, 32, 36, 6, 4, 8, 1 },
-        { 1568, 64, 40, 36, 6, 4, 8, 1 },
-        { 2920, 64, 64, 24, 6, 4, 8, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t =
-    {
-        { 3136, 64, 64, 36, 6, 2, 16, 0 },
-        { 4096, 48, 32, 36, 5, 4, 8, 0 },
-        { 688, 92, 68, 32, 6, 2, 16, 0 },
-        { 24, 464, 412, 24, 6, 2, 16, 0 },
-        { 112, 184, 144, 28, 6, 2, 16, 0 },
-        { 5776, 64, 32, 36, 5, 4, 8, 0 },
-        { 1568, 64, 40, 36, 5, 4, 8, 0 },
-        { 2920, 64, 64, 24, 6, 2, 16, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 4, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 4, 4, 4, 1 },
-        { 112, 184, 144, 28, 4, 4, 4, 1 },
-        { 5776, 64, 32, 36, 4, 4, 4, 1 },
-        { 1568, 64, 40, 36, 4, 4, 4, 1 },
-        { 2920, 64, 64, 24, 4, 4, 4, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 0 },
-        { 4096, 48, 32, 36, 4, 4, 4, 0 },
-        { 688, 92, 68, 32, 4, 4, 4, 0 },
-        { 24, 464, 412, 24, 4, 4, 4, 0 },
-        { 112, 184, 144, 28, 4, 4, 4, 0 },
-        { 5776, 64, 32, 36, 4, 4, 4, 0 },
-        { 1568, 64, 40, 36, 4, 4, 4, 0 },
-        { 2920, 64, 64, 24, 4, 4, 4, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 4, 2, 8, 1 },
-        { 112, 184, 144, 28, 4, 2, 16, 1 },
-        { 5776, 64, 32, 36, 4, 4, 16, 1 },
-        { 1568, 64, 40, 36, 4, 4, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 16, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 8, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 0 },
-        { 688, 92, 68, 32, 4, 4, 8, 0 },
-        { 24, 464, 412, 24, 4, 4, 8, 0 },
-        { 112, 184, 144, 28, 4, 4, 8, 0 },
-        { 5776, 64, 32, 36, 4, 4, 8, 0 },
-        { 1568, 64, 40, 36, 4, 4, 8, 0 },
-        { 2920, 64, 64, 24, 4, 4, 8, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 16, 1},
+        {24, 464, 412, 24, 4, 4, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 4, 4, 8, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = {
+        {3136, 64, 64, 36, 6, 4, 8, 0}, {4096, 48, 32, 36, 6, 4, 8, 0},  {688, 92, 68, 32, 6, 4, 8, 0},
+        {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 6, 4, 8, 0}, {5776, 64, 32, 36, 6, 4, 8, 0},
+        {1568, 64, 40, 36, 6, 4, 8, 0}, {2920, 64, 64, 24, 6, 4, 8, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 6, 4, 8, 1}, {4096, 48, 32, 36, 6, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 6, 2, 4, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 6, 4, 8, 1},
+        {1568, 64, 40, 36, 6, 4, 8, 1}, {2920, 64, 64, 24, 6, 4, 8, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = {
+        {3136, 64, 64, 36, 6, 2, 16, 0}, {4096, 48, 32, 36, 5, 4, 8, 0},   {688, 92, 68, 32, 6, 2, 16, 0},
+        {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 6, 2, 16, 0}, {5776, 64, 32, 36, 5, 4, 8, 0},
+        {1568, 64, 40, 36, 5, 4, 8, 0},  {2920, 64, 64, 24, 6, 2, 16, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 4, 4, 4, 1},  {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 4, 1},  {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 4, 2, 8, 1},  {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 16, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 4, 4, 8, 0},
+        {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
 
     const bool adj_lhs = info.adj_lhs();
     const bool adj_rhs = info.adj_rhs();
@@ -318,17 +237,17 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(unsigned
     const MatMulNativeConfigsMatrix *configs_best_to_use     = nullptr;
     const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr;
 
-    if((adj_lhs == false) && (adj_rhs == false))
+    if ((adj_lhs == false) && (adj_rhs == false))
     {
         configs_best_to_use     = &configs_mnkb_best_nt_nt;
         configs_fallback_to_use = &configs_mnkb_fallback_nt_nt;
     }
-    else if((adj_lhs == false) && (adj_rhs == true))
+    else if ((adj_lhs == false) && (adj_rhs == true))
     {
         configs_best_to_use     = &configs_mnkb_best_nt_t;
         configs_fallback_to_use = &configs_mnkb_fallback_nt_t;
     }
-    else if((adj_lhs == true) && (adj_rhs == false))
+    else if ((adj_lhs == true) && (adj_rhs == false))
     {
         configs_best_to_use     = &configs_mnkb_best_t_nt;
         configs_fallback_to_use = &configs_mnkb_fallback_t_nt;
@@ -342,75 +261,46 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(unsigned
     MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b);
     MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b);
 
-    return select_info(desc0,
-                       desc1,
-                       m, n, k, b, DataType::F16, rhs_lock_padding);
+    return select_info(desc0, desc1, m, n, k, b, DataType::F16, rhs_lock_padding);
 }
 
-MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
 {
     ARM_COMPUTE_UNUSED(rhs_lock_padding);
 
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt =
-    {
-        { 3136, 64, 64, 36, 6, 4, 4, 0 },
-        { 4096, 48, 32, 36, 6, 4, 4, 0 },
-        { 688, 92, 68, 32, 2, 8, 4, 0 },
-        { 24, 464, 412, 24, 4, 4, 4, 0 },
-        { 112, 184, 144, 28, 6, 4, 4, 0 },
-        { 5776, 64, 32, 36, 6, 4, 4, 0 },
-        { 1568, 64, 40, 36, 6, 4, 4, 0 },
-        { 2920, 64, 64, 24, 5, 4, 4, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 0 },
-        { 4096, 48, 32, 36, 4, 4, 16, 0 },
-        { 688, 92, 68, 32, 4, 4, 16, 0 },
-        { 24, 464, 412, 24, 6, 2, 16, 0 },
-        { 112, 184, 144, 28, 4, 4, 16, 0 },
-        { 5776, 64, 32, 36, 4, 4, 16, 0 },
-        { 1568, 64, 40, 36, 6, 4, 4, 0 },
-        { 2920, 64, 64, 24, 4, 4, 16, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 8, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 0 },
-        { 688, 92, 68, 32, 4, 4, 4, 0 },
-        { 24, 464, 412, 24, 4, 4, 4, 0 },
-        { 112, 184, 144, 28, 4, 4, 8, 0 },
-        { 5776, 64, 32, 36, 4, 4, 8, 0 },
-        { 1568, 64, 40, 36, 4, 4, 8, 0 },
-        { 2920, 64, 64, 24, 4, 4, 8, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 2, 16, 0 },
-        { 4096, 48, 32, 36, 4, 4, 4, 0 },
-        { 688, 92, 68, 32, 4, 4, 8, 0 },
-        { 24, 464, 412, 24, 4, 2, 16, 0 },
-        { 112, 184, 144, 28, 4, 2, 16, 0 },
-        { 5776, 64, 32, 36, 4, 4, 4, 0 },
-        { 1568, 64, 40, 36, 4, 4, 8, 0 },
-        { 2920, 64, 64, 24, 4, 2, 16, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 6, 4, 4, 0}, {4096, 48, 32, 36, 6, 4, 4, 0},  {688, 92, 68, 32, 2, 8, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 6, 4, 4, 0},
+        {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 5, 4, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 4, 4, 16, 0}, {4096, 48, 32, 36, 4, 4, 16, 0},  {688, 92, 68, 32, 4, 4, 16, 0},
+        {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 4, 4, 16, 0}, {5776, 64, 32, 36, 4, 4, 16, 0},
+        {1568, 64, 40, 36, 6, 4, 4, 0},  {2920, 64, 64, 24, 4, 4, 16, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 2, 16, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},   {688, 92, 68, 32, 4, 4, 8, 0},
+        {24, 464, 412, 24, 4, 2, 16, 0}, {112, 184, 144, 28, 4, 2, 16, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0},  {2920, 64, 64, 24, 4, 2, 16, 0}};
 
     const bool adj_lhs = info.adj_lhs();
     const bool adj_rhs = info.adj_rhs();
 
-    if((adj_lhs == false) && (adj_rhs == false))
+    if ((adj_lhs == false) && (adj_rhs == false))
     {
         return find_info(configs_mnkb_best_nt_nt, adj_lhs, adj_rhs, m, n, k, b);
     }
-    else if((adj_lhs == false) && (adj_rhs == true))
+    else if ((adj_lhs == false) && (adj_rhs == true))
     {
         return find_info(configs_mnkb_best_nt_t, adj_lhs, adj_rhs, m, n, k, b);
     }
-    else if((adj_lhs == true) && (adj_rhs == false))
+    else if ((adj_lhs == true) && (adj_rhs == false))
     {
         return find_info(configs_mnkb_best_t_nt, adj_lhs, adj_rhs, m, n, k, b);
     }
@@ -419,5 +309,5 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(unsigned
         return find_info(configs_mnkb_best_t_t, adj_lhs, adj_rhs, m, n, k, b);
     }
 }
-} // namespace opencl
+} // namespace cl_matmul
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
index fe167d18dd521d099168bafec5e4e3b274af072a..5279871057343f8c2b3cfcc1303c771eceffadaa 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL
-#define SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H
 
 #include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
 
@@ -44,10 +44,19 @@ public:
     MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) override;
 
 private:
-    MatMulKernelInfo configure_G710_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
-    MatMulKernelInfo configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
-    MatMulKernelInfo configure_G710_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G710_f32(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G710_f16(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G710_u8(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G715_f32(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G715_f16(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G715_u8(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
 };
-} // namespace opencl
+} // namespace cl_matmul
 } // namespace arm_compute
-#endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL */
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
index 1e06e84d4d79914731bb254544bb9ba6649c0e5f..89cad30214f8dc4c2c65665a39a977cbcda6eb6e 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
+
 #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
 
 #include <limits>
@@ -37,22 +38,32 @@ namespace cl_matmul
 {
 MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
                              const MatMulKernelInfo &info1,
-                             unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type, bool rhs_lock_padding)
+                             unsigned int            m,
+                             unsigned int            n,
+                             unsigned int            k,
+                             unsigned int            b,
+                             DataType                data_type,
+                             bool                    rhs_lock_padding)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true, "The fallback MatMul configuration cannot have export_to_cl_image = true");
-    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs, "The MatMul configurations must have the same adj_lhs value");
-    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs, "The MatMul configurations must have the same adj_rhs value");
+    ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true,
+                             "The fallback MatMul configuration cannot have export_to_cl_image = true");
+    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs,
+                             "The MatMul configurations must have the same adj_lhs value");
+    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs,
+                             "The MatMul configurations must have the same adj_rhs value");
 
     const bool adj_lhs = info0.adj_lhs;
     const bool adj_rhs = info0.adj_rhs;
 
-    TensorInfo lhs_info = !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type);
-    TensorInfo rhs_info = !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type);
+    TensorInfo lhs_info =
+        !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type);
+    TensorInfo rhs_info =
+        !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type);
     TensorInfo dst_info;
 
-    if(rhs_lock_padding == false)
+    if (rhs_lock_padding == false)
     {
-        if(bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0)))
+        if (bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0)))
         {
             return info0;
         }
@@ -67,7 +78,13 @@ MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
     }
 }
 
-MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lhs, bool adj_rhs, unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs,
+                           bool                             adj_lhs,
+                           bool                             adj_rhs,
+                           unsigned int                     m,
+                           unsigned int                     n,
+                           unsigned int                     k,
+                           unsigned int                     b)
 {
     size_t min_acc = std::numeric_limits<size_t>::max();
     size_t min_idx = 0;
@@ -76,12 +93,13 @@ MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lh
     const size_t num_rows = configs.size();
     const size_t num_cols = configs[0].size();
 
-    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U, "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS");
+    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U,
+                             "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS");
     ARM_COMPUTE_UNUSED(num_cols);
 
     // Find nearest GeMM workload
     // Note: the workload does not depend on the K dimension
-    for(size_t y = 0; y < num_rows; ++y)
+    for (size_t y = 0; y < num_rows; ++y)
     {
         size_t mc0 = static_cast<size_t>(configs[y][0]);
         size_t nc0 = static_cast<size_t>(configs[y][1]);
@@ -94,7 +112,7 @@ MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lh
         acc += (k - kc0) * (k - kc0);
         acc += (b - bc0) * (b - bc0);
         acc = std::sqrt(acc);
-        if(acc < min_acc)
+        if (acc < min_acc)
         {
             min_acc = acc;
             min_idx = y;
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
index 3881617558a4eccf9480885946492878d441451d..a114fffa6845fdd6a7a1c5048cc7cb17766fd66b 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
@@ -52,7 +52,12 @@ using MatMulNativeConfigsMatrix = std::vector<std::vector<int32_t>>;
  */
 MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
                              const MatMulKernelInfo &info1,
-                             unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type, bool rhs_lock_padding);
+                             unsigned int            m,
+                             unsigned int            n,
+                             unsigned int            k,
+                             unsigned int            b,
+                             DataType                data_type,
+                             bool                    rhs_lock_padding);
 
 /** Find the preferred configurations for the MatMul Native kernel using the MatMulNativeConfigsMatrix provided by the user
  *
@@ -66,7 +71,13 @@ MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
  *
  * @return @ref MatMulKernelInfo
  */
-MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lhs, bool adj_rhs, unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs,
+                           bool                             adj_lhs,
+                           bool                             adj_rhs,
+                           unsigned int                     m,
+                           unsigned int                     n,
+                           unsigned int                     k,
+                           unsigned int                     b);
 } // namespace cl_matmul
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS */
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
index a2dbfc7dd5d3c0d474ea7bf5a7e2ba6ae77f806a..b10018a6d25623a13882b381c04643d91297f367 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
@@ -45,7 +45,7 @@ public:
      */
     static std::unique_ptr<IClMatMulNativeKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
@@ -56,6 +56,6 @@ public:
         }
     }
 };
-} // namespace opencl
+} // namespace cl_matmul
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG */
diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
index 4f548bd01d6df6b0948e53b13e59561c9f1a2489..b9b091100c7fdc94dd2cd1ff0adc2c25c9f89040 100644
--- a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
+++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
@@ -28,6 +28,7 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/MatMulInfo.h"
+
 #include "src/core/common/Macros.h"
 
 namespace arm_compute
@@ -53,8 +54,7 @@ public:
      * @param[in] func_int8 Function to call for matmul native Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
      *
      */
-    ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
+    ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
     {
     }
 
@@ -66,7 +66,7 @@ public:
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -93,8 +93,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    IClMatMulNativeKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClMatMulNativeKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelConfig);
diff --git a/support/Bfloat16.h b/support/Bfloat16.h
index e67c729a6cd18072f5546f0f0b6992848664f7ac..17013294e2d1ce53324f7324b262fb4fb8bef36e 100644
--- a/support/Bfloat16.h
+++ b/support/Bfloat16.h
@@ -43,18 +43,17 @@ inline uint16_t float_to_bf16(const float v)
 #if defined(ARM_COMPUTE_ENABLE_BF16)
     uint16_t res;
 
-    __asm __volatile(
-        "ldr    s0, [%[fromptr]]\n"
-        ".inst    0x1e634000\n" // BFCVT h0, s0
-        "str    h0, [%[toptr]]\n"
-        :
-        : [fromptr] "r"(fromptr), [toptr] "r"(&res)
-        : "v0", "memory");
+    __asm __volatile("ldr    s0, [%[fromptr]]\n"
+                     ".inst    0x1e634000\n" // BFCVT h0, s0
+                     "str    h0, [%[toptr]]\n"
+                     :
+                     : [fromptr] "r"(fromptr), [toptr] "r"(&res)
+                     : "v0", "memory");
 #else  /* defined(ARM_COMPUTE_ENABLE_BF16) */
     uint16_t       res   = (*fromptr >> 16);
     const uint16_t error = (*fromptr & 0x0000ffff);
     uint16_t       bf_l  = res & 0x0001;
-    if((error > 0x8000) || ((error == 0x8000) && (bf_l != 0)))
+    if ((error > 0x8000) || ((error == 0x8000) && (bf_l != 0)))
     {
         res += 1;
     }
@@ -75,23 +74,21 @@ inline float bf16_to_float(const uint16_t &v)
     memcpy(&fp, &lv, sizeof(lv));
     return fp;
 }
-}
+} // namespace
 
 /** Brain floating point representation class */
 class bfloat16 final
 {
 public:
     /** Default Constructor */
-    bfloat16()
-        : value(0)
+    bfloat16() : value(0)
     {
     }
     /** Constructor
      *
      * @param[in] v Floating-point value
      */
-    bfloat16(float v)
-        : value(float_to_bf16(v))
+    bfloat16(float v) : value(float_to_bf16(v))
     {
     }
     /** Assignment operator
diff --git a/support/Cast.h b/support/Cast.h
index 53d5f680653aa792752c6bcf98dccd9cd5385b9b..5fd763690b9e0ed37a87a3b168f3de09ad94e870 100644
--- a/support/Cast.h
+++ b/support/Cast.h
@@ -46,7 +46,7 @@ namespace cast
 template <typename Target, typename Source>
 inline Target polymorphic_cast(Source *v)
 {
-    if(dynamic_cast<Target>(v) == nullptr)
+    if (dynamic_cast<Target>(v) == nullptr)
     {
         ARM_COMPUTE_THROW(std::bad_cast());
     }
@@ -86,7 +86,7 @@ inline Target polymorphic_downcast(Source *v)
 template <typename Target, typename Source, typename Deleter>
 std::unique_ptr<Target, Deleter> polymorphic_cast_unique_ptr(std::unique_ptr<Source, Deleter> &&v)
 {
-    if(dynamic_cast<Target *>(v.get()) == nullptr)
+    if (dynamic_cast<Target *>(v.get()) == nullptr)
     {
         ARM_COMPUTE_THROW(std::bad_cast());
     }
diff --git a/support/DeepCopy.h b/support/DeepCopy.h
index 01178979017152905474522a08be8adaeb84dfaa..c0279284c0aa984bc9c508fb58d95f8ceca6cdfa 100644
--- a/support/DeepCopy.h
+++ b/support/DeepCopy.h
@@ -40,9 +40,8 @@ namespace
 template <typename Base, typename Derived>
 Base *default_polymorphic_copy(const Base *ptr)
 {
-    static_assert(std::is_base_of<Base, Derived>::value,
-                  "Derived is not a specialization of Base");
-    if(ptr == nullptr)
+    static_assert(std::is_base_of<Base, Derived>::value, "Derived is not a specialization of Base");
+    if (ptr == nullptr)
     {
         return nullptr;
     }
@@ -62,25 +61,18 @@ class deep_unique_ptr
 public:
     using CopyFunc = std::function<Base *(const Base *)>;
 
-    deep_unique_ptr(std::nullptr_t val = nullptr) noexcept
-        : _val{ val },
-    _copy{}
+    deep_unique_ptr(std::nullptr_t val = nullptr) noexcept : _val{val}, _copy{}
     {
     }
     template <typename Derived, typename CopyFuncDerived>
-    deep_unique_ptr(Derived *value, const CopyFuncDerived &copy) noexcept
-        : _val{ value },
-    _copy{ std::move(copy) }
+    deep_unique_ptr(Derived *value, const CopyFuncDerived &copy) noexcept : _val{value}, _copy{std::move(copy)}
     {
-        static_assert(std::is_base_of<Base, Derived>::value,
-                      "Derived is not a specialization of Base");
-        static_assert(
-            std::is_constructible<CopyFunc, CopyFuncDerived>::value,
-            "CopyFuncDerived is not valid for a copy functor");
+        static_assert(std::is_base_of<Base, Derived>::value, "Derived is not a specialization of Base");
+        static_assert(std::is_constructible<CopyFunc, CopyFuncDerived>::value,
+                      "CopyFuncDerived is not valid for a copy functor");
     }
 
-    deep_unique_ptr(const deep_unique_ptr<Base> &ptr)
-        : deep_unique_ptr(ptr.clone())
+    deep_unique_ptr(const deep_unique_ptr<Base> &ptr) : deep_unique_ptr(ptr.clone())
     {
     }
     deep_unique_ptr &operator=(const deep_unique_ptr<Base> &ptr)
@@ -90,7 +82,7 @@ public:
         return *this;
     }
 
-    deep_unique_ptr(deep_unique_ptr<Base> &&ptr) = default;
+    deep_unique_ptr(deep_unique_ptr<Base> &&ptr)            = default;
     deep_unique_ptr &operator=(deep_unique_ptr<Base> &&ptr) = default;
     ~deep_unique_ptr()                                      = default;
     friend void swap(deep_unique_ptr &ptr0, deep_unique_ptr<Base> &ptr1) noexcept
@@ -135,11 +127,11 @@ public:
 
     bool operator==(const deep_unique_ptr<Base> &rhs) const
     {
-        if(rhs.get() == nullptr && _val == nullptr)
+        if (rhs.get() == nullptr && _val == nullptr)
         {
             return true;
         }
-        else if(rhs.get() == nullptr || _val == nullptr)
+        else if (rhs.get() == nullptr || _val == nullptr)
         {
             return false;
         }
@@ -152,9 +144,9 @@ public:
 private:
     deep_unique_ptr clone() const
     {
-        return { _copy(_val.get()), CopyFunc(_copy) };
+        return {_copy(_val.get()), CopyFunc(_copy)};
     }
-    std::unique_ptr<Base> _val{ nullptr };
+    std::unique_ptr<Base> _val{nullptr};
     CopyFunc              _copy{};
 };
 
@@ -170,34 +162,26 @@ private:
 template <typename Base, typename Derived, typename CopyFunc>
 deep_unique_ptr<Base> make_deep_unique(Derived &&temp, CopyFunc copy)
 {
-    return
-    {
-        new Derived(std::move(temp)),
-        CopyFunc{ std::move(copy) }
-    };
+    return {new Derived(std::move(temp)), CopyFunc{std::move(copy)}};
 }
 
 template <typename Base, typename Derived>
 deep_unique_ptr<Base> make_deep_unique(Derived &&temp)
 {
-    static_assert(std::is_base_of<Base, Derived>::value,
-                  "Derived is not a specialization of Base");
+    static_assert(std::is_base_of<Base, Derived>::value, "Derived is not a specialization of Base");
 
-    return make_deep_unique<Base, Derived>(
-               std::move(temp), default_polymorphic_copy<Base, Derived>);
+    return make_deep_unique<Base, Derived>(std::move(temp), default_polymorphic_copy<Base, Derived>);
 }
 
 template <typename Base, typename Derived, typename... Args>
-deep_unique_ptr<Base> make_deep_unique(Args &&... args)
+deep_unique_ptr<Base> make_deep_unique(Args &&...args)
 {
-    static_assert(std::is_constructible<Derived, Args...>::value,
-                  "Cannot instantiate Derived from arguments");
+    static_assert(std::is_constructible<Derived, Args...>::value, "Cannot instantiate Derived from arguments");
 
-    return make_deep_unique<Base, Derived>(
-               std::move(Derived{ std::forward<Args>(args)... }));
+    return make_deep_unique<Base, Derived>(std::move(Derived{std::forward<Args>(args)...}));
 }
 
 } // namespace memory
 } // namespace utils
 } // namespace arm_compute
-#endif // ARM_COMPUTE_MISC_ITERABLE_H
\ No newline at end of file
+#endif // ARM_COMPUTE_MISC_ITERABLE_H
diff --git a/support/Half.h b/support/Half.h
index 081da5ebc1a0d5b1d8b325ca9da4e7e9f03988a5..f5c27da2d3483f381a7c79c9dd2d22e556e32b6a 100644
--- a/support/Half.h
+++ b/support/Half.h
@@ -24,12 +24,12 @@
 #ifndef __ARM_COMPUTE_HALF_H__
 #define __ARM_COMPUTE_HALF_H__
 
-#if(BARE_METAL)
+#if (BARE_METAL)
 #define HALF_ENABLE_CPP11_CMATH 0
 #endif /* BARE_METAL */
 
 // Set style to round to nearest
-#define HALF_ROUND_STYLE 1
+#define HALF_ROUND_STYLE        1
 #define HALF_ROUND_TIES_TO_EVEN 1
 
 #include "half/half.hpp"
diff --git a/support/Iterable.h b/support/Iterable.h
index a0bafaf4ce883cce4b69a90f4e322fa4f2cafe33..8d99e7019682845ecd88014b01e57468e957f60d 100644
--- a/support/Iterable.h
+++ b/support/Iterable.h
@@ -44,8 +44,7 @@ public:
      *
      * @param[in] it Value to reverse iterate on
      */
-    explicit reverse_iterable(T &it)
-        : _it(it)
+    explicit reverse_iterable(T &it) : _it(it)
     {
     }
 
diff --git a/support/Mutex.h b/support/Mutex.h
index 6e68fa5248c7dc788a0be613205273012228eb53..9c2b55c3ac4c08fc135103cfb4daead61e81f8ae 100644
--- a/support/Mutex.h
+++ b/support/Mutex.h
@@ -50,10 +50,10 @@ public:
     ~Mutex() = default;
 
     /** Lock */
-    void lock() {};
+    void lock(){};
 
     /** Unlock */
-    void unlock() {};
+    void unlock(){};
 
     /** Try the lock.
      *
@@ -73,8 +73,7 @@ public:
     typedef Mutex mutex_type;
 
 public:
-    explicit lock_guard(Mutex &m_)
-        : m(m_)
+    explicit lock_guard(Mutex &m_) : m(m_)
     {
     }
     ~lock_guard()
@@ -97,15 +96,14 @@ public:
     unique_lock() noexcept : m(nullptr)
     {
     }
-    explicit unique_lock(mutex_type &m)
-        : m(&m)
+    explicit unique_lock(mutex_type &m) : m(&m)
     {
     }
-    unique_lock(const unique_lock &) = delete;
-    unique_lock(unique_lock &&)      = default;
+    unique_lock(const unique_lock &)            = delete;
+    unique_lock(unique_lock &&)                 = default;
     unique_lock &operator=(const unique_lock &) = delete;
-    unique_lock &operator=(unique_lock &&) = default;
-    ~unique_lock()                         = default;
+    unique_lock &operator=(unique_lock &&)      = default;
+    ~unique_lock()                              = default;
     void lock()
     {
     }
@@ -121,5 +119,5 @@ private:
     mutex_type *m;
 };
 #endif /* NO_MULTI_THREADING */
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_MUTEX_H__ */
diff --git a/support/Random.h b/support/Random.h
index 7658e6d529122e29abc96a6b0b9e3c4d0f6623b4..1a804d3290be8c41304130acd2a6591e698fe150 100644
--- a/support/Random.h
+++ b/support/Random.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_MISC_RANDOM_H
 
 #include "arm_compute/core/Error.h"
+
 #include "utils/Utils.h"
 
 #include <random>
@@ -47,7 +48,9 @@ public:
     static constexpr bool is_fp_16bit = std::is_same<T, half>::value || std::is_same<T, bfloat16>::value;
     static constexpr bool is_integral = std::is_integral<T>::value && !is_fp_16bit;
 
-    using fp_dist     = typename std::conditional<is_fp_16bit, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
+    using fp_dist     = typename std::conditional<is_fp_16bit,
+                                              arm_compute::utils::uniform_real_distribution_16bit<T>,
+                                              std::uniform_real_distribution<T>>::type;
     using DT          = typename std::conditional<is_integral, std::uniform_int_distribution<T>, fp_dist>::type;
     using result_type = T;
     using range_pair  = std::pair<result_type, result_type>;
@@ -62,7 +65,7 @@ public:
         : _distributions(), _selector()
     {
         result_type clow = low;
-        for(const auto &erange : exclude_ranges)
+        for (const auto &erange : exclude_ranges)
         {
             result_type epsilon = is_integral ? result_type(1) : result_type(std::numeric_limits<T>::epsilon());
 
diff --git a/support/Rounding.h b/support/Rounding.h
index e2732dc459d1bd76cf55aa637518c0579788337d..5691a6680b4f56673c4ecd668c58cf7e49fff54e 100644
--- a/support/Rounding.h
+++ b/support/Rounding.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "support/AclRequires.h"
 #include "support/ToolchainSupport.h"
 
@@ -153,10 +154,10 @@ inline T round_half_even(T value, T epsilon = std::numeric_limits<T>::epsilon())
     T ipart          = 0;
     std::modf(positive_value, &ipart);
     // If 'value' is exactly halfway between two integers
-    if(std::abs(positive_value - (ipart + 0.5f)) < epsilon)
+    if (std::abs(positive_value - (ipart + 0.5f)) < epsilon)
     {
         // If 'ipart' is even then return 'ipart'
-        if(std::fmod(ipart, 2.f) < epsilon)
+        if (std::fmod(ipart, 2.f) < epsilon)
         {
             return support::cpp11::copysign(ipart, value);
         }
@@ -179,7 +180,7 @@ inline T round_half_even(T value, T epsilon = std::numeric_limits<T>::epsilon())
 template <typename T, ARM_COMPUTE_REQUIRES_TA(traits::is_floating_point<T>::value)>
 inline T round(T value, RoundingMode rounding_mode)
 {
-    switch(rounding_mode)
+    switch (rounding_mode)
     {
         case RoundingMode::TO_ZERO:
             return round_to_zero(value);
diff --git a/support/SaturateCast.h b/support/SaturateCast.h
index a9982d8e969c1a6fb6f8015f89b867dc0d72f031..7af9f983eded05264705d178ec944133b25d73ed 100644
--- a/support/SaturateCast.h
+++ b/support/SaturateCast.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+
 #include "support/Rounding.h"
 
 namespace arm_compute
diff --git a/support/Semaphore.h b/support/Semaphore.h
index e182b53a2de051732d853bb6b0f5faa4defca089..f44179332b64dffe7a22500ca3c4343a93da6d37 100644
--- a/support/Semaphore.h
+++ b/support/Semaphore.h
@@ -24,8 +24,9 @@
 #ifndef __ARM_COMPUTE_UTILS_SEMAMPHORE_H__
 #define __ARM_COMPUTE_UTILS_SEMAMPHORE_H__
 
-#include "Mutex.h"
 #include "support/Mutex.h"
+
+#include "Mutex.h"
 #include <condition_variable>
 
 namespace arm_compute
@@ -39,8 +40,7 @@ public:
      *
      * @param[in] value Semaphore initial value
      */
-    Semaphore(int value = 0)
-        : _value(value), _m(), _cv()
+    Semaphore(int value = 0) : _value(value), _m(), _cv()
     {
     }
     /** Signals a semaphore */
@@ -56,10 +56,7 @@ public:
     inline void wait()
     {
         std::unique_lock<std::mutex> lock(_m);
-        _cv.wait(lock, [this]()
-        {
-            return _value > 0;
-        });
+        _cv.wait(lock, [this]() { return _value > 0; });
         --_value;
     }
 
@@ -73,8 +70,7 @@ private:
 class Semaphore
 {
 public:
-    Semaphore(int value = 0)
-        : _value(value)
+    Semaphore(int value = 0) : _value(value)
     {
         (void)_value;
     }
@@ -93,5 +89,5 @@ private:
     int _value;
 };
 #endif /* NO_MULTI_THREADING */
-} // arm_compute
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_UTILS_SEMAMPHORE_H__ */
diff --git a/support/StringSupport.h b/support/StringSupport.h
index e8b3ca7ab316ca4abbc3a8d02a159485afd337d5..7d1b5e77786f074cd55e0c28e7d6598fa5822574 100644
--- a/support/StringSupport.h
+++ b/support/StringSupport.h
@@ -57,14 +57,14 @@ inline int stoi(const std::string &str, std::size_t *pos = 0, NumericBase base =
     assert(base == NumericBase::BASE_10 || base == NumericBase::BASE_16);
     unsigned int      x;
     std::stringstream ss;
-    if(base == NumericBase::BASE_16)
+    if (base == NumericBase::BASE_16)
     {
         ss << std::hex;
     }
     ss << str;
     ss >> x;
 
-    if(pos)
+    if (pos)
     {
         std::string       s;
         std::stringstream ss_p;
@@ -93,14 +93,14 @@ inline unsigned long stoul(const std::string &str, std::size_t *pos = 0, Numeric
     assert(base == NumericBase::BASE_10 || base == NumericBase::BASE_16);
     std::stringstream stream;
     unsigned long     value = 0;
-    if(base == NumericBase::BASE_16)
+    if (base == NumericBase::BASE_16)
     {
         stream << std::hex;
     }
     stream << str;
     stream >> value;
 
-    if(pos)
+    if (pos)
     {
         std::string       s;
         std::stringstream ss_p;
@@ -113,7 +113,7 @@ inline unsigned long stoul(const std::string &str, std::size_t *pos = 0, Numeric
     return value;
 }
 
-#if(__ANDROID__ || BARE_METAL)
+#if (__ANDROID__ || BARE_METAL)
 /** Convert integer and float values to string.
  *
  * @note This function implements the same behaviour as std::to_string. The
@@ -124,7 +124,7 @@ inline unsigned long stoul(const std::string &str, std::size_t *pos = 0, Numeric
  * @return String representation of @p value.
  */
 template <typename T, typename std::enable_if<std::is_arithmetic<typename std::decay<T>::type>::value, int>::type = 0>
-inline std::string to_string(T && value)
+inline std::string to_string(T &&value)
 {
     std::stringstream stream;
     stream << std::forward<T>(value);
@@ -186,7 +186,7 @@ inline std::string to_string(const std::string &value)
  * @return Float representation of input string.
  */
 template <typename... Ts>
-int stof(Ts &&... args)
+int stof(Ts &&...args)
 {
     return ::std::stof(std::forward<Ts>(args)...);
 }
diff --git a/support/ToolchainSupport.h b/support/ToolchainSupport.h
index 96826dad5ec4d2f7ae6dedcd151f9ba9340e7475..4d394889c392af8d15ea51b7ad35b026488cc5ed 100644
--- a/support/ToolchainSupport.h
+++ b/support/ToolchainSupport.h
@@ -24,6 +24,9 @@
 #ifndef ARM_COMPUTE_SUPPORT_TOOLCHAINSUPPORT
 #define ARM_COMPUTE_SUPPORT_TOOLCHAINSUPPORT
 
+#include "support/Bfloat16.h"
+#include "support/Half.h"
+
 #include <cassert>
 #include <cmath>
 #include <cstddef>
@@ -33,9 +36,6 @@
 #include <string>
 #include <type_traits>
 
-#include "support/Bfloat16.h"
-#include "support/Half.h"
-
 #ifndef M_PI
 #define M_PI (3.14159265358979323846)
 #endif // M_PI
@@ -50,7 +50,7 @@ namespace support
 {
 namespace cpp11
 {
-#if(__ANDROID__ || BARE_METAL)
+#if (__ANDROID__ || BARE_METAL)
 template <typename T>
 inline T nearbyint(T value)
 {
@@ -129,11 +129,12 @@ inline T copysign(T x, T y)
  *
  * @return Result floating point value equal to (x*y) + z.c
  */
-template < typename T, typename = typename std::enable_if < std::is_floating_point<T>::value
+template <typename T,
+          typename = typename std::enable_if<std::is_floating_point<T>::value
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                            || std::is_same<T, float16_t>::value
+                                             || std::is_same<T, float16_t>::value
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                            >::type >
+                                             >::type>
 inline T fma(T x, T y, T z)
 {
     return ::fma(x, y, z);
@@ -151,7 +152,7 @@ inline T fma(T x, T y, T z)
  *          if successful (not including the ending null character), or a negative value if an error occurred.
  */
 template <typename... Ts>
-inline int snprintf(char *s, size_t n, const char *fmt, Ts &&... args)
+inline int snprintf(char *s, size_t n, const char *fmt, Ts &&...args)
 {
     return ::snprintf(s, n, fmt, std::forward<Ts>(args)...);
 }
@@ -244,11 +245,12 @@ inline T copysign(T x, T y)
  *
  * @return Result floating point value equal to (x*y) + z.
  */
-template < typename T, typename = typename std::enable_if < std::is_floating_point<T>::value
+template <typename T,
+          typename = typename std::enable_if<std::is_floating_point<T>::value
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                            || std::is_same<T, float16_t>::value
+                                             || std::is_same<T, float16_t>::value
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                            >::type >
+                                             >::type>
 inline T fma(T x, T y, T z)
 {
     return std::fma(x, y, z);
@@ -266,7 +268,7 @@ inline T fma(T x, T y, T z)
  *          if successful (not including the ending null character), or a negative value if an error occurred.
  */
 template <typename... Ts>
-inline int snprintf(char *s, std::size_t n, const char *fmt, Ts &&... args)
+inline int snprintf(char *s, std::size_t n, const char *fmt, Ts &&...args)
 {
     return std::snprintf(s, n, fmt, std::forward<Ts>(args)...);
 }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index c4b12e770a1044fa50dd0fd675341f82d42e5e8e..3f2223596f2395647be991e889992fa5fd749e40 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -67,7 +67,6 @@ target_sources(
           validation/reference/L2NormalizeLayer.cpp
           validation/reference/ActivationLayer.cpp
           validation/reference/SpaceToBatch.cpp
-          validation/reference/PostOps.cpp
           validation/reference/Im2Col.cpp
           validation/reference/DequantizationLayer.cpp
           validation/reference/DeconvolutionLayer.cpp
diff --git a/tests/SConscript b/tests/SConscript
index 33f709e454d8f8d4386f804e9d7e5b7d062e7fa9..21904899c017671f5f4f7eff71813652e418f5b0 100644
--- a/tests/SConscript
+++ b/tests/SConscript
@@ -79,7 +79,7 @@ load_whole_archive = '-Wl,--whole-archive'
 noload_whole_archive = '-Wl,--no-whole-archive'
 if 'macos' in test_env['os']:
     load_whole_archive = '-Wl,-force_load'
-    noload_whole_archive = '-Wl,-noall_load'
+    noload_whole_archive = ''
 
 if env['os'] in ['android', 'macos', 'bare_metal'] or env['standalone']:
     Import("arm_compute_a")
diff --git a/tests/datasets/DepthwiseConvolutionLayerDataset.h b/tests/datasets/DepthwiseConvolutionLayerDataset.h
index f88cb887fc54c14767df587762b44816917f429f..17e03368acc71d7f2f53903b16c0172b6f9348e8 100644
--- a/tests/datasets/DepthwiseConvolutionLayerDataset.h
+++ b/tests/datasets/DepthwiseConvolutionLayerDataset.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_DEPTHWISE_CONVOLUTION_DATASET
-#define ARM_COMPUTE_TEST_DEPTHWISE_CONVOLUTION_DATASET
+#ifndef ACL_TESTS_DATASETS_DEPTHWISECONVOLUTIONLAYERDATASET_H
+#define ACL_TESTS_DATASETS_DEPTHWISECONVOLUTIONLAYERDATASET_H
 
 #include "utils/TypePrinter.h"
 
@@ -127,7 +127,7 @@ public:
         add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 1, 2, 0, DimensionRoundingType::FLOOR));
         add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 1, 0, 2, DimensionRoundingType::FLOOR));
         // Ceil rounding
-        add_config(TensorShape(7U, 8U, 5U, 9U), Size2D(8U, 6U), PadStrideInfo(2, 3, 1, 1, 1, 3, DimensionRoundingType::CEIL), Size2D(1U, 2U));
+        add_config(TensorShape(7U, 8U, 5U, 9U), Size2D(8U, 6U), PadStrideInfo(2, 3, 1, 1, 1, 3, DimensionRoundingType::CEIL));
     }
 };
 
@@ -141,12 +141,12 @@ public:
         add_config(TensorShape(17U, 31U, 2U), Size2D(13U, 9U), PadStrideInfo(1, 2, 1, 1));
         add_config(TensorShape(23U, 27U, 5U), Size2D(11U, 3U), PadStrideInfo(1, 2, 0, 0));
         add_config(TensorShape(17U, 31U, 2U, 3U), Size2D(5U, 9U), PadStrideInfo(1, 2, 1, 1));
-        add_config(TensorShape(233U, 277U, 55U), Size2D(1U, 1U), PadStrideInfo(2, 1, 0, 0));
-        add_config(TensorShape(333U, 277U, 77U), Size2D(1U, 1U), PadStrideInfo(3, 2, 1, 0));
-        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 4U), PadStrideInfo(1, 2, 1, 1));
-        add_config(TensorShape(233U, 277U, 55U), Size2D(3U, 4U), PadStrideInfo(1, 2, 0, 0));
-        add_config(TensorShape(333U, 277U, 77U), Size2D(3U, 4U), PadStrideInfo(2, 3, 0, 1));
-        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 4U), PadStrideInfo(2, 1, 1, 1));
+        add_config(TensorShape(133U, 127U, 55U), Size2D(1U, 1U), PadStrideInfo(2, 1, 0, 0));
+        add_config(TensorShape(233U, 109U, 77U), Size2D(1U, 1U), PadStrideInfo(3, 2, 1, 0));
+        add_config(TensorShape(177U, 111U, 22U), Size2D(3U, 4U), PadStrideInfo(1, 2, 1, 1));
+        add_config(TensorShape(233U, 87U, 55U), Size2D(3U, 4U), PadStrideInfo(1, 2, 0, 0));
+        add_config(TensorShape(333U, 79U, 77U), Size2D(3U, 4U), PadStrideInfo(2, 3, 0, 1));
+        add_config(TensorShape(67U, 211U, 22U), Size2D(3U, 4U), PadStrideInfo(2, 1, 1, 1));
         // Asymmetric padding
         add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 2, 1, 2, 0, DimensionRoundingType::FLOOR));
         add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 3, 0, 2, DimensionRoundingType::FLOOR));
@@ -157,6 +157,24 @@ public:
     }
 };
 
+class LargeDepthwiseConvolutionLayerDatasetFp16Subset final : public DepthwiseConvolutionLayerDataset
+{
+public:
+    LargeDepthwiseConvolutionLayerDatasetFp16Subset()
+    {
+        add_config(TensorShape(33U, 27U, 11U), Size2D(3U, 4U), PadStrideInfo(1, 2, 0, 1));
+        add_config(TensorShape(17U, 31U, 2U, 3U), Size2D(5U, 9U), PadStrideInfo(1, 2, 1, 1));
+        add_config(TensorShape(233U, 109U, 77U), Size2D(1U, 1U), PadStrideInfo(3, 2, 1, 0));
+        add_config(TensorShape(177U, 111U, 22U), Size2D(3U, 4U), PadStrideInfo(1, 2, 1, 1));
+        add_config(TensorShape(67U, 211U, 22U), Size2D(3U, 4U), PadStrideInfo(2, 1, 1, 1));
+        // Asymmetric padding
+        add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 3, 0, 2, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 0, 1, 0, DimensionRoundingType::FLOOR));
+        // Padding greater than kernel size.
+        add_config(TensorShape(128, 56, 56), Size2D(4, 4), PadStrideInfo(2, 2, 0, 10, 0, 10, DimensionRoundingType::FLOOR));
+    }
+};
+
 /** Dataset containing large kernel size for generic depthwise convolution. */
 class LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset final : public DepthwiseConvolutionLayerDataset
 {
@@ -198,21 +216,39 @@ class LargeDepthwiseConvolutionLayerDataset3x3 final : public DepthwiseConvoluti
 public:
     LargeDepthwiseConvolutionLayerDataset3x3()
     {
-        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1));
         add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 0));
-        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 1));
-        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1));
         add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 0));
+        add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 2));
+
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 2));
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 1, 0, 1));
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 2, 2, 1));
+
+        add_config(TensorShape(77U, 209U, 22U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1));
+        add_config(TensorShape(123U, 76U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0));
+        add_config(TensorShape(133U, 277U, 77U), Size2D(3U, 3U), PadStrideInfo(2, 3, 0, 0));
+        add_config(TensorShape(77U, 95U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1));
+
+        // Width and height are a multiple of the processing tile size
+        add_config(TensorShape(32U, 21U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 1));
+    }
+};
+
+class LargeDepthwiseConvolutionLayerDataset3x3Fp16Subset final : public DepthwiseConvolutionLayerDataset
+{
+public:
+    LargeDepthwiseConvolutionLayerDataset3x3Fp16Subset()
+    {
+        add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 2));
+
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 2));
         add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 1, 0, 1));
-        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1));
-        add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 0));
-        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1));
-        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1));
-        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1));
-        add_config(TensorShape(233U, 277U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0));
-        add_config(TensorShape(333U, 277U, 77U), Size2D(3U, 3U), PadStrideInfo(2, 3, 0, 0));
-        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1));
-        // Width and height are a multipile of the processing tile size
+
+        add_config(TensorShape(123U, 76U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0));
+        add_config(TensorShape(77U, 95U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1));
+
+        // Width and height are a multiple of the processing tile size
         add_config(TensorShape(32U, 21U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 1));
     }
 };
@@ -241,14 +277,14 @@ public:
     LargeOptimizedDepthwiseConvolutionLayerDataset3x3()
     {
         // Stride 1
-        add_config(TensorShape(233U, 277U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
-        add_config(TensorShape(233U, 7U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
+        add_config(TensorShape(233U, 173U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
+        add_config(TensorShape(133U, 7U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
         add_config(TensorShape(7U, 7U, 21U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
         add_config(TensorShape(28U, 28U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
         add_config(TensorShape(28U, 28U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
         // Stride 2
-        add_config(TensorShape(233U, 277U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL));
-        add_config(TensorShape(233U, 277U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1, 1, 1, DimensionRoundingType::CEIL));
+        add_config(TensorShape(133U, 97U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL));
+        add_config(TensorShape(153U, 77U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1, 1, 1, DimensionRoundingType::CEIL));
         add_config(TensorShape(8U, 8U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::FLOOR));
         add_config(TensorShape(8U, 8U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL));
         add_config(TensorShape(8U, 8U, 33U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL));
@@ -296,4 +332,4 @@ public:
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_DEPTHWISE_CONVOLUTION_DATASET */
+#endif // ACL_TESTS_DATASETS_DEPTHWISECONVOLUTIONLAYERDATASET_H
diff --git a/tests/datasets/DilatedDepthwiseConvolutionLayerDataset.h b/tests/datasets/DilatedDepthwiseConvolutionLayerDataset.h
index 8ddc71cf5a30f73b8e95407c3ccd05c630512c85..a58650a5e47d47a9dffa2e1c211f012d6f0ae970 100644
--- a/tests/datasets/DilatedDepthwiseConvolutionLayerDataset.h
+++ b/tests/datasets/DilatedDepthwiseConvolutionLayerDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_DILATED_CONVOLUTION_LAYER_DATASET
-#define ARM_COMPUTE_TEST_DILATED_CONVOLUTION_LAYER_DATASET
+#ifndef ACL_TESTS_DATASETS_DILATEDDEPTHWISECONVOLUTIONLAYERDATASET_H
+#define ACL_TESTS_DATASETS_DILATEDDEPTHWISECONVOLUTIONLAYERDATASET_H
 
 #include "utils/TypePrinter.h"
 
@@ -48,6 +48,7 @@ public:
         add_config(TensorShape(7U, 7U, 1U), Size2D(3U, 2U), PadStrideInfo(1, 1, 0, 0), Size2D(2U, 1U));
         add_config(TensorShape(7U, 7U, 1U), Size2D(3U, 2U), PadStrideInfo(2, 1, 0, 0), Size2D(2U, 2U));
         add_config(TensorShape(7U, 7U, 1U), Size2D(3U, 2U), PadStrideInfo(2, 2, 0, 0), Size2D(1U, 2U));
+        add_config(TensorShape(7U, 8U, 5U, 9U), Size2D(8U, 6U), PadStrideInfo(2, 3, 1, 1, 1, 3, DimensionRoundingType::CEIL), Size2D(1U, 2U));
 
         add_config(TensorShape(7U, 8U, 1U), Size2D(2U, 3U), PadStrideInfo(1, 2, 0, 0), Size2D(2U, 2U));
         add_config(TensorShape(23U, 27U, 5U), Size2D(3U, 5U), PadStrideInfo(2, 1, 0, 0), Size2D(2U, 1U));
@@ -96,15 +97,16 @@ public:
     LargeDepthwiseDilatedConvolutionLayerDataset()
     {
         add_config(TensorShape(33U, 27U, 11U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 1), Size2D(2U, 1U));
-        add_config(TensorShape(17U, 31U, 2U), Size2D(5U, 9U), PadStrideInfo(1, 2, 1, 1), Size2D(1U, 2U));
         add_config(TensorShape(23U, 27U, 5U), Size2D(11U, 3U), PadStrideInfo(1, 2, 0, 0), Size2D(1U, 3U));
+        add_config(TensorShape(17U, 31U, 2U), Size2D(5U, 9U), PadStrideInfo(1, 2, 1, 1), Size2D(1U, 2U));
         add_config(TensorShape(17U, 31U, 2U, 3U), Size2D(5U, 9U), PadStrideInfo(1, 2, 1, 1), Size2D(2U, 2U));
-        add_config(TensorShape(233U, 277U, 55U), Size2D(3U, 3U), PadStrideInfo(2, 1, 0, 0), Size2D(2U, 2U));
-        add_config(TensorShape(333U, 277U, 77U), Size2D(3U, 3U), PadStrideInfo(3, 2, 1, 0), Size2D(3U, 2U));
-        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1), Size2D(2U, 2U));
-        add_config(TensorShape(233U, 277U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0), Size2D(5U, 2U));
-        add_config(TensorShape(333U, 277U, 77U), Size2D(3U, 3U), PadStrideInfo(2, 3, 0, 1), Size2D(2U, 2U));
-        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1), Size2D(2U, 5U));
+
+        add_config(TensorShape(133U, 177U, 55U), Size2D(3U, 3U), PadStrideInfo(2, 1, 0, 0), Size2D(2U, 2U));
+        add_config(TensorShape(233U, 177U, 77U), Size2D(3U, 3U), PadStrideInfo(3, 2, 1, 0), Size2D(3U, 2U));
+        add_config(TensorShape(77U, 211U, 22U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1), Size2D(2U, 2U));
+        add_config(TensorShape(133U, 177U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0), Size2D(5U, 2U));
+        add_config(TensorShape(233U, 177U, 77U), Size2D(3U, 3U), PadStrideInfo(2, 3, 0, 1), Size2D(2U, 2U));
+        add_config(TensorShape(177U, 211U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1), Size2D(2U, 5U));
         // Asymmetric padding
         add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 2, 1, 2, 0, DimensionRoundingType::FLOOR), Size2D(3U, 2U));
         add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 3, 0, 2, DimensionRoundingType::FLOOR), Size2D(4U, 4U));
@@ -113,6 +115,24 @@ public:
     }
 };
 
+class LargeDepthwiseDilatedConvolutionLayerDatasetFp16Subset final : public DepthwiseConvolutionLayerDataset
+{
+public:
+    LargeDepthwiseDilatedConvolutionLayerDatasetFp16Subset()
+    {
+        add_config(TensorShape(33U, 27U, 11U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 1), Size2D(2U, 1U));
+        add_config(TensorShape(23U, 27U, 5U), Size2D(11U, 3U), PadStrideInfo(1, 2, 0, 0), Size2D(1U, 3U));
+        add_config(TensorShape(17U, 31U, 2U, 3U), Size2D(5U, 9U), PadStrideInfo(1, 2, 1, 1), Size2D(2U, 2U));
+
+        add_config(TensorShape(77U, 211U, 22U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1), Size2D(2U, 2U));
+        add_config(TensorShape(133U, 177U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0), Size2D(5U, 2U));
+        add_config(TensorShape(177U, 211U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1), Size2D(2U, 5U));
+        // Asymmetric padding
+        add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 3, 0, 2, DimensionRoundingType::FLOOR), Size2D(4U, 4U));
+        add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 0, 1, 0, DimensionRoundingType::FLOOR), Size2D(2U, 2U));
+    }
+};
+
 /** Dataset containing large, 3x3 depthwise convolution shapes with dilation. */
 class LargeDepthwiseDilatedConvolutionLayerDataset3x3 final : public DepthwiseConvolutionLayerDataset
 {
@@ -120,23 +140,44 @@ public:
     LargeDepthwiseDilatedConvolutionLayerDataset3x3()
     {
         add_config(TensorShape(32U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 1), Size2D(2U, 1U));
+
         add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1), Size2D(2U, 2U));
-        add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 0), Size2D(2U, 2U));
         add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 1), Size2D(2U, 1U));
         add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1), Size2D(2U, 3U));
-        add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 0), Size2D(2U, 1U));
         add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 1, 0, 1), Size2D(3U, 3U));
         add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1), Size2D(2U, 2U));
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1), Size2D(4U, 4U));
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1), Size2D(2U, 5U));
+
+        add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 0), Size2D(2U, 2U));
+        add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 0), Size2D(2U, 1U));
         add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 0), Size2D(2U, 2U));
+
+        add_config(TensorShape(133U, 177U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0), Size2D(5U, 5U));
+        add_config(TensorShape(233U, 77U, 77U), Size2D(3U, 3U), PadStrideInfo(2, 3, 0, 0), Size2D(4U, 4U));
+        add_config(TensorShape(77U, 211U, 22U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1), Size2D(4U, 4U));
+        add_config(TensorShape(77U, 111U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1), Size2D(3U, 3U));
+    }
+};
+
+class LargeDepthwiseDilatedConvolutionLayerDataset3x3Fp16Subset final : public DepthwiseConvolutionLayerDataset
+{
+public:
+    LargeDepthwiseDilatedConvolutionLayerDataset3x3Fp16Subset()
+    {
+        add_config(TensorShape(32U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 1), Size2D(2U, 1U));
+
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 1), Size2D(2U, 1U));
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 1, 0, 1), Size2D(3U, 3U));
         add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1), Size2D(4U, 4U));
         add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1), Size2D(2U, 5U));
-        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1), Size2D(4U, 4U));
-        add_config(TensorShape(233U, 277U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0), Size2D(5U, 5U));
-        add_config(TensorShape(333U, 277U, 77U), Size2D(3U, 3U), PadStrideInfo(2, 3, 0, 0), Size2D(4U, 4U));
-        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1), Size2D(3U, 3U));
+
+        add_config(TensorShape(21U, 31U, 9U, 10U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 0), Size2D(2U, 2U));
+
+        add_config(TensorShape(77U, 111U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1), Size2D(3U, 3U));
     }
 };
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_DILATED_CONVOLUTION_LAYER_DATASET */
\ No newline at end of file
+#endif // ACL_TESTS_DATASETS_DILATEDDEPTHWISECONVOLUTIONLAYERDATASET_H
diff --git a/tests/datasets/LargeConvolutionLayerDataset.h b/tests/datasets/LargeConvolutionLayerDataset.h
index 1cffc9a2219919ef1eaec4d10c7cbae79d8755fe..72f73ba6d9f39f99f3d7937188895e5fc1c10027 100644
--- a/tests/datasets/LargeConvolutionLayerDataset.h
+++ b/tests/datasets/LargeConvolutionLayerDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_LARGE_CONVOLUTION_LAYER_DATASET
-#define ARM_COMPUTE_TEST_LARGE_CONVOLUTION_LAYER_DATASET
+#ifndef ACL_TESTS_DATASETS_LARGECONVOLUTIONLAYERDATASET_H
+#define ACL_TESTS_DATASETS_LARGECONVOLUTIONLAYERDATASET_H
 
 #include "tests/datasets/ConvolutionLayerDataset.h"
 
@@ -44,18 +44,31 @@ public:
     {
         // Kernel size 3
         // Batch size 1
-        add_config(TensorShape(224U, 222U, 64U), TensorShape(3U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(112U, 113U, 64U), TensorShape(3U, 3U, 64U, 128U), TensorShape(128U), TensorShape(112U, 113U, 128U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(112U, 112U, 128U), TensorShape(3U, 3U, 128U, 129U), TensorShape(129U), TensorShape(112U, 112U, 129U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(53U, 56U, 125U), TensorShape(3U, 3U, 125U, 256U), TensorShape(256U), TensorShape(51U, 54U, 256U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(56U, 56U, 256U), TensorShape(3U, 3U, 256U, 256U), TensorShape(256U), TensorShape(54U, 54U, 256U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(28U, 28U, 257U), TensorShape(3U, 3U, 257U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(28U, 28U, 512U), TensorShape(3U, 3U, 512U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(14U, 14U, 512U), TensorShape(3U, 3U, 512U, 512U), TensorShape(512U), TensorShape(12U, 12U, 512U), PadStrideInfo(1, 1, 0, 0));
-        // Batch size 3, 2 and 4
-        add_config(TensorShape(224U, 222U, 64U, 3U), TensorShape(3U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U, 3U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(112U, 113U, 64U, 2U), TensorShape(3U, 3U, 64U, 128U), TensorShape(128U), TensorShape(110U, 111U, 128U, 2U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 3U, 127U, 128U), TensorShape(128U), TensorShape(111U, 112U, 128U, 4U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(224U, 222U, 32U), TensorShape(3U, 3U, 32U, 32U), TensorShape(32U), TensorShape(224U, 222U, 32U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(112U, 113U, 32U), TensorShape(3U, 3U, 32U, 64U), TensorShape(64U), TensorShape(112U, 113U, 64U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(112U, 112U, 64U), TensorShape(3U, 3U, 64U, 129U), TensorShape(129U), TensorShape(112U, 112U, 129U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(53U, 56U, 125U), TensorShape(3U, 3U, 125U, 128U), TensorShape(128U), TensorShape(51U, 54U, 128U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(56U, 56U, 128U), TensorShape(3U, 3U, 128U, 128U), TensorShape(128U), TensorShape(54U, 54U, 128U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(28U, 28U, 257U), TensorShape(3U, 3U, 257U, 128U), TensorShape(128U), TensorShape(28U, 28U, 128U), PadStrideInfo(1, 1, 1, 1));
+
+        // Batch > 1
+        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 3U, 127U, 64U), TensorShape(64U), TensorShape(111U, 112U, 64U, 4U), PadStrideInfo(1, 1, 1, 1));
+    }
+};
+
+class LargeWinogradConvolutionLayer3x3DatasetFp16Subset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer3x3DatasetFp16Subset()
+    {
+        // Kernel size 3
+        // Batch size 1
+        add_config(TensorShape(224U, 222U, 32U), TensorShape(3U, 3U, 32U, 32U), TensorShape(32U), TensorShape(224U, 222U, 32U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(112U, 112U, 64U), TensorShape(3U, 3U, 64U, 129U), TensorShape(129U), TensorShape(112U, 112U, 129U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(56U, 56U, 128U), TensorShape(3U, 3U, 128U, 128U), TensorShape(128U), TensorShape(54U, 54U, 128U), PadStrideInfo(1, 1, 0, 0));
+
+        // Batch > 1
+        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 3U, 127U, 64U), TensorShape(64U), TensorShape(111U, 112U, 64U, 4U), PadStrideInfo(1, 1, 1, 1));
     }
 };
 
@@ -66,18 +79,31 @@ public:
     {
         // Kernel size 3
         // Batch size 1
-        add_config(TensorShape(224U, 222U, 64U), TensorShape(3U, 1U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U), PadStrideInfo(1, 1, 1, 0));
-        add_config(TensorShape(112U, 113U, 64U), TensorShape(3U, 1U, 64U, 128U), TensorShape(128U), TensorShape(112U, 113U, 128U), PadStrideInfo(1, 1, 1, 0));
-        add_config(TensorShape(112U, 112U, 128U), TensorShape(3U, 1U, 128U, 129U), TensorShape(129U), TensorShape(112U, 112U, 129U), PadStrideInfo(1, 1, 1, 0));
-        add_config(TensorShape(53U, 56U, 125U), TensorShape(3U, 1U, 125U, 256U), TensorShape(256U), TensorShape(51U, 56U, 256U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(56U, 56U, 256U), TensorShape(3U, 1U, 256U, 256U), TensorShape(256U), TensorShape(56U, 56U, 256U), PadStrideInfo(1, 1, 1, 0));
-        add_config(TensorShape(28U, 28U, 257U), TensorShape(3U, 1U, 257U, 512U), TensorShape(512U), TensorShape(26U, 28U, 512U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(28U, 28U, 512U), TensorShape(3U, 1U, 512U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 1, 0));
-        add_config(TensorShape(14U, 14U, 512U), TensorShape(3U, 1U, 512U, 512U), TensorShape(512U), TensorShape(12U, 14U, 512U), PadStrideInfo(1, 1, 0, 0));
-        // Batch size 3, 2 and 4
-        add_config(TensorShape(224U, 222U, 64U, 3U), TensorShape(3U, 1U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U, 3U), PadStrideInfo(1, 1, 1, 0));
-        add_config(TensorShape(112U, 113U, 64U, 2U), TensorShape(3U, 1U, 64U, 128U), TensorShape(128U), TensorShape(110U, 113U, 128U, 2U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 1U, 127U, 128U), TensorShape(128U), TensorShape(111U, 112U, 128U, 4U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(224U, 222U, 32U), TensorShape(3U, 1U, 32U, 32U), TensorShape(32U), TensorShape(224U, 222U, 32U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(112U, 113U, 32U), TensorShape(3U, 1U, 32U, 64U), TensorShape(64U), TensorShape(112U, 113U, 64U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(112U, 112U, 64U), TensorShape(3U, 1U, 64U, 129U), TensorShape(129U), TensorShape(112U, 112U, 129U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(53U, 56U, 125U), TensorShape(3U, 1U, 125U, 128U), TensorShape(128U), TensorShape(51U, 56U, 128U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(56U, 56U, 128U), TensorShape(3U, 1U, 128U, 128U), TensorShape(128U), TensorShape(56U, 56U, 128U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(28U, 28U, 257U), TensorShape(3U, 1U, 257U, 128U), TensorShape(128U), TensorShape(26U, 28U, 128U), PadStrideInfo(1, 1, 0, 0));
+
+        // Batch > 1
+        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 1U, 127U, 64U), TensorShape(64U), TensorShape(111U, 112U, 64U, 4U), PadStrideInfo(1, 1, 1, 0));
+    }
+};
+
+class LargeWinogradConvolutionLayer3x1DatasetFp16Subset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer3x1DatasetFp16Subset()
+    {
+        // Kernel size 3
+        // Batch size 1
+        add_config(TensorShape(112U, 113U, 32U), TensorShape(3U, 1U, 32U, 64U), TensorShape(64U), TensorShape(112U, 113U, 64U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(53U, 56U, 125U), TensorShape(3U, 1U, 125U, 128U), TensorShape(128U), TensorShape(51U, 56U, 128U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(28U, 28U, 257U), TensorShape(3U, 1U, 257U, 128U), TensorShape(128U), TensorShape(26U, 28U, 128U), PadStrideInfo(1, 1, 0, 0));
+
+        // Batch > 1
+        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 1U, 127U, 64U), TensorShape(64U), TensorShape(111U, 112U, 64U, 4U), PadStrideInfo(1, 1, 1, 0));
     }
 };
 
@@ -88,18 +114,31 @@ public:
     {
         // Kernel size 3
         // Batch size 1
-        add_config(TensorShape(224U, 222U, 64U), TensorShape(1U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U), PadStrideInfo(1, 1, 0, 1));
-        add_config(TensorShape(112U, 113U, 64U), TensorShape(1U, 3U, 64U, 128U), TensorShape(128U), TensorShape(112U, 113U, 128U), PadStrideInfo(1, 1, 0, 1));
-        add_config(TensorShape(112U, 112U, 128U), TensorShape(1U, 3U, 128U, 129U), TensorShape(129U), TensorShape(112U, 110U, 129U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(53U, 56U, 125U), TensorShape(1U, 3U, 125U, 256U), TensorShape(256U), TensorShape(53U, 56U, 256U), PadStrideInfo(1, 1, 0, 1));
-        add_config(TensorShape(56U, 56U, 256U), TensorShape(1U, 3U, 256U, 256U), TensorShape(256U), TensorShape(56U, 54U, 256U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(28U, 28U, 257U), TensorShape(1U, 3U, 257U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 0, 1));
-        add_config(TensorShape(28U, 28U, 512U), TensorShape(1U, 3U, 512U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 0, 1));
-        add_config(TensorShape(14U, 14U, 512U), TensorShape(1U, 3U, 512U, 512U), TensorShape(512U), TensorShape(14U, 12U, 512U), PadStrideInfo(1, 1, 0, 0));
-        // Batch size 3, 2 and 4
-        add_config(TensorShape(224U, 222U, 64U, 3U), TensorShape(1U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U, 3U), PadStrideInfo(1, 1, 0, 1));
-        add_config(TensorShape(112U, 113U, 64U, 2U), TensorShape(1U, 3U, 64U, 128U), TensorShape(128U), TensorShape(112U, 113U, 128U, 2U), PadStrideInfo(1, 1, 0, 1));
-        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(1U, 3U, 127U, 128U), TensorShape(128U), TensorShape(111U, 112U, 128U, 4U), PadStrideInfo(1, 1, 0, 1));
+        add_config(TensorShape(224U, 222U, 32U), TensorShape(1U, 3U, 32U, 32U), TensorShape(32U), TensorShape(224U, 222U, 32U), PadStrideInfo(1, 1, 0, 1));
+        add_config(TensorShape(112U, 113U, 32U), TensorShape(1U, 3U, 32U, 64U), TensorShape(64U), TensorShape(112U, 113U, 64U), PadStrideInfo(1, 1, 0, 1));
+        add_config(TensorShape(112U, 112U, 64U), TensorShape(1U, 3U, 64U, 129U), TensorShape(129U), TensorShape(112U, 110U, 129U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(53U, 56U, 125U), TensorShape(1U, 3U, 125U, 128U), TensorShape(128U), TensorShape(53U, 56U, 128U), PadStrideInfo(1, 1, 0, 1));
+        add_config(TensorShape(56U, 56U, 128U), TensorShape(1U, 3U, 128U, 128U), TensorShape(128U), TensorShape(56U, 54U, 128U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(28U, 28U, 257U), TensorShape(1U, 3U, 257U, 128U), TensorShape(128U), TensorShape(28U, 28U, 128U), PadStrideInfo(1, 1, 0, 1));
+
+        // Batch > 1
+        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(1U, 3U, 127U, 64U), TensorShape(64U), TensorShape(111U, 112U, 64U, 4U), PadStrideInfo(1, 1, 0, 1));
+    }
+};
+
+class LargeWinogradConvolutionLayer1x3DatasetFp16Subset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer1x3DatasetFp16Subset()
+    {
+        // Kernel size 3
+        // Batch size 1
+        add_config(TensorShape(112U, 112U, 64U), TensorShape(1U, 3U, 64U, 129U), TensorShape(129U), TensorShape(112U, 110U, 129U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(53U, 56U, 125U), TensorShape(1U, 3U, 125U, 128U), TensorShape(128U), TensorShape(53U, 56U, 128U), PadStrideInfo(1, 1, 0, 1));
+        add_config(TensorShape(28U, 28U, 257U), TensorShape(1U, 3U, 257U, 128U), TensorShape(128U), TensorShape(28U, 28U, 128U), PadStrideInfo(1, 1, 0, 1));
+
+        // Batch > 1
+        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(1U, 3U, 127U, 64U), TensorShape(64U), TensorShape(111U, 112U, 64U, 4U), PadStrideInfo(1, 1, 0, 1));
     }
 };
 
@@ -110,15 +149,27 @@ public:
     {
         // Kernel size 5
         // Batch size 1
-        add_config(TensorShape(224U, 224U, 3U), TensorShape(5U, 5U, 3U, 64U), TensorShape(64U), TensorShape(220U, 220U, 64U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(123U, 134U, 16U), TensorShape(5U, 5U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U), PadStrideInfo(1, 1, 2, 2));
+        add_config(TensorShape(224U, 224U, 3U), TensorShape(5U, 5U, 3U, 32U), TensorShape(32U), TensorShape(220U, 220U, 32U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(181U, 152U, 42U), TensorShape(5U, 5U, 42U, 100U), TensorShape(100U), TensorShape(177U, 148U, 100U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(200U, 201U, 24U), TensorShape(5U, 5U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 2, 2));
 
-        // Batch size 2, 3 and 4
-        add_config(TensorShape(224U, 224U, 3U, 2U), TensorShape(5U, 5U, 3U, 64U), TensorShape(64U), TensorShape(220U, 220U, 64U, 2U), PadStrideInfo(1, 1, 0, 0));
+        // Batch > 1
+        add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(5U, 5U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U, 3U), PadStrideInfo(1, 1, 2, 2));
+    }
+};
+
+class LargeWinogradConvolutionLayer5x5DatasetFp16Subset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer5x5DatasetFp16Subset()
+    {
+        // Kernel size 5
+        // Batch size 1
+        add_config(TensorShape(181U, 152U, 42U), TensorShape(5U, 5U, 42U, 100U), TensorShape(100U), TensorShape(177U, 148U, 100U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(200U, 201U, 24U), TensorShape(5U, 5U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 2, 2));
+
+        // Batch > 1
         add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(5U, 5U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U, 3U), PadStrideInfo(1, 1, 2, 2));
-        add_config(TensorShape(181U, 152U, 42U, 4U), TensorShape(5U, 5U, 42U, 100U), TensorShape(100U), TensorShape(177U, 148U, 100U, 4U), PadStrideInfo(1, 1, 0, 0));
     }
 };
 
@@ -128,15 +179,26 @@ public:
     LargeWinogradConvolutionLayer5x1Dataset()
     {
         // Batch size 1
-        add_config(TensorShape(224U, 224U, 3U), TensorShape(5U, 1U, 3U, 64U), TensorShape(64U), TensorShape(224U, 224U, 64U), PadStrideInfo(1, 1, 2, 0));
-        add_config(TensorShape(123U, 134U, 16U), TensorShape(5U, 1U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U), PadStrideInfo(1, 1, 2, 0));
+        add_config(TensorShape(224U, 224U, 3U), TensorShape(5U, 1U, 3U, 32U), TensorShape(32U), TensorShape(224U, 224U, 32U), PadStrideInfo(1, 1, 2, 0));
         add_config(TensorShape(181U, 152U, 42U), TensorShape(5U, 1U, 42U, 100U), TensorShape(100U), TensorShape(177U, 152U, 100U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(200U, 201U, 24U), TensorShape(5U, 1U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 2, 0));
 
-        // Batch size 2, 3 and 4
-        add_config(TensorShape(224U, 224U, 3U, 2U), TensorShape(5U, 1U, 3U, 64U), TensorShape(64U), TensorShape(224U, 224U, 64U, 2U), PadStrideInfo(1, 1, 2, 0));
+        // Batch > 1
+        add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(5U, 1U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U, 3U), PadStrideInfo(1, 1, 2, 0));
+    }
+};
+
+class LargeWinogradConvolutionLayer5x1DatasetFp16Subset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer5x1DatasetFp16Subset()
+    {
+        // Batch size 1
+        add_config(TensorShape(224U, 224U, 3U), TensorShape(5U, 1U, 3U, 32U), TensorShape(32U), TensorShape(224U, 224U, 32U), PadStrideInfo(1, 1, 2, 0));
+        add_config(TensorShape(200U, 201U, 24U), TensorShape(5U, 1U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 2, 0));
+
+        // Batch > 1
         add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(5U, 1U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U, 3U), PadStrideInfo(1, 1, 2, 0));
-        add_config(TensorShape(181U, 152U, 42U, 4U), TensorShape(5U, 1U, 42U, 100U), TensorShape(100U), TensorShape(177U, 152U, 100U, 4U), PadStrideInfo(1, 1, 0, 0));
     }
 };
 
@@ -146,15 +208,12 @@ public:
     LargeWinogradConvolutionLayer7x1Dataset()
     {
         // Batch size 1
-        add_config(TensorShape(224U, 224U, 3U), TensorShape(7U, 1U, 3U, 64U), TensorShape(64U), TensorShape(218U, 224U, 64U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(123U, 134U, 16U), TensorShape(7U, 1U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U), PadStrideInfo(1, 1, 3, 0));
+        add_config(TensorShape(224U, 224U, 3U), TensorShape(7U, 1U, 3U, 32U), TensorShape(32U), TensorShape(218U, 224U, 32U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(181U, 152U, 42U), TensorShape(7U, 1U, 42U, 100U), TensorShape(100U), TensorShape(175U, 152U, 100U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(200U, 201U, 24U), TensorShape(7U, 1U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 3, 0));
 
-        // Batch size 2, 3 and 4
-        add_config(TensorShape(224U, 224U, 3U, 2U), TensorShape(7U, 1U, 3U, 64U), TensorShape(64U), TensorShape(224U, 224U, 64U, 2U), PadStrideInfo(1, 1, 3, 0));
+        // Batch > 1
         add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(7U, 1U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U, 3U), PadStrideInfo(1, 1, 3, 0));
-        add_config(TensorShape(181U, 152U, 42U, 4U), TensorShape(7U, 1U, 42U, 100U), TensorShape(100U), TensorShape(175U, 152U, 100U, 4U), PadStrideInfo(1, 1, 0, 0));
     }
 };
 
@@ -164,15 +223,26 @@ public:
     LargeWinogradConvolutionLayer1x7Dataset()
     {
         // Batch size 1
-        add_config(TensorShape(224U, 224U, 3U), TensorShape(1U, 7U, 3U, 64U), TensorShape(64U), TensorShape(224U, 218U, 64U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(123U, 134U, 16U), TensorShape(1U, 7U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U), PadStrideInfo(1, 1, 0, 3));
+        add_config(TensorShape(224U, 224U, 3U), TensorShape(1U, 7U, 3U, 32U), TensorShape(32U), TensorShape(224U, 218U, 32U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(181U, 152U, 42U), TensorShape(1U, 7U, 42U, 100U), TensorShape(100U), TensorShape(181U, 146U, 100U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(200U, 201U, 24U), TensorShape(1U, 7U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 0, 3));
+
+        // Batch > 1
+        add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(1U, 7U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U, 3U), PadStrideInfo(1, 1, 0, 3));
+    }
+};
+
+class LargeWinogradConvolutionLayer1x7DatasetFp16Subset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer1x7DatasetFp16Subset()
+    {
+        // Batch size 1
         add_config(TensorShape(181U, 152U, 42U), TensorShape(1U, 7U, 42U, 100U), TensorShape(100U), TensorShape(181U, 146U, 100U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(200U, 201U, 24U), TensorShape(1U, 7U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 0, 3));
 
-        // Batch size 2, 3 and 4
-        add_config(TensorShape(224U, 224U, 3U, 2U), TensorShape(1U, 7U, 3U, 64U), TensorShape(64U), TensorShape(224U, 224U, 64U, 2U), PadStrideInfo(1, 1, 0, 3));
+        // Batch > 1
         add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(1U, 7U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U, 3U), PadStrideInfo(1, 1, 0, 3));
-        add_config(TensorShape(181U, 152U, 42U, 4U), TensorShape(1U, 7U, 42U, 100U), TensorShape(100U), TensorShape(181U, 146U, 100U, 4U), PadStrideInfo(1, 1, 0, 0));
     }
 };
 
@@ -182,15 +252,26 @@ public:
     LargeWinogradConvolutionLayer1x5Dataset()
     {
         // Batch size 1
-        add_config(TensorShape(224U, 224U, 3U), TensorShape(1U, 5U, 3U, 64U), TensorShape(64U), TensorShape(224U, 224U, 64U), PadStrideInfo(1, 1, 0, 2));
-        add_config(TensorShape(123U, 134U, 16U), TensorShape(1U, 5U, 16U, 7U), TensorShape(7U), TensorShape(123U, 130U, 7U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(224U, 224U, 3U), TensorShape(1U, 5U, 3U, 32U), TensorShape(32U), TensorShape(224U, 224U, 32U), PadStrideInfo(1, 1, 0, 2));
         add_config(TensorShape(181U, 152U, 42U), TensorShape(1U, 5U, 42U, 100U), TensorShape(100U), TensorShape(181U, 148U, 100U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(200U, 201U, 24U), TensorShape(1U, 5U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 0, 2));
 
-        // Batch size 2, 3 and 4
-        add_config(TensorShape(224U, 224U, 3U, 2U), TensorShape(1U, 5U, 3U, 64U), TensorShape(64U), TensorShape(224U, 224U, 64U, 2U), PadStrideInfo(1, 1, 0, 2));
+        // Batch size > 1
+        add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(1U, 5U, 16U, 7U), TensorShape(7U), TensorShape(123U, 130U, 7U, 3U), PadStrideInfo(1, 1, 0, 0));
+    }
+};
+
+class LargeWinogradConvolutionLayer1x5DatasetFp16Subset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer1x5DatasetFp16Subset()
+    {
+        // Batch size 1
+        add_config(TensorShape(224U, 224U, 3U), TensorShape(1U, 5U, 3U, 32U), TensorShape(32U), TensorShape(224U, 224U, 32U), PadStrideInfo(1, 1, 0, 2));
+        add_config(TensorShape(181U, 152U, 42U), TensorShape(1U, 5U, 42U, 100U), TensorShape(100U), TensorShape(181U, 148U, 100U), PadStrideInfo(1, 1, 0, 0));
+
+        // Batch size > 1
         add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(1U, 5U, 16U, 7U), TensorShape(7U), TensorShape(123U, 130U, 7U, 3U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(181U, 152U, 42U, 4U), TensorShape(1U, 5U, 42U, 100U), TensorShape(100U), TensorShape(181U, 148U, 100U, 4U), PadStrideInfo(1, 1, 0, 0));
     }
 };
 
@@ -233,4 +314,4 @@ public:
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_LARGE_CONVOLUTION_LAYER_DATASET */
+#endif // ACL_TESTS_DATASETS_LARGECONVOLUTIONLAYERDATASET_H
diff --git a/tests/datasets/MatMulLowpMMULDataset.h b/tests/datasets/MatMulLowpMMULDataset.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b22e1061f70ecdc3e9af8faff5227ddf4349c0d
--- /dev/null
+++ b/tests/datasets/MatMulLowpMMULDataset.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_TESTS_DATASETS_MATMULLOWPMMULDATASET_H
+#define ACL_TESTS_DATASETS_MATMULLOWPMMULDATASET_H
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/datasets/MatMulDataset.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+/** MatMulLowp MMUL shapes are similar to MatMul MMUL shapes except that K has to be a
+ * multiple of MMUL_K0 which is 16 (e.g. see src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp for the definition)
+ */
+class SmallMatMulLowpMMULDataset final : public MatMulDataset
+{
+public:
+    SmallMatMulLowpMMULDataset()
+    {
+        add_config(TensorShape(16U, 4U), TensorShape(4U, 16U), TensorShape(4U, 4U)); // same as mmul block
+        add_config(TensorShape(96U, 1U), TensorShape(1U, 96U), TensorShape(1U, 1U)); // vector x vector
+        add_config(TensorShape(32U, 4U, 2U), TensorShape(16U, 32U, 2U), TensorShape(16U, 4U, 2U));
+        add_config(TensorShape(48U, 2U), TensorShape(17U, 48U), TensorShape(17U, 2U));
+        add_config(TensorShape(32U, 6U), TensorShape(7U, 32U), TensorShape(7U, 6U));
+    }
+};
+
+// This dataset is for smaller number of tests that will still use small shapes
+// e.g. not repeating everything for QASYMM8 while we're already testing for QASYMM8_SIGNED
+class SmallMatMulLowpMMULDatasetSubset final : public MatMulDataset
+{
+public:
+    SmallMatMulLowpMMULDatasetSubset()
+    {
+        add_config(TensorShape(32U, 4U, 2U), TensorShape(16U, 32U, 2U), TensorShape(16U, 4U, 2U));
+        add_config(TensorShape(32U, 6U), TensorShape(7U, 32U), TensorShape(7U, 6U));
+    }
+};
+
+class SmallMatMulLowpMMULWithBiasDataset final : public MatMulDataset
+{
+public:
+    SmallMatMulLowpMMULWithBiasDataset()
+    {
+        add_config(TensorShape(32U, 4U, 2U, 2U), TensorShape(16U, 32U, 2U, 2U), TensorShape(16U, 4U, 2U, 2U));
+    }
+};
+
+class LargeMatMulLowpMMULDataset final : public MatMulDataset
+{
+public:
+    LargeMatMulLowpMMULDataset()
+    {
+        add_config(TensorShape(192U, 38U, 3U, 2U), TensorShape(21U, 192U, 3U, 2U), TensorShape(21U, 38U, 3U, 2U));
+    }
+};
+
+class HighDimensionalMatMulLowpMMULDataset final : public MatMulDataset
+{
+public:
+    HighDimensionalMatMulLowpMMULDataset()
+    {
+        add_config(TensorShape(16U, 5U, 2U, 2U, 2U, 2U), TensorShape(5U, 16U, 2U, 2U, 2U, 2U), TensorShape(5U, 5U, 2U, 2U, 2U, 2U)); // 6D tensor
+    }
+};
+
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+
+#endif // ACL_TESTS_DATASETS_MATMULLOWPMMULDATASET_H
diff --git a/tests/datasets/ReshapeLayerDataset.h b/tests/datasets/ReshapeLayerDataset.h
index d1a1667683c3a47c5ea3388e9a14d1afef691b99..015f9157aad6a29477501b73ba175948b24a850b 100644
--- a/tests/datasets/ReshapeLayerDataset.h
+++ b/tests/datasets/ReshapeLayerDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_RESHAPE_LAYER_DATASET
-#define ARM_COMPUTE_TEST_RESHAPE_LAYER_DATASET
+#ifndef ACL_TESTS_DATASETS_RESHAPELAYERDATASET_H
+#define ACL_TESTS_DATASETS_RESHAPELAYERDATASET_H
 
 #include "utils/TypePrinter.h"
 
@@ -111,9 +111,10 @@ public:
         add_config(TensorShape(17U, 3U, 12U), TensorShape(1U, 1U, 612U));
         add_config(TensorShape(26U, 26U, 32U), TensorShape(13U, 13U, 128U));
         add_config(TensorShape(31U, 23U, 4U, 7U), TensorShape(2U, 14U, 713U));
+        add_config(TensorShape(8U, 8U, 8U), TensorShape(8U, 64U));
     }
 };
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_RESHAPE_LAYER_DATASET */
+#endif // ACL_TESTS_DATASETS_RESHAPELAYERDATASET_H
diff --git a/tests/framework/datasets/CartesianProductDataset.h b/tests/framework/datasets/CartesianProductDataset.h
index 19ac4f6666d626f3bff4b4e3c798d1f3ed05e986..7b3ff120474c7f22f327b3b661727a111278a1bc 100644
--- a/tests/framework/datasets/CartesianProductDataset.h
+++ b/tests/framework/datasets/CartesianProductDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -182,6 +182,20 @@ CartesianProductDataset<T, U> combine(T &&dataset1, U &&dataset2)
     return CartesianProductDataset<T, U>(std::forward<T>(dataset1), std::forward<U>(dataset2));
 }
 
+/** Helper function to create a @ref CartesianProductDataset.
+ *
+ * @param[in] dataset1 First dataset.
+ * @param[in] dataset2 Second dataset.
+ * @param[in] datasets Subsequent dataset.
+ *
+ * @return A grid dataset.
+ */
+template <typename T1, typename T2, typename... Ts>
+auto combine(T1 &&dataset1, T2 &&dataset2, Ts &&... datasets) -> decltype(combine(std::forward<T1>(dataset1), combine(std::forward<T2>(dataset2), std::forward<Ts>(datasets)...)))
+{
+    return combine(std::forward<T1>(dataset1), combine(std::forward<T2>(dataset2), std::forward<Ts>(datasets)...));
+}
+
 /** Helper function to create a @ref CartesianProductDataset.
  *
  * @param[in] dataset1 First dataset.
diff --git a/tests/framework/datasets/ZipDataset.h b/tests/framework/datasets/ZipDataset.h
index ce1bb37cab7c10774b867dc2007d157e76aa768c..0b963484c5fcbb0871c7560a498fd11841444158 100644
--- a/tests/framework/datasets/ZipDataset.h
+++ b/tests/framework/datasets/ZipDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -150,6 +150,20 @@ ZipDataset<T, U> zip(T &&dataset1, U &&dataset2)
 {
     return ZipDataset<T, U>(std::forward<T>(dataset1), std::forward<U>(dataset2));
 }
+
+/** Helper function to create a @ref ZipDataset.
+ *
+ * @param[in] dataset1 First dataset.
+ * @param[in] dataset2 Second dataset.
+ * @param[in] datasets Subsequent datasets.
+ *
+ * @return A zip dataset.
+ */
+template <typename T1, typename T2, typename... Ts>
+auto zip(T1 &&dataset1, T2 &&dataset2, Ts &&... datasets) -> decltype(zip(std::forward<T1>(dataset1), zip(std::forward<T2>(dataset2), std::forward<Ts>(datasets)...)))
+{
+    return zip(std::forward<T1>(dataset1), zip(std::forward<T2>(dataset2), std::forward<Ts>(datasets)...));
+}
 } // namespace dataset
 } // namespace framework
 } // namespace test
diff --git a/tests/validate_examples/graph_validate_utils.h b/tests/validate_examples/graph_validate_utils.h
index 91d62485d1004c185e2b7076a7ed8ab686018b36..c1a83d1f40b43ce4d908cf4f6a59871c566cda67 100644
--- a/tests/validate_examples/graph_validate_utils.h
+++ b/tests/validate_examples/graph_validate_utils.h
@@ -22,8 +22,8 @@
  * SOFTWARE.
  */
 
-#ifndef GRAPH_VALIDATE_UTILS_H
-#define GRAPH_VALIDATE_UTILS_H
+#ifndef ACL_TESTS_VALIDATE_EXAMPLES_GRAPH_VALIDATE_UTILS_H
+#define ACL_TESTS_VALIDATE_EXAMPLES_GRAPH_VALIDATE_UTILS_H
 
 #include "arm_compute/graph.h"
 
@@ -287,7 +287,6 @@ public:
      * @param[out] os            Output stream.
      * @param[in]  common_params Example parameters to output
      *
-     * @return None.
      */
     virtual void print_parameters(::std::ostream &os, const ExampleParams &common_params)
     {
@@ -394,7 +393,6 @@ public:
      * @param[out] bias    The tensor with the bias data.
      * @param[in]  tensor  Tensor result of the actual operation passed into the Accessor.
      *
-     * @return None.
      */
     virtual void create_tensors(arm_compute::test::SimpleTensor<D>     &src,
                                 arm_compute::test::SimpleTensor<D>     &weights,
@@ -680,4 +678,4 @@ public:
 
 } // graph_validate_utils
 } // arm_compute
-#endif //GRAPH_VALIDATE_UTILS_H
+#endif // ACL_TESTS_VALIDATE_EXAMPLES_GRAPH_VALIDATE_UTILS_H
diff --git a/tests/validation/CL/Comparisons.cpp b/tests/validation/CL/Comparisons.cpp
index d015528b0ede028b3e884e1469fa56dcdcdf0638..dd3dbd8d59be21b9336b786edd61735d505be9d6 100644
--- a/tests/validation/CL/Comparisons.cpp
+++ b/tests/validation/CL/Comparisons.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,7 +59,7 @@ TEST_SUITE(Comparison)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
         framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), // Invalid output type
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), // Mismatching input types
-                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Window shrink
+                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), // Mismatching shapes
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
         }),
@@ -75,7 +75,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                 TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::U8),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
         })),
-        framework::dataset::make("Expected", { false, false, false, false, true})),
+        framework::dataset::make("Expected", { false, false, true, false, true})),
         input1_info, input2_info, output_info, expected)
 {
     Status s = CLComparison::validate(&input1_info.clone()->set_is_resizable(false),
diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp
index bced540d2abf8c7b4c14207c4dd29268713a4512..8820a6a31ed8d27a69d2df9380c78eb193f0d800 100644
--- a/tests/validation/CL/ConvolutionLayer.cpp
+++ b/tests/validation/CL/ConvolutionLayer.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/PostOps.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
@@ -48,6 +47,7 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 class SmallConvolutionLayerDatasetCases final : public datasets::ConvolutionLayerDataset
@@ -66,56 +66,33 @@ constexpr AbsoluteTolerance<float>  tolerance_qasymm8(1);                 /**< T
 constexpr float                     tolerance_num = 0.07f;                /**< Tolerance number */
 
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
     DataType::F16,
-    DataType::F32,
-    DataType::QASYMM8,
-    DataType::QASYMM8_SIGNED,
+             DataType::F32,
+             DataType::QASYMM8,
+             DataType::QASYMM8_SIGNED,
 });
 
 /** Grouped CNN data types */
-const auto GroupedCNNDataTypes = framework::dataset::make("DataType",
+const auto GroupedCNNDataTypes = make("DataType",
 {
     DataType::F16,
-    DataType::F32
+             DataType::F32
 });
 
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsDataset      = make("ActivationInfo",
 {
     ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
+                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
 });
-const auto ActivationFunctionsSmallDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsSmallDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
+                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
 });
-
-bool is_post_op_list_valid_in_gemmconv(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &output_shape, DataType data_type, DataLayout data_layout,
-                                       const PadStrideInfo &conv_info, const experimental::PostOpList<ITensorInfo *> &post_ops)
-{
-    const int idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
-    const auto         dilation   = Size2D(1U, 1U);
-    const unsigned int num_groups = 1U;
-
-    TensorInfo input_info(input_shape, 1, data_type, data_layout);
-    TensorInfo weights_info(weights_shape, 1, data_type, data_layout);
-
-    TensorInfo output_info(output_shape, 1, data_type, data_layout);
-
-    WeightsInfo w_info(false, weights_info.dimension(idx_width), weights_info.dimension(idx_height), weights_info.dimension(idx_kernels));
-
-    const auto status = CLGEMMConvolutionLayer::validate(&input_info.clone()->set_is_resizable(true),
-                                                         &weights_info.clone()->set_is_resizable(true), nullptr, &output_info.clone()->set_is_resizable(true),
-                                                         conv_info, w_info, dilation, ActivationLayerInfo(), num_groups, post_ops);
-    return bool(status);
-}
 } // namespace
 
 TEST_SUITE(CL)
@@ -124,7 +101,7 @@ TEST_SUITE(ConvolutionLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
-                                          framework::dataset::make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32),            // Select GEMM
+                                          make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32),            // Select GEMM
                                                                                   TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32),            // Select GEMM
                                                                                   TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32),        // Select GEMM
                                                                                   TensorInfo(TensorShape(23U, 27U, 31U, 4U), 1, DataType::F32),       // Select WINOGRAD
@@ -134,7 +111,7 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                                                                   TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32),            // Select GEMM
                                                                                   TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::QASYMM8_SIGNED), // Select GEMM
                                           }),
-                                          framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32),
+                                          make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(3U, 3U, 31U, 21U), 1, DataType::F32),
@@ -144,7 +121,7 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                                                                     TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::QASYMM8_SIGNED),
                                           })),
-                                          framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32),
+                                          make("OutputInfo", { TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32),
@@ -154,7 +131,7 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                                                                    TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::QASYMM8_SIGNED),
                                           })),
-                                          framework::dataset::make("ConvInfo", { PadStrideInfo(1, 2, 1, 1),
+                                          make("ConvInfo", { PadStrideInfo(1, 2, 1, 1),
                                                                                  PadStrideInfo(1, 2, 1, 1),
                                                                                  PadStrideInfo(1, 1, 0, 0),
                                                                                  PadStrideInfo(1, 1, 0, 0),
@@ -164,7 +141,7 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                                                                  PadStrideInfo(1, 1, 2, 2),
                                                                                  PadStrideInfo(1, 1, 2, 2),
                                           })),
-                                          framework::dataset::make("GpuTarget", { GPUTarget::BIFROST,
+                                          make("GpuTarget", { GPUTarget::BIFROST,
                                                                                   GPUTarget::MIDGARD,
                                                                                   GPUTarget::G71,
                                                                                   GPUTarget::G71,
@@ -174,7 +151,7 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                                                                   GPUTarget::BIFROST,
                                                                                   GPUTarget::BIFROST,
                                           })),
-                                          framework::dataset::make("Dilation", { Size2D(1U, 1U),
+                                          make("Dilation", { Size2D(1U, 1U),
                                                                  Size2D(1U, 1U),
                                                                  Size2D(1U, 1U),
                                                                  Size2D(1U, 1U),
@@ -184,8 +161,8 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                                                  Size2D(2U, 1U),
                                                                  Size2D(2U, 1U),
                                           })),
-                                         framework::dataset::make("EnableFastMath", { false, false, false, false, false, false, true, true, true })),
-                                         framework::dataset::make("Expected",{ ConvolutionMethod::GEMM,
+                                         make("EnableFastMath", { false, false, false, false, false, false, true, true, true })),
+                                         make("Expected",{ ConvolutionMethod::GEMM,
                                                                                ConvolutionMethod::GEMM,
                                                                                ConvolutionMethod::GEMM,
                                                                                ConvolutionMethod::WINOGRAD,
@@ -207,72 +184,6 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                                                             enable_fast_math);
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
-
-DATA_TEST_CASE(ValidatePostOpSupportInConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-                                          framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 17U, 31U), 1, DataType::F32, DataLayout::NHWC),            // Select GEMM
-                                                                                  TensorInfo(TensorShape(17U, 31U, 32U), 1, DataType::F32, DataLayout::NCHW),           // Select WINOGRAD
-                                                                                  TensorInfo(TensorShape(27U, 27U, 48U), 1, DataType::F32, DataLayout::NCHW),           // Select Direct
-                                                                                  TensorInfo(TensorShape(27U, 27U, 48U), 1, DataType::F32, DataLayout::NCHW),           // Select FFT
-                                          }),
-                                          framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(2U, 1U, 1U, 19U), 1, DataType::F32, DataLayout::NHWC),
-                                                                                    TensorInfo(TensorShape(5U, 5U, 32U, 19U), 1, DataType::F32, DataLayout::NCHW),
-                                                                                    TensorInfo(TensorShape(5U, 5U, 48U, 128U), 1, DataType::F32, DataLayout::NCHW),
-                                                                                    TensorInfo(TensorShape(11U, 11U, 48U, 24), 1, DataType::F32, DataLayout::NCHW),
-                                          })),
-                                          framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(19U, 17U, 31U), 1, DataType::F32, DataLayout::NHWC),
-                                                                                   TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32, DataLayout::NCHW),
-                                                                                   TensorInfo(TensorShape(27U, 27U, 128U), 1, DataType::F32, DataLayout::NCHW),
-                                                                                   TensorInfo(TensorShape(27U, 27U, 24U), 1, DataType::F32, DataLayout::NCHW),
-                                          })),
-                                          framework::dataset::make("ConvInfo", { PadStrideInfo(1U, 1U, 0U, 0U),
-                                                                                 PadStrideInfo(1U, 1U, 2U, 2U),
-                                                                                 PadStrideInfo(1U, 1U, 2U, 2U),
-                                                                                 PadStrideInfo(1U, 1U, 5U, 5U),
-                                          })),
-                                         framework::dataset::make("EnableFastMath", { false, true, false, false})),
-                                         framework::dataset::make("ExpectedMethod",{ ConvolutionMethod::GEMM,
-                                                                                     ConvolutionMethod::WINOGRAD,
-                                                                                     ConvolutionMethod::DIRECT,
-                                                                                     ConvolutionMethod::FFT,
-                                         })),
-                                         framework::dataset::make("PostOpSupported",{ true, false, false, false
-                                         })),
-                                         input_info, weights_info, output_info, conv_info, enable_fast_math, expected_method, post_op_supported)
-{
-    const int idx_width  = get_data_layout_dimension_index(input_info.data_layout(), DataLayoutDimension::WIDTH);
-    const int idx_height = get_data_layout_dimension_index(input_info.data_layout(), DataLayoutDimension::HEIGHT);
-    const int idx_kernels = get_data_layout_dimension_index(input_info.data_layout(), DataLayoutDimension::BATCHES);
-
-    const auto dilation = Size2D(1U, 1U);
-    const unsigned int num_groups = 1U;
-
-    WeightsInfo w_info(false, weights_info.dimension(idx_width), weights_info.dimension(idx_height), weights_info.dimension(idx_kernels));
-
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<ITensorInfo*>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-
-    ConvolutionMethod actual_method = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
-                                                                            &weights_info.clone()->set_is_resizable(true),
-                                                                            &output_info.clone()->set_is_resizable(true), conv_info,
-                                                                            WeightsInfo(),
-                                                                            ActivationLayerInfo(),
-                                                                            GPUTarget::BIFROST,
-                                                                            dilation,
-                                                                            enable_fast_math);
-    ARM_COMPUTE_EXPECT(actual_method == expected_method, framework::LogLevel::ERRORS);
-    const auto is_valid = CLConvolutionLayer::validate(&input_info.clone()->set_is_resizable(true),
-                                                                            &weights_info.clone()->set_is_resizable(true),
-                                                                            nullptr,
-                                                                            &output_info.clone()->set_is_resizable(true),
-                                                                            conv_info,
-                                                                            w_info,
-                                                                            dilation,
-                                                                            ActivationLayerInfo(),
-                                                                            enable_fast_math,
-                                                                            num_groups,
-                                                                            post_ops);
-    ARM_COMPUTE_EXPECT( bool(is_valid) == post_op_supported, framework::LogLevel::ERRORS);
-}
 // clang-format on
 // *INDENT-ON*
 TEST_SUITE_END() // ConvolutionLayer
@@ -285,167 +196,11 @@ using CLGEMMConvolutionLayerMixedDataLayoutFixture = ConvolutionValidationFixtur
 template <typename T>
 using CLConvolutionValidationWithPaddingFixture = ConvolutionValidationWithPaddingFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T>;
 
-TEST_SUITE(ValidateFusedPostOpsConfigs)
-TEST_SUITE(Invalid)
-TEST_CASE(UnsupportedPostOpSequence, framework::DatasetMode::ALL)
-{
-    const auto data_type     = DataType::F32;
-    const auto data_layout   = DataLayout::NHWC;
-    const auto conv_info     = PadStrideInfo(1, 1, 0, 0);
-    const auto input_shape   = TensorShape(16U, 14U, 12U, 2U);
-    const auto weights_shape = TensorShape(16U, 1U, 1U, 24U);
-
-    const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
-    const TensorShape post_op_arg0_shape(output_shape);
-    TensorInfo        post_op_arg_info(post_op_arg0_shape, 1, data_type);
-    auto              post_op_arg1_info = post_op_arg_info.clone();
-
-    // Unsupported sequence of post ops
-    experimental::PostOpList<ITensorInfo *> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
-                                                                          &post_op_arg_info,
-                                                                          1,
-                                                                          ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
-                                                                          post_op_arg1_info.get(),
-                                                                          0,
-                                                                          ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OnlyNHWCIsSupported, framework::DatasetMode::ALL)
-{
-    const auto data_type     = DataType::F32;
-    const auto data_layout   = DataLayout::NCHW;
-    const auto conv_info     = PadStrideInfo(1, 1, 0, 0);
-    const auto input_shape   = TensorShape(14U, 12U, 16U, 2U);
-    const auto weights_shape = TensorShape(1U, 1U, 16U, 24U);
-
-    const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
-    const TensorShape post_op_arg0_shape(output_shape);
-    TensorInfo        post_op_arg_info(post_op_arg0_shape, 1, data_type);
-
-    experimental::PostOpList<ITensorInfo *> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
-                                                                          &post_op_arg_info,
-                                                                          1,
-                                                                          ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OnlyFloatingTypeIsSupported, framework::DatasetMode::ALL)
-{
-    const auto data_type     = DataType::QASYMM8;
-    const auto data_layout   = DataLayout::NHWC;
-    const auto conv_info     = PadStrideInfo(1, 1, 0, 0);
-    const auto input_shape   = TensorShape(16U, 14U, 12U, 2U);
-    const auto weights_shape = TensorShape(16U, 1U, 1U, 24U);
-
-    const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
-    const TensorShape post_op_arg0_shape(output_shape);
-    TensorInfo        post_op_arg_info(post_op_arg0_shape, 1, data_type);
-
-    experimental::PostOpList<ITensorInfo *> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
-                                                                          &post_op_arg_info,
-                                                                          1,
-                                                                          ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OnlyConv1x1Stride1IsSupported_UnsupportedKernelSize, framework::DatasetMode::ALL)
-{
-    const auto data_type     = DataType::F32;
-    const auto data_layout   = DataLayout::NHWC;
-    const auto conv_info     = PadStrideInfo(1, 1, 0, 0);
-    const auto input_shape   = TensorShape(16U, 14U, 12U, 2U);
-    const auto weights_shape = TensorShape(16U, 3U, 3U, 24U);
-
-    const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
-    const TensorShape post_op_arg0_shape(output_shape);
-    TensorInfo        post_op_arg_info(post_op_arg0_shape, 1, data_type);
-
-    experimental::PostOpList<ITensorInfo *> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
-                                                                          &post_op_arg_info,
-                                                                          1,
-                                                                          ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OnlyConv1x1Stride1IsSupported_UnsupportedStride, framework::DatasetMode::ALL)
-{
-    const auto data_type     = DataType::F32;
-    const auto data_layout   = DataLayout::NHWC;
-    const auto conv_info     = PadStrideInfo(3, 3, 0, 0);
-    const auto input_shape   = TensorShape(16U, 14U, 12U, 2U);
-    const auto weights_shape = TensorShape(16U, 1U, 1U, 24U);
-
-    const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
-    const TensorShape post_op_arg0_shape(output_shape);
-    TensorInfo        post_op_arg_info(post_op_arg0_shape, 1, data_type);
-
-    experimental::PostOpList<ITensorInfo *> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
-                                                                          &post_op_arg_info,
-                                                                          1,
-                                                                          ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Invalid
-TEST_SUITE(Valid)
-TEST_CASE(EmptyPostOpList, framework::DatasetMode::ALL)
-{
-    const auto data_type     = DataType::F32;
-    const auto data_layout   = DataLayout::NHWC;
-    const auto conv_info     = PadStrideInfo(1, 1, 0, 0);
-    const auto input_shape   = TensorShape(16U, 14U, 12U, 2U);
-    const auto weights_shape = TensorShape(16U, 1U, 1U, 24U);
-
-    const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
-    experimental::PostOpList<ITensorInfo *> post_ops{};
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(SupportedPostOps, framework::DatasetMode::ALL)
-{
-    const auto data_type     = DataType::F32;
-    const auto data_layout   = DataLayout::NHWC;
-    const auto conv_info     = PadStrideInfo(1, 1, 0, 0);
-    const auto input_shape   = TensorShape(16U, 14U, 12U, 2U);
-    const auto weights_shape = TensorShape(16U, 1U, 1U, 24U);
-
-    const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
-    TensorShape post_op_arg0_shape(output_shape);
-    post_op_arg0_shape[1] = 1; // Broadcast in "Y" (second) dimension
-    TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
-
-    experimental::PostOpList<ITensorInfo *> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
-                                                                          &post_op_arg_info,
-                                                                          1,
-                                                                          ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Valid
-TEST_SUITE_END() // ValidateFusedPostOps
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                   framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                   framework::dataset::make("DataType",
-                                                                                                                           DataType::F16)),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                   make("ReshapeWeights", { true })), make("DataType", DataType::F16)), make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                            ActivationFunctionsSmallDataset))
 {
     // Validate output
@@ -456,10 +211,7 @@ TEST_SUITE_END() // FP16
 TEST_SUITE(FP32)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                    framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                    framework::dataset::make("DataType",
-                                                                                                                            DataType::F32)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                    make("ReshapeWeights", { true })), make("DataType", DataType::F32)), make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                             ActivationFunctionsSmallDataset))
 {
     // Validate output
@@ -467,15 +219,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixture<float>, framework
 }
 FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLGEMMConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                           framework::dataset::make("Input", TensorShape(23U, 27U, 5U)),
-                                                                                           framework::dataset::make("Weights", TensorShape(3U, 3U, 5U, 2U))),
-                                                                                       framework::dataset::make("Bias", TensorShape(2U))),
-                                                                               framework::dataset::make("Output", TensorShape(11U, 25U, 2U))),
-                                                                       framework::dataset::make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0))),
-                                                               framework::dataset::make("Dilation", Size2D(1, 1))),
-                                                       framework::dataset::make("ReshapeWeights", { true })),
-                                               framework::dataset::make("DataType", DataType::F32)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                           make("Input", TensorShape(23U, 27U, 5U)),
+                                                                                           make("Weights", TensorShape(3U, 3U, 5U, 2U))),
+                                                                                       make("Bias", TensorShape(2U))),
+                                                                               make("Output", TensorShape(11U, 25U, 2U))),
+                                                                       make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0))),
+                                                               make("Dilation", Size2D(1, 1))),
+                                                       make("ReshapeWeights", { true })),
+                                               make("DataType", DataType::F32)),
+                                       make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                ActivationFunctionsSmallDataset))
 {
     // Validate output
@@ -483,11 +235,11 @@ FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLGEMMConvolutionLayerMixedDataLayout
 }
 FIXTURE_DATA_TEST_CASE(RunSmallWithPadding, CLConvolutionValidationWithPaddingFixture<float>, framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerPrePaddingDataset(),
-                                                               framework::dataset::make("ReshapeWeights", { true })),
-                                                       framework::dataset::make("DataType", DataType::F32)),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                       framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
-framework::dataset::make("PrePadLayer", { PaddingList({ { 1, 1 }, { 1, 1 } }) })))
+                                                               make("ReshapeWeights", { true })),
+                                                       make("DataType", DataType::F32)),
+                                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                       make("ActivationInfo", { ActivationLayerInfo() })),
+make("PrePadLayer", { PaddingList({ { 1, 1 }, { 1, 1 } }) })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -503,64 +255,108 @@ using CLGEMMConvolutionLayerQuantizedMixedDataLayoutFixture = ConvolutionValidat
 template <typename T>
 using CLGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T, int8_t>;
 
-const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
-{
-    ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
-});
-const auto QuantizedActivationFunctionsSmallDataset = framework::dataset::make("ActivationInfo",
-{
-    ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
-});
-
 TEST_SUITE(Quantized)
 
-const auto QuantizationData = framework::dataset::make("QuantizationInfo",
+const auto QuantizationData = make("QuantizationInfo",
 {
     QuantizationInfo(0.5f, 10),
     QuantizationInfo(0.3f, 3),
     QuantizationInfo(1.1f, 10),
 });
+
+/// @note: Every asymmetric quantized test has a version with or without activation because the quantization info given
+/// is ignored when there is no activation. Instead of using the same quantization information for all the tensors, the
+/// fixture generates separate quantization info for each input and the output tensor.
+/// When we can also support dynamic quantization with the presence of activation, these two versions should be merged
+/// again, with the explicitly specified quantization info removed
+const auto NoActivation = make("ActivationInfo", ActivationLayerInfo());
+
+const auto IgnoredQuantizationInfo = make("IgnoredQuantizationInfo", QuantizationInfo());
+
+const auto QuantizedActivationFunctionsSmallDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+});
+
 TEST_SUITE(QASYMM8)
 
 FIXTURE_DATA_TEST_CASE(RunSmallCases, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(SmallConvolutionLayerDatasetCases(),
-                                                               framework::dataset::make("ReshapeWeights", { true })),
-                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                       QuantizationData),
-                               QuantizedActivationFunctionsSmallDataset))
+    combine(SmallConvolutionLayerDatasetCases(),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        IgnoredQuantizationInfo,
+        NoActivation))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallCasesWithActivation, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(SmallConvolutionLayerDatasetCases(),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        QuantizationData,
+        QuantizedActivationFunctionsSmallDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                               framework::dataset::make("ReshapeWeights", { true })),
-                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                       QuantizationData),
-                               QuantizedActivationFunctionsSmallDataset))
+    combine(datasets::SmallConvolutionLayerDataset(),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        IgnoredQuantizationInfo,
+        NoActivation))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallWithActivation, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(datasets::SmallConvolutionLayerDataset(),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        QuantizationData,
+        QuantizedActivationFunctionsSmallDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLGEMMConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                   framework::dataset::make("Input", TensorShape(23U, 27U, 5U)),
-                                                                                                   framework::dataset::make("Weights", TensorShape(3U, 3U, 5U, 2U))),
-                                                                                               framework::dataset::make("Bias", TensorShape(2U))),
-                                                                                       framework::dataset::make("Output", TensorShape(11U, 25U, 2U))),
-                                                                               framework::dataset::make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0))),
-                                                                       framework::dataset::make("Dilation", Size2D(1, 1))),
-                                                               framework::dataset::make("ReshapeWeights", { true })),
-                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                       QuantizationData),
-                               QuantizedActivationFunctionsSmallDataset))
+    combine(
+        make("Input", TensorShape(23U, 27U, 5U)),
+        make("Weights", TensorShape(3U, 3U, 5U, 2U)),
+        make("Bias", TensorShape(2U)),
+        make("Output", TensorShape(11U, 25U, 2U)),
+        make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0)),
+        make("Dilation", Size2D(1, 1)),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        IgnoredQuantizationInfo,
+        NoActivation))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, CLGEMMConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        make("Input", TensorShape(23U, 27U, 5U)),
+        make("Weights", TensorShape(3U, 3U, 5U, 2U)),
+        make("Bias", TensorShape(2U)),
+        make("Output", TensorShape(11U, 25U, 2U)),
+        make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0)),
+        make("Dilation", Size2D(1, 1)),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        QuantizationData,
+        QuantizedActivationFunctionsSmallDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -568,44 +364,78 @@ FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLGEMMConvolutionLayerQuantizedMixedD
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                               framework::dataset::make("ReshapeWeights", { true })),
-                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                       QuantizationData),
-                               QuantizedActivationFunctionsSmallDataset))
+    combine(datasets::SmallConvolutionLayerDataset(),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        IgnoredQuantizationInfo,
+        NoActivation))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallWithActivation, CLGEMMConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(datasets::SmallConvolutionLayerDataset(),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        QuantizationData,
+        QuantizedActivationFunctionsSmallDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLGEMMConvolutionLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                   framework::dataset::make("Input", TensorShape(23U, 27U, 5U)),
-                                                                                                   framework::dataset::make("Weights", TensorShape(3U, 3U, 5U, 2U))),
-                                                                                               framework::dataset::make("Bias", TensorShape(2U))),
-                                                                                       framework::dataset::make("Output", TensorShape(11U, 25U, 2U))),
-                                                                               framework::dataset::make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0))),
-                                                                       framework::dataset::make("Dilation", Size2D(1, 1))),
-                                                               framework::dataset::make("ReshapeWeights", { true })),
-                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                       QuantizationData),
-                               QuantizedActivationFunctionsSmallDataset))
+    combine(
+        make("Input", TensorShape(23U, 27U, 5U)),
+        make("Weights", TensorShape(3U, 3U, 5U, 2U)),
+        make("Bias", TensorShape(2U)),
+        make("Output", TensorShape(11U, 25U, 2U)),
+        make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0)),
+        make("Dilation", Size2D(1, 1)),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        IgnoredQuantizationInfo,
+        NoActivation))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, CLGEMMConvolutionLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(
+        make("Input", TensorShape(23U, 27U, 5U)),
+        make("Weights", TensorShape(3U, 3U, 5U, 2U)),
+        make("Bias", TensorShape(2U)),
+        make("Output", TensorShape(11U, 25U, 2U)),
+        make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0)),
+        make("Dilation", Size2D(1, 1)),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        QuantizationData,
+        QuantizedActivationFunctionsSmallDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE(QSYMM8_PER_CHANNEL)
+const auto QuantizedActivationFunctionsSmallPerChannelDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+});
+
 
 FIXTURE_DATA_TEST_CASE(RunSmallSigned, CLGEMMConvolutionLayerQuantizedPerChannelFixture<int8_t>, framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                       framework::dataset::make("ReshapeWeights", { true })),
-                                                               framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })),
-                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                       make("ReshapeWeights", { true })),
+                                                               make("DataType", { DataType::QASYMM8_SIGNED })),
+                                                       make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                QuantizationData),
-                                       QuantizedActivationFunctionsSmallDataset),
-                               framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+                                       QuantizedActivationFunctionsSmallPerChannelDataset),
+                               make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -613,12 +443,12 @@ FIXTURE_DATA_TEST_CASE(RunSmallSigned, CLGEMMConvolutionLayerQuantizedPerChannel
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                       framework::dataset::make("ReshapeWeights", { true })),
-                                                               framework::dataset::make("DataType", { DataType::QASYMM8 })),
-                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                       make("ReshapeWeights", { true })),
+                                                               make("DataType", { DataType::QASYMM8 })),
+                                                       make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                QuantizationData),
-                                       QuantizedActivationFunctionsSmallDataset),
-                               framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+                                       QuantizedActivationFunctionsSmallPerChannelDataset),
+                               make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -637,9 +467,7 @@ TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMGroupedConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallGroupedConvolutionLayerDataset(),
-                                                                                                                   framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                   make("ReshapeWeights", { true })), make("DataType", DataType::F32)), make("DataLayout", { DataLayout::NCHW })),
                                                                                                                    ActivationFunctionsSmallDataset))
 {
     // Validate output
@@ -648,9 +476,9 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMGroupedConvolutionLayerFixture<float>, fr
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMGroupedConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                        combine(combine(combine(combine(datasets::LargeGroupedConvolutionLayerDataset(),
-                                                       framework::dataset::make("ReshapeWeights", { true })),
-                                               framework::dataset::make("DataType", DataType::F32)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                       make("ReshapeWeights", { true })),
+                                               make("DataType", DataType::F32)),
+                                       make("DataLayout", { DataLayout::NCHW })),
                                ActivationFunctionsDataset))
 {
     // Validate output
@@ -661,9 +489,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE(FP16)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMGroupedConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallGroupedConvolutionLayerDataset(),
-                                                                                                                  framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                  framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                  make("ReshapeWeights", { true })), make("DataType", DataType::F16)), make("DataLayout", { DataLayout::NCHW })),
                                                                                                                   ActivationFunctionsSmallDataset))
 {
     // Validate output
@@ -672,9 +498,9 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMGroupedConvolutionLayerFixture<half>, fra
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMGroupedConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
                        combine(combine(combine(combine(datasets::LargeGroupedConvolutionLayerDataset(),
-                                                       framework::dataset::make("ReshapeWeights", { true })),
-                                               framework::dataset::make("DataType", DataType::F16)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                       make("ReshapeWeights", { true })),
+                                               make("DataType", DataType::F16)),
+                                       make("DataLayout", { DataLayout::NCHW })),
                                ActivationFunctionsDataset))
 {
     // Validate output
diff --git a/tests/validation/CL/DeconvolutionLayer.cpp b/tests/validation/CL/DeconvolutionLayer.cpp
index 6b12fc01a10666c7c0c37e1ab77ed24a7550751d..d1508fd90221ecc382b267c13c2c698a724206cd 100644
--- a/tests/validation/CL/DeconvolutionLayer.cpp
+++ b/tests/validation/CL/DeconvolutionLayer.cpp
@@ -53,35 +53,29 @@ const auto data9x9_small_asymm = framework::dataset::make("InputShape", TensorSh
                                  *framework::dataset::make("PadLeft", 3)
                                  *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop", 3) *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
 
-const auto data9x9_large_asymm = framework::dataset::make("InputShape", TensorShape{ 640U, 360U, 56U, 1U }) *framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY",
-                                 2)
-                                 *framework::dataset::make("PadLeft", 3)
-                                 *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop", 3) *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
-
-const auto data4x4 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 3)
-                     * framework::dataset::make("PadY", 0, 3) * framework::dataset::make("NumKernels", { 3 });
-
-const auto data3x3 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 2)
+const auto data4x4 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4, 2) * framework::dataset::make("StrideY", 2, 4) * framework::dataset::make("PadX", 1, 3)
                      * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels", { 3 });
 
+const auto data3x3 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 2, 4) * framework::dataset::make("StrideY", 1, 4, 2) * framework::dataset::make("PadX", 1, 2)
+                     * framework::dataset::make("PadY", 1, 3) * framework::dataset::make("NumKernels", { 3 });
+
 const auto data3x3_asymm = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 2) * framework::dataset::make("StrideY", 1, 2) * framework::dataset::make("PadLeft", 0, 1)
                            * framework::dataset::make("PadRight", 0, 1) * framework::dataset::make("PadTop", 0, 1) * framework::dataset::make("PadBottom", 0, 1) * framework::dataset::make("NumKernels", { 3 });
 
 const auto data3x3_precommit = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 2) * framework::dataset::make("StrideY", 1, 2) * framework::dataset::make("PadX", 0, 2)
                                * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels", { 3 });
 
-const auto data3x3_precommit_large_channels = datasets::SmallDeconvolutionShapesWithLargerChannels() * framework::dataset::make("StrideX", 2) * framework::dataset::make("StrideY",
-                                              2)
+const auto data3x3_precommit_large_channels = datasets::SmallDeconvolutionShapesWithLargerChannels() * framework::dataset::make("StrideX", 2) * framework::dataset::make("StrideY", 2)
                                               * framework::dataset::make("PadX", 1)
                                               * framework::dataset::make("PadY", 2) * framework::dataset::make("NumKernels", { 5 });
 
 const auto data2x2_precommit = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 2) * framework::dataset::make("StrideY", 2) * framework::dataset::make("PadX", 1)
                                * framework::dataset::make("PadY", 1) * framework::dataset::make("NumKernels", { 3 });
 
-const auto data1x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 1)
+const auto data1x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4, 2) * framework::dataset::make("StrideY", 2, 4) * framework::dataset::make("PadX", 0, 1)
                      * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels", { 3 });
 
-const auto data5x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 1)
+const auto data5x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 2, 4) * framework::dataset::make("StrideY", 1, 4, 2) * framework::dataset::make("PadX", 0, 1)
                      * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels", { 3 });
 
 const auto data_layouts_dataset = framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC });
@@ -233,8 +227,8 @@ FIXTURE_DATA_TEST_CASE(RunAsymm, CLDeconvolutionLayerAsymmFixture3x3<float>, fra
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDeconvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data3x3, framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                 data_layouts_dataset),
-                                                                                                                 add_bias_dataset))
+                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                 framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
@@ -255,7 +249,7 @@ TEST_SUITE_END() // W2x2
 TEST_SUITE(W1x1)
 FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture1x1<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data1x1, framework::dataset::make("DataType", DataType::F32)),
                                                                                                                     data_layouts_dataset),
-                                                                                                            add_bias_dataset))
+                                                                                                            framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
@@ -275,7 +269,7 @@ TEST_SUITE_END() // W9x9
 TEST_SUITE(W5x1)
 FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture5x1<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data5x1, framework::dataset::make("DataType", DataType::F32)),
                                                                                                                     data_layouts_dataset),
-                                                                                                            add_bias_dataset))
+                                                                                                            framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
@@ -289,7 +283,7 @@ TEST_SUITE(FP16)
 TEST_SUITE(W4x4)
 FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture4x4<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data4x4, framework::dataset::make("DataType", DataType::F16)),
                                                                                                                    data_layouts_dataset),
-                                                                                                           add_bias_dataset))
+                                                                                                           framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
@@ -306,8 +300,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerFixture3x3<half>, framework
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDeconvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data3x3, framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                        data_layouts_dataset),
-                                                                                                                add_bias_dataset))
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
@@ -328,7 +322,7 @@ TEST_SUITE_END() // W2x2
 TEST_SUITE(W1x1)
 FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture1x1<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data1x1, framework::dataset::make("DataType", DataType::F16)),
                                                                                                                    data_layouts_dataset),
-                                                                                                           add_bias_dataset))
+                                                                                                           framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
@@ -338,7 +332,7 @@ TEST_SUITE_END() // W1x1
 TEST_SUITE(W5x1)
 FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture5x1<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data5x1, framework::dataset::make("DataType", DataType::F16)),
                                                                                                                    data_layouts_dataset),
-                                                                                                           add_bias_dataset))
+                                                                                                           framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
@@ -387,7 +381,7 @@ FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture4x4<uint8_t>, fr
                                                                                                                        data_layouts_dataset),
                                                                                                                        framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 10), QuantizationInfo(2.f / 255.f, 5) })),
                                                                                                                        framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 5), QuantizationInfo(4.f / 255.f, 10) })),
-                                                                                                                       add_bias_dataset))
+                                                                                                                       framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
@@ -407,12 +401,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedFixture3x3<uint8_t
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDeconvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data3x3,
-                       framework::dataset::make("DataType",
-                                                DataType::QASYMM8)),
-                       data_layouts_dataset),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
                        framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 10), QuantizationInfo(2.f / 255.f, 128) })),
                        framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 128), QuantizationInfo(4.f / 255.f, 128) })),
-                       add_bias_dataset))
+                       framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
@@ -438,7 +431,7 @@ FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture1x1<uint8_t>, fr
                                                                                                                        data_layouts_dataset),
                                                                                                                        framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f / 255.f, 0) })),
                                                                                                                        framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(4.f / 255.f, 0) })),
-                                                                                                                       add_bias_dataset))
+                                                                                                                       framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
@@ -451,7 +444,7 @@ FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture5x1<uint8_t>, fr
                                                                                                                        data_layouts_dataset),
                                                                                                                        framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 10), QuantizationInfo(2.f / 255.f, 5) })),
                                                                                                                        framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 5), QuantizationInfo(4.f / 255.f, 10) })),
-                                                                                                                       add_bias_dataset))
+                                                                                                                       framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
@@ -471,7 +464,7 @@ FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture4x4<int8_t>, fra
                                                                                                                       data_layouts_dataset),
                                                                                                                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 10), QuantizationInfo(2.f / 255.f, 5) })),
                                                                                                                       framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 5), QuantizationInfo(4.f / 255.f, 10) })),
-                                                                                                                      add_bias_dataset))
+                                                                                                                      framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
@@ -493,12 +486,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedFixture3x3<int8_t>
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDeconvolutionLayerQuantizedFixture3x3<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data3x3,
-                       framework::dataset::make("DataType",
-                                                DataType::QASYMM8_SIGNED)),
-                       data_layouts_dataset),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
                        framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, -10), QuantizationInfo(2.f / 255.f, 127) })),
                        framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 64), QuantizationInfo(4.f / 255.f, -128) })),
-                       add_bias_dataset))
+                       framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
@@ -524,7 +516,7 @@ FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture1x1<int8_t>, fra
                                                                                                                       data_layouts_dataset),
                                                                                                                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f / 255.f, 0) })),
                                                                                                                       framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(4.f / 255.f, 0) })),
-                                                                                                                      add_bias_dataset))
+                                                                                                                      framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
@@ -537,7 +529,7 @@ FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture5x1<int8_t>, fra
                                                                                                                       data_layouts_dataset),
                                                                                                                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 10), QuantizationInfo(2.f / 255.f, 5) })),
                                                                                                                       framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 5), QuantizationInfo(4.f / 255.f, 10) })),
-                                                                                                                      add_bias_dataset))
+                                                                                                                      framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
@@ -559,7 +551,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedPerChannelFixture4
                        data_layouts_dataset),
                        input_qinfo_dataset),
                        output_qinfo_dataset),
-                       add_bias_dataset),
+                       framework::dataset::make("AddBias", { true })),
                        framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
 {
     // Validate output
@@ -570,7 +562,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallSigned, CLDeconvolutionLayerQuantizedPerChannelFi
                        data_layouts_dataset),
                        input_signed_qinfo_dataset),
                        output_signed_qinfo_dataset),
-                       add_bias_dataset),
+                       framework::dataset::make("AddBias", { true })),
                        framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
 {
     // Validate output
@@ -584,7 +576,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedPerChannelFixture3
                        data_layouts_dataset),
                        input_qinfo_dataset),
                        output_qinfo_dataset),
-                       add_bias_dataset),
+                       framework::dataset::make("AddBias", { true })),
                        framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
 {
     // Validate output
@@ -595,7 +587,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallSigned, CLDeconvolutionLayerQuantizedPerChannelFi
                        data_layouts_dataset),
                        input_signed_qinfo_dataset),
                        output_signed_qinfo_dataset),
-                       add_bias_dataset),
+                       framework::dataset::make("AddBias", { true })),
                        framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
 {
     // Validate output
@@ -659,7 +651,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedPerChannelFixture1
                        data_layouts_dataset),
                        input_qinfo_dataset),
                        output_qinfo_dataset),
-                       add_bias_dataset),
+                       framework::dataset::make("AddBias", { false })),
                        framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
 {
     // Validate output
@@ -670,7 +662,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallSigned, CLDeconvolutionLayerQuantizedPerChannelFi
                        data_layouts_dataset),
                        input_signed_qinfo_dataset),
                        output_signed_qinfo_dataset),
-                       add_bias_dataset),
+                       framework::dataset::make("AddBias", { true })),
                        framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
 {
     // Validate output
@@ -684,7 +676,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedPerChannelFixture5
                        data_layouts_dataset),
                        input_qinfo_dataset),
                        output_qinfo_dataset),
-                       add_bias_dataset),
+                       framework::dataset::make("AddBias", { true })),
                        framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
 {
     // Validate output
@@ -695,7 +687,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallSigned, CLDeconvolutionLayerQuantizedPerChannelFi
                        data_layouts_dataset),
                        input_signed_qinfo_dataset),
                        output_signed_qinfo_dataset),
-                       add_bias_dataset),
+                       framework::dataset::make("AddBias", { false })),
                        framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
 {
     // Validate output
diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
index 5d5562f6785c0d6034af25d989e2e91a8e7c94ec..04612a689de314a01edcb5d4cf2bc9a6b14d14b1 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,6 +41,9 @@ namespace test
 {
 namespace validation
 {
+
+using framework::dataset::make;
+
 namespace
 {
 RelativeTolerance<half_float::half>  tolerance_f16(half_float::half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
@@ -48,14 +51,36 @@ constexpr RelativeTolerance<float>   tolerance_f32(0.01f);                  /**<
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);                  /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
 constexpr float                      tolerance_num = 0.05f;                 /**< Tolerance number */
 
-const auto depth_multipliers       = framework::dataset::make("DepthMultiplier", { 1, 4 });
-const auto large_depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 5, 8 });
+const auto depth_multipliers       = make("DepthMultiplier", { 1, 4 });
+const auto large_depth_multipliers = make("DepthMultiplier", { 2, 5, 8 });
 
 //Activation Functions
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsSmallDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f, 0.f)
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 2.f, 0.f)
+});
+
+const auto ActivationFunctionsDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.8f, -0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SQUARE),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::HARD_SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 2.f, 1.f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::GELU)
+});
+
+const auto ActivationFunctionsQuantizedDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 2.3f, -1.5f),
 });
 } // namespace
 
@@ -64,85 +89,85 @@ TEST_SUITE(DepthwiseConvolutionLayer)
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
-                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Mismatching data type input/weights
-                                                        TensorInfo(TensorShape(27U, 13U, 3U), 1, DataType::F32),    // Mismatching input feature maps
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Mismatching depth multiplier
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid biases size
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid biases dimensions
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid output size
-                                                        TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),    // patch size bigger than input width
-                                                        TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),    // dilation < 1
-                                                        TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),
-                                                        TensorInfo(TensorShape(32U, 13U, 8U), 1, DataType::QASYMM8),
-                                                      }),
-                framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16),
-                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 24U), 1, DataType::QASYMM8),
-                                                        })),
-                framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(4U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(2U, 2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(24U), 1, DataType::S32),
-                                                       })),
-                framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(32U, 11U, 24U), 1, DataType::QASYMM8),
-                                                       })),
-                framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 1, 0),
-                                                      })),
-                framework::dataset::make("DepthMultiplier", { 1,
-                                                              1,
-                                                              3,
-                                                              1,
-                                                              1,
-                                                              1,
-                                                              2,
-                                                              2,
-                                                              2,
-                                                              3,
-                                                             })),
-                framework::dataset::make("Dilation", { Size2D(1U, 1U),
-                                                              Size2D(1U, 1U),
-                                                              Size2D(1U, 1U),
-                                                              Size2D(1U, 1U),
-                                                              Size2D(1U, 1U),
-                                                              Size2D(1U, 1U),
-                                                              Size2D(20U, 1U),
-                                                              Size2D(0U, 1U),
-                                                              Size2D(1U, 1U),
-                                                              Size2D(1U, 1U),
-                                                             })),
-                framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, true, true })),
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+                make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Mismatching data type input/weights
+                                TensorInfo(TensorShape(27U, 13U, 3U), 1, DataType::F32),    // Mismatching input feature maps
+                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Mismatching depth multiplier
+                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid biases size
+                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid biases dimensions
+                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid output size
+                                TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),    // patch size bigger than input width
+                                TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),    // dilation < 1
+                                TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),
+                                TensorInfo(TensorShape(32U, 13U, 8U), 1, DataType::QASYMM8),
+                                }),
+                make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16),
+                                TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(3U, 3U, 24U), 1, DataType::QASYMM8),
+                        }),
+                make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(4U), 1, DataType::F32),
+                                TensorInfo(TensorShape(2U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(24U), 1, DataType::S32),
+                        }),
+                make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(32U, 11U, 24U), 1, DataType::QASYMM8),
+                        }),
+                make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 1, 0),
+                                }),
+                make("DepthMultiplier", { 1,
+                                        1,
+                                        3,
+                                        1,
+                                        1,
+                                        1,
+                                        2,
+                                        2,
+                                        2,
+                                        3,
+                                }),
+                make("Dilation", { Size2D(1U, 1U),
+                                Size2D(1U, 1U),
+                                Size2D(1U, 1U),
+                                Size2D(1U, 1U),
+                                Size2D(1U, 1U),
+                                Size2D(1U, 1U),
+                                Size2D(20U, 1U),
+                                Size2D(0U, 1U),
+                                Size2D(1U, 1U),
+                                Size2D(1U, 1U),
+                                }),
+                make("Expected", { false, false, false, false, false, false, false, false, true, true })),
                 input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier, dilation, expected)
 {
     bool is_valid = bool(CLDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(true), &weights_info.clone()->set_is_resizable(true), &biases_info.clone()->set_is_resizable(true), &output_info.clone()->set_is_resizable(true), conv_info, depth_multiplier,ActivationLayerInfo(), dilation));
@@ -163,42 +188,24 @@ TEST_SUITE(FP16)
 TEST_SUITE(W3x3)
 TEST_SUITE(NCHW)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL,
-                           combine(combine(combine(combine(framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                                                      datasets::SmallDepthwiseConvolutionLayerDataset3x3NCHW()),
-                                                           depth_multipliers),
-                                                   framework::dataset::make("DataType",
-                                                                            DataType::F16)),
-                                           framework::dataset::make("DataLayout", DataLayout::NCHW)),
-                                   ActivationFunctionsDataset))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f16);
-}
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                                                                                                                        large_depth_multipliers),
-                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                                DataType::F16)),
-                                                                                                                        framework::dataset::make("DataLayout", DataLayout::NCHW)),
-                                                                                                                        ActivationFunctionsDataset))
+    combine(
+        framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                                        datasets::SmallDepthwiseConvolutionLayerDataset3x3NCHW()),
+        depth_multipliers,
+        make("DataType", DataType::F16),
+        make("DataLayout", DataLayout::NCHW),
+        ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                                                                                    depth_multipliers),
-                                                                                                                    framework::dataset::make("DataType",
-                                                                                                                            DataType::F16)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                                                                                                    ActivationFunctionsDataset))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f16);
-}
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                           large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
-                                                                            DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL,
+    combine(
+        datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::F16),
+        make("DataLayout", { DataLayout::NCHW }),
+        ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
@@ -207,41 +214,42 @@ TEST_SUITE_END() // NCHW
 
 TEST_SUITE(NHWC)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL,
-                           combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                           depth_multipliers),
-                                                   framework::dataset::make("DataType",
-                                                                            DataType::F16)),
-                                           framework::dataset::make("DataLayout", DataLayout::NHWC)),
-                                   ActivationFunctionsDataset))
+    combine(
+        datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::F16),
+        make("DataLayout", DataLayout::NHWC),
+        ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                                                                                                                        large_depth_multipliers),
-                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                                DataType::F16)),
-                                                                                                                        framework::dataset::make("DataLayout", DataLayout::NHWC)),
-                                                                                                                        ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::LargeDepthwiseConvolutionLayerDataset3x3Fp16Subset(),
+        large_depth_multipliers,
+        make("DataType", DataType::F16),
+        make("DataLayout", DataLayout::NHWC),
+        make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                                                                                     depth_multipliers),
-                                                                                                                    framework::dataset::make("DataType",
+                                                                                                                    make("DataType",
                                                                                                                             DataType::F16)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                                                                                    ActivationFunctionsDataset))
+                                                                                                                    make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                   ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3Fp16Subset(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
@@ -252,19 +260,33 @@ TEST_SUITE_END() // W3x3
 TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                                                                                                                     depth_multipliers),
-                                                                                                                    framework::dataset::make("DataType",
+                                                                                                                    make("DataType",
                                                                                                                             DataType::F16)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                    ActivationFunctionsDataset))
+                                                                                                                    make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                   ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDatasetFp16Subset(),
                                                                                                                         large_depth_multipliers),
-                                                                                                                        framework::dataset::make("DataType",
+                                                                                                                        make("DataType",
                                                                                                                                 DataType::F16)),
-                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                        ActivationFunctionsDataset))
+                                                                                                                        make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                        make("ActivationInfo", ActivationLayerInfo())))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 2 }),
+        make("DataType", DataType::F16),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
@@ -272,20 +294,20 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, f
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                                                                                     depth_multipliers),
-                                                                                                                    framework::dataset::make("DataType",
+                                                                                                                    make("DataType",
                                                                                                                             DataType::F16)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                    ActivationFunctionsDataset))
+                                                                                                                    make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                   ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDatasetFp16Subset(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
@@ -295,11 +317,11 @@ TEST_SUITE_END() // Generic
 TEST_SUITE(InPlace)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerInPlaceFixture<half>, framework::DatasetMode::ALL,
                            combine(combine(combine(combine(datasets::SmallInPlaceDepthwiseConvolutionLayerDataset(),
-                                                           framework::dataset::make("DepthMultiplier", { 1 })),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", { 1 })),
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_src), _reference, tolerance_f16, tolerance_num);
 }
@@ -313,72 +335,53 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>,
                            combine(combine(combine(combine(framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                                                       datasets::SmallDepthwiseConvolutionLayerDataset3x3NCHW()),
                                                            depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", DataLayout::NCHW)),
-                                   ActivationFunctionsDataset))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f32);
-}
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                           large_depth_multipliers),
-                           framework::dataset::make("DataType",
-                                                    DataType::F32)),
-                           framework::dataset::make("DataLayout", DataLayout::NCHW)),
-                           ActivationFunctionsDataset))
+                                           make("DataLayout", DataLayout::NCHW)),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL,
                            combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", DataLayout::NCHW)),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", DataLayout::NCHW)),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                           large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
-                                                                            DataType::F32)),
-                                           framework::dataset::make("DataLayout", DataLayout::NCHW)),
-                                   ActivationFunctionsDataset))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f32);
-}
-
 TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // NCHW
+
 TEST_SUITE(NHWC)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL,
                            combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                            depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", DataLayout::NHWC)),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", DataLayout::NHWC)),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout, CLDepthwiseConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                           framework::dataset::make("DepthMultiplier", { 2 })),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", { 2 })),
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", DataLayout::NHWC)),
-                                   framework::dataset::make("ActivationInfo", ActivationLayerInfo())))
+                                           make("DataLayout", DataLayout::NHWC)),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                            large_depth_multipliers),
-                           framework::dataset::make("DataType",
+                           make("DataType",
                                                     DataType::F32)),
-                           framework::dataset::make("DataLayout", DataLayout::NHWC)),
-                           ActivationFunctionsDataset))
+                           make("DataLayout", DataLayout::NHWC)),
+                           make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
@@ -388,20 +391,20 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL,
                            combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                            depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", DataLayout::NHWC)),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", DataLayout::NHWC)),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", DataLayout::NHWC)),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", DataLayout::NHWC)),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
@@ -412,31 +415,45 @@ TEST_SUITE_END() // W3x3
 TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                                                                                                                      depth_multipliers),
-                                                                                                                     framework::dataset::make("DataType",
+                                                                                                                     make("DataType",
                                                                                                                              DataType::F32)),
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                     ActivationFunctionsDataset))
+                                                                                                                     make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                    ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
                            large_depth_multipliers),
-                           framework::dataset::make("DataType",
+                           make("DataType",
                                                     DataType::F32)),
-                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                           ActivationFunctionsDataset))
+                           make("DataLayout", { DataLayout::NHWC })),
+                           make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
 FIXTURE_DATA_TEST_CASE_NEW(RunLargeKernelSize, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL,
                            combine(combine(combine(combine(datasets::LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset(),
-                                                           framework::dataset::make("DepthMultiplier", { 1 })),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", { 1 })),
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                  ActivationFunctionsSmallDataset))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 2 }),
+        make("DataType", DataType::F32),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
@@ -444,20 +461,20 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLargeKernelSize, CLDepthwiseConvolutionLayerFixtur
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                                                                                      depth_multipliers),
-                                                                                                                     framework::dataset::make("DataType",
+                                                                                                                     make("DataType",
                                                                                                                              DataType::F32)),
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                     ActivationFunctionsDataset))
+                                                                                                                     make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                    ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
@@ -467,11 +484,11 @@ TEST_SUITE_END() // Generic
 TEST_SUITE(InPlace)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerInPlaceFixture<float>, framework::DatasetMode::ALL,
                            combine(combine(combine(combine(datasets::SmallInPlaceDepthwiseConvolutionLayerDataset(),
-                                                           framework::dataset::make("DepthMultiplier", { 1 })),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", { 1 })),
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_src), _reference, tolerance_f32);
 }
@@ -492,22 +509,37 @@ TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                                                                            depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 128), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(1.f, 128) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })), // NCHW is tested with int8
-                                   ActivationFunctionsDataset))
+                                                                   make("DataType", DataType::QASYMM8)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 128), QuantizationInfo(2.2f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(1.f, 128) })),
+                                           make("DataLayout", { DataLayout::NHWC })), // NCHW is tested with int8
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
                                                                            large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.7f, 2) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                   make("DataType", DataType::QASYMM8)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.7f, 2) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 2U }),
+        make("DataType", DataType::QASYMM8),
+        make("SrcQuantizationInfo", { QuantizationInfo(2.2f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.1f, 128) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
@@ -515,22 +547,22 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                                            depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.8, 1) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })), // NCHW is tested with int8
-                                   ActivationFunctionsDataset))
+                                                                   make("DataType", DataType::QASYMM8)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.8, 1) })),
+                                           make("DataLayout", { DataLayout::NHWC })), // NCHW is tested with int8
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
                                                                            large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(1.3f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                   make("DataType", DataType::QASYMM8)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(1.3f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
@@ -540,22 +572,22 @@ TEST_SUITE(W3x3)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                                            depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                   make("DataType", DataType::QASYMM8)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10), QuantizationInfo(2.2f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                                            large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                   make("DataType", DataType::QASYMM8)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
@@ -563,33 +595,33 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                                            depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                   make("DataType", DataType::QASYMM8)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout, CLDepthwiseConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                                           framework::dataset::make("DepthMultiplier", { 2 })),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   framework::dataset::make("ActivationInfo", ActivationLayerInfo())))
+                                                                           make("DepthMultiplier", { 2 })),
+                                                                   make("DataType", DataType::QASYMM8)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                                            large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                   make("DataType", DataType::QASYMM8)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
@@ -602,22 +634,37 @@ TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                                                                            depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+                                                                   make("DataType", DataType::QASYMM8_SIGNED)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10), QuantizationInfo(2.2f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NCHW })),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout, CLDepthwiseConvolutionLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                                                                           framework::dataset::make("DepthMultiplier", { 2 })),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   framework::dataset::make("ActivationInfo", ActivationLayerInfo())))
+                                                                           make("DepthMultiplier", { 2 })),
+                                                                   make("DataType", DataType::QASYMM8_SIGNED)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10), QuantizationInfo(2.2f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NCHW })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, CLDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 2U }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
@@ -625,11 +672,11 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                                            depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.8, 1) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+                                                                   make("DataType", DataType::QASYMM8_SIGNED)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.8, 1) })),
+                                           make("DataLayout", { DataLayout::NCHW })),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
@@ -642,24 +689,40 @@ TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                                                                                    depth_multipliers),
-                                                                           framework::dataset::make("SrcDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                           make("SrcDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
                                                                                    large_depth_multipliers),
-                                                                           framework::dataset::make("SrcDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.7f, 2) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                           make("SrcDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.7f, 2) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 2U }),
+        make("SrcDataType", DataType::QASYMM8_SIGNED),
+        make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
@@ -667,24 +730,24 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                                                    depth_multipliers),
-                                                                           framework::dataset::make("SrcDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.8, 1) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                           make("SrcDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.8, 1) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
                                                                                    large_depth_multipliers),
-                                                                           framework::dataset::make("SrcDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                           make("SrcDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
@@ -694,24 +757,24 @@ TEST_SUITE(W3x3)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                                                    depth_multipliers),
-                                                                           framework::dataset::make("SrcDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                           make("SrcDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                                                    large_depth_multipliers),
-                                                                           framework::dataset::make("SrcDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                           make("SrcDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
@@ -719,24 +782,24 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                                                    depth_multipliers),
-                                                                           framework::dataset::make("SrcDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                           make("SrcDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                                                    large_depth_multipliers),
-                                                                           framework::dataset::make("SrcDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                           make("SrcDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
diff --git a/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp b/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
index 45047cab44c53e314f9189b7527fddae69866ffb..012018c0fcb9d47ee7736173849810d852a13a76 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,6 +42,7 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 using namespace arm_compute::misc::shape_calculator;
 
 // Create function for CLDepthwiseConvolutionLayerNativeKernel
@@ -62,76 +63,89 @@ RelativeTolerance<half_float::half>  rel_tolerance_f16(half_float::half(0.01f));
 constexpr float                      abs_tolerance_f16(0.03f);
 
 /** Width values to test - Precommit */
-const auto width_values_precommit = framework::dataset::make("width", { 1U, 33U } );
+const auto width_values_precommit = make("width", { 1U, 33U } );
 
 /** Width values to test - Nightly */
-const auto width_values_nightly = framework::dataset::make("width", { 53U, 47U } );
+const auto width_values_nightly = make("width", { 53U, 47U } );
 
 /** Height values to test - Precommit */
-const auto height_values_precommit = framework::dataset::make("height", { 19U } );
+const auto height_values_precommit = make("height", { 19U } );
 
 /** Height values to test - Nightly */
-const auto height_values_nightly = framework::dataset::make("height", { 39U, 43U } );
+const auto height_values_nightly = make("height", { 39U, 43U } );
 
 /** Channel values to test - Precommit */
-const auto channel_values_precommit = framework::dataset::make("channels", { 15U });
+const auto channel_values_precommit = make("channels", { 15U });
 
 /** Channel values to test - Nightly */
-const auto channel_values_nightly = framework::dataset::make("channels", { 33U, 19U });
+const auto channel_values_nightly = make("channels", { 33U, 19U });
 
 /** Channel values to test with cl_image support - Precommit */
-const auto channel_values_export_to_cl_image_precommit = framework::dataset::make("channels", { 16U });
+const auto channel_values_export_to_cl_image_precommit = make("channels", { 16U });
 
 /** Channel values to test with cl_image support - Nightly */
-const auto channel_values_export_to_cl_image_nightly = framework::dataset::make("channels", { 32U });
+const auto channel_values_export_to_cl_image_nightly = make("channels", { 32U });
 
 /** Batch values to test - Precommit */
-const auto batch_values_precommit = framework::dataset::make("batch", { 1U, 2U });
+const auto batch_values_precommit = make("batch", { 1U, 2U });
 
 /** Batch values to test - Nightly */
-const auto batch_values_nightly = framework::dataset::make("batch", { 1U, 3U });
+const auto batch_values_nightly = make("batch", { 3U });
 
 /** Kernel size values to test - Precommit */
-const auto kernel_sz_values_precommit = framework::dataset::make("kernel_size", { Size2D(1U, 1U), Size2D(1U, 3U), Size2D(5U, 5U) });
+const auto kernel_sz_values_precommit = make("kernel_size", { Size2D(1U, 1U), Size2D(1U, 3U), Size2D(5U, 5U) });
 
 /** Kernel size values to test - Nightly */
-const auto kernel_sz_values_nightly = framework::dataset::make("kernel_size", { Size2D(3U, 5U), Size2D(5U, 1U), Size2D(1U, 7U), Size2D(9U, 7U) });
+const auto kernel_sz_values_nightly = make("kernel_size", { Size2D(3U, 5U), Size2D(5U, 1U), Size2D(1U, 7U), Size2D(9U, 7U) });
 
 /** Depth multiplier values to test - All */
-const auto depth_multiplier_values = framework::dataset::make("depth_multiplier", {3U});
+const auto depth_multiplier_values = make("depth_multiplier", {3U});
 
 /** Dilation values to test - All */
-const auto dilation_values = framework::dataset::make("dilation", { Size2D(1U, 1U), Size2D(3U, 3U) });
+const auto dilation_values = make("dilation", { Size2D(1U, 1U), Size2D(3U, 3U) });
 
 /** Stride values to test - All */
-const auto stride_values = framework::dataset::make("stride", { Size2D(1U, 1U), Size2D(3U, 2U) });
+const auto stride_values = make("stride", { Size2D(1U, 1U), Size2D(3U, 2U) });
 
-/** Padding values to test - All */
-const auto padding_valid_values = framework::dataset::make("padding_valid", { true, false });
+/** Padding values to test - Precommit */
+const auto padding_valid_values = make("padding_valid", { true, false });
 
-/** Data type values to test - All */
-const auto data_type_values = framework::dataset::make("data_type", { DataType::F32, DataType::F16 });
+/** Padding values to test - Nightly */
+const auto padding_valid_values_nightly = make("padding_valid", { false });
 
 /** Data layout values to test - All */
-const auto data_layout_values = framework::dataset::make("data_layout", { DataLayout::NHWC });
+const auto data_layout_values = make("data_layout", { DataLayout::NHWC });
 
 /** N0 values to test - Precommit */
-const auto n0_values_precommit = framework::dataset::make("N0", {2, 4});
+const auto n0_values_precommit = make("N0", {2, 4});
 
 /** N0 values to test - Nightly */
-const auto n0_values_nightly = framework::dataset::make("N0", {3, 8});
+const auto n0_values_nightly = make("N0", {3, 8});
 
 /** N0 values to test with cl_image support - Precommit */
-const auto n0_values_export_to_cl_image_precommit = framework::dataset::make("N0", {4});
+const auto n0_values_export_to_cl_image_precommit = make("N0", {4});
 
 /** N0 values to test with cl_image support - Nightly */
-const auto n0_values_export_to_cl_image_nightly = framework::dataset::make("N0", {8});
+const auto n0_values_export_to_cl_image_nightly = make("N0", {8});
 
-/** Activation values to test */
-const auto act_values = framework::dataset::make("Activation",
+/** Activation values to test in precommit */
+const auto act_values = make("Activation", { ActivationLayerInfo() });
+
+const auto activations_rest = make("Activation",
 {
-    ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f, 0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.8f, -0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SQUARE),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::HARD_SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 2.f, 1.f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::GELU)
 });
 
 } // namespace
@@ -147,15 +161,15 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<fl
                                                                                                 channel_values_precommit),
                                                                                                 batch_values_precommit),
                                                                                                 kernel_sz_values_precommit),
-                                                                                                framework::dataset::make("depth_multiplier", 1)),
+                                                                                                make("depth_multiplier", 1)),
                                                                                                 dilation_values),
                                                                                                 stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F32)),
+                                                                                                make("DataType", DataType::F32)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
                                                                                                 n0_values_precommit),
-                                                                                                framework::dataset::make("ExportToCLImage", false)))
+                                                                                                make("ExportToCLImage", false)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
@@ -168,15 +182,36 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<fl
                                                                                                 channel_values_nightly),
                                                                                                 batch_values_nightly),
                                                                                                 kernel_sz_values_nightly),
-                                                                                                framework::dataset::make("depth_multiplier", 1)),
+                                                                                                make("depth_multiplier", 1)),
                                                                                                 dilation_values),
                                                                                                 stride_values),
-                                                                                                padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F32)),
+                                                                                                padding_valid_values_nightly),
+                                                                                                make("DataType", DataType::F32)),
                                                                                                 data_layout_values),
-                                                                                                act_values),
+                                                                                                make("Activation", { ActivationLayerInfo() })),
                                                                                                 n0_values_nightly),
-                                                                                                framework::dataset::make("ExportToCLImage", false)))
+                                                                                                make("ExportToCLImage", false)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
+    combine(
+        make("width", { 33U } ),
+        height_values_precommit,
+        channel_values_precommit,
+        make("batch", { 2U } ),
+        make("kernel_size", { Size2D(5U, 5U) }),
+        make("depth_multiplier", 1),
+        make("dilation", Size2D(3U, 3U)),
+        make("stride", Size2D(3U, 2U)),
+        padding_valid_values_nightly,
+        make("DataType", DataType::F32),
+        data_layout_values,
+        activations_rest,
+        n0_values_precommit,
+        make("ExportToCLImage", false)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
@@ -190,15 +225,15 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<fl
                                                                                                 channel_values_export_to_cl_image_precommit),
                                                                                                 batch_values_precommit),
                                                                                                 kernel_sz_values_precommit),
-                                                                                                framework::dataset::make("depth_multiplier", 1)),
+                                                                                                make("depth_multiplier", 1)),
                                                                                                 dilation_values),
                                                                                                 stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F32)),
+                                                                                                make("DataType", DataType::F32)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
                                                                                                 n0_values_export_to_cl_image_precommit),
-                                                                                                framework::dataset::make("ExportToCLImage", true)))
+                                                                                                make("ExportToCLImage", true)))
 {
    // Validate output
     if(_validate_output)
@@ -220,15 +255,15 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<fl
                                                                                                 channel_values_export_to_cl_image_nightly),
                                                                                                 batch_values_nightly),
                                                                                                 kernel_sz_values_nightly),
-                                                                                                framework::dataset::make("depth_multiplier", 1)),
+                                                                                                make("depth_multiplier", 1)),
                                                                                                 dilation_values),
                                                                                                 stride_values),
-                                                                                                padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F32)),
+                                                                                                padding_valid_values_nightly),
+                                                                                                make("DataType", DataType::F32)),
                                                                                                 data_layout_values),
-                                                                                                act_values),
+                                                                                                make("Activation", { ActivationLayerInfo() })),
                                                                                                 n0_values_export_to_cl_image_nightly),
-                                                                                                framework::dataset::make("ExportToCLImage", true)))
+                                                                                                make("ExportToCLImage", true)))
 {
    // Validate output
     if(_validate_output)
@@ -254,36 +289,56 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<ha
                                                                                                 channel_values_precommit),
                                                                                                 batch_values_precommit),
                                                                                                 kernel_sz_values_precommit),
-                                                                                                framework::dataset::make("depth_multiplier", 1)),
+                                                                                                make("depth_multiplier", 1)),
                                                                                                 dilation_values),
                                                                                                 stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F16)),
+                                                                                                make("DataType", DataType::F16)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
                                                                                                 n0_values_precommit),
-                                                                                                framework::dataset::make("ExportToCLImage", false)))
+                                                                                                make("ExportToCLImage", false)))
 {
     // Validate output
         validate(CLAccessor(_target), _reference, rel_tolerance_f16);
 }
-
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::NIGHTLY,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                width_values_nightly,
-                                                                                                height_values_nightly),
-                                                                                                channel_values_nightly),
+                                                                                                make("width", { 47U } ),
+                                                                                                make("height", { 39U } )),
+                                                                                                make("channels", { 19U } )),
                                                                                                 batch_values_nightly),
-                                                                                                kernel_sz_values_nightly),
-                                                                                                framework::dataset::make("depth_multiplier", 1)),
-                                                                                                dilation_values),
-                                                                                                stride_values),
-                                                                                                padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F16)),
+                                                                                                make("kernel_size", { Size2D(5U, 5U) })),
+                                                                                                make("depth_multiplier", 1)),
+                                                                                                make("dilation", { Size2D(3U, 3U) })),
+                                                                                                make("stride", { Size2D(3U, 2U) })),
+                                                                                                padding_valid_values_nightly),
+                                                                                                make("DataType", DataType::F16)),
                                                                                                 data_layout_values),
-                                                                                                act_values),
+                                                                                                make("Activation", { ActivationLayerInfo() })),
                                                                                                 n0_values_nightly),
-                                                                                                framework::dataset::make("ExportToCLImage", false)))
+                                                                                                make("ExportToCLImage", false)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
+    combine(
+        make("width", { 33U } ),
+        height_values_precommit,
+        channel_values_precommit,
+        make("batch", { 2U } ),
+        make("kernel_size", { Size2D(5U, 5U) }),
+        make("depth_multiplier", 4),
+        make("dilation", Size2D(3U, 3U)),
+        make("stride", Size2D(3U, 2U)),
+        padding_valid_values_nightly,
+        make("DataType", DataType::F16),
+        data_layout_values,
+        activations_rest,
+        n0_values_precommit,
+        make("ExportToCLImage", false)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
@@ -297,15 +352,15 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<ha
                                                                                                 channel_values_export_to_cl_image_precommit),
                                                                                                 batch_values_precommit),
                                                                                                 kernel_sz_values_precommit),
-                                                                                                framework::dataset::make("depth_multiplier", 1)),
+                                                                                                make("depth_multiplier", 1)),
                                                                                                 dilation_values),
                                                                                                 stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F16)),
+                                                                                                make("DataType", DataType::F16)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
                                                                                                 n0_values_export_to_cl_image_precommit),
-                                                                                                framework::dataset::make("ExportToCLImage", true)))
+                                                                                                make("ExportToCLImage", true)))
 {
    // Validate output
     if(_validate_output)
@@ -319,23 +374,22 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<ha
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
-
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::NIGHTLY,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                width_values_nightly,
-                                                                                                height_values_nightly),
+                                                                                                make("width", { 47U } ),
+                                                                                                make("height", { 39U } )),
                                                                                                 channel_values_export_to_cl_image_nightly),
                                                                                                 batch_values_nightly),
-                                                                                                kernel_sz_values_nightly),
-                                                                                                framework::dataset::make("depth_multiplier", 1)),
-                                                                                                dilation_values),
-                                                                                                stride_values),
-                                                                                                padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F16)),
+                                                                                                make("kernel_size", { Size2D(5U, 5U) })),
+                                                                                                make("depth_multiplier", 1)),
+                                                                                                make("dilation", { Size2D(3U, 3U) })),
+                                                                                                make("stride", { Size2D(3U, 2U) })),
+                                                                                                padding_valid_values_nightly),
+                                                                                                make("DataType", DataType::F16)),
                                                                                                 data_layout_values),
-                                                                                                act_values),
+                                                                                                make("Activation", { ActivationLayerInfo() })),
                                                                                                 n0_values_export_to_cl_image_nightly),
-                                                                                                framework::dataset::make("ExportToCLImage", true)))
+                                                                                                make("ExportToCLImage", true)))
 {
    // Validate output
     if(_validate_output)
@@ -349,16 +403,16 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<ha
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
-
 TEST_SUITE_END() // ExportWeightsToCLImage
 TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
+
 TEST_SUITE(DepthMultiplier)
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                framework::dataset::make("width", { 33U } ),
+                                                                                                make("width", { 33U } ),
                                                                                                 height_values_precommit),
                                                                                                 channel_values_precommit),
                                                                                                 batch_values_precommit),
@@ -367,11 +421,11 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<fl
                                                                                                 dilation_values),
                                                                                                 stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F32)),
+                                                                                                make("DataType", DataType::F32)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
-                                                                                                framework::dataset::make("N0", 1)),
-                                                                                                framework::dataset::make("ExportToCLImage", false)))
+                                                                                                make("N0", 1)),
+                                                                                                make("ExportToCLImage", false)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
@@ -379,7 +433,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<fl
 
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::NIGHTLY,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                framework::dataset::make("width", { 53U } ),
+                                                                                                make("width", { 53U } ),
                                                                                                 height_values_nightly),
                                                                                                 channel_values_nightly),
                                                                                                 batch_values_nightly),
@@ -387,12 +441,12 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<fl
                                                                                                 depth_multiplier_values),
                                                                                                 dilation_values),
                                                                                                 stride_values),
-                                                                                                padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F32)),
+                                                                                                padding_valid_values_nightly),
+                                                                                                make("DataType", DataType::F32)),
                                                                                                 data_layout_values),
-                                                                                                act_values),
-                                                                                                framework::dataset::make("N0", 1)),
-                                                                                                framework::dataset::make("ExportToCLImage", false)))
+                                                                                                make("Activation", { ActivationLayerInfo() })),
+                                                                                                make("N0", 1)),
+                                                                                                make("ExportToCLImage", false)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
@@ -401,20 +455,20 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<fl
 TEST_SUITE(DepthMultiplierMultipleOfOutputChannels)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                framework::dataset::make("width", { 33U } ),
+                                                                                                make("width", { 33U } ),
                                                                                                 height_values_precommit),
                                                                                                 channel_values_precommit),
                                                                                                 batch_values_precommit),
                                                                                                 kernel_sz_values_precommit),
-                                                                                                framework::dataset::make("depth_multiplier", 2)),
+                                                                                                make("depth_multiplier", 2)),
                                                                                                 dilation_values),
                                                                                                 stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F32)),
+                                                                                                make("DataType", DataType::F32)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
-                                                                                                framework::dataset::make("N0", {2})),
-                                                                                                framework::dataset::make("ExportToCLImage", false)))
+                                                                                                make("N0", {2})),
+                                                                                                make("ExportToCLImage", false)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
@@ -423,20 +477,20 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<fl
 TEST_SUITE(ExportWeightsToCLImage)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                framework::dataset::make("width", { 33U } ),
+                                                                                                make("width", { 33U } ),
                                                                                                 height_values_precommit),
                                                                                                 channel_values_precommit),
                                                                                                 batch_values_precommit),
                                                                                                 kernel_sz_values_precommit),
-                                                                                                framework::dataset::make("depth_multiplier", 4)),
+                                                                                                make("depth_multiplier", 4)),
                                                                                                 dilation_values),
                                                                                                 stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F32)),
+                                                                                                make("DataType", DataType::F32)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
-                                                                                                framework::dataset::make("N0", {4})),
-                                                                                                framework::dataset::make("ExportToCLImage", true)))
+                                                                                                make("N0", {4})),
+                                                                                                make("ExportToCLImage", true)))
 {
    // Validate output
     if(_validate_output)
@@ -457,7 +511,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                framework::dataset::make("width", { 33U } ),
+                                                                                                make("width", { 33U } ),
                                                                                                 height_values_precommit),
                                                                                                 channel_values_precommit),
                                                                                                 batch_values_precommit),
@@ -466,11 +520,11 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<ha
                                                                                                 dilation_values),
                                                                                                 stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F16)),
+                                                                                                make("DataType", DataType::F16)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
-                                                                                                framework::dataset::make("N0", 1)),
-                                                                                                framework::dataset::make("ExportToCLImage", false)))
+                                                                                                make("N0", 1)),
+                                                                                                make("ExportToCLImage", false)))
 {
     // Validate output
         validate(CLAccessor(_target), _reference, rel_tolerance_f16);
@@ -478,7 +532,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<ha
 
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::NIGHTLY,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                framework::dataset::make("width", { 53U } ),
+                                                                                                make("width", { 53U } ),
                                                                                                 height_values_nightly),
                                                                                                 channel_values_nightly),
                                                                                                 batch_values_nightly),
@@ -486,12 +540,12 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<ha
                                                                                                 depth_multiplier_values),
                                                                                                 dilation_values),
                                                                                                 stride_values),
-                                                                                                padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F16)),
+                                                                                                padding_valid_values_nightly),
+                                                                                                make("DataType", DataType::F16)),
                                                                                                 data_layout_values),
-                                                                                                act_values),
-                                                                                                framework::dataset::make("N0", 1)),
-                                                                                                framework::dataset::make("ExportToCLImage", false)))
+                                                                                                make("Activation", { ActivationLayerInfo() })),
+                                                                                                make("N0", 1)),
+                                                                                                make("ExportToCLImage", false)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
@@ -500,20 +554,20 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<ha
 TEST_SUITE(DepthMultiplierMultipleOfOutputChannels)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                framework::dataset::make("width", { 33U } ),
+                                                                                                make("width", { 33U } ),
                                                                                                 height_values_precommit),
                                                                                                 channel_values_precommit),
                                                                                                 batch_values_precommit),
                                                                                                 kernel_sz_values_precommit),
-                                                                                                framework::dataset::make("depth_multiplier", 2)),
+                                                                                                make("depth_multiplier", 2)),
                                                                                                 dilation_values),
                                                                                                 stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F16)),
+                                                                                                make("DataType", DataType::F16)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
-                                                                                                framework::dataset::make("N0", {2})),
-                                                                                                framework::dataset::make("ExportToCLImage", false)))
+                                                                                                make("N0", {2})),
+                                                                                                make("ExportToCLImage", false)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
@@ -522,20 +576,20 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<ha
 TEST_SUITE(ExportWeightsToCLImage)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                framework::dataset::make("width", { 33U } ),
+                                                                                                make("width", { 33U } ),
                                                                                                 height_values_precommit),
                                                                                                 channel_values_precommit),
                                                                                                 batch_values_precommit),
                                                                                                 kernel_sz_values_precommit),
-                                                                                                framework::dataset::make("depth_multiplier", 4)),
+                                                                                                make("depth_multiplier", 4)),
                                                                                                 dilation_values),
                                                                                                 stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F16)),
+                                                                                                make("DataType", DataType::F16)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
-                                                                                                framework::dataset::make("N0", {4})),
-                                                                                                framework::dataset::make("ExportToCLImage", true)))
+                                                                                                make("N0", {4})),
+                                                                                                make("ExportToCLImage", true)))
 {
    // Validate output
     if(_validate_output)
diff --git a/tests/validation/CL/DilatedConvolutionLayer.cpp b/tests/validation/CL/DilatedConvolutionLayer.cpp
index 9a9df2c7e4c98d92ffbac30d59492a72bc909af0..776bf3415194e866ba83ee331266c50610b701d9 100644
--- a/tests/validation/CL/DilatedConvolutionLayer.cpp
+++ b/tests/validation/CL/DilatedConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -167,13 +167,18 @@ template <typename T>
 using CLGEMMDilatedConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T>;
 
 TEST_SUITE(Quantized)
+/// @note: Every asymmetric quantized test where there's no fused activation will have its quantization info ignored
+/// This is because instead of using the same quantization information for all the tensors, the fixture generates
+/// separate quantization info for each input and the output tensor.
+/// When we can also support dynamic quantization with the presence of activation, we can remove the explicit
+/// quantization info.
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMDilatedConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                        combine(combine(combine(combine(combine(datasets::SmallDilatedConvolutionLayerDataset(),
                                                                framework::dataset::make("ReshapeWeights", { true })),
                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("IgnoredQuantizationInfo", { QuantizationInfo() })),
                                framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
 {
     // Validate output
@@ -185,7 +190,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMDilatedConvolutionLayerQuantizedFixture<u
                                                                framework::dataset::make("ReshapeWeights", { true })),
                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 0) })),
+                                       framework::dataset::make("IgnoredQuantizationInfo", { QuantizationInfo() })),
                                framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
 {
     // Validate output
diff --git a/tests/validation/CL/DirectConvolutionLayer.cpp b/tests/validation/CL/DirectConvolutionLayer.cpp
index 342a093ca367f6bae415323137e23e4490bec7dc..ff22ae5ef091814e230b62e875c880a6185675fc 100644
--- a/tests/validation/CL/DirectConvolutionLayer.cpp
+++ b/tests/validation/CL/DirectConvolutionLayer.cpp
@@ -605,103 +605,197 @@ FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionValidationWithTensorShapesFixture
 TEST_SUITE_END() // FP32_CustomDataset
 TEST_SUITE_END() // Float
 
+/// @note: Every quantized test has a version with or without activation because the quantization info given is
+/// ignored when there is no activation. Instead of using the same quantization information for all the tensors, the
+/// fixture generates separate quantization info for each input and the output tensor.
+/// When we can also support dynamic quantization with the presence of activation, these two versions should be merged
+/// again, with the explicitly specified quantization info removed
 const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
 {
-    ActivationLayerInfo(),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
 });
+const auto NoActivation = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo()
+});
+const auto IgnoredQuantizationInfo = framework::dataset::make("IgnoredQuantizationInfo",
+{
+    QuantizationInfo()
+});
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLDirectConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(data_precommit,
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10) })),
-                       QuantizedActivationFunctionsDataset),
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLDirectConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(data_precommit,
+                       framework::dataset::make("DataType", DataType::QASYMM8),
+                       IgnoredQuantizationInfo,
+                       NoActivation,
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, CLDirectConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(data_precommit,
+                       framework::dataset::make("DataType", DataType::QASYMM8),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10) }),
+                       QuantizedActivationFunctionsDataset,
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(data_precommit,
+                       framework::dataset::make("DataType", DataType::QASYMM8),
+                       IgnoredQuantizationInfo,
+                       NoActivation,
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallWithActivation, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(data_precommit,
+                       framework::dataset::make("DataType", DataType::QASYMM8),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) }),
+                       QuantizedActivationFunctionsDataset,
                        framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(data_precommit,
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })),
-                       QuantizedActivationFunctionsDataset),
+FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(data_precommit_9x9,
+                       framework::dataset::make("DataType",
+                                                DataType::QASYMM8),
+                       IgnoredQuantizationInfo,
+                       NoActivation,
                        framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(data_precommit_9x9,
+FIXTURE_DATA_TEST_CASE(RunSmall9x9WithActivation, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(data_precommit_9x9,
                        framework::dataset::make("DataType",
-                                                DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(3.f / 255, 10), QuantizationInfo(1.1f, 10) })),
-                       QuantizedActivationFunctionsDataset),
+                                                DataType::QASYMM8),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(3.f / 255, 10), QuantizationInfo(1.1f, 10) }),
+                       QuantizedActivationFunctionsDataset,
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(data_nightly, framework::dataset::make("DataType",
+                       DataType::QASYMM8),
+                       IgnoredQuantizationInfo,
+                       NoActivation,
                        framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(data_nightly, framework::dataset::make("DataType",
-                       DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })),
-                       QuantizedActivationFunctionsDataset),
+FIXTURE_DATA_TEST_CASE(RunLargeWithActivation, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(data_nightly, framework::dataset::make("DataType",
+                       DataType::QASYMM8),
+                       framework::dataset::make("QuantizationInfoIf", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) }),
+                       QuantizedActivationFunctionsDataset,
                        framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge9x9, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(data_nightly_9x9,
+FIXTURE_DATA_TEST_CASE(RunLarge9x9, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(data_nightly_9x9,
                        framework::dataset::make("DataType",
-                                                DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(3.f / 255, 10), QuantizationInfo(1.1f, 10) })),
-                       QuantizedActivationFunctionsDataset),
+                                                DataType::QASYMM8),
+                       IgnoredQuantizationInfo,
+                       NoActivation,
                        framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-
-TEST_SUITE_END() // QASYMM8
-
-TEST_SUITE(QASYMM8_CustomDataset)
-FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionValidationWithTensorShapesQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::DirectConvolutionLayerDataset(),
-                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127), QuantizationInfo(1.1f, 10) })),
-                                       QuantizedActivationFunctionsDataset),
+FIXTURE_DATA_TEST_CASE(RunLarge9x9WithActivation, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(data_nightly_9x9,
+                       framework::dataset::make("DataType",
+                                                DataType::QASYMM8),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(3.f / 255, 10), QuantizationInfo(1.1f, 10) }),
+                       QuantizedActivationFunctionsDataset,
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(CustomDataset, CLDirectConvolutionValidationWithTensorShapesQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(datasets::DirectConvolutionLayerDataset(),
+                                                       framework::dataset::make("DataType", DataType::QASYMM8),
+                                               IgnoredQuantizationInfo,
+                                       NoActivation,
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(CustomDatasetWithActivation, CLDirectConvolutionValidationWithTensorShapesQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(datasets::DirectConvolutionLayerDataset(),
+                                                       framework::dataset::make("DataType", DataType::QASYMM8),
+                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127), QuantizationInfo(1.1f, 10) }),
+                                       QuantizedActivationFunctionsDataset,
                                framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-TEST_SUITE_END() // QASYMM8_CustomDataset
+TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(data_precommit, framework::dataset::make("DataType",
-                                                                                                                        DataType::QASYMM8_SIGNED)),
-                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, -10) })),
-                                                                                                                        QuantizedActivationFunctionsDataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(data_precommit, framework::dataset::make("DataType",
+                                                                                                                        DataType::QASYMM8_SIGNED),
+                                                                                                                        IgnoredQuantizationInfo,
+                                                                                                                        NoActivation,
                                                                                                                         framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLDirectConvolutionLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(data_precommit,
+FIXTURE_DATA_TEST_CASE(RunSmallWithActivation, CLDirectConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(data_precommit, framework::dataset::make("DataType",
+                                                                                                                        DataType::QASYMM8_SIGNED),
+                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, -10) }),
+                                                                                                                        QuantizedActivationFunctionsDataset,
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLDirectConvolutionLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::ALL, combine(data_precommit,
                        framework::dataset::make("DataType",
-                                                DataType::QASYMM8_SIGNED)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.1f, -10) })),
-                       QuantizedActivationFunctionsDataset),
+                                                DataType::QASYMM8_SIGNED),
+                       IgnoredQuantizationInfo,
+                       NoActivation,
                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(data_precommit_9x9,
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, CLDirectConvolutionLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::ALL, combine(data_precommit,
                        framework::dataset::make("DataType",
-                                                DataType::QASYMM8_SIGNED)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })),
-                       QuantizedActivationFunctionsDataset),
+                                                DataType::QASYMM8_SIGNED),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.1f, -10) }),
+                       QuantizedActivationFunctionsDataset,
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(data_precommit_9x9,
+                       framework::dataset::make("DataType",
+                                                DataType::QASYMM8_SIGNED),
+                       IgnoredQuantizationInfo,
+                       NoActivation,
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall9x9WithActivation, CLDirectConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(data_precommit_9x9,
+                       framework::dataset::make("DataType",
+                                                DataType::QASYMM8_SIGNED),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) }),
+                       QuantizedActivationFunctionsDataset,
                        framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
@@ -709,10 +803,21 @@ FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerQuantizedFixture<int
 }
 
 FIXTURE_DATA_TEST_CASE(RunCustomDataset, CLDirectConvolutionValidationWithTensorShapesQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::DirectConvolutionLayerDataset(),
-                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127), QuantizationInfo(1.1f, 10) })),
-                                       QuantizedActivationFunctionsDataset),
+                       combine(datasets::DirectConvolutionLayerDataset(),
+                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED),
+                                               IgnoredQuantizationInfo,
+                                       NoActivation,
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunCustomDatasetWithActivation, CLDirectConvolutionValidationWithTensorShapesQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(datasets::DirectConvolutionLayerDataset(),
+                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED),
+                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127), QuantizationInfo(1.1f, 10) }),
+                                       QuantizedActivationFunctionsDataset,
                                framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
diff --git a/tests/validation/CL/FullyConnectedLayer.cpp b/tests/validation/CL/FullyConnectedLayer.cpp
index 474a87dd1cbfdd74a1a0c025f27b8d121e708e5e..2f0c86499b7924a285bc8b1738be7150ed941959 100644
--- a/tests/validation/CL/FullyConnectedLayer.cpp
+++ b/tests/validation/CL/FullyConnectedLayer.cpp
@@ -40,6 +40,7 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 /** Tolerance for float operations */
@@ -51,15 +52,20 @@ constexpr float                     tolerance_num = 0.07f;      /**< Tolerance n
 /** Tolerance for quantized asymmetric operations */
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
 
-const auto FullyConnectedParameters = combine(framework::dataset::make("TransposeWeights", { false, true }), framework::dataset::make("ReshapeWeights", { false, true }));
+const auto FullyConnectedParameters = combine(make("TransposeWeights", { false, true }), make("ReshapeWeights", { false, true }));
 
-const auto QuantizationData = framework::dataset::make("QuantizationInfo",
+const auto QuantizationData = make("QuantizationInfo",
 {
     QuantizationInfo(1.f / 255.f, 10),
     QuantizationInfo(1.1f, 10),
 });
 
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+const auto IgnoredQuantizationData = make("IgnoredQuantizationInfo",
+{
+    QuantizationInfo(),
+});
+
+const auto ActivationFunctionsDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
@@ -68,11 +74,16 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH)
 });
 
-const auto ActivationFunctionsQuantizedDataset = concat(concat(concat(
-                                                                   framework::dataset::make("ActivationInfo", ActivationLayerInfo()),
-                                                                   framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))),
-                                                               framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f))),
-                                                        framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.75f, 0.25f)));
+// This dataset case only runs with dynamic quantization
+const auto NoActivationFunctionsQuantizedDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo()
+});
+
+const auto ActivationFunctionsQuantizedDataset = concat(concat(
+                                                        make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                                                        make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f))),
+                                                        make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.75f, 0.25f)));
 } // namespace
 
 TEST_SUITE(CL)
@@ -81,33 +92,33 @@ TEST_SUITE(FullyConnectedLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-    framework::dataset::make("InputInfo", { TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Mismatching data types
+    make("InputInfo", { TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Mismatching data types
                                             TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Invalid weights dimensions
                                             TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Wrongly reshaped weights
                                           }),
-    framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(315U, 271U), 1, DataType::F16),
+    make("WeightsInfo",{ TensorInfo(TensorShape(315U, 271U), 1, DataType::F16),
                                              TensorInfo(TensorShape(192U, 192U), 1, DataType::F32),
                                              TensorInfo(TensorShape(192U, 192U), 1, DataType::F32),
                                              TensorInfo(TensorShape(217U, 231U), 1, DataType::F32),
                                              TensorInfo(TensorShape(217U, 315U), 1, DataType::F32),
                                           })),
-    framework::dataset::make("BiasInfo",{ TensorInfo(TensorShape(271U), 1, DataType::F32),
+    make("BiasInfo",{ TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           TensorInfo(TensorShape(192U), 1, DataType::F32),
                                           TensorInfo(TensorShape(192U), 1, DataType::F32),
                                           TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
+    make("OutputInfo",{ TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(192U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(192U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                            })),
-    framework::dataset::make("TransposeWeights",{ true, true, false, true, true })),
-    framework::dataset::make("ReshapedWeights",{ false, false, false, false, false})),
-    framework::dataset::make("Expected", { false, true, true, false, false })),
+    make("TransposeWeights",{ true, true, false, true, true })),
+    make("ReshapedWeights",{ false, false, false, false, false})),
+    make("Expected", { false, true, true, false, false })),
     input_info, weights_info, bias_info, output_info, transpose_weights, reshaped_weights, expected)
 {
     // Create Fully Connected layer info
@@ -136,64 +147,64 @@ using CLFullyConnectedNoBiasFixture = FullyConnectedDynamicNoBiasFixture<CLTenso
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                                                                                                                        FullyConnectedParameters),
-                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                                                                                                                        FullyConnectedParameters,
+                                                                                                                        make("DataType", DataType::F16),
                                                                                                                 ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(),
-                                                                                                                      FullyConnectedParameters),
-                                                                                                                      framework::dataset::make("DataType", DataType::F16)),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeFullyConnectedLayerDataset(),
+                                                                                                                      FullyConnectedParameters,
+                                                                                                                      make("DataType", DataType::F16),
                                                                                                               ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                       framework::dataset::make("DataType", DataType::F16)),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
-                       framework::dataset::make("WeightsReshaped", { false, true })))
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::F16),
+                       make("ActivationInfo", ActivationLayerInfo()),
+                       make("WeightsReshaped", { false, true })))
 {
 }
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters),
-                                                                                                                 framework::dataset::make("DataType", DataType::F32)),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters,
+                                                                                                                 make("DataType", DataType::F32),
                                                                                                                  ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLFullyConnectedLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(
-                           framework::dataset::make("Input", TensorShape(9U, 5U, 7U)),
-                           framework::dataset::make("Weights", TensorShape(315U, 271U))),
-                       framework::dataset::make("Biases", TensorShape(271U))),
-                       framework::dataset::make("Output", TensorShape(271U))),
-                       FullyConnectedParameters),
-                       framework::dataset::make("DataType", DataType::F32)),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLFullyConnectedLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(
+                           make("Input", TensorShape(9U, 5U, 7U)),
+                           make("Weights", TensorShape(315U, 271U)),
+                       make("Biases", TensorShape(271U)),
+                       make("Output", TensorShape(271U)),
+                       FullyConnectedParameters,
+                       make("DataType", DataType::F32),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                       framework::dataset::make("DataType", DataType::F32)),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
-                       framework::dataset::make("WeightsReshaped", { false, true })))
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::F32),
+                       make("ActivationInfo", ActivationLayerInfo()),
+                       make("WeightsReshaped", { false, true })))
 {
 }
-FIXTURE_DATA_TEST_CASE(RunDynamicNoBias, CLFullyConnectedNoBiasFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                       framework::dataset::make("DataType", DataType::F32)),
-                       framework::dataset::make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })),
-                       framework::dataset::make("WeightsReshaped", { false })))
+FIXTURE_DATA_TEST_CASE(RunDynamicNoBias, CLFullyConnectedNoBiasFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::F32),
+                       make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) }),
+                       make("WeightsReshaped", { false })))
 {
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters),
-                                                                                                                       framework::dataset::make("DataType", DataType::F32)),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters,
+                                                                                                                make("DataType", DataType::F32),
                                                                                                                ActivationFunctionsDataset))
 {
     // Validate output
@@ -209,73 +220,130 @@ using CLFullyConnectedLayerQuantizedMixedDataLayoutFixture = FullyConnectedLayer
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters), framework::dataset::make("DataType", DataType::QASYMM8)), QuantizationData),
+FIXTURE_DATA_TEST_CASE(RunSmallWithActivation, CLFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters, make("DataType", DataType::QASYMM8), QuantizationData,
                                ActivationFunctionsQuantizedDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLFullyConnectedLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(combine(
-                                                                           framework::dataset::make("Input", TensorShape(9U, 5U, 7U)),
-                                                                           framework::dataset::make("Weights", TensorShape(315U, 271U))),
-                                                                       framework::dataset::make("Biases", TensorShape(271U))),
-                                                               framework::dataset::make("Output", TensorShape(271U))),
-                                                       FullyConnectedParameters),
-                                               framework::dataset::make("DataType", DataType::QASYMM8)),
-                                       QuantizationData),
-                               framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, CLFullyConnectedLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8),
+                                       QuantizationData,
+                               make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters), framework::dataset::make("DataType", DataType::QASYMM8)), QuantizationData),
+FIXTURE_DATA_TEST_CASE(RunLargeWithActivation, CLFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                            combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters, make("DataType", DataType::QASYMM8), QuantizationData,
                                ActivationFunctionsQuantizedDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
-                       framework::dataset::make("WeightsReshaped", { false /* COMPMID-6000: Support FullyConnected with quantized dynamic weights already reshaped */ })))
+
+// Dynamic Quantization Tests
+FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters, make("DataType", DataType::QASYMM8), IgnoredQuantizationData,
+                        NoActivationFunctionsQuantizedDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                        combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters, make("DataType", DataType::QASYMM8), IgnoredQuantizationData,
+                               NoActivationFunctionsQuantizedDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                        NoActivationFunctionsQuantizedDataset,
+                       make("WeightsReshaped", { false /* COMPMID-6000: Support FullyConnected with quantized dynamic weights already reshaped */ })))
 {
 }
+
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLFullyConnectedLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                    combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8),
+                                       IgnoredQuantizationData,
+                               NoActivationFunctionsQuantizedDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
 TEST_SUITE_END() /* QASYMM8 */
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), QuantizationData),
+FIXTURE_DATA_TEST_CASE(RunSmallWithActivation, CLFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters, make("DataType", DataType::QASYMM8_SIGNED), QuantizationData,
                                ActivationFunctionsQuantizedDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, CLFullyConnectedLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                            combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8_SIGNED),
+                                       QuantizationData,
+                               make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+// Dynamic Quantization tests below
+FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                            combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters, make("DataType", DataType::QASYMM8_SIGNED), IgnoredQuantizationData,
+                               NoActivationFunctionsQuantizedDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
 FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLFullyConnectedLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(combine(
-                                                                           framework::dataset::make("Input", TensorShape(9U, 5U, 7U)),
-                                                                           framework::dataset::make("Weights", TensorShape(315U, 271U))),
-                                                                       framework::dataset::make("Biases", TensorShape(271U))),
-                                                               framework::dataset::make("Output", TensorShape(271U))),
-                                                       FullyConnectedParameters),
-                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                       QuantizationData),
-                               framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+                                                                    combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8_SIGNED),
+                                       IgnoredQuantizationData,
+                               NoActivationFunctionsQuantizedDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
-                       framework::dataset::make("WeightsReshaped", { false /* COMPMID-6000: Support FullyConnected with quantized dynamic weights already reshaped */ })))
+
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8_SIGNED),
+                       make("ActivationInfo", ActivationLayerInfo()),
+                       make("WeightsReshaped", { false /* COMPMID-6000: Support FullyConnected with quantized dynamic weights already reshaped */ })))
 {
 }
-FIXTURE_DATA_TEST_CASE(RunDynamicNoBias, CLFullyConnectedNoBiasFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
-                       framework::dataset::make("WeightsReshaped", { false /* COMPMID-6000: Support FullyConnected with quantized dynamic weights already reshaped */ })))
+FIXTURE_DATA_TEST_CASE(RunDynamicNoBias, CLFullyConnectedNoBiasFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8_SIGNED),
+                       make("ActivationInfo", ActivationLayerInfo()),
+                       make("WeightsReshaped", { false /* COMPMID-6000: Support FullyConnected with quantized dynamic weights already reshaped */ })))
 {
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
diff --git a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
index 7f63a03371741b4979acc7394271cd557b1b4a83..0ddf43766f78ec7b495215f0e9d916618f2fee78 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,11 +53,6 @@ using CLGEMMMatrixMultiplyNative = CLSynthetizeOperator<ClGemmMatrixMultiplyNati
 template <typename T>
 using CLGEMMMatrixMultiplyNativeFixture = GEMMMatrixMultiplyNativeValidationFixture<CLTensor, CLAccessor, T, CLGEMMMatrixMultiplyNative>;
 
-// Fixture for CLGEMMMatrixMultiplyNative with post ops
-template <typename T>
-using CLGEMMMatrixMultiplyNativeWithPostOpsFixture =
-    GEMMMatrixMultiplyNativeWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMMatrixMultiplyNative>;
-
 // Fixture for CLGEMMMatrixMultiplyNative3D
 template <typename T>
 using CLGEMMMatrixMultiplyNative3DFixture = GEMMMatrixMultiplyNative3DValidationFixture<CLTensor, CLAccessor, T, CLGEMMMatrixMultiplyNative>;
@@ -146,105 +141,6 @@ const auto boundary_handling_cases = combine(combine(combine(combine(combine(com
                                     broadcast_bias_values),
                                     framework::dataset::make("Activation", ActivationLayerInfo()));
 
-/** Post Ops */
-using PostOpArgBroadcast =  CLGEMMMatrixMultiplyNativeWithPostOpsFixture<float>::PostOpArgBroadcast;
-experimental::PostOpList<PostOpArgBroadcast> post_ops_1()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(true, true, false),   // If broadcast in dims 0, 1 and 2
-        0,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_2()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(false, true, true),   // If broadcast in dims 0, 1 and 2
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_3()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    // post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, false),  // If broadcast in dims 0, 1 and 2
-        1,
-        ConvertPolicy::SATURATE);
-    return post_ops;
-}
-// To test that the output of the main op is the first parameter in prelu post op
-experimental::PostOpList<PostOpArgBroadcast> post_ops_4()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, true),   // If true, broadcast in corresponding dim: 0, 1 or 2
-        0,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-// To test that the output of the main op is the second parameter in prelu post op i.e. it is the alpha_param
-experimental::PostOpList<PostOpArgBroadcast> post_ops_5()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, false),   // If true, broadcast in corresponding dim: 0, 1 or 2
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-/** Different Post Op Lists */
-const auto post_op_lists = framework::dataset::make("post_op_lists", {
-    post_ops_1(),
-    post_ops_2(),
-    post_ops_3(),
-    post_ops_4(),
-    post_ops_5()
- } );
-
-bool is_post_op_list_valid(unsigned int m, unsigned int n, unsigned int k, unsigned int batch, DataType data_type, const experimental::PostOpList<ITensorInfo*>& post_ops)
-{
-    const auto lhs_info = GEMMLHSMatrixInfo(4,4,1,false,true);
-    const auto rhs_info = GEMMRHSMatrixInfo(4,4,1,true,true,false);
-
-    // Create TensorInfo for post op arguments
-    TensorInfo input0_info(TensorShape(k, m, batch), 1, data_type);
-    TensorInfo input1_info(TensorShape(n, k, batch), 1, data_type);
-    TensorInfo input2_info(TensorShape(n), 1, data_type);
-    TensorInfo output_info(TensorShape(n, m, batch), 1, data_type);
-
-    GEMMKernelInfo gemm_info(m, n, k, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
-             false /**< reinterpret the input as 3D */,
-             true  /**< Flag used to broadcast the bias addition */,
-             false /**< wider accumm */,
-             false /**< has pad y */,
-           ActivationLayerInfo::ActivationFunction::IDENTITY,
-             1   /**< Multiplication factor for the width of the 1xW transposed block */,
-             1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
-             lhs_info,
-             rhs_info,
-             0  /**< Offset to be added to each element of the matrix A */,
-             0 /**< Offset to be added to each element of the matrix B */,
-             post_ops);
-    return bool(ClGemmMatrixMultiplyNativeKernel::validate(&input0_info.clone()->set_is_resizable(true),
-                                                          &input1_info.clone()->set_is_resizable(true),
-                                                          &input2_info.clone()->set_is_resizable(true),
-                                                          &output_info.clone()->set_is_resizable(true),1.f,1.f,
-                                                          lhs_info,
-                                                          rhs_info,
-                                                          gemm_info));
-}
-
 /** Configuration test */
 void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value, unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, bool broadcast_bias, DataType data_type, const ActivationLayerInfo &act_info)
 {
@@ -295,119 +191,6 @@ void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned
 
 TEST_SUITE(CL)
 TEST_SUITE(GEMMMatrixMultiplyNative)
-TEST_SUITE(ValidateFusedPostOpsConfigs)
-TEST_SUITE(Invalid)
-TEST_CASE(UnsupportedPostOpSequence, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 17;
-    const unsigned int n = 1;
-    const unsigned int k = 13;
-    const unsigned int batch = 2;
-    TensorShape post_op_arg0_shape(n, m, batch);
-    TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
-    auto post_op_arg1_info = post_op_arg_info.clone();
-
-    // Unsupported sequence of post ops
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
-        &post_op_arg_info,
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
-        post_op_arg1_info.get(),
-        0,
-        ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OutputWidened, framework::DatasetMode::ALL)
-{
-    // Invalid broadcast: post op tensors "widen" the output tensor
-    const auto data_type = DataType::F32;
-    const unsigned int m = 1;
-    const unsigned int n = 18;
-    const unsigned int k = 13;
-    const unsigned int batch = 2;
-    TensorShape post_op_arg_shape(n, m + 1, batch); // output's Y dimension (m) is "widened", which is not allowed
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInXDimOnly, framework::DatasetMode::ALL)
-{
-    // Invalid broadcast: post op tensors broadcast in the first dimension (X) only
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, m, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Invalid
-TEST_SUITE(Valid)
-TEST_CASE(EmptyPostOpList, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInYDimOnly, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(n, 1, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInBothXandYDims, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, 1, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInAllDims, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, 1, 1);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Valid
-TEST_SUITE_END() // ValidateFusedPostOps
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(
@@ -541,31 +324,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyNative3DFixture<float>, f
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
 
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyNativeWithPostOpsFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   framework::dataset::make("M0", { 4 })),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                   framework::dataset::make("alpha", {1.0f} )),
-                                                                   framework::dataset::make("beta", {1.0f} )),
-                                                                   framework::dataset::make("broadcast_bias", { false, true } )),
-                                                                   framework::dataset::make("Activation", { ActivationLayerInfo() })),
-                                                                   post_op_lists)
-                                                                   )
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-}
-
-TEST_SUITE_END() //  FusedPostOps
-
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // GEMMMatrixMulipltyNative
diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
index cdd89670fa71d2502673f8cd38a0b003516f8595..b06e4bf213de37a636cba813ecd884d0493d4731 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/PostOps.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
@@ -62,21 +61,11 @@ using CLGEMMMatrixMultiplyReshaped = CLSynthetizeOperator<ClGemmMatrixMultiplyRe
 template <typename T>
 using CLGEMMMatrixMultiplyReshapedFixture = GEMMMatrixMultiplyReshapedValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
 
-// Fixture for CLGEMMMatrixMultiplyReshaped with post ops
-template <typename T>
-using CLGEMMMatrixMultiplyReshapedWithPostOpsFixture =
-    GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
-
 // Fixture for CLGEMMMatrixMultiplyReshaped mixed precision
 template <typename T>
 using CLGEMMMatrixMultiplyReshapedMixedPrecisionFixture =
     GEMMMatrixMultiplyReshapedValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped, true>;
 
-// Fixture for CLGEMMMatrixMultiplyReshaped mixed precision with post ops
-template <typename T>
-using CLGEMMMatrixMultiplyReshapedMixedPrecisionWithPostOpsFixture =
-    GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped, true>;
-
 // Fixture for CLGEMMMatrixMultiplyReshaped3D
 template <typename T>
 using CLGEMMMatrixMultiplyReshaped3DFixture = GEMMMatrixMultiplyReshaped3DValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
@@ -184,108 +173,6 @@ const auto broadcast_bias_values = framework::dataset::make("broadcast_bias", {
 /** LHS transposed values */
 const auto lhs_transpose_values = framework::dataset::make("lhs_transpose", { false, true } );
 
-/** Post Ops */
-using PostOpArgBroadcast =  CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<float>::PostOpArgBroadcast;
-experimental::PostOpList<PostOpArgBroadcast> post_ops_1()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(true, true, false),   // If broadcast in dims 0, 1 and 2
-        0,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_2()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(false, true, true),   // If broadcast in dims 0, 1 and 2
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_3()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, true),  // If broadcast in dims 0, 1 and 2
-        1,
-        ConvertPolicy::SATURATE);
-    return post_ops;
-}
-// To test that the output of the main op is the first parameter in prelu post op
-experimental::PostOpList<PostOpArgBroadcast> post_ops_4()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, true),   // If true, broadcast in corresponding dim: 0, 1 or 2
-        0,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-// To test that the output of the main op is the second parameter in prelu post op i.e. it is the alpha_param
-experimental::PostOpList<PostOpArgBroadcast> post_ops_5()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, false),   // If true, broadcast in corresponding dim: 0, 1 or 2
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-/** Different Post Op Lists */
-const auto post_op_lists = framework::dataset::make("post_op_lists", {
-    post_ops_1(),
-    post_ops_2(),
-    post_ops_3(),
-    post_ops_4(),
-    post_ops_5()
- } );
-
-bool is_post_op_list_valid(unsigned int m, unsigned int n, unsigned int k, unsigned int batch, DataType data_type, const experimental::PostOpList<ITensorInfo*>& post_ops)
-{
-    const auto lhs_info = GEMMLHSMatrixInfo(4,4,1,false,true);
-    const auto rhs_info = GEMMRHSMatrixInfo(4,4,1,true,true,false);
-
-    // Create TensorInfo for post op arguments
-    TensorInfo input0_info(TensorShape(k, m, batch), 1, data_type);
-    TensorInfo input1_info(TensorShape(n, k, batch), 1, data_type);
-    TensorInfo input2_info(TensorShape(n), 1, data_type);
-    TensorInfo output_info(TensorShape(n, m, batch), 1, data_type);
-
-    const TensorInfo reshaped_input0_info = input0_info.clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(input0_info, lhs_info));
-    const TensorInfo reshaped_input1_info = input1_info.clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(input1_info, rhs_info));
-
-    GEMMKernelInfo gemm_info(m, n, k, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
-             false /**< reinterpret the input as 3D */,
-             true  /**< Flag used to broadcast the bias addition */,
-             false /**< wider accumm */,
-             false /**< has pad y */,
-           ActivationLayerInfo::ActivationFunction::IDENTITY,
-             1   /**< Multiplication factor for the width of the 1xW transposed block */,
-             1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
-             lhs_info,
-             rhs_info,
-             0  /**< Offset to be added to each element of the matrix A */,
-             0 /**< Offset to be added to each element of the matrix B */,
-             post_ops);
-    return bool(ClGemmMatrixMultiplyReshapedKernel::validate(&reshaped_input0_info.clone()->set_is_resizable(true),
-                                                          &reshaped_input1_info.clone()->set_is_resizable(true),
-                                                          &input2_info.clone()->set_is_resizable(true),
-                                                          &output_info.clone()->set_is_resizable(true),1.f,1.f,
-                                                          lhs_info,
-                                                          rhs_info,
-                                                          gemm_info));
-}
-
 } // namespace
 
 TEST_SUITE(CL)
@@ -450,119 +337,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                                                           rhs_info,
                                                           gemm_info)) == expected, framework::LogLevel::ERRORS);
 }
-TEST_SUITE(ValidateFusedPostOpsConfigs)
-TEST_SUITE(Invalid)
-TEST_CASE(UnsupportedPostOpSequence, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 17;
-    const unsigned int n = 1;
-    const unsigned int k = 13;
-    const unsigned int batch = 2;
-    TensorShape post_op_arg0_shape(n, m, batch);
-    TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
-    auto post_op_arg1_info = post_op_arg_info.clone();
-
-    // Unsupported sequence of post ops
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
-        &post_op_arg_info,
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
-        post_op_arg1_info.get(),
-        0,
-        ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OutputWidened, framework::DatasetMode::ALL)
-{
-    // Invalid broadcast: post op tensors "widen" the output tensor
-    const auto data_type = DataType::F32;
-    const unsigned int m = 17;
-    const unsigned int n = 1;
-    const unsigned int k = 13;
-    const unsigned int batch = 2;
-    TensorShape post_op_arg_shape(n + 4, m, batch); // output's X dimension (n) is "widened", which is not allowed
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInXDimOnly, framework::DatasetMode::ALL)
-{
-    // Invalid broadcast: post op tensors broadcast in the first dimension (X) only
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, m, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Invalid
-TEST_SUITE(Valid)
-TEST_CASE(EmptyPostOpList, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInYDimOnly, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(n, 1, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInBothXandYDims, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, 1, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInAllDims, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, 1, 1);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Valid
-TEST_SUITE_END() // ValidateFusedPostOps
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
@@ -697,44 +472,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>,
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_precommit),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   v0_values_precommit),
-                                                                   h0_values_precommit),
-                                                                   framework::dataset::make("interleave_lhs", { false })),
-                                                                   framework::dataset::make("interleave_rhs", { false })),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                   a_values_precommit),
-                                                                   beta_values_precommit),
-                                                                   framework::dataset::make("broadcast_bias", { true } )),
-                                                                   lhs_transpose_values),
-                                                                   act_values),
-                                                                   post_op_lists)
-                                                                   )
-{
-    // Validate output
-    if(validate_result)
-    {
-        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-    }
-    else
-    {
-        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
-        framework::ARM_COMPUTE_PRINT_INFO();
-    }
-}
-
-TEST_SUITE_END() //  FusedPostOps
 
 TEST_SUITE(ExportToCLImage)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
@@ -1002,44 +739,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>,
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_precommit),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   v0_values_precommit),
-                                                                   h0_values_precommit),
-                                                                   framework::dataset::make("interleave_lhs", { false })),
-                                                                   framework::dataset::make("interleave_rhs", { false })),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                   a_values_precommit),
-                                                                   beta_values_precommit),
-                                                                   framework::dataset::make("broadcast_bias", { true } )),
-                                                                   lhs_transpose_values),
-                                                                   act_values),
-                                                                   post_op_lists)
-                                                                   )
-{
-    // Validate output only if validate() is successful
-    if(validate_result)
-    {
-        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-    }
-    else
-    {
-        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
-        framework::ARM_COMPUTE_PRINT_INFO();
-    }
-}
-
-TEST_SUITE_END() //  FusedPostOps
 
 TEST_SUITE_END() // ExportToCLImage
 TEST_SUITE_END() // FP32
@@ -1178,45 +877,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>,
     }
 }
 
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_precommit),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   v0_values_precommit),
-                                                                   h0_values_precommit),
-                                                                   framework::dataset::make("interleave_lhs", { false })),
-                                                                   framework::dataset::make("interleave_rhs", { false })),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
-                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                   a_values_precommit),
-                                                                   beta_values_precommit),
-                                                                   framework::dataset::make("broadcast_bias", { true } )),
-                                                                   lhs_transpose_values),
-                                                                   act_values),
-                                                                   post_op_lists)
-                                                                   )
-{
-    // Validate output
-    if(validate_result)
-    {
-        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
-    }
-    else
-    {
-        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
-        framework::ARM_COMPUTE_PRINT_INFO();
-    }
-}
-
-TEST_SUITE_END() //  FusedPostOps
-
 TEST_SUITE(ExportToCLImage)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
                framework::dataset::make("Input0Info", { TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F16),  // OK or incorrect if cl_khr_image2d_from_buffer not supported
@@ -1483,44 +1143,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>,
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_precommit),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   v0_values_precommit),
-                                                                   h0_values_precommit),
-                                                                   framework::dataset::make("interleave_lhs", { false })),
-                                                                   framework::dataset::make("interleave_rhs", { false })),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
-                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                   a_values_precommit),
-                                                                   beta_values_precommit),
-                                                                   framework::dataset::make("broadcast_bias", { true } )),
-                                                                   lhs_transpose_values),
-                                                                   act_values),
-                                                                   post_op_lists)
-                                                                   )
-{
-    // Validate output only if validate() is successful
-    if(validate_result)
-    {
-        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
-    }
-    else
-    {
-        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
-        framework::ARM_COMPUTE_PRINT_INFO();
-    }
-}
-
-TEST_SUITE_END() //  FusedPostOps
 
 TEST_SUITE_END() // ExportToCLImage
 TEST_SUITE_END() // FP16
@@ -1659,45 +1281,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DMixedPrecisionF
     }
 }
 
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedMixedPrecisionWithPostOpsFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_precommit),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   v0_values_precommit),
-                                                                   h0_values_precommit),
-                                                                   framework::dataset::make("interleave_lhs", { false })),
-                                                                   framework::dataset::make("interleave_rhs", { false })),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", { true, false })),
-                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                   a_values_precommit),
-                                                                   beta_values_precommit),
-                                                                   framework::dataset::make("broadcast_bias", { true } )),
-                                                                   lhs_transpose_values),
-                                                                   act_values),
-                                                                   post_op_lists)
-                                                                   )
-{
-    // Validate output
-    if(validate_result)
-    {
-        validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision);
-    }
-    else
-    {
-        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
-        framework::ARM_COMPUTE_PRINT_INFO();
-    }
-}
-
-TEST_SUITE_END() // FusedPostOps
-
 TEST_SUITE_END() // MixedPrecision
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // GEMMMatrixMultiplyReshaped
diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
index 53038c8177ea9007dba2e3b040d8f4ea63384015..dafc8dc5ecdc8ec0a9bad972d9527da4d251fc2c 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/PostOps.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
@@ -62,11 +61,6 @@ using CLGEMMMatrixMultiplyReshapedOnlyRHSFixture = GEMMMatrixMultiplyReshapedOnl
 template <typename T>
 using CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture = GEMMMatrixMultiplyReshapedOnlyRHS3DValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshapedOnlyRHS>;
 
-// Fixture for CLGEMMMatrixMultiplyReshapedOnlyRHS with post ops
-template <typename T>
-using CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture =
-    GEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshapedOnlyRHS>;
-
 namespace
 {
 // *INDENT-OFF*
@@ -164,106 +158,6 @@ const auto boundary_handling_cases = combine(combine(combine(combine(combine(com
                                     broadcast_bias_values),
                                     framework::dataset::make("Activation", ActivationLayerInfo()));
 
-/** Post Ops */
-using PostOpArgBroadcast =  CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture<float>::PostOpArgBroadcast;
-experimental::PostOpList<PostOpArgBroadcast> post_ops_1()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(true, true, false),   // If broadcast in dims 0, 1 and 2
-        0,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_2()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(false, true, true),   // If broadcast in dims 0, 1 and 2
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_3()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, true),  // If broadcast in dims 0, 1 and 2
-        1,
-        ConvertPolicy::SATURATE);
-    return post_ops;
-}
-// To test that the output of the main op is the first parameter in prelu post op
-experimental::PostOpList<PostOpArgBroadcast> post_ops_4()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, true),   // If true, broadcast in corresponding dim: 0, 1 or 2
-        0,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-// To test that the output of the main op is the second parameter in prelu post op i.e. it is the alpha_param
-experimental::PostOpList<PostOpArgBroadcast> post_ops_5()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, false),   // If true, broadcast in corresponding dim: 0, 1 or 2
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-/** Different Post Op Lists */
-const auto post_op_lists = framework::dataset::make("post_op_lists", {
-    post_ops_1(),
-    post_ops_2(),
-    post_ops_3(),
-    post_ops_4(),
-    post_ops_5()
- } );
-
- bool is_post_op_list_valid(unsigned int m, unsigned int n, unsigned int k, unsigned int batch, DataType data_type, const experimental::PostOpList<ITensorInfo*>& post_ops)
-{
-    const auto lhs_info = GEMMLHSMatrixInfo(4,4,1,false,true);
-    const auto rhs_info = GEMMRHSMatrixInfo(4,4,1,true,true,false);
-
-    // Create TensorInfo for post op arguments
-    TensorInfo input0_info(TensorShape(k, m, batch), 1, data_type);
-    TensorInfo input1_info(TensorShape(n, k, batch), 1, data_type);
-    TensorInfo input2_info(TensorShape(n), 1, data_type);
-    TensorInfo output_info(TensorShape(n, m, batch), 1, data_type);
-
-    const TensorInfo reshaped_input1_info = input1_info.clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(input1_info, rhs_info));
-
-    GEMMKernelInfo gemm_info(m, n, k, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
-             false /**< reinterpret the input as 3D */,
-             true  /**< Flag used to broadcast the bias addition */,
-             false /**< wider accumm */,
-             false /**< has pad y */,
-           ActivationLayerInfo::ActivationFunction::IDENTITY,
-             1   /**< Multiplication factor for the width of the 1xW transposed block */,
-             1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
-             lhs_info,
-             rhs_info,
-             0  /**< Offset to be added to each element of the matrix A */,
-             0 /**< Offset to be added to each element of the matrix B */,
-             post_ops);
-    return bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(&input0_info.clone()->set_is_resizable(true),
-                                                          &reshaped_input1_info.clone()->set_is_resizable(true),
-                                                          &input2_info.clone()->set_is_resizable(true),
-                                                          &output_info.clone()->set_is_resizable(true),1.f,1.f,
-                                                          lhs_info,
-                                                          rhs_info,
-                                                          gemm_info));
-}
 /** Configuration test */
 bool validate_configuration(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value,
                             unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, unsigned int h0_value,
@@ -370,119 +264,6 @@ b_value, m0_value, n0_value, k0_value, broadcast_bias, input_as_3d, depth_output
     ARM_COMPUTE_EXPECT(status == expected_value, framework::LogLevel::ERRORS);
 }
 
-TEST_SUITE(ValidateFusedPostOpsConfigs)
-TEST_SUITE(Invalid)
-TEST_CASE(UnsupportedPostOpSequence, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 17;
-    const unsigned int n = 1;
-    const unsigned int k = 13;
-    const unsigned int batch = 2;
-    TensorShape post_op_arg0_shape(n, m, batch);
-    TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
-    auto post_op_arg1_info = post_op_arg_info.clone();
-
-    // Unsupported sequence of post ops
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
-        &post_op_arg_info,
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
-        post_op_arg1_info.get(),
-        0,
-        ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OutputWidened, framework::DatasetMode::ALL)
-{
-    // Invalid broadcast: post op tensors "widen" the output tensor
-    const auto data_type = DataType::F32;
-    const unsigned int m = 17;
-    const unsigned int n = 1;
-    const unsigned int k = 1;
-    const unsigned int batch = 1;
-    TensorShape post_op_arg_shape(n, m, batch + 4); // output's batch dimension is "widened", which is not allowed
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInXDimOnly, framework::DatasetMode::ALL)
-{
-    // Invalid broadcast: post op tensors broadcast in the first dimension (X) only
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, m, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Invalid
-TEST_SUITE(Valid)
-TEST_CASE(EmptyPostOpList, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInYDimOnly, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(n, 1, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInBothXandYDims, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, 1, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInAllDims, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, 1, 1);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Valid
-TEST_SUITE_END() // ValidateFusedPostOps
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
@@ -684,43 +465,6 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur
     }
 }
 
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_precommit),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   framework::dataset::make("H0", {1})),
-                                                                   framework::dataset::make("interleave_rhs", { true })),
-                                                                   t_values_rhs),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", {false, true})),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                   a_values),
-                                                                   beta_values),
-                                                                   framework::dataset::make("broadcast_bias", { false } )),
-                                                                   act_values),
-                                                                   post_op_lists)
-                                                                   )
-{
-    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
-    if(validate_result)
-    {
-        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-    }
-    else
-    {
-        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
-        framework::ARM_COMPUTE_PRINT_INFO();
-    }
-}
-
-TEST_SUITE_END() //  FusedPostOps
-
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
@@ -849,42 +593,6 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_precommit),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   framework::dataset::make("H0", {1})),
-                                                                   framework::dataset::make("interleave_rhs", { true })),
-                                                                   t_values_rhs),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
-                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                   a_values),
-                                                                   beta_values),
-                                                                   framework::dataset::make("broadcast_bias", { false } )),
-                                                                   act_values),
-                                                                   post_op_lists)
-                                                                   )
-{
-    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
-    if(validate_result)
-    {
-        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
-    }
-    else
-    {
-        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
-        framework::ARM_COMPUTE_PRINT_INFO();
-    }
-}
-
-TEST_SUITE_END() //  FusedPostOps
 
 TEST_SUITE_END() // FP16
 
diff --git a/tests/validation/CL/MatMulLowpNativeMMULKernel.cpp b/tests/validation/CL/MatMulLowpNativeMMULKernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ac46b67c9e0b0e960ae69974ee0b1ccf3615872c
--- /dev/null
+++ b/tests/validation/CL/MatMulLowpNativeMMULKernel.cpp
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h"
+
+#include "tests/datasets/MatMulLowpMMULDataset.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/MatMulKernelFixture.h"
+#include "tests/validation/reference/Permute.h"
+
+#include <tuple>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+constexpr AbsoluteTolerance<float> tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+}
+using framework::dataset::make;
+
+template <typename T>
+using CLMatMulLowpNativeMMULKernelFixture = MatMulKernelValidationFixture<T, ClMatMulLowpNativeMMULKernel, true /* use_mmul */>;
+
+template <typename T>
+using CLMatMulLowpNativeMMULKernelWithBiasFixture = MatMulKernelWithBiasValidation<T, ClMatMulLowpNativeMMULKernel, true /* use_mmul */>;
+
+/** M0 values to test --precommit*/
+const auto m0_values_precommit = framework::dataset::make("M0", { 1, 3 });
+
+/** N0 values to test --precommit*/
+const auto n0_values_precommit = framework::dataset::make("N0", { 2, 4 });
+
+/** M0 values to test --nightly*/
+const auto m0_values_nightly_lhs_nt = framework::dataset::make("M0", { 2, 4, 5, 8 });
+const auto m0_values_nightly_lhs_t  = framework::dataset::make("M0", { 2, 4, 8 });
+
+/** N0 values to test --nightly*/
+const auto n0_values_nightly = framework::dataset::make("N0", { 1, 3, 8, 16 });
+
+TEST_SUITE(CL)
+TEST_SUITE(MatMulLowpNativeMMULKernel)
+TEST_SUITE(Validate)
+
+TEST_CASE(SupportedKernelConfigurations, framework::DatasetMode::ALL)
+{
+    using MatMulConfigurationPair = std::pair<MatMulKernelInfo, bool>;
+
+    const std::vector<MatMulConfigurationPair> supported_block_sizes =
+    {
+        // MatMulKernelInfo(adj_lhs, adj_rhs, M0, N0, K0, export_rhs_to_cl_image = false)
+        { MatMulKernelInfo(false, false, 0, 1, 4), false }, // M0 should be > 0
+        { MatMulKernelInfo(false, true, 3, 5, 4), false },  // N0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, false, 3, 6, 4), false }, // N0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, false, 3, 3, 8), false }, // K0 not in 4
+        { MatMulKernelInfo(true, false, 5, 3, 4), false },  // M0 not in {1, 2, 3, 4, 8, 16} when Lhs is transposed
+        { MatMulKernelInfo(false, false, 9, 1, 4), true },
+        { MatMulKernelInfo(false, true, 3, 16, 4), true },
+        { MatMulKernelInfo(false, false, 7, 3, 4), true },
+        { MatMulKernelInfo(true, false, 8, 3, 4), true },
+        { MatMulKernelInfo(true, true, 4, 3, 4), true },
+        { MatMulKernelInfo(false, false, 7, 3, 4, true), false }, // export to CLImage is unsupported for quantized types
+    };
+
+    // Set big enough shapes so that block sizes are not truncated. Also, set all dimensions equal
+    // so that it doesn't fail for different NT/T configurations. We aim to test the block sizes here,
+    // not the shapes themselves.
+    const TensorInfo lhs_info = TensorInfo(TensorShape(64U, 64U), 1, DataType::QASYMM8_SIGNED);
+    const TensorInfo rhs_info = TensorInfo(TensorShape(64U, 64U), 1, DataType::QASYMM8_SIGNED);
+
+    for(auto &pair : supported_block_sizes)
+    {
+        TensorInfo output_info;
+        Status     status   = ClMatMulLowpNativeMMULKernel::validate(&lhs_info, &rhs_info, nullptr, &output_info, pair.first);
+        const bool expected = (pair.second && arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()));
+
+        ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_CASE(ValidateInputShapes, framework::DatasetMode::ALL)
+{
+    // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations
+    using ShapeConfigurationTuple = std::tuple<TensorShape, TensorShape, TensorShape, bool>;
+    const std::vector<ShapeConfigurationTuple> shape_configurations =
+    {
+        { TensorShape(32U, 1U), TensorShape(3U, 32U), TensorShape(3U), true },
+        { TensorShape(16U, 12U), TensorShape(3U, 16U), TensorShape(3U), true },
+        { TensorShape(64U, 4U), TensorShape(2U, 64U), TensorShape(2U), true },
+        { TensorShape(16U, 4U), TensorShape(2U, 32U), TensorShape(2U), false }, // Mismatch in the K dimension
+        { TensorShape(16U, 0U), TensorShape(2U, 16U), TensorShape(2U), false }, // Invalid dimension
+        { TensorShape(32U, 4U, 3U, 4U, 5U, 6U), TensorShape(2U, 32U, 3U, 4U, 5U, 6U), TensorShape(2U), true },
+        { TensorShape(32U, 4U, 3U, 4U, 5U, 1U), TensorShape(2U, 32U, 3U, 4U, 5U, 6U), TensorShape(2U), false }, // no batch broadcasting
+        { TensorShape(32U, 4U, 3U, 4U, 9U, 6U), TensorShape(2U, 32U, 3U, 4U, 5U, 6U), TensorShape(2U), false }, // mismatch in batch dimension
+        { TensorShape(32U, 1U), TensorShape(3U, 32U), TensorShape(1U), false },                                 // invalid broadcast of bias
+        { TensorShape(32U, 1U), TensorShape(3U, 32U), TensorShape(3U, 3U), false },                             // 2d bias is invalid
+        { TensorShape(12U, 12U), TensorShape(3U, 12U), TensorShape(3U), false },                                // K must be multiple of 16
+    };
+
+    for(auto &tuple : shape_configurations)
+    {
+        const bool expected = (std::get<3>(tuple) && arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()));
+
+        for(bool adj_lhs :
+            {
+                false, true
+            })
+        {
+            for(bool adj_rhs :
+                {
+                    false, true
+                })
+            {
+                TensorShape lhs_shape = std::get<0>(tuple);
+                TensorShape rhs_shape = std::get<1>(tuple);
+                TensorShape bia_shape = std::get<2>(tuple);
+
+                if(adj_lhs)
+                {
+                    permute(lhs_shape, PermutationVector(1U, 0U));
+                }
+
+                if(adj_rhs)
+                {
+                    permute(rhs_shape, PermutationVector(1U, 0U));
+                }
+
+                const TensorInfo lhs_info = TensorInfo(lhs_shape, 1, DataType::QASYMM8_SIGNED);
+                const TensorInfo rhs_info = TensorInfo(rhs_shape, 1, DataType::QASYMM8_SIGNED);
+                const TensorInfo bia_info = TensorInfo(bia_shape, 1, DataType::S32);
+                TensorInfo       output_info;
+
+                MatMulKernelInfo matmul_kernel_info{ adj_lhs, adj_rhs, 1, 1, 4, false /* export_rhs_to_cl_image */ };
+
+                Status status = ClMatMulLowpNativeMMULKernel::validate(&lhs_info, &rhs_info, &bia_info, &output_info, matmul_kernel_info);
+                ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+            }
+        }
+    }
+}
+
+TEST_CASE(ValidateDataTypes, framework::DatasetMode::ALL)
+{
+    using DataTypeConfigurationTuple = std::tuple<DataType, DataType, DataType, DataType, bool>;
+    const std::vector<DataTypeConfigurationTuple> data_type_configurations =
+    {
+        { DataType::F32, DataType::F32, DataType::F32, DataType::F32, false }, // no floating point types
+        { DataType::F16, DataType::F16, DataType::F16, DataType::F16, false }, // no floating point types
+        { DataType::F64, DataType::F64, DataType::F64, DataType::F64, false }, // no double precision
+        { DataType::QASYMM8, DataType::QASYMM8, DataType::S32, DataType::QASYMM8, true },
+        { DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, DataType::S32, DataType::QASYMM8_SIGNED, true },
+        { DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL, DataType::S32, DataType::QSYMM8_PER_CHANNEL, false }, // only qasymm8/qasymm8_signed is supported
+        { DataType::QASYMM16, DataType::QASYMM16, DataType::S32, DataType::QASYMM16, false },                               // only qasymm8/qasymm8_signed is supported
+        { DataType::QSYMM16, DataType::QSYMM16, DataType::S32, DataType::QSYMM16, false },                                  // only qasymm8/qasymm8_signed is supported
+        { DataType::QSYMM8, DataType::QSYMM8, DataType::S32, DataType::QSYMM8, false },                                     // only qasymm8/qasymm8_signed is supported
+        { DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::QASYMM8, false },                           // no mixed data types
+        { DataType::S64, DataType::S64, DataType::S64, DataType::S64, false },                                              // no integral types
+        { DataType::S32, DataType::S32, DataType::S32, DataType::S32, false },                                              // no integral types
+        { DataType::S16, DataType::S16, DataType::S16, DataType::S16, false },                                              // no integral types
+        { DataType::S8, DataType::S8, DataType::S8, DataType::S8, false },                                                  // no integral types
+        { DataType::U64, DataType::U64, DataType::U64, DataType::U64, false },                                              // no integral types
+        { DataType::U32, DataType::U32, DataType::U32, DataType::U32, false },                                              // no integral types
+        { DataType::U16, DataType::U16, DataType::U16, DataType::U16, false },                                              // no integral types
+        { DataType::U8, DataType::U8, DataType::U8, DataType::U8, false },                                                  // no integral types
+        { DataType::QASYMM8, DataType::QASYMM8, DataType::F32, DataType::QASYMM8, false }                                   // Only S32 bias is supported
+    };
+
+    // It's enough to test a single shape and block size configuration while checking data types
+    const TensorShape      shape     = TensorShape(48U, 48U);
+    const TensorShape      bia_shape = TensorShape(48U);
+    const MatMulKernelInfo matmul_kernel_info{ false, false, 1, 1, 4, false };
+    for(auto &tuple : data_type_configurations)
+    {
+        const bool expected = (std::get<4>(tuple) && arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()));
+
+        const TensorInfo lhs_info(shape, 1, std::get<0>(tuple));
+        const TensorInfo rhs_info(shape, 1, std::get<1>(tuple));
+        const TensorInfo bia_info(bia_shape, 1, std::get<2>(tuple));
+        TensorInfo       output_info(shape, 1, std::get<3>(tuple));
+
+        Status status = ClMatMulLowpNativeMMULKernel::validate(&lhs_info, &rhs_info, &bia_info, &output_info, matmul_kernel_info);
+
+        ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_SUITE_END() // Validate
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8_SIGNED)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulLowpNativeMMULKernelFixture<int8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::SmallMatMulLowpMMULDataset(),
+                               make("TransposeA", { false, true }),
+                               make("TransposeB", { false, true }),
+                               m0_values_precommit,
+                               n0_values_precommit,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunWithBias, CLMatMulLowpNativeMMULKernelWithBiasFixture<int8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::SmallMatMulLowpMMULWithBiasDataset(),
+                               make("TransposeA", { false, true }),
+                               make("TransposeB", { false, true }),
+                               m0_values_precommit,
+                               n0_values_precommit,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeLhsNotTransposed, CLMatMulLowpNativeMMULKernelFixture<int8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulLowpMMULDataset(),
+                               make("TransposeA", { false }),
+                               make("TransposeB", { false, true }),
+                               m0_values_nightly_lhs_nt,
+                               n0_values_nightly,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeLhsTransposed, CLMatMulLowpNativeMMULKernelFixture<int8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulLowpMMULDataset(),
+                               make("TransposeA", { true }),
+                               make("TransposeB", { false, true }),
+                               m0_values_nightly_lhs_t,
+                               n0_values_nightly,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+// Running High Dimensional test is enough for qasymm8_signed, because we're stressing the number of dimensions, not data type or M0/N0/K0
+// It's a good idea to test for each Lhs/Rhs T/NT combinations because they're different CL kernels
+FIXTURE_DATA_TEST_CASE(RunHighDimensional, CLMatMulLowpNativeMMULKernelFixture<int8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::HighDimensionalMatMulLowpMMULDataset(),
+                               make("TransposeA", { false, true }),
+                               make("TransposeB", { false, true }),
+                               make("M0", { 2 }),
+                               make("N0", { 2 }),
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE(QASYMM8)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulLowpNativeMMULKernelFixture<uint8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::SmallMatMulLowpMMULDatasetSubset(),
+                               make("TransposeA", { false, true }),
+                               make("TransposeB", { false, true }),
+                               m0_values_precommit,
+                               n0_values_precommit,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunWithBias, CLMatMulLowpNativeMMULKernelWithBiasFixture<uint8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::SmallMatMulLowpMMULWithBiasDataset(),
+                               make("TransposeA", { false, true }),
+                               make("TransposeB", { false, true }),
+                               m0_values_precommit,
+                               n0_values_precommit,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeLhsNotTransposed, CLMatMulLowpNativeMMULKernelFixture<uint8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulLowpMMULDataset(),
+                               make("TransposeA", { false }),
+                               make("TransposeB", { false, true }),
+                               m0_values_nightly_lhs_nt,
+                               n0_values_nightly,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeLhsTransposed, CLMatMulLowpNativeMMULKernelFixture<uint8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulLowpMMULDataset(),
+                               make("TransposeA", { true }),
+                               make("TransposeB", { false, true }),
+                               m0_values_nightly_lhs_t,
+                               n0_values_nightly,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // MatMulLowpNativeMMULKernel
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/CL/MatMulNativeMMULKernel.cpp b/tests/validation/CL/MatMulNativeMMULKernel.cpp
index 70c80985db1528be9d7ac9efb26274c5986be502..655dd354dcb74b3c4015e1a2a1dfd734ed8dab80 100644
--- a/tests/validation/CL/MatMulNativeMMULKernel.cpp
+++ b/tests/validation/CL/MatMulNativeMMULKernel.cpp
@@ -46,8 +46,8 @@ RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for compari
 constexpr float          abs_tolerance_f32(
     0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for floating point data types in case using relative tolerance fails because of small values */
 constexpr float abs_tolerance_f16(
-    0.001f);                                                   /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16  data types in case using relative tolerance fails because of small values */
-RelativeTolerance<half_float::half> tolerance_f16(half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+    0.02f);                                                   /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16  data types in case using relative tolerance fails because of small values */
+RelativeTolerance<half_float::half> tolerance_f16(half(0.02)); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
 } // namespace
 
 /** M0 values to test --precommit*/
diff --git a/tests/validation/CL/Reverse.cpp b/tests/validation/CL/Reverse.cpp
index 11df0e78035e8118d03d86aec3a726d9c63c42a4..82effc2136b56ce66bf9b01245721b7dc4031052 100644
--- a/tests/validation/CL/Reverse.cpp
+++ b/tests/validation/CL/Reverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,9 +41,10 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
-auto run_small_dataset = combine(datasets::SmallShapes(), datasets::Tiny1DShapes());
+auto run_small_dataset = combine(datasets::Small3DShapes(), datasets::Tiny1DShapes());
 auto run_large_dataset = combine(datasets::LargeShapes(), datasets::Tiny1DShapes());
 
 } // namespace
@@ -53,33 +54,34 @@ TEST_SUITE(Reverse)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8), // Invalid axis datatype
+        make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8), // Invalid axis datatype
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid axis shape
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid axis length (> 4)
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Mismatching shapes
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U), 1, DataType::U8),
         }),
-        framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),
+        make("OutputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U), 1, DataType::U8),
         })),
-        framework::dataset::make("AxisInfo",{ TensorInfo(TensorShape(3U), 1, DataType::U8),
+        make("AxisInfo",{ TensorInfo(TensorShape(3U), 1, DataType::U8),
                                            TensorInfo(TensorShape(2U, 10U), 1, DataType::U32),
                                            TensorInfo(TensorShape(8U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
         })),
-        framework::dataset::make("Expected", { false, false, false, false, true, true})),
+        make("Expected", { false, false, false, false, true, true})),
         src_info, dst_info, axis_info, expected)
 {
     Status s = CLReverse::validate(&src_info.clone()->set_is_resizable(false),
                                   &dst_info.clone()->set_is_resizable(false),
-                                  &axis_info.clone()->set_is_resizable(false));
+                                  &axis_info.clone()->set_is_resizable(false),
+                                  false);
     ARM_COMPUTE_EXPECT(bool(s) == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
@@ -93,7 +95,11 @@ TEST_SUITE(F16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        CLReverseFixture<half>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::F16)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::F16),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -102,7 +108,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        CLReverseFixture<half>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::F16)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::F16),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -113,7 +123,11 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        CLReverseFixture<float>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::F32)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::F32),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -122,7 +136,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        CLReverseFixture<float>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::F32)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::F32),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -135,7 +153,11 @@ TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        CLReverseFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::QASYMM8)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::QASYMM8),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -144,7 +166,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        CLReverseFixture<uint8_t>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::QASYMM8)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::QASYMM8),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
diff --git a/tests/validation/CL/Transpose.cpp b/tests/validation/CL/Transpose.cpp
index 943534058bf45ebde7d3d4d0138d8cdece71f4ea..6cf5fe853787c8bb81503808be1df78310c0e5d1 100644
--- a/tests/validation/CL/Transpose.cpp
+++ b/tests/validation/CL/Transpose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,12 +50,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
     framework::dataset::make("InputInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::U16), // Invalid shape
                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::U8),  // Wrong data type
                                             TensorInfo(TensorShape(20U, 16U), 1, DataType::U32), // Valid
+                                            TensorInfo(TensorShape(20U, 16U, 3U, 3U), 1, DataType::U16), // Transpose only first two dimensions
                                           }),
     framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(13U, 20U), 1, DataType::U32),
                                             TensorInfo(TensorShape(31U, 20U), 1, DataType::U16),
                                             TensorInfo(TensorShape(16U, 20U), 1, DataType::U32),
+                                            TensorInfo(TensorShape(16U, 20U, 3U, 3U), 1, DataType::U16),
                                            })),
-    framework::dataset::make("Expected", { false, false, true })),
+    framework::dataset::make("Expected", { false, false, true, true })),
     a_info, output_info, expected)
 {
     // Lock tensors
@@ -80,6 +82,16 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLTransposeFixture<uint8_t>, framework::Dataset
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
+FIXTURE_DATA_TEST_CASE(RunLargeHighDimensional,
+                       CLTransposeFixture<uint8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(concat(concat(datasets::Large3DShapes(), datasets::Large4DShapes()),
+                                      datasets::Large5dShapes()),
+                               framework::dataset::make("DataType", DataType::U8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
 TEST_SUITE_END() // U8
 
 TEST_SUITE(U16)
@@ -106,6 +118,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLTransposeFixture<uint32_t>, framework::Datase
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallHighDimensional,
+                       CLTransposeFixture<uint32_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(concat(datasets::Small3DShapes(), datasets::Small4DShapes()),
+                               framework::dataset::make("DataType", DataType::U32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
 FIXTURE_DATA_TEST_CASE(RunLarge, CLTransposeFixture<uint32_t>, framework::DatasetMode::NIGHTLY, combine(concat(datasets::Large1DShapes(), datasets::Large2DShapes()),
                                                                                                         framework::dataset::make("DataType", DataType::U32)))
 {
diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp
index 6ac37d14755ef00475de652d00e0cd40689b6364..196e7edb8c369e568ec3528d038bd43911ce4dc5 100644
--- a/tests/validation/CL/Winograd.cpp
+++ b/tests/validation/CL/Winograd.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
+#include "tests/datasets/ActivationFunctionsDataset.h"
 #include "tests/datasets/LargeConvolutionLayerDataset.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/datasets/SmallConvolutionLayerDataset.h"
@@ -47,6 +48,7 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 // *INDENT-OFF*
@@ -57,108 +59,232 @@ const AbsoluteTolerance<half> tolerance_convolution_layer_f16(half(0.4f));
 RelativeTolerance<half_float::half> rel_tolerance_f16(half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for FP16 data types */
 constexpr float                     tolerance_num   = 0.05f;  /**< Tolerance number */
 constexpr float                     abs_tolerance_convolution_layer_f16   = 2.5f;  /**< Tolerance number */
-constexpr float                      tolerance_num_f16 = 0.15f;                 /**< Tolerance number */
+constexpr float                     tolerance_num_f16 = 0.15f;                 /**< Tolerance number */
 
-//Activation Functions
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsDataset = make("ActivationInfo",
 {
-    ActivationLayerInfo(),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.8f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SQUARE),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::HARD_SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 2.f, 1.f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::GELU)
 });
-const auto ActivationFunctionsSmallDataset = framework::dataset::make("ActivationInfo",
+
+const auto ActivationFunctionsSmallDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU)
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.8f, -0.5f)
 });
 
 } // namespace
 
 using namespace arm_compute::misc::shape_calculator;
 
+/*
+    Testing Strategy of CL Winograd:
+        - For nchw and nhwc and for each kernel size, we have a dedicated OpenCL kernel.
+          (except 1xN and Nx1 uses NxN under the hood). Therefore, test cases should be
+          stressed for each of these configurations.
+        - Fp32 and Fp16 kernels are the same. Only the DATA_TYPE build option changes
+          between these two. Because the same kernel is stressed thoroughly for both
+          small and large shapes for Fp32 data type, Fp16 kernels are run on a subset
+          of the shapes, because we get diminishing returns by exhaustively testing the
+          same kernel.
+        - Activations only affect the output stage and it's calculated on the output tile.
+          Exhaustively testing all activations with all the shapes does not provide much
+          value but increases the testing time quite significantly. Therefore, all activations
+          are tested in a subset of the shapes, and for all MxM kernels and data layouts as
+          they represent different OpenCL kernels. (1xM and Mx1 kernels use MxM under the hood).
+*/
 TEST_SUITE(CL)
 TEST_SUITE(Winograd)
 
 TEST_SUITE(ConvolutionLayer)
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-                                                framework::dataset::make("InputInfo", {
-                                                                                        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F16),     // Insufficient padding
-                                                                                        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32),     // Datatype mismatch
-                                                                                        TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32), // Stride y not supported
-                                                                                        TensorInfo(TensorShape(16U, 16U, 8U), 1, DataType::F32),     // Padding needed
-                                                                                        TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32)  // Kernel size not supported
-                                                                                      }),
-                                                framework::dataset::make("WeightsInfo", {
-                                                                                        TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::F16),
-                                                                                        TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::QASYMM8),
-                                                                                        TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(3U, 3U, 8U, 16U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16)
-                                                                                        })),
-                                                framework::dataset::make("BiasesInfo", {
-                                                                                        TensorInfo(TensorShape(19U), 1, DataType::F16),
-                                                                                        TensorInfo(TensorShape(19U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(21U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(16U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(16U), 1, DataType::F32)
-                                                                                       })),
-                                                framework::dataset::make("OutputInfo", {
-                                                                                        TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F16),
-                                                                                        TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(16U, 16U, 16U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32)
-                                                                                       })),
-                                                framework::dataset::make("ConvInfo", {
-                                                                                        PadStrideInfo(1, 1, 1, 1),
-                                                                                        PadStrideInfo(1, 1, 1, 1),
-                                                                                        PadStrideInfo(1, 2, 0, 0),
-                                                                                        PadStrideInfo(1, 1, 1, 1),
-                                                                                        PadStrideInfo(1, 1, 1, 0)
-                                                                                                                 })),
-                                                framework::dataset::make("Expected", { false, false, false, false, false })),
-               input_info, weights_info, bias_info, output_info, conv_info, expected)
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+    make("InputInfo", {
+        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F16),     // Insufficient padding
+        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32),     // Datatype mismatch
+        TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32), // Stride y not supported
+        TensorInfo(TensorShape(16U, 16U, 8U), 1, DataType::F32),     // Padding needed
+        TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32)  // Kernel size not supported
+        }),
+    make("WeightsInfo", {
+        TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::F16),
+        TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::QASYMM8),
+        TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32),
+        TensorInfo(TensorShape(3U, 3U, 8U, 16U), 1, DataType::F32),
+        TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16)
+        }),
+    make("BiasesInfo", {
+        TensorInfo(TensorShape(19U), 1, DataType::F16),
+        TensorInfo(TensorShape(19U), 1, DataType::F32),
+        TensorInfo(TensorShape(21U), 1, DataType::F32),
+        TensorInfo(TensorShape(16U), 1, DataType::F32),
+        TensorInfo(TensorShape(16U), 1, DataType::F32)
+        }),
+    make("OutputInfo", {
+        TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F16),
+        TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32),
+        TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32),
+        TensorInfo(TensorShape(16U, 16U, 16U), 1, DataType::F32),
+        TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32)
+        }),
+    make("ConvInfo", {
+        PadStrideInfo(1, 1, 1, 1),
+        PadStrideInfo(1, 1, 1, 1),
+        PadStrideInfo(1, 2, 0, 0),
+        PadStrideInfo(1, 1, 1, 1),
+        PadStrideInfo(1, 1, 1, 0)
+    }),
+    make("Expected", { false, false, false, false, false })),
+    input_info, weights_info, bias_info, output_info, conv_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLWinogradConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info)) == expected, framework::LogLevel::ERRORS);
 }
 
+DATA_TEST_CASE(SupportedKernels, framework::DatasetMode::ALL, zip(
+    make("WeightsInfo", {
+        // Shapes are always in NCHW format. When layout is NHWC, the shape is permuted
+
+        // Fp32/16, NCHW
+        // 3x1, 1x3, 3x3 --> all TRUE
+        TensorInfo(TensorShape(3U, 3U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+        TensorInfo(TensorShape(1U, 3U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+        TensorInfo(TensorShape(3U, 1U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+
+        // 5x1, 1x5, 5x5 --> all TRUE
+        TensorInfo(TensorShape(5U, 5U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+        TensorInfo(TensorShape(1U, 5U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+        TensorInfo(TensorShape(5U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+        // 7x1, 1x7, 7x7
+        // nchw does not support kernels with size 7 --> all FALSE
+        TensorInfo(TensorShape(7U, 7U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+        TensorInfo(TensorShape(1U, 7U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+        TensorInfo(TensorShape(7U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+        // unsupported kernel sizes
+        TensorInfo(TensorShape(2U, 2U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+        TensorInfo(TensorShape(5U, 2U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+        TensorInfo(TensorShape(3U, 6U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+        // Fp32/16, NHWC
+        // 7x1, 1x7, 7x7 --> all TRUE
+        TensorInfo(TensorShape(7U, 7U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+        TensorInfo(TensorShape(1U, 7U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+        TensorInfo(TensorShape(7U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+
+        // 3x1, 1x3, 3x3 --> all TRUE
+        TensorInfo(TensorShape(3U, 3U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+        TensorInfo(TensorShape(1U, 3U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+        TensorInfo(TensorShape(3U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+
+        // 5x1, 1x5, 5x5 --> all TRUE
+        TensorInfo(TensorShape(5U, 5U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+        TensorInfo(TensorShape(1U, 5U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+        TensorInfo(TensorShape(5U, 1U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+
+        // unsupported kernel sizes
+        TensorInfo(TensorShape(2U, 2U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+        TensorInfo(TensorShape(5U, 2U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+        TensorInfo(TensorShape(3U, 6U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+
+        }),
+    make("Expected", {
+        true, true, true,     // nchw, 3x3, 1x3, 3x1
+        true, true, true,     // nchw, 5x5, 1x5, 5x1
+        false, false, false,  // nchw, 7x7, 1x7, 7x1
+        false, false, false,  // nchw, random unsupported kernels
+        true, true, true,     // nhwc, 7x7, 1x7, 7x1
+        true, true, true,     // nhwc, 3x3, 1x3, 3x1
+        true, true, true,     // nhwc, 5x5, 1x5, 5x1
+        false, false, false,  // nchw, random unsupported kernels
+    })),
+    weights_info_const, expected)
+{
+    DataType data_type = weights_info_const.data_type();
+    DataLayout data_layout = weights_info_const.data_layout();
+
+    TensorInfo input_info = TensorInfo(TensorShape(17U, 31U, 2U), 1, data_type);
+    TensorInfo bias_info = TensorInfo(TensorShape(8U), 1, data_type);
+    TensorInfo weights_info = weights_info_const;
+
+    if(data_layout == DataLayout::NHWC)
+    {
+        // Convert to NHWC
+        PermutationVector perm = PermutationVector(2U, 0U, 1U);
+
+        TensorShape input_shape = input_info.tensor_shape();
+        TensorShape weights_shape = weights_info.tensor_shape();
+        permute(input_shape, perm);
+        permute(weights_shape, perm);
+
+        input_info.set_tensor_shape(input_shape);
+        weights_info.set_tensor_shape(weights_shape);
+
+        input_info.set_data_layout(data_layout);
+        weights_info.set_data_layout(data_layout);
+        bias_info.set_data_layout(data_layout);
+    }
+
+    PadStrideInfo conv_info(1, 1, 0, 0);
+
+    TensorShape output_shape = compute_deep_convolution_shape(input_info, weights_info, conv_info);
+    TensorInfo output_info = TensorInfo(output_shape, 1, data_type, data_layout);
+
+    Status status = CLWinogradConvolutionLayer::validate(
+        &input_info,
+        &weights_info,
+        &bias_info,
+        &output_info,
+        conv_info,
+        ActivationLayerInfo(),
+        true /* fast math */);
+
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+
 TEST_SUITE(FP32)
 using CLWinogradConvolutionLayerFastMathFixture = WinogradConvolutionLayerFastMathValidationFixture<CLTensor, CLAccessor, CLWinogradConvolutionLayer, float>;
 using CLWinogradConvolutionLayerFastMathMixedDataLayoutFixture = WinogradConvolutionLayerFastMathValidationFixture<CLTensor, CLAccessor, CLWinogradConvolutionLayer, float, float, true, true>;
 TEST_SUITE(Conv3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                               ActivationFunctionsSmallDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                                               make("DataType", { DataType::F32 }),
+                                               ActivationFunctionsSmallDataset,
+                                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLWinogradConvolutionLayerFastMathMixedDataLayoutFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(combine(combine(
-                                                framework::dataset::make("Input", TensorShape(8U, 8U, 32U)),
-                                                framework::dataset::make("Weight", TensorShape(1U, 3U, 32U, 1U))),
-                                                framework::dataset::make("Bias", TensorShape(1U))),
-                                                framework::dataset::make("Output", TensorShape(8U, 6U, 1U))),
-                                                framework::dataset::make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0))),
-                                                framework::dataset::make("Dilation", Size2D(1U, 1U))),
-                                                framework::dataset::make("DataType", { DataType::F32 })),
-                                                ActivationFunctionsSmallDataset),
-                                                framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                               ActivationFunctionsDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+
+FIXTURE_DATA_TEST_CASE(RunActivations, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
+                       combine(
+                            make("Input", TensorShape(8U, 8U, 32U)),
+                            make("Weight", TensorShape(3U, 3U, 32U, 4U)),
+                            make("Bias", TensorShape(4U)),
+                            make("Output", TensorShape(6U, 6U, 4U)),
+                            make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                            make("Dilation", Size2D(1U, 1U)),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
@@ -167,20 +293,20 @@ TEST_SUITE_END() // Conv3x3
 
 TEST_SUITE(Conv3x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
-                                       framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsSmallDataset),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x1Dataset(),
-                                       framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x1Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
@@ -189,20 +315,36 @@ TEST_SUITE_END() // Conv3x1
 
 TEST_SUITE(Conv1x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
-                                       framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsSmallDataset),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLWinogradConvolutionLayerFastMathMixedDataLayoutFixture, framework::DatasetMode::PRECOMMIT,
+                       combine(
+                            make("Input", TensorShape(8U, 8U, 32U)),
+                            make("Weight", TensorShape(1U, 3U, 32U, 1U)),
+                            make("Bias", TensorShape(1U)),
+                            make("Output", TensorShape(8U, 6U, 1U)),
+                            make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                            make("Dilation", Size2D(1U, 1U)),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x3Dataset(),
-                                       framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x3Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
@@ -211,10 +353,10 @@ TEST_SUITE_END() // Conv1x3
 
 TEST_SUITE(Conv5x5)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                               ActivationFunctionsSmallDataset ),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -222,11 +364,27 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, fram
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                               ActivationFunctionsDataset ),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
+}
 
+FIXTURE_DATA_TEST_CASE(RunActivations, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
+                       combine(
+                            make("Input", TensorShape(13U, 13U, 32U)),
+                            make("Weight", TensorShape(5U, 5U, 32U, 4U)),
+                            make("Bias", TensorShape(4U)),
+                            make("Output", TensorShape(9U, 9U, 4U)),
+                            make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                            make("Dilation", Size2D(1U, 1U)),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
@@ -235,10 +393,10 @@ TEST_SUITE_END() // Conv5x5
 
 TEST_SUITE(Conv5x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                               ActivationFunctionsSmallDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -246,10 +404,10 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, fram
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                               ActivationFunctionsDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer5x1Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -259,10 +417,10 @@ TEST_SUITE_END() // Conv5x1
 
 TEST_SUITE(Conv1x5)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                               ActivationFunctionsSmallDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -270,16 +428,63 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, fram
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                               ActivationFunctionsDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x5Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
 TEST_SUITE_END() // Conv1x5
+
+TEST_SUITE(Conv1x7)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallWinogradConvolutionLayer1x7Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NHWC })))
+
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunActivations, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
+                       combine(
+                            make("Input", TensorShape(13U, 13U, 32U)),
+                            make("Weight", TensorShape(1U, 7U, 32U, 4U)),
+                            make("Bias", TensorShape(4U)),
+                            make("Output", TensorShape(13U, 11U, 4U)),
+                            make("PadStrideInfo", PadStrideInfo(1, 1, 0, 2)),
+                            make("Dilation", Size2D(1U, 1U)),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsDataset,
+                            make("DataLayout", { DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
+}
+TEST_SUITE_END() // Conv1x7
+
+TEST_SUITE(Conv7x1)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallWinogradConvolutionLayer7x1Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NHWC })))
+
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
+}
+TEST_SUITE_END() // Conv7x1
+
+/** @note: Although 7x7 is in the kernels, reference implementation
+ *  does not support it. So, it remains as a "test gap".
+ */
+
 TEST_SUITE_END() // FP32
 
 
@@ -288,20 +493,36 @@ TEST_SUITE(FP16)
 using CLWinogradConvolutionLayerFastMathFixture16 = WinogradConvolutionLayerFastMathValidationFixture<CLTensor, CLAccessor, CLWinogradConvolutionLayer, half, float>;
 TEST_SUITE(Conv3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                               ActivationFunctionsSmallDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16, tolerance_num_f16);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                               ActivationFunctionsDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x3DatasetFp16Subset(),
+                            make("DataType", { DataType::F16 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_convolution_layer_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunActivations, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
+                       combine(
+                            make("Input", TensorShape(8U, 8U, 32U)),
+                            make("Weight", TensorShape(3U, 3U, 32U, 6U)),
+                            make("Bias", TensorShape(6U)),
+                            make("Output", TensorShape(6U, 6U, 6U)),
+                            make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                            make("Dilation", Size2D(1U, 1U)),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_convolution_layer_f16);
@@ -310,20 +531,20 @@ TEST_SUITE_END() // Conv3x3
 
 TEST_SUITE(Conv3x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
-                                       framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsSmallDataset),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16, tolerance_num_f16);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x1Dataset(),
-                                       framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsDataset),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x1DatasetFp16Subset(),
+                            make("DataType", { DataType::F16 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_convolution_layer_f16);
@@ -332,20 +553,20 @@ TEST_SUITE_END() // Conv3x1
 
 TEST_SUITE(Conv1x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
-                                       framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsSmallDataset),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16, tolerance_num_f16);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x3Dataset(),
-                                       framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsDataset),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x3DatasetFp16Subset(),
+                            make("DataType", { DataType::F16 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_convolution_layer_f16);
@@ -354,10 +575,10 @@ TEST_SUITE_END() // Conv1x3
 
 TEST_SUITE(Conv5x5)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsSmallDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -365,23 +586,39 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, fr
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                               ActivationFunctionsDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer5x5DatasetFp16Subset(),
+                            make("DataType", { DataType::F16 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_convolution_layer_f16);
 }
+
+FIXTURE_DATA_TEST_CASE(RunActivations, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
+                       combine(
+                            make("Input", TensorShape(13U, 13U, 32U)),
+                            make("Weight", TensorShape(5U, 5U, 32U, 6U)),
+                            make("Bias", TensorShape(6U)),
+                            make("Output", TensorShape(9U, 9U, 6U)),
+                            make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                            make("Dilation", Size2D(1U, 1U)),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_convolution_layer_f16);
+}
 TEST_SUITE_END() // Conv5x5
 
 TEST_SUITE(Conv5x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsSmallDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -389,10 +626,10 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, fr
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                               ActivationFunctionsDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer5x1DatasetFp16Subset(),
+                            make("DataType", { DataType::F16 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -402,10 +639,10 @@ TEST_SUITE_END() // Conv5x1
 
 TEST_SUITE(Conv1x5)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsSmallDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -413,10 +650,10 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, fr
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                               ActivationFunctionsDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x5DatasetFp16Subset(),
+                            make("DataType", { DataType::F16 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -426,10 +663,10 @@ TEST_SUITE_END() // Conv1x5
 
 TEST_SUITE(Conv1x7)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x7Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsSmallDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x7Dataset(),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NHWC })))
 
 {
     // Validate output
@@ -437,16 +674,46 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, fr
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x7Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                               ActivationFunctionsDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x7DatasetFp16Subset(),
+                            make("DataType", { DataType::F16 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NHWC })))
+
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_convolution_layer_f16);
+}
 
+FIXTURE_DATA_TEST_CASE(RunActivations, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
+                       combine(
+                            make("Input", TensorShape(13U, 13U, 32U)),
+                            make("Weight", TensorShape(1U, 7U, 32U, 6U)),
+                            make("Bias", TensorShape(6U)),
+                            make("Output", TensorShape(13U, 7U, 6U)),
+                            make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                            make("Dilation", Size2D(1U, 1U)),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsDataset,
+                            make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_convolution_layer_f16);
 }
 TEST_SUITE_END() // Conv1x7
+
+TEST_SUITE(Conv7x1)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallWinogradConvolutionLayer7x1Dataset(),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NHWC })))
+
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16, tolerance_num_f16);
+}
+TEST_SUITE_END() // Conv7x1
+
 TEST_SUITE_END() // FP16
 TEST_SUITE_END() // ConvolutionLayer
 TEST_SUITE_END() // Winograd
diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp
index 110325c5a0d98760cf3e26ffaa02309272f00ea6..560460fd330be43d5dec765fd808345f612d103e 100644
--- a/tests/validation/Helpers.cpp
+++ b/tests/validation/Helpers.cpp
@@ -26,6 +26,8 @@
 
 #include <algorithm>
 #include <cmath>
+#include <cstdint>
+#include <tuple>
 
 namespace arm_compute
 {
@@ -350,33 +352,82 @@ void add_padding_x(std::initializer_list<ITensor *> tensors, const DataLayout &d
     }
 }
 
-void add_padding_y(std::initializer_list<ITensor *> tensors, const DataLayout &data_layout)
+QuantizationHint suggest_conv_dst_q_info_and_bias(const QuantizationInfo &in_q_info,
+                                                  const QuantizationInfo &weight_q_info,
+                                                  int32_t height,
+                                                  int32_t width,
+                                                  int32_t channels,
+                                                  DataType data_type,
+                                                  float bias_fraction)
 {
-    if(data_layout == DataLayout::NHWC)
-    {
-        constexpr unsigned int lower = 1U;
-        constexpr unsigned int upper = 4U;
-
-        std::uniform_int_distribution<unsigned int> distribution(lower, upper);
-        size_t                                      seed_offset = 0;
+    /**  Quantization Setup of convolution
+     *
+     *  Just like any other multiply-accummulate, convolution (2D) operation
+     *  multiplies and accumulates the input and weight tensors. This operation
+     *  takes place in three dimensions: height, width and channels. All of them
+     *  belong to the weight tensor.
+     *
+     *  The formula for simple convolution can be written as:
+     *      C = sum_h sum_w sum_c(I[h_offset + h, w_offset + w, c] * W[h, w, c])
+     *
+     *  Here, h_offset and w_offset are the starting positions in the image. Effects
+     *  of paddings are ignored. This accumulation reduces to something like
+     *
+     *  C = sum_m(I_index * W_hwc)
+     *      where m is height x width x channels.
+     *
+     *  Non-unit strides and/or dilations do not change the probabilistic nature of
+     *  this sum because we always iterate as the size of the weight tensor.
+     *
+     *  Paddings may affect this summation, but it's a boundary condition and so is
+     *  neglected for brevity.
+     */
 
-        for(ITensor *tensor : tensors)
-        {
-            ARM_COMPUTE_ERROR_ON(!tensor->info()->is_resizable());
+    return suggest_mac_dst_q_info_and_bias(in_q_info, weight_q_info, height * width * channels, data_type, bias_fraction);
+}
 
-            std::mt19937 gen(library->seed() + seed_offset++);
+QuantizationHint suggest_matmul_dst_q_info_and_bias(const QuantizationInfo &lhs_q_info,
+                                                    const QuantizationInfo &rhs_q_info,
+                                                    int32_t m, int32_t n, int32_t k, DataType data_type,
+                                                    float bias_fraction)
+{
+    ARM_COMPUTE_UNUSED(m, n);
 
-            const unsigned int top    = distribution(gen);
-            const unsigned int bottom = distribution(gen);
+    /**  Quantization Setup of matrix multiplication
+     *
+     *  We have a matrix multiplication of the form C = A * B + D
+     *  where A is (m X k), B is (k x n) and C is therefore (m x n).
+     *  The bias, D is (1 x n).
+     *
+     *  If we have some distributional statistics of A, B and D, i.e. mean and variance,
+     *  we can estimate the mean and variance of a single value in C matrix and pick
+     *  good scale and offset values for the output and have non-saturated tests.
+     *
+     *  Each element in the output matrix can be calculated as follows:
+     *      C_ij = sum_k(A_ik * B_kj) + D_j
+     *
+     * Note: All possible A_ik, B_kj, D_j random variables are assumed mutually independent.
+     * Note: In quantized operators, bias is an integer. But, its quantization scale is
+     *       assumed to be equal to lhs_scale * rhs_scale, and offset equal to 0.
+     * Note: Since, bias is an integer that should be given as input, we need to pick responsible
+     *       values when adding it on top of the summation. This is where "bias_fraction" comes
+     *       into play. Based on the fraction given, we also return suggested bias range (min/max)
+     *       for not saturating the output.
+     *
+     * Because all random variables are mutually independent, any C_ij has the same statistics,
+     * which is why we return a single destination quantization info object; which is why we can
+     * resort to a more general calculation explained in suggest_mac_dst_q_info_and_bias().
+     *
+     * From a probabilistic perspective, the above calculation reduces to
+     *      c = sum_k (a_k * b_k) + d
+     */
 
-            tensor->info()->extend_padding(PaddingSize(top, 0U, bottom, 0U));
-        }
-    }
+    return suggest_mac_dst_q_info_and_bias(lhs_q_info, rhs_q_info, k, data_type, bias_fraction);
 }
 
-QuantizationInfo calculate_mat_mul_dst_q_info(const QuantizationInfo &a_q_info, const QuantizationInfo &b_q_info, int m, int n, int k, DataType data_type)
+QuantizationHint suggest_mac_dst_q_info_and_bias(
+    const QuantizationInfo &a_q_info, const QuantizationInfo &b_q_info, int32_t K, DataType data_type, float bias_fraction, int num_sd)
 {
-    ARM_COMPUTE_UNUSED(m, n);
     QuantizationInfo c_q_info;
 
     ARM_COMPUTE_ASSERT(data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED);
@@ -384,21 +435,13 @@ QuantizationInfo calculate_mat_mul_dst_q_info(const QuantizationInfo &a_q_info,
     const int32_t t_max = static_cast<int32_t>(data_type == DataType::QASYMM8 ? std::numeric_limits<uint8_t>::max() : std::numeric_limits<int8_t>::max());
     const int32_t t_min = static_cast<int32_t>(data_type == DataType::QASYMM8 ? std::numeric_limits<uint8_t>::min() : std::numeric_limits<int8_t>::min());
 
-    /**  Quantization Setup of matrix multiplication
+    /**  Quantization Setup of multiply-accummulate
      *
-     *  We have a matrix multiplication of the form C = A * B
-     *  where A is (M X K), B is (K x N) and C is therefore (M x N).
-     *
-     *  If we have some distributions statistics of A and B, i.e. mean and variance,
-     *  we can estimate the mean and variance of a single value in C matrix and
-     *  pick good scale and offset values for the output and have non-saturated tests.
-     *
-     *  Each element in the output matrix can be calculated as follows:
-     *      C_ij = sum_k(A_ik * B_kj)
+     * Expression (in float):
+     *    C = sum_k ( A_k * B_k ) + D
      *
-     *      All values are float above.
-     *
-     * Note: All possible A_ik, B_kj random variables are assumed mutually independent.
+     * Lemma: An affine transformation (i.e. aX + b) to a discrete uniform random variable
+     *        creates another discrete uniform random variable.
      *
      * Terminology:
      *  E[X]: Mean of the random variable X (sometimes referred as mu_x)
@@ -406,26 +449,58 @@ QuantizationInfo calculate_mat_mul_dst_q_info(const QuantizationInfo &a_q_info,
      *  std(X): sqrt(var(X)), standard deviation of X
      *
      * 1) Calculate the mean:
-     *      E[C_ij] = sum_k( E[A_ik] * E[B_kj] ) = K * mean_a * mean_b
+     *      E[C] = sum_k( E[A_k] * E[B_k] ) + D = K * mean_a * mean_b + mean_d
      *
      *      Since elements of A and B are uniformly distributed random variables, we have
      *          mean_a = (max_a + min_a) / 2, mean_b = (max_b + min_b ) / 2
      *          max_a and min_a can be calculated with the scale_a/b and offset_a/b
      *              by replacing data type minimum and maximums in the equations
      *
+     *    We don't know mean_d because we have to choose it based on bias_fraction. If we call
+     *    the summation as M_int, similar to above, we have:
+     *
+     *      E[C_int] = sum_k( E[A_k_int] * E[B_k_int] ) + E[D_int] = K * mean_a_int * mean_b_int + mean_d_int
+     *                  \___________________________/
+     *                             E[M_int]
+     *
+     *      We choose a bias mean proportional to the integer summation. This proportion is "bias_fraction".
+     *      So, we have D_int = f * M_int (f: fraction), and
+     *          E[D_int] = mean_d_int = f * E[M_int]
+     *
+     *      This also means, for floating point value of D, the following:
+     *          E[D] = mean_d = E[D_int] * a_scale * b_scale
+     *
      * 2) Calculate the variance:
-     *      var(C_ij) = sum_k( var(A_ik * B_kj) )
-     *                = sum_k ( E[A_ik^2 * B_kj^2] - E[A_ik]^2E[B_kj^2] )
+     *      var(C)    = sum_k( var(A_k * B_k) ) + var(D)
+     *                = sum_k ( E[A_k^2 * B_k^2] - E[A_k]^2E[B_k^2] )
      *                = ...
-     *                = K * (var_a * var_b + var_a * mean^2_b + var_b * mean^2_a)
+     *                = K * (var_a * var_b + var_a * mean^2_b + var_b * mean^2_a) + var_d
      *
      *      Similarly, due to uniform random variable properties, we have
      *          var_a = (max_a - min_a)^2 / 12
      *          var_b = (max_b - min_b)^2 / 12
      *
+     *      Again, we don't know var_d as we don't know the bias. As set out in the previous section, we have
+     *              var(D_int) = var(f * M_int) = f^2 * var(M_int)
+     *
+     *      Using the same expression, we can find var(M_int):
+     *      var(C_int)    = sum_k( var(A_k_int * B_k_int) ) + var(D_int)
+     *                    = sum_k ( E[A_k_int^2 * B_k_int^2] - E[A_k_int]^2E[B_k_int^2] )
+     *                    = ...
+     *                    = K * (var_a_int * var_b_int + var_a_int * mean^2_b_int + var_b_int * mean^2_a_int) + var_d_int
+     *                      \_______________________________________________________________________________/
+     *                                                          var(M_int)
      *
-     * 3) Now, we have an idea of what would an average C_ij will like and how much deviation
-     *    is present around it. The exact distribution of C is not easy to come up with dependent on K.
+     *      Now, we know mean and variance of D_int, we can return a suitable bias range with
+     *          [mean_d_int +/- 2 * std_d_int]
+     *
+     *      This also means, for floating point value of D, the following:
+     *          var(D) = var_d = var(D_int) * a_scale^2 * b_scale^2
+     *
+     *      E[D] and var(D) calculated in steps (1) and (2) can be substituted into E[C] and var(C) calculatons.
+     *
+     * 3) Now, we have an idea of what would an average C will look like and how much deviation
+     *    is present around it. The exact distribution of C is difficult to come up with dependent on K.
      *    But, as K increases, due to Central Limit Theorem, it'll look more like a bell shaped figure,
      *    approaching normal distribution.
      *
@@ -448,6 +523,12 @@ QuantizationInfo calculate_mat_mul_dst_q_info(const QuantizationInfo &a_q_info,
     const int32_t b_offset = b_q_info.uniform().offset;
     const float   b_scale  = b_q_info.uniform().scale;
 
+    // Integer value statistics. Valid for both Lhs/A and Rhs/B
+    const float     mean_a_int = (t_max + t_min) / 2.f;
+    constexpr float var_a_int  = (256 * 256 - 1) / 12.f; // Discrete uniform RV variance
+    const float     mean_b_int = mean_a_int;             // A_int and B_int has the same stats
+    constexpr float var_b_int  = var_a_int;
+
     // Lhs/A stats
     const float max_a  = (t_max - a_offset) * a_scale;
     const float min_a  = (t_min - a_offset) * a_scale;
@@ -460,17 +541,34 @@ QuantizationInfo calculate_mat_mul_dst_q_info(const QuantizationInfo &a_q_info,
     const float mean_b = (max_b + min_b) / 2;
     const float var_b  = (max_b - min_b) * (max_b - min_b) / 12;
 
-    // Output stats
-    const float mean_out = k * mean_a * mean_b;
-    const float var_out  = k * (var_a * var_b + var_a * mean_b * mean_b + var_b * mean_a * mean_a);
+    // Integer multiplication output/M stats
+    const float mean_m_int = K * mean_a_int * mean_b_int;
+    const float var_m_int  = K * (var_a_int * var_b_int + mean_a_int * var_b_int + mean_b_int + var_a_int);
+    const float std_m_int  = sqrt(var_m_int);
+
+    // Bias/D both Int and Float statistics
+    const float mean_d_int = bias_fraction * mean_m_int;
+    const float std_d_int  = bias_fraction * std_m_int;
+    const float mean_d     = a_scale * b_scale * mean_d_int;
+    const float std_d      = a_scale * b_scale * std_d_int;
+    const float var_d      = std_d * std_d;
+
+    // Also calculate the suggested bias range
+    const int32_t min_bias = mean_d_int - (num_sd * std_d_int);
+    const int32_t max_bias = mean_d_int + (num_sd * std_d_int);
+
+    // Output/C stats
+    const float mean_out = K * mean_a * mean_b + mean_d;
+    const float var_out  = K * (var_a * var_b + var_a * mean_b * mean_b + var_b * mean_a * mean_a) + var_d;
     const float std_out  = sqrt(var_out);
 
     // Output quantization setup
-    const float   scale_out  = 4 * std_out / 255;
-    const int32_t offset_out = static_cast<int32_t>(t_min - (mean_out - 2.f * std_out) / scale_out);
+    const float   scale_out  = (2 * num_sd) * std_out / 255;
+    const int32_t offset_out = static_cast<int32_t>(t_min - (mean_out - (num_sd * std_out)) / scale_out);
 
     c_q_info = QuantizationInfo(scale_out, offset_out);
-    return c_q_info;
+
+    return { c_q_info, min_bias, max_bias };
 }
 
 template void get_tile(const SimpleTensor<float> &in, SimpleTensor<float> &roi, const Coordinates &coord);
diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h
index df3b08ba536b26744adc97b503b5257ee714a174..647adcdb6920ba264b5792022054b718acd5ac08 100644
--- a/tests/validation/Helpers.h
+++ b/tests/validation/Helpers.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_TESTS_VALIDATION_HELPERS
-#define ACL_TESTS_VALIDATION_HELPERS
+#ifndef ACL_TESTS_VALIDATION_HELPERS_H
+#define ACL_TESTS_VALIDATION_HELPERS_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
@@ -31,7 +31,8 @@
 #include "tests/Globals.h"
 #include "tests/SimpleTensor.h"
 
-#include <math.h>
+#include <cmath>
+#include <cstdint>
 #include <random>
 #include <type_traits>
 #include <utility>
@@ -52,6 +53,19 @@ struct is_floating_point<half> : public std::true_type
 {
 };
 
+/** Helper struct to store the hints for
+ *  - destination quantization info
+ *  - minimum bias value
+ *  - maximum bias value
+ * in quantized test construction.
+ */
+struct QuantizationHint
+{
+    QuantizationInfo q_info;
+    int32_t          bias_min;
+    int32_t          bias_max;
+};
+
 /** Helper function to get the testing range for each activation layer.
  *
  * @param[in] activation Activation function to test.
@@ -112,23 +126,6 @@ std::pair<T, T> get_activation_layer_test_bounds(ActivationLayerInfo::Activation
     return bounds;
 }
 
-/** Calculate output tensor shape give a vector of input tensor to concatenate
- *
- * @param[in] input_shapes Shapes of the tensors to concatenate across depth.
- *
- * @return The shape of output concatenated tensor.
- */
-TensorShape calculate_depth_concatenate_shape(const std::vector<TensorShape> &input_shapes);
-
-/** Calculate output tensor shape for the concatenate operation along a given axis
- *
- * @param[in] input_shapes Shapes of the tensors to concatenate across width.
- * @param[in] axis         Axis to use for the concatenate operation
- *
- * @return The shape of output concatenated tensor.
- */
-TensorShape calculate_concatenate_shape(const std::vector<TensorShape> &input_shapes, size_t axis);
-
 /** Convert an asymmetric quantized simple tensor into float using tensor quantization information.
  *
  * @param[in] src Quantized tensor.
@@ -243,21 +240,60 @@ std::pair<int, int> get_symm_quantized_per_channel_bounds(const QuantizationInfo
  */
 void add_padding_x(std::initializer_list<ITensor *> tensors, const DataLayout &data_layout = DataLayout::NHWC, bool only_right_pad = false);
 
-/** Add random padding along the Y axis (between 1 and 4 rows per side) to all the input tensors.
- *  This is used in our validation suite in order to simulate implicit padding addition after configuring, but before allocating.
+/** For 2d convolution, given the Lhs/Rhs matrix quantization informations and the convolution dimension,
+ *  calculate a suitable output quantization and suggested bias range for obtaining non-saturated outputs with high probability.
  *
- * @param[in] tensors     List of tensors to add padding to
- * @param[in] data_layout (Optional) Data layout of the operator
+ * @param[in] in_q_info     Input matrix quantization info
+ * @param[in] weight_q_info Weights matrix quantization info
+ * @param[in] height        Height of the weights tensor
+ * @param[in] width         Width of the weights tensors
+ * @param[in] channels      Number of input channels
+ * @param[in] data_type     data type, only QASYMM8, QASYMM8_SIGNED are supported
+ * @param[in] bias_fraction see @ref suggest_mac_dst_q_info_and_bias() for explanation
  *
- * @note This function adds padding to the input tensors only if data_layout == DataLayout::NHWC
+ * @return QuantizationHint object containing the suggested output quantization info and min/max bias range
+ */
+QuantizationHint suggest_conv_dst_q_info_and_bias(const QuantizationInfo &in_q_info,
+                                                  const QuantizationInfo &weight_q_info,
+                                                  int32_t height,
+                                                  int32_t width,
+                                                  int32_t channels,
+                                                  DataType data_type,
+                                                  float bias_fraction);
+
+/** For a matrix multiplication, given the Lhs/Rhs matrix quantization informations and the matrix multiplication dimensions,
+ *  calculate a suitable output quantization and suggested bias range for obtaining non-saturated outputs with high probability.
+ *
+ * @param[in] lhs_q_info    Lhs matrix quantization info
+ * @param[in] rhs_q_info    Rhs matrix quantization info
+ * @param[in] m             Number of rows of Lhs matrix
+ * @param[in] n             Number of columns of Rhs Matrix
+ * @param[in] k             Number of rows/columns of Rhs/Lhs Matrix
+ * @param[in] data_type     data type, only QASYMM8, QASYMM8_SIGNED are supported
+ * @param[in] bias_fraction see @ref suggest_mac_dst_q_info_and_bias() for explanation
+ *
+ * @return QuantizationHint object containing the suggested output quantization info and min/max bias range
  */
-void add_padding_y(std::initializer_list<ITensor *> tensors, const DataLayout &data_layout = DataLayout::NHWC);
+QuantizationHint suggest_matmul_dst_q_info_and_bias(const QuantizationInfo &lhs_q_info,
+                                                    const QuantizationInfo &rhs_q_info, int32_t m, int32_t n, int32_t k, DataType data_type,
+                                                    float bias_fraction);
 
-/** For MatMulLowp, given the Lhs/Rhs matrix quantization informations and the matrix multiplication dimensions,
- *  calculate a suitable output quantization for obtaining non-saturated outputs with high probability.
+/** For a multiply-accumulate (mac), given the Lhs/Rhs vector quantization informations and the dot product dimensions,
+ *  calculate a suitable output quantization and suggested bias range for obtaining non-saturated outputs with high probability.
+ *
+ * @param[in] lhs_q_info    Lhs matrix quantization info
+ * @param[in] rhs_q_info    Rhs matrix quantization info
+ * @param[in] k             number of accumulations taking place in the sum, i.e. c_k = sum_k(a_k * b_k)
+ * @param[in] data_type     data type, only QASYMM8, QASYMM8_SIGNED are supported
+ * @param[in] bias_fraction the fraction of bias amplitude compared to integer accummulation.
+ * @param[in] num_sd        (Optional) number of standard deviations we allow from the mean. Default value is 2.
+ *
+ * @return QuantizationHint object containing the suggested output quantization info and min/max bias range
  */
-QuantizationInfo calculate_mat_mul_dst_q_info(const QuantizationInfo &lhs_q_info, const QuantizationInfo &rhs_q_info, int m, int n, int k, DataType data_type);
+QuantizationHint suggest_mac_dst_q_info_and_bias(const QuantizationInfo &lhs_q_info,
+                                                 const QuantizationInfo &rhs_q_info, int32_t k, DataType data_type, float bias_fraction,
+                                                 int num_sd = 2);
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ACL_TESTS_VALIDATION_HELPERS */
+#endif // ACL_TESTS_VALIDATION_HELPERS_H
diff --git a/tests/validation/NEON/ArgMinMax.cpp b/tests/validation/NEON/ArgMinMax.cpp
index 2e21a7db7b20f6e07aef4fea693e6579bf1a2f01..91b8128dea5bd52976fd38ce63f36610aa789296 100644
--- a/tests/validation/NEON/ArgMinMax.cpp
+++ b/tests/validation/NEON/ArgMinMax.cpp
@@ -97,6 +97,10 @@ using NEArgMinMaxValidationFixture = ArgMinMaxValidationFixture<Tensor, Accessor
 using NEArgMinMaxValidationFixture_S32_S32 = NEArgMinMaxValidationFixture<int32_t, int32_t>;
 using NEArgMinMaxValidationFixture_F16_S32 = NEArgMinMaxValidationFixture<half, int32_t>;
 using NEArgMinMaxValidationFixture_F32_S32 = NEArgMinMaxValidationFixture<float, int32_t>;
+#ifdef __aarch64__
+using NEArgMinMaxValidationFixture_F32_S64 = NEArgMinMaxValidationFixture<float, int64_t>;
+#endif // __aarch64__
+
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmallAxis0,
                        NEArgMinMaxValidationFixture_S32_S32,
@@ -182,6 +186,21 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
     validate(Accessor(_target), _reference);
 }
 
+#ifdef __aarch64__
+FIXTURE_DATA_TEST_CASE(RunSmall_F32_S64,
+                       NEArgMinMaxValidationFixture_F32_S64,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::F32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S64)),
+                                       AxisDataset),
+                               OpsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+#endif // __aarch64__
+
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEArgMinMaxValidationFixture_F32_S32,
                        framework::DatasetMode::NIGHTLY,
diff --git a/tests/validation/NEON/Cast.cpp b/tests/validation/NEON/Cast.cpp
index a1ddcc9cad2dad3090d7d961a22efe58a0d87636..b56594546b4609539c3838b373d5ddf77097c508 100644
--- a/tests/validation/NEON/Cast.cpp
+++ b/tests/validation/NEON/Cast.cpp
@@ -217,7 +217,6 @@ DATA_TEST_CASE(KernelSelectionDstFP16, framework::DatasetMode::ALL,
     DataType::S32,
     DataType::QASYMM8,
     DataType::QASYMM8_SIGNED,
-    DataType::BFLOAT16,
 })),
 cpu_ext, data_type)
 {
@@ -226,21 +225,9 @@ cpu_ext, data_type)
 
     cpuinfo::CpuIsaInfo cpu_isa{};
     cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = true;
 
-    cpu_isa.bf16 = (data_type == DataType::BFLOAT16);
-
-    /* bf16 cast is different from all the others being converted to fp32 and not to fp16 */
-    if(cpu_isa.bf16)
-    {
-        cpu_isa.fp16  = false;
-        selected_impl = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ data_type, DataType::F32, cpu_isa }, cpu::KernelSelectionType::Preferred);
-    }
-    else
-    {
-        cpu_isa.fp16  = true;
-        selected_impl = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ data_type, DataType::F16, cpu_isa }, cpu::KernelSelectionType::Preferred);
-    }
-
+    selected_impl = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ data_type, DataType::F16, cpu_isa }, cpu::KernelSelectionType::Preferred);
     ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
 
     std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_cast";
@@ -254,7 +241,6 @@ DATA_TEST_CASE(KernelSelectionSrcFP32, framework::DatasetMode::ALL,
                        framework::dataset::make("DataType",
 {
     DataType::F16,
-    DataType::BFLOAT16,
 })),
 cpu_ext, data_type)
 {
@@ -263,7 +249,6 @@ cpu_ext, data_type)
     cpuinfo::CpuIsaInfo cpu_isa{};
     cpu_isa.neon = (cpu_ext == "NEON");
     cpu_isa.fp16 = (data_type == DataType::F16);
-    cpu_isa.bf16 = (data_type == DataType::BFLOAT16);
 
     const auto *selected_impl = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ DataType::F32, data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
 
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 06fe9f78037748bd7961eea3fd415d8c91a531e8..98a5be54849c0b505ec49b6fe21b8f17d775e75a 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -28,15 +28,16 @@
 #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuGemmConv2d.h"
 #include "src/cpu/operators/CpuGemmDirectConv2d.h"
 #include "src/cpu/operators/CpuWinogradConv2d.h"
+
 #include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
 #include "tests/datasets/LargeConvolutionLayerDataset.h"
 #include "tests/datasets/SmallConvolutionLayerDataset.h"
-#include "tests/datasets/TinyConvolutionLayerDataset.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
@@ -50,6 +51,8 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
+
 namespace detail
 {
 template <>
@@ -85,13 +88,13 @@ constexpr float                           tolerance_num = 0.07f;
 #ifdef ARM_COMPUTE_ENABLE_SME
 // TODO(COMPMID-6011): SME kernels and the reference model use different rounding mode.
 // Temporarily increase the tolerance for quantized data.
-constexpr AbsoluteTolerance<float> tolerance_qasymm8(1.0);                           /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
-#else // ARM_COMPUTE_ENABLE_SME
-constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0);                           /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
-#endif // ARM_COMPUTE_ENABLE_SME
+constexpr AbsoluteTolerance<float> tolerance_qasymm8(1.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+#else                                                      // ARM_COMPUTE_ENABLE_SME
+constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+#endif                                                     // ARM_COMPUTE_ENABLE_SME
 
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -99,14 +102,36 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
     DataType::F32,
     DataType::QASYMM8,
 });
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f)
 });
 
-const auto QuantizationData = framework::dataset::make("QuantizationInfo",
+const auto ActivationFunctionsDatasetNightly = make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f, -0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SQUARE),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::HARD_SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 2.f, 1.f),
+#ifdef __aarch64__
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::GELU),
+#endif // __aarch64__
+});
+
+const auto QuantizationData = make("QuantizationInfo",
 {
     QuantizationInfo(0.5f, 10),
     QuantizationInfo(0.3f, 3),
@@ -121,32 +146,32 @@ TEST_SUITE(ConvolutionLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-                                          framework::dataset::make("InputInfo", { TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F32),
+                                          make("InputInfo", { TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F32),
                                                                                   TensorInfo(TensorShape(23U, 27U, 32U, 4U), 1, DataType::F32),
                                                                                   TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32),
                                                                                   TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32)
                                           }),
-                                          framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F32),
+                                          make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(5U, 5U, 32U, 21U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16)
                                           })),
-                                          framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F32),
+                                          make("OutputInfo", { TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(19U, 23U, 21U, 4U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32)
                                           })),
-                                          framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                                          make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
                                                                                  PadStrideInfo(1, 1, 0, 0),
                                                                                  PadStrideInfo(2, 1, 0, 0),
                                                                                  PadStrideInfo(3, 2, 1, 0)
                                           })),
-                                          framework::dataset::make("FastMath", { true,
+                                          make("FastMath", { true,
                                                                                  true,
                                                                                  false,
                                                                                  false
                                           })),
-                                                                           framework::dataset::make("Expected", { ConvolutionMethod::WINOGRAD, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
+                                                                           make("Expected", { ConvolutionMethod::WINOGRAD, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
                input_info, weights_info, output_info, conv_info, fast_math, expected)
 {
     ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
@@ -158,6 +183,14 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
 // *INDENT-ON*
 TEST_SUITE_END() // ConvolutionLayer
 
+/*
+    Testing Strategy of Neon Winograd:
+        - There is no need to thoroughly test nchw cases because winograd kernels accept
+          nhwc and the tensors are permuted before and after if they're nchw.
+        - Except relu and bounded relu, testing activations for a single input
+          combination is enough because activation is not fused into winograd and called
+          separately.
+*/
 TEST_SUITE(WinogradLayer)
 template <typename T>
 using NEWinogradConvolutionLayerFixture = WinogradConvolutionLayerFastMathValidationFixture<Tensor, Accessor, NEWinogradConvolutionLayer, T>;
@@ -269,38 +302,148 @@ TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
     }
 }
 
+DATA_TEST_CASE(SupportedKernels, framework::DatasetMode::ALL, zip(
+                   make("WeightsInfo",
+{
+    // Shapes are always in NCHW format. When layout is NHWC, the shape is permuted
+
+    // Fp32, NCHW/NHWC (layout does not matter as it's )
+    // 3x1, 1x3, 3x3 --> all TRUE
+    TensorInfo(TensorShape(3U, 3U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(1U, 3U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(3U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+    // 5x1, 1x5, 5x5 --> all TRUE
+    TensorInfo(TensorShape(5U, 5U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+    TensorInfo(TensorShape(1U, 5U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(5U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+    // 7x1, 1x7, 7x7
+    //  --> all FALSE
+    TensorInfo(TensorShape(7U, 7U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+    TensorInfo(TensorShape(1U, 7U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(7U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+
+    // unsupported kernel sizes
+    TensorInfo(TensorShape(2U, 2U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(5U, 2U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(3U, 6U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+    // Fp16
+    TensorInfo(TensorShape(3U, 3U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(1U, 3U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(3U, 1U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+
+    // 5x1, 1x5, 5x5 --> all TRUE
+    TensorInfo(TensorShape(5U, 5U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+    TensorInfo(TensorShape(1U, 5U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(5U, 1U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+
+    // 7x1, 1x7, 7x7
+    //  --> all FALSE
+    TensorInfo(TensorShape(7U, 7U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+    TensorInfo(TensorShape(1U, 7U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(7U, 1U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+
+    // unsupported kernel sizes
+    TensorInfo(TensorShape(2U, 2U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(5U, 2U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(3U, 6U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+
+}),
+make("Expected",
+{
+    // fp32
+    true, true, true,    // 3x3, 1x3, 3x1
+    true, true, true,    // 5x5, 1x5, 5x1
+    false, true, true,   // 7x7, 1x7, 7x1
+    false, false, false, // random unsupported kernels
+
+    // fp16
+    true, false, false,  // 3x3, 1x3, 3x1
+    false, false, false, // 5x5, 1x5, 5x1
+    false, false, false, // 7x7, 1x7, 7x1
+    false, false, false, // random unsupported kernels
+})),
+weights_info_const, expected_const)
+{
+    DataType   data_type   = weights_info_const.data_type();
+    DataLayout data_layout = weights_info_const.data_layout();
+
+    TensorInfo input_info   = TensorInfo(TensorShape(17U, 31U, 2U), 1, data_type);
+    TensorInfo bias_info    = TensorInfo(TensorShape(8U), 1, data_type);
+    TensorInfo weights_info = weights_info_const;
+
+    if(data_layout == DataLayout::NHWC)
+    {
+        // Convert to NHWC
+        PermutationVector perm = PermutationVector(2U, 0U, 1U);
+
+        TensorShape input_shape   = input_info.tensor_shape();
+        TensorShape weights_shape = weights_info.tensor_shape();
+        permute(input_shape, perm);
+        permute(weights_shape, perm);
+
+        input_info.set_tensor_shape(input_shape);
+        weights_info.set_tensor_shape(weights_shape);
+
+        input_info.set_data_layout(data_layout);
+        weights_info.set_data_layout(data_layout);
+        bias_info.set_data_layout(data_layout);
+    }
+
+    PadStrideInfo conv_info(1, 1, 0, 0);
+
+    TensorShape output_shape = compute_deep_convolution_shape(input_info, weights_info, conv_info);
+    TensorInfo  output_info  = TensorInfo(output_shape, 1, data_type, data_layout);
+
+    Status status = NEWinogradConvolutionLayer::validate(
+                        &input_info,
+                        &weights_info,
+                        &bias_info,
+                        &output_info,
+                        conv_info,
+                        ActivationLayerInfo(),
+                        true /* fast math */);
+
+    Status fp16_supported = ::arm_compute::error_on_unsupported_cpu_fp16("N/A", "N/A", 0, &input_info);
+    bool   expected       = expected_const && static_cast<bool>(fp16_supported);
+
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+
 TEST_SUITE(FP32)
 
 TEST_SUITE(Conv1x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEWinogradConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                   framework::dataset::make("Input", TensorShape(8U, 8U, 32U)),
-                                                                                   framework::dataset::make("Weight", TensorShape(1U, 3U, 32U, 1U))),
-                                                                               framework::dataset::make("Bias", TensorShape(1U))),
-                                                                       framework::dataset::make("Output", TensorShape(8U, 6U, 1U))),
-                                                               framework::dataset::make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0))),
-                                                       framework::dataset::make("Dilation", Size2D(1U, 1U))),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(
+                           make("Input", TensorShape(8U, 8U, 32U)),
+                           make("Weight", TensorShape(1U, 3U, 32U, 1U)),
+                           make("Bias", TensorShape(1U)),
+                           make("Output", TensorShape(8U, 6U, 1U)),
+                           make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                           make("Dilation", Size2D(1U, 1U)),
+                           make("DataType", { DataType::F32 }),
+                           ActivationFunctionsDataset,
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x3Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -310,19 +453,19 @@ TEST_SUITE_END() // Conv1x3
 
 TEST_SUITE(Conv3x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -332,19 +475,19 @@ TEST_SUITE_END() // Conv3x1
 
 TEST_SUITE(Conv1x5)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x5Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -354,19 +497,19 @@ TEST_SUITE_END() // Conv1x5
 
 TEST_SUITE(Conv5x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer5x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -376,10 +519,10 @@ TEST_SUITE_END() // Conv5x1
 
 TEST_SUITE(Conv7x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer7x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer7x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
@@ -387,9 +530,9 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                        combine(combine(combine(datasets::LargeWinogradConvolutionLayer7x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                                               make("DataType", { DataType::F32 })),
+                                       make("ActivationInfo", { ActivationLayerInfo() })),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -398,20 +541,20 @@ TEST_SUITE_END() // Conv7x1
 
 TEST_SUITE(Conv1x7)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x7Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x7Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer7x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer7x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -420,20 +563,40 @@ TEST_SUITE_END() // Conv1x7
 
 TEST_SUITE(Conv3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+
+{
+    // Validate output
+    validate(Accessor(_target), _reference, abs_tolerance_f32);
+}
 
+/// It's enough to run the activations for a single weight/input combination and data type because
+/// activation function is called on top of the winograd output as a separate operator
+/// TODO: Enable after COMPMID-6573 is resolved
+FIXTURE_DATA_TEST_CASE(RunActivations, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::DISABLED,
+                       combine(
+                           make("Input", TensorShape(3U, 3U, 32U)),
+                           make("Weight", TensorShape(3U, 3U, 32U, 4U)),
+                           make("Bias", TensorShape(4U)),
+                           make("Output", TensorShape(1U, 1U, 4U)),
+                           make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                           make("Dilation", Size2D(1U, 1U)),
+                           make("DataType", { DataType::F32 }),
+                           ActivationFunctionsDatasetNightly,
+                           make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
+
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 
 {
     // Validate output
@@ -444,20 +607,20 @@ TEST_SUITE_END() // Conv3x3
 
 TEST_SUITE(Conv5x5)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 
 {
     // Validate output
@@ -467,12 +630,12 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, frame
 TEST_SUITE_END() // Conv5x5
 
 FIXTURE_DATA_TEST_CASE(RunSmallNoBias, NEWinogradConvolutionLayerNoBiasFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
-                                                                          datasets::SmallWinogradConvolutionLayer5x5Dataset()),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(framework::dataset::concat(
+                                   datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                                   datasets::SmallWinogradConvolutionLayer5x5Dataset()),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
@@ -484,24 +647,26 @@ TEST_SUITE_END() // FP32
 TEST_SUITE(FP16)
 using CLWinogradConvolutionLayerFastMathFixture16 = WinogradConvolutionLayerFastMathValidationFixture<Tensor, Accessor, NEWinogradConvolutionLayer, half, float>;
 
-DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-                                          framework::dataset::make("InputInfo", { TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F16),
-                                                                                  TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F16)
-                                          }),
-                                          framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F16),
-                                                                                    TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F16)
-                                          })),
-                                          framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F32),
-                                                                                   TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F16)
-                                          })),
-                                          framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
-                                                                                 PadStrideInfo(1, 1, 0, 0)
-                                          })),
-                                          framework::dataset::make("FastMath", { false, // case fp16 and fast_math False then disable Winograd
-                                                                                 true   // case fp16 and fast_math True then enable Winograd
-                                          })),
-                                                                           framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD })),
-               input_info, weights_info, output_info, conv_info, fast_math, expected)
+DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(
+                   make("InputInfo", { TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F16),
+                                       TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F16)
+                                     }),
+                   make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F16),
+                                         TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F16)
+                                       }),
+                   make("OutputInfo", { TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F32),
+                                        TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F16)
+                                      }),
+                   make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                                      PadStrideInfo(1, 1, 0, 0)
+                                    }),
+                   make("FastMath",
+{
+    false, // case fp16 and fast_math False then disable Winograd
+    true   // case fp16 and fast_math True then enable Winograd
+}),
+make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD })),
+input_info, weights_info, output_info, conv_info, fast_math, expected)
 {
     ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
                                                                             &weights_info.clone()->set_is_resizable(true),
@@ -511,10 +676,10 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
 
 TEST_SUITE(Conv3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                               make("DataType", { DataType::F16 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -522,10 +687,10 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, fr
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
+                               make("DataType", { DataType::F16 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 
 {
     // Validate output
@@ -867,6 +1032,8 @@ TEST_SUITE(GEMMConvolutionLayer)
 template <typename T>
 using NEGEMMConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T>;
 template <typename T>
+using NEGEMMConvolutionLayerPaddedWeightsFixture = ConvolutionValidationPaddedWeightsFixture<Tensor, Accessor, NEConvolutionLayer, T>;
+template <typename T>
 using NEGEMMConvolutionLayerMixedDataLayoutFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T, true>;
 
 /** Test case for memory injection in @ref cpu::CpuGemmConv2d.
@@ -968,7 +1135,9 @@ TEST_SUITE(Float)
 #if defined(ARM_COMPUTE_ENABLE_BF16)
 TEST_SUITE(BFLOAT16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                    framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::BFLOAT16)), framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                    framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                    framework::dataset::make("DataType", DataType::BFLOAT16)),
+                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC })),
                                                                                                             ActivationFunctionsDataset))
 {
     // Validate output
@@ -980,7 +1149,10 @@ TEST_SUITE_END() // BFLOAT16
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                   framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataLayout", { DataLayout::NCHW })), ActivationFunctionsDataset))
+                                                                                                                   framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                           ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
@@ -990,7 +1162,9 @@ TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                    framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                    framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                             ActivationFunctionsDataset))
 {
     // Validate output
@@ -1012,9 +1186,25 @@ FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEGEMMConvolutionLayerMixedDataLayout
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
 }
+/** Padded weights
+ * CpuGemmConv2d uses two different paths for reshaping the weights based on if the weight tensor has holes (a common
+ * way to have "holes" in tensor is via extended paddings)
+ *
+ * We only need to test the padded weight path here on a single floating data type and a single layout, because the fallback path is agnostic of them
+ */
+FIXTURE_DATA_TEST_CASE(RunPaddedWeights, NEGEMMConvolutionLayerPaddedWeightsFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                    framework::dataset::make("ReshapeWeights", { true }),
+                                                                                                                    framework::dataset::make("DataType", DataType::F32),
+                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC })
+                                                                                                            ))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+// TODO: COMPMID-6596 Extend quantized tests with at least one suite where the weight is padded (the legacy case, see floating point's RunPaddedWeights)
 template <typename T>
 using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEConvolutionLayer, T>;
 template <typename T>
@@ -1030,10 +1220,18 @@ const auto QuantizedActivationFunctionsDataset = framework::dataset::make("Activ
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
 });
 TEST_SUITE(Quantized)
+/// @note: Every asymmetric quantized test where there's no fused activation will have its quantization info ignored
+/// This is because instead of using the same quantization information for all the tensors, the fixture generates
+/// separate quantization info for each input and the output tensor.
+/// When we can also support dynamic quantization with the presence of activation, these two versions should be merged
+/// again, with the explicitly specified quantization info removed
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                       framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })), QuantizedActivationFunctionsDataset))
+                                                                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                       framework::dataset::make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(2.f / 255.f, 10) })),
+                                                                                                                       QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -1049,7 +1247,7 @@ FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEGEMMConvolutionLayerQuantizedFixtur
                                                                framework::dataset::make("ReshapeWeights", { true })),
                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(2.f / 255.f, 10) })),
                                QuantizedActivationFunctionsDataset))
 {
     // Validate output
@@ -1059,8 +1257,11 @@ TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                      framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.01f, -10) })), QuantizedActivationFunctionsDataset))
+                                                                                                                      framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                      framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                      framework::dataset::make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(0.01f, -10) })),
+                                                                                                                      QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -1076,7 +1277,7 @@ FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEGEMMConvolutionLayerQuantizedFixtur
                                                                framework::dataset::make("ReshapeWeights", { true })),
                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(2.f / 255.f, 10) })),
                                QuantizedActivationFunctionsDataset))
 {
     // Validate output
@@ -1214,7 +1415,10 @@ TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                     framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NHWC })), ActivationFunctionsDataset))
+                                                                                                                     framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                     framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                             ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
@@ -1238,8 +1442,11 @@ const auto QuantizedActivationFunctionsDataset = framework::dataset::make("Activ
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                        framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })), QuantizedActivationFunctionsDataset))
+                                                                                                                        framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                                                                                                        QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -1248,8 +1455,11 @@ TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                       framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.01f, -10) })), QuantizedActivationFunctionsDataset))
+                                                                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.01f, -10) })),
+                                                                                                                       QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
diff --git a/tests/validation/NEON/DeconvolutionLayer.cpp b/tests/validation/NEON/DeconvolutionLayer.cpp
index d26d26adf7e94d4bb40b3cc8d80dabc1feabf365..b4c049f6f931ba358e333210282591cc4c60da7e 100644
--- a/tests/validation/NEON/DeconvolutionLayer.cpp
+++ b/tests/validation/NEON/DeconvolutionLayer.cpp
@@ -52,54 +52,81 @@ constexpr float                           tolerance_num_fp16 = 0.02f;
 constexpr float tolerance_num_quant = 0.07f;                                      /**< Tolerance number for quantized types */
 
 const auto data4x4 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 3)
-                     * framework::dataset::make("PadY", 0, 3) * framework::dataset::make("NumKernels", { 3 });
+                     * framework::dataset::make("PadY", 0, 3) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
 const auto data3x3 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 2)
-                     * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels", { 3 });
+                     * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
 const auto data3x3_asymm = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 2) * framework::dataset::make("StrideY", 1, 2) * framework::dataset::make("PadLeft", 0, 1)
-                           * framework::dataset::make("PadRight", 0, 1) * framework::dataset::make("PadTop", 0, 1) * framework::dataset::make("PadBottom", 0, 1) * framework::dataset::make("NumKernels", { 3 });
+                           * framework::dataset::make("PadRight", 0, 1) * framework::dataset::make("PadTop", 0, 1) * framework::dataset::make("PadBottom", 0, 1) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
-const auto data9x9_small_asymm = framework::dataset::make("InputShape", TensorShape{ 10U, 10U, 1U, 1U }) *framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY",
-                                 2)
-                                 *framework::dataset::make("PadLeft", 3)
-                                 *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop", 3) *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
+const auto data9x9_small_asymm = framework::dataset::make("InputShape", TensorShape
+{
+    10U, 10U, 1U, 1U
+})
+*framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY", 2) *framework::dataset::make("PadLeft", 3) *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop",
+        3)  *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
 
-const auto data9x9_large_asymm = framework::dataset::make("InputShape", TensorShape{ 640U, 360U, 56U, 1U }) *framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY",
-                                 2)
-                                 *framework::dataset::make("PadLeft", 3)
-                                 *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop", 3) *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
+const auto data9x9_large_asymm = framework::dataset::make("InputShape", TensorShape
+{
+    640U, 360U, 56U, 1U
+})
+*framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY", 2) *framework::dataset::make("PadLeft", 3) *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop",
+        3)  *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
 
 const auto data3x3_precommit = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 2) * framework::dataset::make("StrideY", 1, 2) * framework::dataset::make("PadX", 0, 2)
-                               * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels", { 3 });
+                               * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
 const auto data1x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 1)
-                     * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels", { 3 });
+                     * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
 const auto data5x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 1)
-                     * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels", { 3 });
+                     * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
-const auto data_layouts_dataset = framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC });
+const auto data_layouts_dataset = framework::dataset::make("DataLayout",
+{
+    DataLayout::NCHW, DataLayout::NHWC
+});
 
-const auto add_bias_dataset = framework::dataset::make("AddBias", { true, false });
+const auto add_bias_dataset = framework::dataset::make("AddBias",
+{
+    true, false
+});
 
 const auto input_qinfo_dataset = framework::dataset::make("InputQInfo",
 {
     QuantizationInfo(1.f / 255.f, 0),
-    QuantizationInfo(2.f, 0),
+                     QuantizationInfo(2.f, 0),
 });
 
 const auto output_qinfo_dataset = framework::dataset::make("OutputQInfo",
 {
     QuantizationInfo(3.f / 255.f, 0),
-    QuantizationInfo(4.f, 0),
+                     QuantizationInfo(4.f, 0),
 });
 
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(DeconvolutionLayer)
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
@@ -109,6 +136,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),  // Invalid bias shape
                                             TensorInfo(TensorShape(13U, 11U, 4U, 3U), 1, DataType::F32), // Window shrink
                                             TensorInfo(TensorShape(32U, 16U, 2U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(2U,2U,1U,1U), 1, DataType::F32),    // Small shape no padding
+                                            TensorInfo(TensorShape(3U,26U,26U,1U), 1, DataType::F32),    // Negative padding
                                           }),
     framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16),
                                             TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32),
@@ -116,6 +145,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                             TensorInfo(TensorShape(3U, 2U, 2U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(3U, 3U, 4U), 1, DataType::F32),
                                               TensorInfo(TensorShape(1U, 1U, 2U, 4U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(3U,3U,1U,1U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(1U,1U,26U,88U), 1, DataType::F32),
                                           })),
     framework::dataset::make("BiasInfo",  { TensorInfo(TensorShape(1U), 1, DataType::F16),
                                             TensorInfo(TensorShape(1U), 1, DataType::F32),
@@ -123,6 +154,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                             TensorInfo(TensorShape(25U, 11U), 1, DataType::F32),
                                             TensorInfo(TensorShape(1U), 1, DataType::F32),
                                             TensorInfo(TensorShape(4U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(1U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(88U), 1, DataType::F32),
                                           })),
     framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F16),
                                             TensorInfo(TensorShape(25U, 10U, 2U), 1, DataType::F32),
@@ -130,6 +163,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                             TensorInfo(TensorShape(13U, 13U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(11U, 9U, 1U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(32U, 16U, 4U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(4U,4U,1U,1U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(1U,78U,88U,1U), 1, DataType::F32),
                                           })),
     framework::dataset::make("PadStrideInfo", { PadStrideInfo(1, 1, 0, 0),
                                                 PadStrideInfo(1, 1, 0, 0),
@@ -137,8 +172,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                 PadStrideInfo(1, 1, 0, 0),
                                                 PadStrideInfo(1, 1, 1, 1),
                                                 PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(2, 3, 3, 1),
                                            })),
-    framework::dataset::make("Expected", { false, false, false, false, false, true })),
+    framework::dataset::make("Expected", { false, false, false, false, false, true,true, false })),
     input_info, weights_info, bias_info, output_info, pad_info, expected)
 {
     bool is_valid = bool(NEDeconvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pad_info));
@@ -452,10 +489,22 @@ TEST_SUITE_END() // W5x1
 
 TEST_SUITE_END() // QASYMM8_SIGNED
 
-const auto input_qinfo_per_channel_dataset         = framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 10) });
-const auto output_qinfo_per_channel_dataset        = framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0) });
-const auto input_signed_qinfo_per_channel_dataset  = framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, -10) });
-const auto output_signed_qinfo_per_channel_dataset = framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 10) });
+const auto input_qinfo_per_channel_dataset = framework::dataset::make("InputQuantizationInfo",
+{
+    QuantizationInfo(1.f / 255.f, 10)
+});
+const auto output_qinfo_per_channel_dataset = framework::dataset::make("OutputQuantizationInfo",
+{
+    QuantizationInfo(3.f / 255.f, 0)
+});
+const auto input_signed_qinfo_per_channel_dataset = framework::dataset::make("InputQuantizationInfo",
+{
+    QuantizationInfo(1.f / 255.f, -10)
+});
+const auto output_signed_qinfo_per_channel_dataset = framework::dataset::make("OutputQuantizationInfo",
+{
+    QuantizationInfo(3.f / 255.f, 10)
+});
 
 TEST_SUITE(QSYMM8_PER_CHANNEL)
 
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index ab49ee1962348570365a9dee27acd992054721b4..a5d7a31cfc12d483a6dca703239d8dcfc35ffea9 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,6 +42,7 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 using namespace arm_compute::misc::shape_calculator;
 
 namespace
@@ -53,17 +54,42 @@ RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.02)); /**<
 constexpr float                     tolerance_num = 0.05f;                 /**< Tolerance number */
 #endif                                                                     // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
-const auto depth_multipliers       = framework::dataset::make("DepthMultiplier", { 1, 2, 8 });
-const auto large_depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 5, 32 });
+const auto depth_multipliers       = make("DepthMultiplier", { 1, 2, 8 });
+const auto large_depth_multipliers = make("DepthMultiplier", { 5, 32 });
 
 //Activation Functions
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
 });
 
-const auto input_qinfo_dataset = framework::dataset::make("InputQInfo",
+const auto ActivationFunctionsDatasetNightly = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f, -0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SQUARE),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::HARD_SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 2.f, 1.f),
+#ifdef __aarch64__
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::GELU),
+#endif // __aarch64__
+});
+
+const auto ActivationFunctionsQuantizedDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f, -0.5f),
+});
+
+const auto input_qinfo_dataset = make("InputQInfo",
 {
     QuantizationInfo(0.3f, 10),
     QuantizationInfo(2.2f, 10),
@@ -76,7 +102,7 @@ TEST_SUITE(DepthwiseConvolutionLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
+               make("InputInfo", { TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
                                                        TensorInfo(TensorShape(32U, 18U, 3U), 1, DataType::F32),     // Mismatching input feature maps
                                                        TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Unsupported weights dimensions
                                                        TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Mismatching depth multiplier
@@ -88,7 +114,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // dilation < 1
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                      }),
-               framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16),
+               make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16),
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(5U, 5U, 2U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
@@ -100,7 +126,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
                                                        })),
-               framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
+               make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
@@ -112,7 +138,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                       })),
-               framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
+               make("OutputInfo", { TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
@@ -124,7 +150,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                       })),
-               framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+               make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
@@ -136,7 +162,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                       PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
                                                      })),
-               framework::dataset::make("DepthMultiplier", { 1,
+               make("DepthMultiplier", { 1,
                                                              1,
                                                              1,
                                                              3,
@@ -148,7 +174,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                              1,
                                                              1,
                                                             })),
-               framework::dataset::make("Dilation", { Size2D(1U, 1U),
+               make("Dilation", { Size2D(1U, 1U),
                                                       Size2D(1U, 1U),
                                                       Size2D(1U, 1U),
                                                       Size2D(1U, 1U),
@@ -160,7 +186,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                       Size2D(0U, 1U),
                                                       Size2D(1U, 1U),
                                                             })),
-               framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false, false, true })),
+               make("Expected", { false, false, false, false, false, false, false, false, false, false, true })),
                input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier,dilation, expected)
 {
     bool is_valid = bool(NEDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false),
@@ -169,7 +195,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
 }
 
 DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
-                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
+                make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
                                                         TensorInfo(TensorShape(27U, 13U, 3U), 1, DataType::F32),     // Mismatching input feature maps
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Mismatching depth multiplier
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Invalid biases size
@@ -178,7 +204,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                         TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),     // Patch size bigger than input width
                                                         TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),     // Dilation < 1
                                                       }),
-                framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16),
+                make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16),
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
@@ -187,7 +213,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                           TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
                                                         })),
-                framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
+                make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(4U), 1, DataType::F32),
@@ -196,7 +222,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(16U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(16U), 1, DataType::F32),
                                                        })),
-                framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
@@ -205,7 +231,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
                                                        })),
-                framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
@@ -214,7 +240,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                       })),
-                framework::dataset::make("DepthMultiplier", { 1,
+                make("DepthMultiplier", { 1,
                                                               1,
                                                               3,
                                                               1,
@@ -223,7 +249,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                               2,
                                                               2,
                                                              })),
-                framework::dataset::make("Dilation", { Size2D(1U, 1U),
+                make("Dilation", { Size2D(1U, 1U),
                                                        Size2D(1U, 1U),
                                                        Size2D(1U, 1U),
                                                        Size2D(1U, 1U),
@@ -232,7 +258,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                        Size2D(25U, 1U),
                                                        Size2D(0U, 1U),
                                                              })),
-                framework::dataset::make("Expected", { false, false, false, false, false, false, false, false})),
+                make("Expected", { false, false, false, false, false, false, false, false})),
                 input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier,dilation, expected)
 {
     bool is_valid = bool(NEDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier, ActivationLayerInfo(), dilation));
@@ -249,40 +275,53 @@ using NEDepthwiseConvolutionLayerVariableWeightsFixture = DepthwiseConvolutionLa
 
 TEST_SUITE(Float)
 TEST_SUITE(F32)
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("DataType", DataType::F32),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsDatasetNightly))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
 TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                            depth_multipliers),
-                           framework::dataset::make("DataType", DataType::F32)),
-                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           make("DataType", DataType::F32)),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                            ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout, NEDepthwiseConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                           framework::dataset::make("DepthMultiplier", { 2 })),
-                           framework::dataset::make("DataType", DataType::F32)),
-                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                           framework::dataset::make("ActivationInfo", ActivationLayerInfo())))
+                           make("DepthMultiplier", { 2 })),
+                           make("DataType", DataType::F32)),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
                            large_depth_multipliers),
-                           framework::dataset::make("DataType",
-                                                    DataType::F32)),
-                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                           ActivationFunctionsDataset))
+                           make("DataType", DataType::F32)),
+                           make("DataLayout", { DataLayout::NHWC })),
+                           make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                            depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -290,10 +329,10 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>,
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -303,9 +342,9 @@ TEST_SUITE_END() // Generic
 TEST_SUITE(W3x3)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                            depth_multipliers),
-                           framework::dataset::make("DataType",
+                           make("DataType",
                                                     DataType::F32)),
-                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                            ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -313,10 +352,10 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>,
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -324,9 +363,9 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                            depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -334,10 +373,10 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>,
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -348,70 +387,70 @@ TEST_SUITE_END() // W3x3
 TEST_SUITE(Optimized)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunVariableWeightsSmall3x3, NEDepthwiseConvolutionLayerVariableWeightsFixture<float>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                           make("DataLayout", { DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout3x3, NEDepthwiseConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                   framework::dataset::make("DataType", DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   framework::dataset::make("ActivationInfo", ActivationLayerInfo())))
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType", DataType::F32)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunVariableWeightsSmall5x5, NEDepthwiseConvolutionLayerVariableWeightsFixture<float>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                           make("DataLayout", { DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunVariableWeightsLarge3x3, NEDepthwiseConvolutionLayerVariableWeightsFixture<float>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -420,22 +459,37 @@ TEST_SUITE_END() // F32
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("DataType", DataType::F16),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsDatasetNightly))
+{
+    validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+
 TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                            depth_multipliers),
-                           framework::dataset::make("DataType",
+                           make("DataType",
                                                     DataType::F16)),
-                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                            ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
                                                                                                                         large_depth_multipliers),
-                                                                                                                        framework::dataset::make("DataType",
+                                                                                                                        make("DataType",
                                                                                                                                 DataType::F16)),
-                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                        ActivationFunctionsDataset))
+                                                                                                                        make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                        make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
 }
@@ -444,9 +498,8 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                            depth_multipliers),
-                                                   framework::dataset::make("DataType",
-                                                                            DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                   make("DataType", DataType::F16)),
+                                           make("DataLayout", { DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
@@ -454,10 +507,9 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, f
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
-                                                                            DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                   make("DataType", DataType::F16)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
 }
@@ -469,9 +521,9 @@ using NEDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFi
 TEST_SUITE(W3x3)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                            depth_multipliers),
-                           framework::dataset::make("DataType",
+                           make("DataType",
                                                     DataType::F16)),
-                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                            ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -479,10 +531,10 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, f
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
@@ -492,9 +544,9 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                            depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -502,10 +554,10 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, f
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
@@ -516,31 +568,31 @@ TEST_SUITE_END() // W3x3
 TEST_SUITE(Optimized)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmallW3x3, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunSmallW5x5, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLargeW3x3, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
@@ -558,26 +610,43 @@ using NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture = Depthwise
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("DataType", DataType::QASYMM8),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.05f, 4) }),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsQuantizedDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
 TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                                                                            depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                   make("DataType", DataType::QASYMM8)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout, NEDepthwiseConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                                                                           framework::dataset::make("DepthMultiplier", { 2 })),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                           make("DepthMultiplier", { 2 })),
+                                                                   make("DataType", DataType::QASYMM8)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   framework::dataset::make("ActivationInfo", ActivationLayerInfo())))
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -585,10 +654,10 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                                            depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                   make("DataType", DataType::QASYMM8)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.8f, 1) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.8f, 1) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -596,11 +665,11 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
                                                                            large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                   make("DataType", DataType::QASYMM8)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -609,10 +678,10 @@ TEST_SUITE_END() // Generic
 TEST_SUITE(W3x3)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                   make("DataType", DataType::QASYMM8)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -620,11 +689,11 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                                            large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                   make("DataType", DataType::QASYMM8)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -633,10 +702,10 @@ TEST_SUITE(Dilation)
 
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                   make("DataType", DataType::QASYMM8)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -644,11 +713,11 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                                            large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                   make("DataType", DataType::QASYMM8)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -658,47 +727,47 @@ TEST_SUITE_END() // W3x3
 TEST_SUITE(Optimized)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                           make("DepthMultiplier", 1)),
+                                                                   make("DataType", DataType::QASYMM8)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout3x3, NEDepthwiseConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                           make("DepthMultiplier", 1)),
+                                                                   make("DataType", DataType::QASYMM8)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   framework::dataset::make("ActivationInfo", ActivationLayerInfo())))
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                                   framework::dataset::make("DataType",
+                                                                           make("DepthMultiplier", 1)),
+                                                                   make("DataType",
                                                                                             DataType::QASYMM8)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                                   framework::dataset::make("DataType",
+                                                                           make("DepthMultiplier", 1)),
+                                                                   make("DataType",
                                                                                             DataType::QASYMM8)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -706,14 +775,31 @@ TEST_SUITE_END() // Optimized
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.05f, 4) }),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsQuantizedDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
 TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                                                                            depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                   make("DataType", DataType::QASYMM8_SIGNED)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -723,10 +809,10 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                                            depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                   make("DataType", DataType::QASYMM8_SIGNED)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.8f, 1) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.8f, 1) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -734,11 +820,11 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
                                                                            large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                   make("DataType", DataType::QASYMM8_SIGNED)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
+                                           make("DataLayout", { DataLayout::NCHW })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -748,10 +834,10 @@ TEST_SUITE_END() // Generic
 TEST_SUITE(W3x3)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                   make("DataType", DataType::QASYMM8_SIGNED)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -759,11 +845,11 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                                            large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                   make("DataType", DataType::QASYMM8_SIGNED)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NCHW })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -771,10 +857,10 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                   make("DataType", DataType::QASYMM8_SIGNED)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -782,11 +868,11 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                                            large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                   make("DataType", DataType::QASYMM8_SIGNED)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NCHW })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -796,37 +882,37 @@ TEST_SUITE_END() // W3x3
 TEST_SUITE(Optimized)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                                   framework::dataset::make("DataType",
+                                                                           make("DepthMultiplier", 1)),
+                                                                   make("DataType",
                                                                                             DataType::QASYMM8_SIGNED)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                                   framework::dataset::make("DataType",
+                                                                           make("DepthMultiplier", 1)),
+                                                                   make("DataType",
                                                                                             DataType::QASYMM8_SIGNED)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                                   framework::dataset::make("DataType",
+                                                                           make("DepthMultiplier", 1)),
+                                                                   make("DataType",
                                                                                             DataType::QASYMM8_SIGNED)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NCHW })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -834,15 +920,33 @@ TEST_SUITE_END() // Optimized
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE(QSYMM8_PER_CHANNEL)
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("InputDataType", DataType::QASYMM8),
+        make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.05f, 4) }),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsQuantizedDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
 TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                                                                                    depth_multipliers),
-                                                                           framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -852,11 +956,11 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                                                    depth_multipliers),
-                                                                           framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -864,12 +968,12 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetr
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
                                                                                    depth_multipliers),
-                                                                           framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -879,25 +983,25 @@ TEST_SUITE_END() // Generic
 TEST_SUITE(Optimized)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                                   framework::dataset::make("DepthMultiplier", 1)),
-                                                                           framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                                                   make("DepthMultiplier", 1)),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                                   framework::dataset::make("DepthMultiplier", 1)),
-                                                                           framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                                                   make("DepthMultiplier", 1)),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
diff --git a/tests/validation/NEON/DilatedConvolutionLayer.cpp b/tests/validation/NEON/DilatedConvolutionLayer.cpp
index 30bf6904569503e1f41f76b1ea31ca9c7602e52f..2ede4fac4f5ee56319526b1ca397eb45e3cb8bfa 100644
--- a/tests/validation/NEON/DilatedConvolutionLayer.cpp
+++ b/tests/validation/NEON/DilatedConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -162,13 +162,18 @@ template <typename T>
 using NEGEMMDilatedConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T>;
 
 TEST_SUITE(Quantized)
+/// @note: Every asymmetric quantized test where there's no fused activation will have its quantization info ignored
+/// This is because instead of using the same quantization information for all the tensors, the fixture generates
+/// separate quantization info for each input and the output tensor.
+/// When we can also support dynamic quantization with the presence of activation, we can remove the explicit
+/// quantization info.
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMDilatedConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                        combine(combine(combine(combine(combine(datasets::SmallDilatedConvolutionLayerDataset(),
                                                                framework::dataset::make("ReshapeWeights", { true })),
                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("IgnoredQuantizationInfo", { QuantizationInfo() })),
                                framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
 {
     // Validate output
@@ -179,7 +184,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMDilatedConvolutionLayerQuantizedFixture<u
                                                                framework::dataset::make("ReshapeWeights", { true })),
                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("IgnoredQuantizationInfo", { QuantizationInfo() })),
                                framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
 {
     // Validate output
diff --git a/tests/validation/NEON/FullyConnectedLayer.cpp b/tests/validation/NEON/FullyConnectedLayer.cpp
index 04889a9dba7ce40eb4e0bb538053bc06f0be2b73..31db8f0f805a035b4bf5a5cf81c627a89bc46d3f 100644
--- a/tests/validation/NEON/FullyConnectedLayer.cpp
+++ b/tests/validation/NEON/FullyConnectedLayer.cpp
@@ -42,6 +42,7 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 /** Tolerance for float operations */
@@ -58,7 +59,7 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
 constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
 
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -66,18 +67,25 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
     DataType::F32,
 });
 
-const auto FullyConnectedParameters = combine(framework::dataset::make("TransposeWeights", { false, true }), framework::dataset::make("ReshapeWeights", { false, true }));
+const auto FullyConnectedParameters = combine(make("TransposeWeights", { false, true }), make("ReshapeWeights", { false, true }));
 
-const auto QuantizationData = framework::dataset::make("QuantizationInfo",
+const auto QuantizationData = make("QuantizationInfo",
 {
     QuantizationInfo(1.f / 256.f, 10),
     QuantizationInfo(1.1f, 10),
 });
-const auto EmptyActivationFunctionDataset = framework::dataset::make("ActivationInfo",
+
+const auto IgnoredQuantizationData = make("IgnoredQuantizationInfo",
+{
+    QuantizationInfo(),
+});
+
+const auto NoActivationFunctionDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
 });
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+
+const auto ActivationFunctionsDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
@@ -85,7 +93,7 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
 });
 
-const auto ActivationFunctionsQuantizedDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsQuantizedDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
@@ -242,37 +250,37 @@ TEST_CASE(Quant8_Signed_Mult_gt_1, framework::DatasetMode::ALL)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-    framework::dataset::make("InputInfo", { TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Mismatching data types
+    make("InputInfo", { TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Mismatching data types
                                             TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Invalid weights dimensions
                                             TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Wrongly reshaped weights
                                             TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
                                           }),
-    framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(315U, 271U), 1, DataType::F16),
+    make("WeightsInfo",{ TensorInfo(TensorShape(315U, 271U), 1, DataType::F16),
                                              TensorInfo(TensorShape(192U, 192U), 1, DataType::F32),
                                              TensorInfo(TensorShape(192U, 192U), 1, DataType::F32),
                                              TensorInfo(TensorShape(217U, 315U), 1, DataType::F32),
                                              TensorInfo(TensorShape(217U, 315U), 1, DataType::F32),
                                              TensorInfo(TensorShape(192U, 192U), 1, DataType::F32),
                                           })),
-    framework::dataset::make("BiasInfo",{ TensorInfo(TensorShape(271U), 1, DataType::F32),
+    make("BiasInfo",{ TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           TensorInfo(TensorShape(192U), 1, DataType::F32),
                                           TensorInfo(TensorShape(192U), 1, DataType::F32),
                                           TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           TensorInfo(TensorShape(192U), 1, DataType::F32),
                                           })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
+    make("OutputInfo",{ TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(192U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(192U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(192U, 4U), 1, DataType::F32),
                                            })),
-    framework::dataset::make("TransposeWeights",{ true, true, false, true, true, true })),
-    framework::dataset::make("ReshapedWeights",{ false, false, false, false, false , false})),
-    framework::dataset::make("Expected", { false, true, true, false, false, true })),
+    make("TransposeWeights",{ true, true, false, true, true, true })),
+    make("ReshapedWeights",{ false, false, false, false, false , false})),
+    make("Expected", { false, true, true, false, false, true })),
     input_info, weights_info, bias_info, output_info, transpose_weights, reshaped_weights, expected)
 {
     // Create Fully Connected layer info
@@ -298,80 +306,79 @@ using NEFullyConnectedLayerDynamicBiasFixture = FullyConnectedWithDynamicBiasFix
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                                                                                                                        FullyConnectedParameters),
-                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                                                                                                                        FullyConnectedParameters,
+                                                                                                                        make("DataType", DataType::F16),
+                                                                                                                NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num_f16, abs_tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(
+FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
                            combine(datasets::FullyConnectedLayerWithActivationDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::F16)),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::F16),
                        ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num_f16, abs_tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(),
-                                                                                                                      FullyConnectedParameters),
-                                                                                                                      framework::dataset::make("DataType", DataType::F16)),
-                                                                                                              EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeFullyConnectedLayerDataset(),
+                                                                                                                      FullyConnectedParameters,
+                                                                                                                      make("DataType", DataType::F16),
+                                                                                                              NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num_f16, abs_tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                       framework::dataset::make("DataType", DataType::F16)),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))),
-                       framework::dataset::make("WeightsReshaped", { false, true })))
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::F16),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                       make("WeightsReshaped", { false, true })))
 {
 }
 TEST_SUITE_END()
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters),
-                                                                                                                 framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                 EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters,
+                                                                                                                 make("DataType", DataType::F32),
+                                                                                                                 NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFullyConnectedLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(
-                           framework::dataset::make("Input", TensorShape(9U, 5U, 7U)),
-                           framework::dataset::make("Weights", TensorShape(315U, 271U))),
-                       framework::dataset::make("Biases", TensorShape(271U))),
-                       framework::dataset::make("Output", TensorShape(271U))),
-                       FullyConnectedParameters),
-                       framework::dataset::make("DataType", DataType::F32)),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFullyConnectedLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(
+                           make("Input", TensorShape(9U, 5U, 7U)),
+                           make("Weights", TensorShape(315U, 271U)),
+                       make("Biases", TensorShape(271U)),
+                       make("Output", TensorShape(271U)),
+                       FullyConnectedParameters,
+                       make("DataType", DataType::F32),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(
-                           combine(datasets::FullyConnectedLayerWithActivationDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::F32)),
+FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::FullyConnectedLayerWithActivationDataset(),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::F32),
                        ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters),
-                                                                                                                       framework::dataset::make("DataType", DataType::F32)),
-                                                                                                               EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters,
+                                                                                                                       make("DataType", DataType::F32),
+                                                                                                               NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                       framework::dataset::make("DataType", DataType::F32)),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))),
-                       framework::dataset::make("WeightsReshaped", { false, true })))
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::F32),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                       make("WeightsReshaped", { false, true })))
 {
 }
 TEST_SUITE_END()
@@ -384,103 +391,150 @@ using NEFullyConnectedLayerQuantizedMixedDataLayoutFixture = FullyConnectedLayer
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
-                           combine(datasets::SmallFullyConnectedLayerDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8)),
-                       QuantizationData),
-                       EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8),
+                                       QuantizationData,
+                               make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(combine(
-                                                                           framework::dataset::make("Input", TensorShape(9U, 5U, 7U)),
-                                                                           framework::dataset::make("Weights", TensorShape(315U, 271U))),
-                                                                       framework::dataset::make("Biases", TensorShape(271U))),
-                                                               framework::dataset::make("Output", TensorShape(271U))),
-                                                       FullyConnectedParameters),
-                                               framework::dataset::make("DataType", DataType::QASYMM8)),
-                                       QuantizationData),
-                               framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
-}
-FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
+FIXTURE_DATA_TEST_CASE(RunSmallWithActivation, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(datasets::FullyConnectedLayerWithActivationDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8)),
-                       QuantizationData),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::QASYMM8),
+                       QuantizationData,
                        ActivationFunctionsQuantizedDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeightsWithActivation, NEFullyConnectedLayerDynamicWeightsFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                       make("WeightsReshaped", { false })))
+{
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicBiasWithActivation, NEFullyConnectedLayerDynamicBiasFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+{
+}
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(
-                           combine(datasets::LargeFullyConnectedLayerDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8)),
-                       QuantizationData),
-                       EmptyActivationFunctionDataset))
+// Dynamic Quantization Tests here
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                           combine(datasets::SmallFullyConnectedLayerDataset(),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::QASYMM8),
+                       IgnoredQuantizationData,
+                       NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-
-FIXTURE_DATA_TEST_CASE(RunDynamicBias, NEFullyConnectedLayerDynamicBiasFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(
+                           datasets::LargeFullyConnectedLayerDataset(),
+                            FullyConnectedParameters,
+                           framework::dataset::make("DataType", DataType::QASYMM8),
+                       QuantizationData,
+                       NoActivationFunctionDataset))
 {
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))),
-                       framework::dataset::make("WeightsReshaped", { false })))
+FIXTURE_DATA_TEST_CASE(RunDynamicBias, NEFullyConnectedLayerDynamicBiasFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                       NoActivationFunctionDataset))
 {
 }
-TEST_SUITE_END()
-TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
-                           combine(datasets::SmallFullyConnectedLayerDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                       QuantizationData),
-                       EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8),
+                                       IgnoredQuantizationData,
+                               NoActivationFunctionDataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(combine(
-                                                                           framework::dataset::make("Input", TensorShape(9U, 5U, 7U)),
-                                                                           framework::dataset::make("Weights", TensorShape(315U, 271U))),
-                                                                       framework::dataset::make("Biases", TensorShape(271U))),
-                                                               framework::dataset::make("Output", TensorShape(271U))),
-                                                       FullyConnectedParameters),
-                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                       QuantizationData),
-                               framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                       NoActivationFunctionDataset,
+                       make("WeightsReshaped", { false })))
+{
+}
+TEST_SUITE_END() // QASYMM8
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8_SIGNED),
+                                       QuantizationData,
+                               make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
+FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(datasets::FullyConnectedLayerWithActivationDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                       QuantizationData),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::QASYMM8_SIGNED),
+                       QuantizationData,
                        ActivationFunctionsQuantizedDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))),
-                       framework::dataset::make("WeightsReshaped", { false })))
+FIXTURE_DATA_TEST_CASE(RunDynamicWeightsWithActivation, NEFullyConnectedLayerDynamicWeightsFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8_SIGNED),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                       make("WeightsReshaped", { false })))
+{
+}
+
+// Dynamic Quantization tests
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(
+                           datasets::SmallFullyConnectedLayerDataset(),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::QASYMM8_SIGNED),
+                       IgnoredQuantizationData,
+                       NoActivationFunctionDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8_SIGNED),
+                                       QuantizationData,
+                               NoActivationFunctionDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8_SIGNED),
+                       NoActivationFunctionDataset,
+                       make("WeightsReshaped", { false })))
 {
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
diff --git a/tests/validation/NEON/MatMul.cpp b/tests/validation/NEON/MatMul.cpp
index 0a20d1849006f8594b5f6db457621bbf9d4565d4..8cc20211f27d28e82b4ee4206dd11b11d5641432 100644
--- a/tests/validation/NEON/MatMul.cpp
+++ b/tests/validation/NEON/MatMul.cpp
@@ -40,6 +40,8 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
+
 TEST_SUITE(NEON)
 TEST_SUITE(MatMul)
 
@@ -53,42 +55,46 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8_signed(0);
 // clang-format off
 // *INDENT-OFF*
 // Validation Tests
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),        // Mismatching datatype
-                                             TensorInfo(TensorShape(9U, 6U), 1, DataType::S32),        // Unsupported datatypes
-                                             TensorInfo(TensorShape(9U, 6U, 2U), 1, DataType::F32),    // Broadcasting in batch dimension not supported
-                                             TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),        // Invalid shape for multiplication
-                                             TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(9U, 6U , 12U) , 1 , DataType::F32),
-                                             TensorInfo(TensorShape(9U, 6U , 12U) , 1 , DataType::F32), // Tensors are not dynamic
-                                             TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8),
-                                             TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8_SIGNED),
-                                             TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8_SIGNED), // Mismatching data type
-                                          }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8),
-                                            TensorInfo(TensorShape(5U, 9U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(5U, 9U, 1U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(5U, 12U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(5U, 9U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(5U, 9U, 12U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(5U, 9U, 12U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8),
-                                            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8_SIGNED),
-                                            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8_SIGNED),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(5U, 6U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(5U, 6U, 2U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(5U, 6U, 12U) , 1, DataType::F32),
-                                            TensorInfo(TensorShape(5U, 6U, 12U) , 1, DataType::F32),
-                                            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8),
-                                            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8_SIGNED),
-                                            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8),
-                                           })),
-    framework::dataset::make( "TensorIsConst", {false, false, false, false, false , false, true, false, false, false} )),
-    framework::dataset::make("Expected", { false, false, false, false, true, true, false, true, true, false })),
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL,
+    zip(
+        make("InputAInfo", {
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),        // Mismatching datatype
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::S32),        // Unsupported datatypes
+            TensorInfo(TensorShape(9U, 6U, 2U), 1, DataType::F32),    // Broadcasting in batch dimension not supported
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),        // Invalid shape for multiplication
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(9U, 6U , 12U) , 1 , DataType::F32),
+            TensorInfo(TensorShape(9U, 6U , 12U) , 1 , DataType::F32), // Tensors are not dynamic
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8_SIGNED),
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8_SIGNED), // Mismatching data type
+        }),
+        make("InputBInfo", {
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::S32),
+            TensorInfo(TensorShape(5U, 9U, 1U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 12U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U, 12U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U, 12U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8_SIGNED),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8_SIGNED),
+        }),
+        make("OutputInfo", {
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::S32),
+            TensorInfo(TensorShape(5U, 6U, 2U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U, 12U) , 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U, 12U) , 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8_SIGNED),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8),
+        }),
+        make("TensorIsConst", {false, false, false, false, false , false, true, false, false, false}),
+        make("Expected", { false, false, false, false, true, true, false, true, true, false })),
     a_info, b_info, output_info, are_tensors_const, expected)
 {
     TensorInfo a{a_info};
@@ -121,40 +127,48 @@ using NEQuantizedMatMulFixture = QuantizedMatMulValidationFixture<Tensor, Access
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEMatMulFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallMatMulDataset(),
-                                                                                                                    framework::dataset::make("TransposeA", { false, true })),
-                                                                                                                    framework::dataset::make("TransposeB", { false, true })),
-                                                                                                            framework::dataset::make("DataType", DataType::F32)),
-                                                                                                    framework::dataset::make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEMatMulFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SmallMatMulDataset(),
+        make("TransposeA", { false, true }),
+        make("TransposeB", { false, true }),
+        make("DataType", DataType::F32),
+        make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEMatMulFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeMatMulDataset(),
-                                                                                                                  framework::dataset::make("TransposeA", { false, true })),
-                                                                                                                  framework::dataset::make("TransposeB", { false, true })),
-                                                                                                          framework::dataset::make("DataType", DataType::F32)),
-                                                                                                  framework::dataset::make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEMatMulFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::LargeMatMulDataset(),
+        make("TransposeA", { false, true }),
+        make("TransposeB", { false, true }),
+        make("DataType", DataType::F32),
+        make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
 }
-FIXTURE_DATA_TEST_CASE(RunHighDimensions, NEMatMulFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::HighDimensionalMatMulDataset(),
-                                                                                                                   framework::dataset::make("TransposeA", { false, true })),
-                                                                                                                   framework::dataset::make("TransposeB", { false, true })),
-                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                                                           framework::dataset::make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })))
+FIXTURE_DATA_TEST_CASE(RunHighDimensions, NEMatMulFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::HighDimensionalMatMulDataset(),
+        make("TransposeA", { false, true }),
+        make("TransposeB", { false, true }),
+        make("DataType", DataType::F32),
+        make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunStressDynamicTensors, NEMatMulDynamicTensorsFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(),
-                       framework::dataset::make("TransposeA", { false, true })),
-                       framework::dataset::make("TransposeB", { false, true })),
-                       framework::dataset::make("DataType", DataType::F32)),
-                       framework::dataset::make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })),
-                       framework::dataset::make("NumberOfRuns", 5)))
+FIXTURE_DATA_TEST_CASE(RunStressDynamicTensors, NEMatMulDynamicTensorsFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SmallMatMulDataset(),
+        make("TransposeA", { false, true }),
+        make("TransposeB", { false, true }),
+        make("DataType", DataType::F32),
+        make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) }),
+        make("NumberOfRuns", 5)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
@@ -165,17 +179,18 @@ TEST_SUITE_END() // FP32
 /* Note : MatMul BF16 is enabled by specifying FP32 datatype and enabling the fast math setting */
 constexpr AbsoluteTolerance<float> tolerance_bf16(0.001f);
 TEST_SUITE(BF16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEMatMulFastMathFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(combine(combine(
-    datasets::SmallMatMulDataset(),
-    framework::dataset::make("TransposeA", { false, true })),
-    framework::dataset::make("TransposeB", { false, true })),
-    framework::dataset::make("DataType", DataType::F32)),
-    framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
-    framework::dataset::make("RunTimes", { 0 })),
-    framework::dataset::make("Settings", { CpuMatMulSettings().fast_math(true) })),
-    framework::dataset::make("LhsQInfo", { QuantizationInfo() })),
-    framework::dataset::make("RhsQInfo", { QuantizationInfo() })),
-    framework::dataset::make("OutQInfo", { QuantizationInfo() }))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEMatMulFastMathFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SmallMatMulDataset(),
+        make("TransposeA", { false, true }),
+        make("TransposeB", { false, true }),
+        make("DataType", DataType::F32),
+        make("ActivationInfo", { ActivationLayerInfo() }),
+        make("RunTimes", { 0 }),
+        make("Settings", { CpuMatMulSettings().fast_math(true) }),
+        make("LhsQInfo", { QuantizationInfo() }),
+        make("RhsQInfo", { QuantizationInfo() }),
+        make("OutQInfo", { QuantizationInfo() }))
 )
 {
     // Validate output
@@ -186,30 +201,36 @@ TEST_SUITE_END() // BF16
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEMatMulFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallMatMulDataset(),
-                                                                                                                   framework::dataset::make("TransposeA", { false, true })),
-                                                                                                                   framework::dataset::make("TransposeB", { false, true })),
-                                                                                                           framework::dataset::make("DataType", DataType::F16)),
-                                                                                                   framework::dataset::make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEMatMulFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SmallMatMulDataset(),
+        make("TransposeA", { false, true }),
+        make("TransposeB", { false, true }),
+        make("DataType", DataType::F16),
+        make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEMatMulFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeMatMulDataset(),
-                                                                                                                 framework::dataset::make("TransposeA", { false, true })),
-                                                                                                                 framework::dataset::make("TransposeB", { false, true })),
-                                                                                                         framework::dataset::make("DataType", DataType::F16)),
-                                                                                                 framework::dataset::make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEMatMulFixture<half>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::LargeMatMulDataset(),
+        make("TransposeA", { false, true }),
+        make("TransposeB", { false, true }),
+        make("DataType", DataType::F16),
+        make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16);
 }
-FIXTURE_DATA_TEST_CASE(RunStressDynamicTensors, NEMatMulDynamicTensorsFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(),
-                       framework::dataset::make("TransposeA", { false, true })),
-                       framework::dataset::make("TransposeB", { false, true })),
-                       framework::dataset::make("DataType", DataType::F16)),
-                       framework::dataset::make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })),
-                       framework::dataset::make("NumberOfRuns", 5)))
+FIXTURE_DATA_TEST_CASE(RunStressDynamicTensors, NEMatMulDynamicTensorsFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SmallMatMulDataset(),
+        make("TransposeA", { false, true }),
+        make("TransposeB", { false, true }),
+        make("DataType", DataType::F16),
+        make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) }),
+        make("NumberOfRuns", 5)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16);
@@ -224,48 +245,51 @@ TEST_SUITE(Quantized)
 
 TEST_SUITE(QASYMM8)
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEQuantizedMatMulFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(combine(
-    datasets::SmallMatMulDataset(),
-    framework::dataset::make("TransposeA", { false, true })),
-    framework::dataset::make("TransposeB", { false, true })),
-    framework::dataset::make("DataType", DataType::QASYMM8)),
-    framework::dataset::make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })),
-    framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
-    framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })),
-    framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })),
-    framework::dataset::make("OutQInfo", { QuantizationInfo(1.f, 2) }))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEQuantizedMatMulFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SmallMatMulDataset(),
+        make("TransposeA", { false, true }),
+        make("TransposeB", { false, true }),
+        make("DataType", DataType::QASYMM8),
+        make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) }),
+        make("NumberOfExtraRuns", { 0, 1 }),
+        make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) }),
+        make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) }),
+        make("OutQInfo", { QuantizationInfo(1.f, 2) }))
 )
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmallExtraActivation, NEQuantizedMatMulFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(
-    datasets::SmallerMatMulDataset(),
-    framework::dataset::make("TransposeA", { false, true })),
-    framework::dataset::make("TransposeB", { false, true })),
-    framework::dataset::make("DataType", DataType::QASYMM8)),
-    framework::dataset::make("ActivationInfo", { ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) })),
-    framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
-    framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })),
-    framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })),
-    framework::dataset::make("OutQInfo", { QuantizationInfo(1.f, 2) }))
+FIXTURE_DATA_TEST_CASE(RunSmallExtraActivation, NEQuantizedMatMulFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::SmallerMatMulDataset(),
+        make("TransposeA", { false, true }),
+        make("TransposeB", { false, true }),
+        make("DataType", DataType::QASYMM8),
+        make("ActivationInfo", { ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) }),
+        make("NumberOfExtraRuns", { 0, 1 }),
+        make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) }),
+        make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) }),
+        make("OutQInfo", { QuantizationInfo(1.f, 2) }))
 )
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEQuantizedMatMulFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(
-    datasets::LargeMatMulDataset(),
-    framework::dataset::make("TransposeA", { false, true })),
-    framework::dataset::make("TransposeB", { false, true })),
-    framework::dataset::make("DataType", DataType::QASYMM8)),
-    framework::dataset::make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })),
-    framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
-    framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })),
-    framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })),
-    framework::dataset::make("OutQInfo", { QuantizationInfo(1.f, 2) }))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEQuantizedMatMulFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::LargeMatMulDataset(),
+        make("TransposeA", { false, true }),
+        make("TransposeB", { false, true }),
+        make("DataType", DataType::QASYMM8),
+        make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) }),
+        make("NumberOfExtraRuns", { 0, 1 }),
+        make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) }),
+        make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) }),
+        make("OutQInfo", { QuantizationInfo(1.f, 2) }))
 )
 {
     // Validate output
@@ -276,48 +300,51 @@ TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEQuantizedMatMulFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(combine(
-    datasets::SmallMatMulDataset(),
-    framework::dataset::make("TransposeA", { false, true })),
-    framework::dataset::make("TransposeB", { false, true })),
-    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-    framework::dataset::make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })),
-    framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
-    framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 40, -2) })),
-    framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 50, 1) })),
-    framework::dataset::make("OutQInfo", { QuantizationInfo(1.f, 1) }))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEQuantizedMatMulFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SmallMatMulDataset(),
+        make("TransposeA", { false, true }),
+        make("TransposeB", { false, true }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) }),
+        make("NumberOfExtraRuns", { 0, 1 }),
+        make("LhsQInfo", { QuantizationInfo(1.f / 40, -2) }),
+        make("RhsQInfo", { QuantizationInfo(1.f / 50, 1) }),
+        make("OutQInfo", { QuantizationInfo(1.f, 1) }))
 )
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmallExtraActivation, NEQuantizedMatMulFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(
-    datasets::SmallerMatMulDataset(),
-    framework::dataset::make("TransposeA", { false, true })),
-    framework::dataset::make("TransposeB", { false, true })),
-    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-    framework::dataset::make("ActivationInfo", { ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) })),
-    framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
-    framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 40, -2) })),
-    framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 50, 1) })),
-    framework::dataset::make("OutQInfo", { QuantizationInfo(1.f, 1) }))
+FIXTURE_DATA_TEST_CASE(RunSmallExtraActivation, NEQuantizedMatMulFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::SmallerMatMulDataset(),
+        make("TransposeA", { false, true }),
+        make("TransposeB", { false, true }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("ActivationInfo", { ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) }),
+        make("NumberOfExtraRuns", { 0, 1 }),
+        make("LhsQInfo", { QuantizationInfo(1.f / 40, -2) }),
+        make("RhsQInfo", { QuantizationInfo(1.f / 50, 1) }),
+        make("OutQInfo", { QuantizationInfo(1.f, 1) }))
 )
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEQuantizedMatMulFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(
-    datasets::LargeMatMulDataset(),
-    framework::dataset::make("TransposeA", { false, true })),
-    framework::dataset::make("TransposeB", { false, true })),
-    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-    framework::dataset::make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })),
-    framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
-    framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 150, -2) })),
-    framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 250, 1) })),
-    framework::dataset::make("OutQInfo", { QuantizationInfo(1.f, 1) }))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEQuantizedMatMulFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::LargeMatMulDataset(),
+        make("TransposeA", { false, true }),
+        make("TransposeB", { false, true }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) }),
+        make("NumberOfExtraRuns", { 0, 1 }),
+        make("LhsQInfo", { QuantizationInfo(1.f / 150, -2) }),
+        make("RhsQInfo", { QuantizationInfo(1.f / 250, 1) }),
+        make("OutQInfo", { QuantizationInfo(1.f, 1) }))
 )
 {
     // Validate output
diff --git a/tests/validation/NEON/ReshapeLayer.cpp b/tests/validation/NEON/ReshapeLayer.cpp
index bf39c399a5acf6d1a9a4694cc640d34f7410f44b..e9f114d491be161621e86b164957fadc7dd1ea1f 100644
--- a/tests/validation/NEON/ReshapeLayer.cpp
+++ b/tests/validation/NEON/ReshapeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,6 +77,9 @@ input_info, output_info, expected)
 template <typename T>
 using NEReshapeLayerFixture = ReshapeLayerValidationFixture<Tensor, Accessor, NEReshapeLayer, T>;
 
+template <typename T>
+using NEReshapeLayerPaddedFixture = ReshapeLayerPaddedValidationFixture<Tensor, Accessor, NEReshapeLayer, T>;
+
 TEST_SUITE(Float)
 TEST_SUITE(F32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::F32)))
@@ -84,8 +87,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<float>, framework::Datase
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() //F32
+TEST_SUITE_END() //Float
 
 TEST_SUITE(Integer)
 TEST_SUITE(S8)
@@ -94,7 +97,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<int8_t>, framework::Datas
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() //S8
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::S16)))
@@ -102,11 +105,41 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<int16_t>, framework::Data
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() //S16
+TEST_SUITE_END() //Integer
+
+TEST_SUITE(Padded)
+TEST_SUITE(Float)
+TEST_SUITE(F32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerPaddedFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() //S32
+TEST_SUITE_END() //Float
+
+TEST_SUITE(Integer)
+TEST_SUITE(S8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerPaddedFixture<int8_t>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::S8)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() //S8
+
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerPaddedFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::S16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() //S16
+TEST_SUITE_END() //Integer
+TEST_SUITE_END() //Padded
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() //ReshapeLayer
+TEST_SUITE_END() //NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Reverse.cpp b/tests/validation/NEON/Reverse.cpp
index 3dc3eeee805a0b2f9b4f8f294d09afe581871fa9..7b5337f14b42bd5dcc26f6db21e37168dfffd400 100644
--- a/tests/validation/NEON/Reverse.cpp
+++ b/tests/validation/NEON/Reverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,8 @@ namespace validation
 {
 namespace
 {
-auto run_small_dataset = combine(datasets::SmallShapes(), datasets::Tiny1DShapes());
+using framework::dataset::make;
+auto run_small_dataset = combine(datasets::Small3DShapes(), datasets::Tiny1DShapes());
 auto run_large_dataset = combine(datasets::LargeShapes(), datasets::Tiny1DShapes());
 
 } // namespace
@@ -53,28 +54,31 @@ TEST_SUITE(Reverse)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8), // Invalid axis datatype
+        make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8), // Invalid axis datatype
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid axis shape
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid axis length (> 4)
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Mismatching shapes
+                                            TensorInfo(TensorShape(32U, 13U, 17U, 3U, 2U), 1, DataType::U8), // Unsupported source dimensions (>4)
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U), 1, DataType::U8),
         }),
-        framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),
+        make("OutputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U, 13U, 2U), 1, DataType::U8),
+                                            TensorInfo(TensorShape(32U, 13U, 17U, 3U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U), 1, DataType::U8),
         })),
-        framework::dataset::make("AxisInfo", { TensorInfo(TensorShape(3U), 1, DataType::U8),
+        make("AxisInfo", { TensorInfo(TensorShape(3U), 1, DataType::U8),
                                            TensorInfo(TensorShape(2U, 10U), 1, DataType::U32),
                                            TensorInfo(TensorShape(8U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
+                                           TensorInfo(TensorShape(2U), 1, DataType::U32),
         })),
-        framework::dataset::make("Expected", { false, false, false, false, true, true})),
+        make("Expected", { false, false, false, false, false, true, true})),
         src_info, dst_info, axis_info, expected)
 {
     Status s = NEReverse::validate(&src_info.clone()->set_is_resizable(false),
@@ -95,7 +99,11 @@ TEST_SUITE(F16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReverseFixture<half>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::F16)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::F16),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -104,7 +112,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEReverseFixture<half>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::F16)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::F16),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -116,7 +128,11 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReverseFixture<float>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::F32)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::F32),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -125,7 +141,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEReverseFixture<float>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::F32)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::F32),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -138,7 +158,11 @@ TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReverseFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::QASYMM8)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::QASYMM8),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -147,7 +171,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEReverseFixture<uint8_t>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::QASYMM8)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::QASYMM8),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
diff --git a/tests/validation/NEON/StackLayer.cpp b/tests/validation/NEON/StackLayer.cpp
index d88f713ccd5006240f42f1d42a5a5d4ff01378de..3828010c7b302e615d845f47bf89c2a996aa080f 100644
--- a/tests/validation/NEON/StackLayer.cpp
+++ b/tests/validation/NEON/StackLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,69 +44,74 @@ namespace test
 {
 namespace validation
 {
+
+using framework::dataset::make;
 namespace
 {
 // *INDENT-OFF*
 // clang-format off
 /** Data types */
-const auto data_types = framework::dataset::make("DataType", { DataType::QASYMM8, DataType::F16, DataType::F32 });
+const auto data_types = make("DataType", { DataType::QASYMM8, DataType::F16, DataType::F32 });
 
 /** Num tensors values to test */
-const auto n_values = framework::dataset::make("NumTensors", { 3, 4 });
+const auto n_values = make("NumTensors", { 3, 4 });
 
 /** Shapes 1D to test */
-const auto shapes_1d_small = combine(datasets::Small1DShapes(), framework::dataset::make("Axis", -1, 2));
+const auto shapes_1d_small = combine(datasets::Small1DShapes(), make("Axis", -1, 2));
 
 /** Shapes 2D to test */
-const auto shapes_2d_small = combine(datasets::Small2DShapes(), framework::dataset::make("Axis", -2, 3));
+const auto shapes_2d_small = combine(datasets::Small2DShapes(), make("Axis", -2, 3));
 
 /** Shapes 3D to test */
-const auto shapes_3d_small = combine(datasets::Small3DShapes(), framework::dataset::make("Axis", -3, 4));
+const auto shapes_3d_small = combine(datasets::Small3DShapes(), make("Axis", -3, 4));
 
 /** Shapes 4D to test */
-const auto shapes_4d_small = combine(datasets::Small4DShapes(), framework::dataset::make("Axis", -4, 5));
+const auto shapes_4d_small = combine(datasets::Small4DShapes(), make("Axis", -4, 5));
 
 /** Shapes 1D to test */
-const auto shapes_1d_large = combine(datasets::Large1DShapes(), framework::dataset::make("Axis", -1, 2));
+const auto shapes_1d_large = combine(datasets::Large1DShapes(), make("Axis", -1, 2));
 
 /** Shapes 2D to test */
-const auto shapes_2d_large = combine(datasets::Medium2DShapes(), framework::dataset::make("Axis", -2, 3));
+const auto shapes_2d_large = combine(datasets::Medium2DShapes(), make("Axis", -2, 3));
 
 /** Shapes 3D to test */
-const auto shapes_3d_large = combine(datasets::Medium3DShapes(), framework::dataset::make("Axis", -3, 4));
+const auto shapes_3d_large = combine(datasets::Medium3DShapes(), make("Axis", -3, 4));
 
 /** Shapes 4D to test */
-const auto shapes_4d_large = combine(datasets::Medium4DShapes(), framework::dataset::make("Axis", -4, 5));
+const auto shapes_4d_large = combine(datasets::Medium4DShapes(), make("Axis", -4, 5));
 } // namespace
 
 /** Fixture to use */
 template<typename T>
 using NEStackLayerFixture = StackLayerValidationFixture<Tensor, ITensor, Accessor, NEStackLayer, T>;
 
+template<typename T>
+using NEStackLayerWithPaddingFixture = StackLayerWithPaddingValidationFixture<Tensor, ITensor, Accessor, NEStackLayer, T>;
+
 using namespace arm_compute::misc::shape_calculator;
 
 TEST_SUITE(NEON)
 TEST_SUITE(StackLayer)
 
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-                                                                      framework::dataset::make("InputInfo",
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+make("InputInfo",
 {
     std::vector<TensorInfo>{ TensorInfo(TensorShape(9U, 8U), 1, DataType::U8) },
-    std::vector<TensorInfo>{ TensorInfo(TensorShape(1U, 2U), 1, DataType::U8) , TensorInfo(TensorShape(1U, 2U), 1, DataType::U8), TensorInfo(TensorShape(1U, 2U), 1, DataType::U8)}, 
+    std::vector<TensorInfo>{ TensorInfo(TensorShape(1U, 2U), 1, DataType::U8) , TensorInfo(TensorShape(1U, 2U), 1, DataType::U8), TensorInfo(TensorShape(1U, 2U), 1, DataType::U8)},
     std::vector<TensorInfo>{ TensorInfo(TensorShape(2U, 3U), 1, DataType::S32) },
-    std::vector<TensorInfo>{ TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32), TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32)}, 
+    std::vector<TensorInfo>{ TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32), TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32)},
     std::vector<TensorInfo>{ TensorInfo(TensorShape(9U, 8U), 1, DataType::S32) },
 }),
-framework::dataset::make("OutputInfo",
+make("OutputInfo",
 {
     TensorInfo(TensorShape(1U, 9U, 8U), 1, DataType::U8),   // Passes, stack 1 tensor on x axis
     TensorInfo(TensorShape(1U, 3U, 2U), 1, DataType::U8),   // Passes, stack 3 tensors on y axis
     TensorInfo(TensorShape(1U, 2U, 3U), 1, DataType::S32),  // fails axis <  (- input's rank)
     TensorInfo(TensorShape(3U, 7U, 5U), 1, DataType::S32),  // fails, input dimensions > 4
     TensorInfo(TensorShape(1U, 2U, 3U), 1, DataType::U8),   // fails mismatching data types
-})),
-framework::dataset::make("Axis", { -3, 1, -4, -3, 1 })),
-framework::dataset::make("Expected", { true, true, false, false, false })),
+}),
+make("Axis", { -3, 1, -4, -3, 1 }),
+make("Expected", { true, true, false, false, false })),
 input_info, output_info, axis, expected)
 {
     std::vector<TensorInfo>    ti(input_info);
@@ -121,18 +126,18 @@ input_info, output_info, axis, expected)
 TEST_SUITE(Shapes1D)
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_1d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_1d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_1d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_1d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -141,18 +146,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_1d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_1d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_1d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_1d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -161,18 +166,18 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_1d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_1d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_1d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_1d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -183,18 +188,18 @@ TEST_SUITE_END() // Shapes1D
 TEST_SUITE(Shapes2D)
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_2d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_2d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_2d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_2d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -203,18 +208,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_2d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_2d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_2d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_2d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -223,18 +228,18 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_2d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_2d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_2d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_2d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -245,18 +250,18 @@ TEST_SUITE_END() // Shapes2D
 TEST_SUITE(Shapes3D)
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_3d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_3d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_3d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_3d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -265,18 +270,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_3d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_3d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_3d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_3d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -285,18 +290,18 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_3d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_3d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_3d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_3d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -307,18 +312,29 @@ TEST_SUITE_END() // Shapes3D
 TEST_SUITE(Shapes4D)
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_4d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+// Testing the case with padding for only 4d shapes and for one data type. This is because the underlying code
+// path depends only on the padding, which isn't affected by the shapes or data types.
+FIXTURE_DATA_TEST_CASE(RunSmallWithPadding, NEStackLayerWithPaddingFixture<int>, framework::DatasetMode::ALL,
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_4d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_4d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -327,18 +343,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_4d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_4d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_4d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -347,24 +363,37 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_4d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_4d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_4d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // S8
 TEST_SUITE_END() // Shapes4D
+
+TEST_SUITE(HighDimensional)
+// The Cpu implementation supports tensors of size 4D+, but reference implementation does not.
+FIXTURE_DATA_TEST_CASE(RunHighDimensional, NEStackLayerFixture<char>, framework::DatasetMode::DISABLED,
+    combine(make("Shape", { TensorShape{2U, 3U, 4U, 5U, 3U} }),
+            make("Axis", { 5, 0, -3, 2 }),
+            make("DataType", { DataType::S8 }),
+            make("NumTensors", { 3 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // HighDimensional
 TEST_SUITE_END() // StackLayer
 TEST_SUITE_END() // Neon
 } // namespace validation
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Add.cpp b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
index d9a3d9533ce286da2434c594f01bf7a899141891..09a8f3fe399ea738304f028b0ec16500232d2fc9 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
@@ -22,6 +22,9 @@
  * SOFTWARE.
  */
 
+// TODO: Fix testing of CKW Elementwise Binary (COMPMID-6530)
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
 
@@ -258,3 +261,4 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp
index 71b0114225197873233734d656d0f1314a2f46ae..aec1306a31cd376151b352dca5387c940b52af69 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test if ACL_INTERNAL_TEST_CKW_IN_DF and the op has not been ported to ckw
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h"
 
 #include "tests/CL/CLAccessor.h"
@@ -284,6 +283,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDepthwiseConv2dFixture<half>, f
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test as dilation not supported yet in DepthwiseConv2d CKW kernel
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuDepthwiseConv2dFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                                                                                     depth_multipliers),
@@ -301,6 +301,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDepthwiseConv2dFixture<half>, f
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 TEST_SUITE_END() // Dilation
+#endif           // ACL_INTERNAL_TEST_CKW_IN_DF
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(Generic)
@@ -318,7 +319,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDepthwiseConv2dFixture<half>, f
 {
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test as dilation not supported yet in DepthwiseConv2d CKW kernel
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuDepthwiseConv2dFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                                                                                     depth_multipliers),
@@ -336,6 +337,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDepthwiseConv2dFixture<half>, f
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 TEST_SUITE_END() // Dilation
+#endif           // ACL_INTERNAL_TEST_CKW_IN_DF
 TEST_SUITE_END() // Generic
 TEST_SUITE_END() // FP16
 
@@ -358,6 +360,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDepthwiseConv2dFixture<float>,
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test as dilation not supported yet in DepthwiseConv2d CKW kernel
 TEST_SUITE(Dilation)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuDepthwiseConv2dFixture<float>, framework::DatasetMode::ALL,
@@ -377,6 +380,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDepthwiseConv2dFixture<float>,
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 TEST_SUITE_END() // Dilation
+#endif           // ACL_INTERNAL_TEST_CKW_IN_DF
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(Generic)
@@ -407,6 +411,7 @@ FIXTURE_DATA_TEST_CASE(RunLargeKernelSize, DynamicFusionGpuDepthwiseConv2dFixtur
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test as dilation not supported yet in DepthwiseConv2d CKW kernel
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuDepthwiseConv2dFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                                                                                      depth_multipliers),
@@ -424,6 +429,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDepthwiseConv2dFixture<float>,
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 TEST_SUITE_END() // Dilation
+#endif           // ACL_INTERNAL_TEST_CKW_IN_DF
 TEST_SUITE_END() // Generic
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
@@ -433,5 +439,3 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
index f27a1796c98ecf26931ecbe592494bb7bc913680..bae8cbf868d8ff756eacd32dffbbfc3d23952910 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
@@ -58,7 +58,7 @@ TEST_SUITE(DYNAMIC_FUSION)
  * No quantized tests               | Not supported yet
  * No grouped CNN tests             | Not supported yet
  * No mixed layout tests            | Not needed; only NHWC is supported
- * No activation/post op tests      | Not needed in fusion
+ * No activation                    | Not needed in fusion
  * No ValidateConvolutionMethod     | Only a single method (direct conv2d) is supported
  * No ReshapeWeights = true tests   | Not applicable yet. This parameter only concerns gemm-based conv2d
  * No RunSmallWithPadding tests     | Padding is removed
@@ -70,9 +70,7 @@ template <typename T>
 using DynamicFusionGpuConv2dFixture = DynamicFusionGpuConv2dValidationFixture<CLTensor, CLAccessor, GpuConv2d, T>;
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuConv2dFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                                                                            framework::dataset::make("QuantizationInfo", QuantizationInfo())))
+                                                                                                                    framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NHWC })), framework::dataset::make("QuantizationInfo", QuantizationInfo())))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -81,9 +79,7 @@ TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuConv2dFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                                                                           framework::dataset::make("QuantizationInfo", QuantizationInfo())))
+                                                                                                                   framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataLayout", { DataLayout::NHWC })), framework::dataset::make("QuantizationInfo", QuantizationInfo())))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
diff --git a/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..38c3a0ca0ea30fbdf3529ad02d04d75af3187237
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
+#include "tests/AssetsLibrary.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/datasets/LargeMatMulDataset.h"
+#include "tests/datasets/SmallMatMulDataset.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/reference/Permute.h"
+#include "tests/validation/reference/GEMM.h"
+
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h"
+
+#include <tuple>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+    RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+constexpr float          abs_tolerance_f32(
+    0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for floating point data types in case using relative tolerance fails because of small values */
+constexpr float abs_tolerance_f16(
+    0.001f);                                                   /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16  data types in case using relative tolerance fails because of small values */
+    RelativeTolerance<half_float::half> tolerance_f16(half(0.02)); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+}
+
+/** M0 values to test --precommit*/
+const auto m0_values_precommit = framework::dataset::make("M0", { 1, 3 });
+
+/** N0 values to test --precommit*/
+const auto n0_values_precommit = framework::dataset::make("N0", { 1, 2, 4 });
+
+/** K0 values to test --precommit*/
+const auto k0_values_precommit = framework::dataset::make("K0", { 1, 2, 3 });
+
+/** M0 values to test --nightly*/
+const auto m0_values_nightly_lhs_nt = framework::dataset::make("M0", { 1, 2, 3, 4, 5, 6, 7, 8 });
+const auto m0_values_nightly_lhs_t  = framework::dataset::make("M0", { 1, 2, 3, 4, 8 });
+
+/** N0 values to test --nightly*/
+const auto n0_values_nightly_rhs_nt = framework::dataset::make("N0", { 1, 2, 3, 4, 8, 16 });
+const auto n0_values_nightly_rhs_t  = framework::dataset::make("N0", { 1, 2, 3, 4, 8 });
+
+/** K0 values to test --nightly*/
+const auto k0_values_nightly_lhs_nt_rhs_nt = framework::dataset::make("K0", { 1, 2, 3, 4, 8, 16 });
+const auto k0_values_nightly_rhs_t         = framework::dataset::make("K0", { 1, 2, 3, 4, 8 });
+const auto k0_values_nightly_lhs_t_rhs_nt  = framework::dataset::make("K0", { 1, 2, 3, 4, 5, 6, 7, 8 });
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+
+TEST_SUITE(MatMul)
+
+TEST_SUITE(Validate)
+TEST_CASE(SupportedBlockSizes, framework::DatasetMode::ALL)
+{
+    using MatMulConfigurationPair = std::pair<MatMulKernelInfo, bool>;
+
+    const std::vector<MatMulConfigurationPair> supported_block_sizes =
+    {
+        // MatMulKernelInfo(adj_lhs, adj_rhs, M0, N0, K0, export_rhs_to_cl_image = false)
+
+        // Lhs not-transposed, Rhs transposed
+        { MatMulKernelInfo(false, true, 0, 1, 1), false },  // M0 should be > 0
+        { MatMulKernelInfo(false, true, 3, 11, 1), false }, // N0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, true, 3, 7, 1), false },  // N0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, true, 3, 3, 12), false }, // K0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, true, 3, 3, 6), false },  // K0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, true, 5, 1, 2), true },
+        { MatMulKernelInfo(false, true, 3, 3, 3), true },
+        { MatMulKernelInfo(false, true, 2, 4, 8), true },
+
+    };
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    // Set big enough shapes so that block sizes are not truncated. Also, set all dimensions equal
+    // so that it doesn't fail for different NT/T configurations. We aim to test the block sizes here,
+    // not the shapes themselves.
+    const TensorInfo lhs_info    = context.create_tensor_info(TensorInfo(TensorShape(100U, 100U), 1, DataType::F32));
+    const TensorInfo rhs_info    = context.create_tensor_info(TensorInfo(TensorShape(100U, 100U), 1, DataType::F32));
+
+    for(auto &pair : supported_block_sizes)
+    {
+        MatMulAttributes matmul_attr {};
+        matmul_attr.adj_lhs(pair.first.adj_lhs);
+        matmul_attr.adj_rhs(pair.first.adj_rhs);
+
+        GpuMatMulSettings matmul_settings {};
+        matmul_settings.m0(pair.first.m0);
+        matmul_settings.n0(pair.first.n0);
+        matmul_settings.k0(pair.first.k0);
+
+        Status status = GpuMatMul::validate_op(sketch, &lhs_info, &rhs_info, matmul_attr, matmul_settings);
+        ARM_COMPUTE_EXPECT(bool(status) == pair.second, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_CASE(ValidateInputShapes, framework::DatasetMode::ALL)
+{
+    // Create a sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations
+    using ShapeConfigurationTuple = std::tuple<TensorShape, TensorShape, bool>;
+    const std::vector<ShapeConfigurationTuple> shape_configurations =
+    {
+        { TensorShape(5U, 1U), TensorShape(3U, 5U), true },
+        { TensorShape(10U, 12U), TensorShape(3U, 10U), true },
+        { TensorShape(8U, 4U), TensorShape(2U, 8U), true },
+        { TensorShape(8U, 4U), TensorShape(2U, 5U), false }, // Mismatch in the K dimension
+        { TensorShape(5U, 0U), TensorShape(2U, 5U), false }, // Invalid dimension
+        { TensorShape(5U, 4U, 3U, 4U, 5U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), true },
+        { TensorShape(5U, 4U, 3U, 4U, 5U, 1U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), false }, // no batch broadcasting
+        { TensorShape(5U, 4U, 3U, 4U, 9U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), false }, // mismatch in batch dimension
+    };
+
+    for(auto &tuple : shape_configurations)
+    {
+        const bool expected = std::get<2>(tuple);
+
+        for(bool adj_lhs :
+            {
+                false
+            })
+        {
+            for(bool adj_rhs :
+                {
+                    true
+                })
+            {
+                TensorShape lhs_shape = std::get<0>(tuple);
+                TensorShape rhs_shape = std::get<1>(tuple);
+
+                if(adj_lhs)
+                {
+                    permute(lhs_shape, PermutationVector(1U, 0U));
+                }
+
+                if(adj_rhs)
+                {
+                    permute(rhs_shape, PermutationVector(1U, 0U));
+                }
+
+                const TensorInfo lhs_info    = context.create_tensor_info(TensorInfo(lhs_shape, 1, DataType::F32));
+                const TensorInfo rhs_info    = context.create_tensor_info(TensorInfo(rhs_shape, 1, DataType::F32));
+
+                MatMulAttributes matmul_attr {};
+                matmul_attr.adj_lhs(adj_lhs);
+                matmul_attr.adj_rhs(adj_rhs);
+
+                GpuMatMulSettings matmul_settings {};
+                matmul_settings.m0(1);
+                matmul_settings.n0(1);
+                matmul_settings.k0(1);
+
+                Status status = GpuMatMul::validate_op(sketch, &lhs_info, &rhs_info, matmul_attr, matmul_settings);
+                ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+            }
+        }
+    }
+}
+
+
+TEST_CASE(ValidateDataTypes, framework::DatasetMode::ALL)
+{
+    // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations
+    using DataTypeConfigurationTuple = std::tuple<DataType, DataType, DataType, bool>;
+    const std::vector<DataTypeConfigurationTuple> data_type_configurations =
+    {
+        { DataType::F32, DataType::F32, DataType::F32, true },
+        { DataType::F16, DataType::F16, DataType::F16, true },
+        { DataType::F16, DataType::F32, DataType::F32, false },                                              // no mixed precision
+        { DataType::F64, DataType::F64, DataType::F64, false },                                              // no double precision
+        { DataType::QASYMM8, DataType::QASYMM8, DataType::QASYMM8, false },                                  // no quantized types
+        { DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, false },             // no quantized types
+        { DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL, false }, // no quantized types
+        { DataType::QASYMM16, DataType::QASYMM16, DataType::QASYMM16, false },                               // no quantized types
+        { DataType::QSYMM16, DataType::QSYMM16, DataType::QSYMM16, false },                                  // no quantized types
+        { DataType::QSYMM8, DataType::QSYMM8, DataType::QSYMM8, false },                                     // no quantized types
+        { DataType::S64, DataType::S64, DataType::S64, false },                                              // no integral types
+        { DataType::S32, DataType::S32, DataType::S32, false },                                              // no integral types
+        { DataType::S16, DataType::S16, DataType::S16, false },                                              // no integral types
+        { DataType::S8, DataType::S8, DataType::S8, false },                                                 // no integral types
+        { DataType::U64, DataType::U64, DataType::U64, false },                                              // no integral types
+        { DataType::U32, DataType::U32, DataType::U32, false },                                              // no integral types
+        { DataType::U16, DataType::U16, DataType::U16, false },                                              // no integral types
+        { DataType::U8, DataType::U8, DataType::U8, false },                                                 // no integral types
+    };
+    // Create a sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    const TensorShape shape = TensorShape(10U, 10U);
+    MatMulAttributes  matmul_attr {};
+    matmul_attr.adj_lhs(false);
+    matmul_attr.adj_rhs(false);
+    GpuMatMulSettings matmul_settings {};
+    matmul_settings.m0(1);
+    matmul_settings.n0(1);
+    matmul_settings.k0(1);
+
+    for(auto &tuple : data_type_configurations)
+    {
+        const bool expected = std::get<3>(tuple);
+
+        const TensorInfo lhs_info    = context.create_tensor_info(TensorInfo(shape, 1, std::get<0>(tuple)));
+        const TensorInfo rhs_info    = context.create_tensor_info(TensorInfo(shape, 1, std::get<1>(tuple)));
+
+        Status status = GpuMatMul::validate_op(sketch, &lhs_info, &rhs_info, matmul_attr, matmul_settings);
+        ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_SUITE_END() // Validate
+
+template <typename T>
+using DynamicFusionGpuMatmulFixture = DynamicFusionGpuMatMulValidationFixture<CLTensor, CLAccessor,GpuMatMul, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+
+FIXTURE_DATA_TEST_CASE(RunTiny, DynamicFusionGpuMatmulFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::TinyMatMulDataset(),
+                                                                                                                   framework::dataset::make("TransposeA", { false })),
+                                                                                                                   framework::dataset::make("TransposeB", { true })),
+                                                                                                                   m0_values_precommit),
+                                                                                                                   n0_values_precommit),
+                                                                                                                   k0_values_precommit),
+                                                                                                           framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                   framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuMatmulFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(),
+                                                                                                                   framework::dataset::make("TransposeA", { false })),
+                                                                                                                   framework::dataset::make("TransposeB", { true })),
+                                                                                                                   m0_values_precommit),
+                                                                                                                   n0_values_precommit),
+                                                                                                                   k0_values_precommit),
+                                                                                                           framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                   framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeRhsTransposed, DynamicFusionGpuMatmulFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                                                                     framework::dataset::make("TransposeA", { false })),
+                                                                                                                     framework::dataset::make("TransposeB", { true })),
+                                                                                                                     m0_values_nightly_lhs_nt),
+                                                                                                                     n0_values_nightly_rhs_t),
+                                                                                                                     k0_values_nightly_rhs_t),
+                                                                                                                     framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                                     framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+// Running High Dimensional test is enough for FP32, because we're stressing the number of dimensions, not data type or M0/N0/K0
+FIXTURE_DATA_TEST_CASE(RunHighDimensional, DynamicFusionGpuMatmulFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::HighDimensionalMatMulDataset(),
+                                                                                                                      framework::dataset::make("TransposeA", { false })),
+                                                                                                                      framework::dataset::make("TransposeB", { true })),
+                                                                                                                      framework::dataset::make("M0", { 2 })),
+                                                                                                                      framework::dataset::make("N0", { 2 })),
+                                                                                                                      framework::dataset::make("K0", { 2 })),
+                                                                                                                      framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                              framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuMatmulFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(),
+                                                                                                                   framework::dataset::make("TransposeA", { false })),
+                                                                                                                   framework::dataset::make("TransposeB", { true })),
+                                                                                                                   m0_values_precommit),
+                                                                                                                   n0_values_precommit),
+                                                                                                                   k0_values_precommit),
+                                                                                                           framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                   framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+
+FIXTURE_DATA_TEST_CASE(RunLargeRhsTransposed, DynamicFusionGpuMatmulFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                                                                     framework::dataset::make("TransposeA", { false })),
+                                                                                                                     framework::dataset::make("TransposeB", { true })),
+                                                                                                                     m0_values_nightly_lhs_nt),
+                                                                                                                     n0_values_nightly_rhs_t),
+                                                                                                                     k0_values_nightly_rhs_t),
+                                                                                                                     framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                                     framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // MatMul
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp b/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp
index 2da2b9eabd4bfd5e61aff2d33bcf8bd676a0d246..b69479fb7e591c083d1451170fc83f3b133a375f 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp
@@ -22,6 +22,9 @@
  * SOFTWARE.
  */
 
+// TODO: Fix testing of CKW Elementwise Binary (COMPMID-6530)
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h"
 
@@ -221,3 +224,4 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
index 7f5efd662ab08dc77d4b8912f6d5067af8a3f3d0..411e31b32b178130888ce4d2ffbb5c3850c0731c 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
@@ -21,8 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test if ACL_INTERNAL_TEST_CKW_IN_DF and the op has not been ported to ckw
+#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
 
 #include "tests/CL/CLAccessor.h"
@@ -65,40 +64,22 @@ using DFPoolMixedPrecisionFixture = DynamicFusionGpuPool2dMixedPrecisionValidati
 // *INDENT-OFF*
 // clang-format off
 
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-            framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),     // Mismatching data type
-                                                    TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),     // Invalid pad/size combination
-                                                    TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),     // Invalid pad/size combination
-                                                    TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::QASYMM8, DataLayout::NHWC), // Invalid parameters, unsupported pooling
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
+            framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::QASYMM8, DataLayout::NHWC), // Invalid parameters, unsupported pooling
                                                     TensorInfo(TensorShape(5U, 15U, 13U), 1, DataType::F32, DataLayout::NHWC),     // Valid Non-rectangular Global Pooling
-                                                    TensorInfo(TensorShape(5U, 13U, 13U), 1, DataType::F32, DataLayout::NHWC),     // Invalid output Global Pooling
                                                     TensorInfo(TensorShape(5U, 13U, 13U), 1, DataType::QASYMM8, DataLayout::NHWC), // Invalid - Quantized not supported.
                                                     TensorInfo(TensorShape(5U, 13U, 13U), 1, DataType::F32, DataLayout::NHWC),     // Valid global pooling
                                                     TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::F32, DataLayout::NCHW),     // Unsupported data layout
                                                 }),
-            framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(2U, 25U, 11U), 1, DataType::F16, DataLayout::NHWC),
-                                                    TensorInfo(TensorShape(2U, 30U, 11U), 1, DataType::F32, DataLayout::NHWC),
-                                                    TensorInfo(TensorShape(2U, 25U, 16U), 1, DataType::F32, DataLayout::NHWC),
-                                                    TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::QASYMM8, DataLayout::NHWC),
-                                                    TensorInfo(TensorShape(5U, 1U, 1U), 1, DataType::F32, DataLayout::NHWC),
-                                                    TensorInfo(TensorShape(5U, 2U, 2U), 1, DataType::F32, DataLayout::NHWC),
-                                                    TensorInfo(TensorShape(5U, 12U, 12U), 1, DataType::QASYMM8, DataLayout::NHWC),
-                                                    TensorInfo(TensorShape(5U, 1U, 1U), 1, DataType::F32, DataLayout::NHWC),
-                                                    TensorInfo(TensorShape(1U, 1U, 5U), 1, DataType::F32, DataLayout::NHWC),
-                                                })),
             framework::dataset::make("Pool2dAttributes", {
-                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(3,3)).pad(Padding2D(0,0,0,0)).stride(Size2D(1,1)),
-                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(2,2)).pad(Padding2D(2,2,0,0)).stride(Size2D(1,1)),
-                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(2,2)).pad(Padding2D(0,0,2,2)).stride(Size2D(1,1)),
                                                     Pool2dAttributes().pool_type(PoolingType::L2).pool_size(Size2D(3,3)).pad(Padding2D(0,0,0,0)).stride(Size2D(1,1)),
                                                     Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(15U, 13U)),
-                                                    Pool2dAttributes().pool_type(PoolingType::MAX).pool_size(Size2D(13U, 13U)),
                                                     Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(2,2)).pad(Padding2D()).stride(Size2D(1,1)),
                                                     Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(13U,13U)),
                                                     Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(13U,13U)),
                                                 })),
-            framework::dataset::make("Expected", { false, false, false, false, true, false, false, true, false })),
-            input_info, output_info, pool2d_attr, expected)
+            framework::dataset::make("Expected", { false, true, false, true, false })),
+            input_info, pool2d_attr, expected)
 {
     // Create a new workload sketch
     auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
@@ -110,8 +91,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 
     // Validate Pool2d Configuration
     auto                   src_info    = context.create_tensor_info(input_info);
-    auto                   dst_info    = context.create_tensor_info(output_info);
-    bool                   res         = bool(GpuPool2d::validate_op(sketch, &src_info, &dst_info, pool2d_attr, settings));
+    bool                   res         = bool(GpuPool2d::validate_op(sketch, &src_info, pool2d_attr, settings));
     ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
 }
 
@@ -232,5 +212,4 @@ TEST_SUITE_END() // CL
 }
 }
 }
-
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp b/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp
index 9ca1c5f0da8206f12190b9c3d4ebe983ae89e60f..5f99cd6d7835471e0117465c5572d6680b30dab7 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test if ACL_INTERNAL_TEST_CKW_IN_DF and the op has not been ported to ckw
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h"
 
 #include "tests/CL/CLAccessor.h"
@@ -518,5 +517,3 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp b/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp
index 0bb05c296164095edd12424979b9fed95cbb3e2c..022c9b46a8790d365100b7e53fb800bbeebd45af 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp
@@ -22,6 +22,9 @@
  * SOFTWARE.
  */
 
+// TODO: Fix testing of CKW Elementwise Binary (COMPMID-6530)
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h"
 
@@ -257,3 +260,4 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/fixtures/AddMulAddFixture.h b/tests/validation/fixtures/AddMulAddFixture.h
index a3a761bff1d95e52baf3137f545567a1aa1e12e9..d13fef2f02d37f850e03de925bfe7e94d17adfca 100644
--- a/tests/validation/fixtures/AddMulAddFixture.h
+++ b/tests/validation/fixtures/AddMulAddFixture.h
@@ -22,8 +22,8 @@
  * SOFTWARE.
  */
 
-#ifndef TESTS_VALIDATION_FIXTURES_ADDMULADDFIXTURE
-#define TESTS_VALIDATION_FIXTURES_ADDMULADDFIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_ADDMULADDFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_ADDMULADDFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -86,7 +86,12 @@ protected:
 
         // Create and configure function
         FunctionType add_mul_add;
-        add_mul_add.configure(&input1, &input2, &bn_mul, &bn_add, interm_out ? &add_output : nullptr, &final_output, ConvertPolicy::SATURATE, act_info);
+        ARM_COMPUTE_ERROR_THROW_ON(add_mul_add.validate(input1.info(), input2.info(), bn_mul.info(),
+                                                        bn_add.info(), interm_out ? add_output.info() : nullptr, final_output.info(),
+                                                        ConvertPolicy::SATURATE, act_info));
+
+        add_mul_add.configure(&input1, &input2, &bn_mul, &bn_add, interm_out ? &add_output : nullptr,
+                              &final_output, ConvertPolicy::SATURATE, act_info);
 
         // Allocate tensors
         input1.allocator()->allocate();
@@ -262,4 +267,4 @@ public:
 } // namespace test
 } // namespace arm_compute
 
-#endif /* TESTS_VALIDATION_FIXTURES_ADDMULADDFIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_ADDMULADDFIXTURE_H
diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index 8562955b79925488312c695ca01e0e4f7d055bd2..0622e5e6f0f2a64b01168dc0def7f2f35b961a39 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h
@@ -21,8 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_CONVOLUTION_LAYER_FIXTURE
-#define ARM_COMPUTE_TEST_CONVOLUTION_LAYER_FIXTURE
+
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_CONVOLUTIONLAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_CONVOLUTIONLAYERFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -91,23 +92,66 @@ public:
                   || std::is_same<typename std::decay<T>::type, int8_t>::value,
                   int32_t, T >::type;
 
+    void setup_quantization(TensorShape input_shape, TensorShape weights_shape, QuantizationInfo &input_q_info,
+        QuantizationInfo &weights_q_info, DataType data_type)
+    {
+        const int32_t t_max = static_cast<int32_t>(std::numeric_limits<T>::max());
+        const int32_t t_min = static_cast<int32_t>(std::numeric_limits<T>::min());
+
+        std::mt19937                           generator(library->seed() + _hash);
+        std::uniform_real_distribution<float>  distribution_float(-5.0f, 3.0f);
+        std::uniform_int_distribution<int32_t> distribution_t(t_min, t_max);
+
+        const float scale_lhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+        const float scale_rhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+
+        const int32_t offset_lhs = distribution_t(generator);
+        const int32_t offset_rhs = distribution_t(generator);
+
+        _quantization_info = QuantizationInfo(scale_lhs, offset_lhs);
+        _weight_quantization_info = QuantizationInfo(scale_rhs, offset_rhs);
+
+        QuantizationHint q_hint = suggest_conv_dst_q_info_and_bias(input_q_info, weights_q_info,
+            weights_shape.y() /* heights */, weights_shape.x() /* width */, input_shape.z() /* channels */,
+            data_type, 0.5f /* bias_fraction */);
+
+        _dst_q_info = q_hint.q_info;
+        _min_bias = q_hint.bias_min;
+        _max_bias = q_hint.bias_max;
+    }
+
 public:
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights,
                DataType data_type, DataType weights_data_type, DataLayout data_layout, QuantizationInfo quantization_info, QuantizationInfo weight_quantization_info, ActivationLayerInfo act_info,
-               bool mixed_layout = false, PaddingList pre_pad_layer = PaddingList({}))
+               bool mixed_layout = false, PaddingList pre_pad_layer = PaddingList({}), bool padded_weights = false)
     {
+        // This hash is used by random generators. There may be hash collisions but
+        // this is intentional as it's a very easy way to make the the current
+        // random generation process almost different for many test configurations,
+        // which were using the same set of values before.
+        _hash = input_shape[0] + input_shape[1] + input_shape[2] + input_shape[3] +
+            + weights_shape[0] + weights_shape[1] + weights_shape[2] + weights_shape[3] +
+            mixed_layout + (data_type == DataType::QASYMM8_SIGNED) + (data_layout == DataLayout::NHWC);
+
         _mixed_layout             = mixed_layout;
         _data_type                = data_type;
         _weights_data_type        = weights_data_type;
-        _is_quantized             = is_data_type_quantized_asymmetric(data_type);
+        const bool is_quantized   = is_data_type_quantized(weights_data_type);
         _is_bfloat16              = data_type == DataType::BFLOAT16;
-        _bias_data_type           = _is_quantized ? DataType::S32 : (_is_bfloat16 ? DataType::F32 : data_type);
+        _bias_data_type           = is_quantized ? DataType::S32 : (_is_bfloat16 ? DataType::F32 : data_type);
         _output_data_type         = _is_bfloat16 ? DataType::F32 : data_type;
         _quantization_info        = quantization_info;
         _weight_quantization_info = weight_quantization_info;
         _data_layout              = data_layout;
+        _dst_q_info               = quantization_info;
 
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, reshape_weights, dilation, act_info, pre_pad_layer);
+        if(is_quantized && !is_data_type_quantized_symmetric(weights_data_type) && (!act_info.enabled() || act_info.activation() == ActivationFunction::IDENTITY))
+        {
+            setup_quantization(input_shape, weights_shape, _quantization_info, _weight_quantization_info, data_type);
+            _use_dynamic_output_quant = true;
+        }
+
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, reshape_weights, dilation, act_info, pre_pad_layer, padded_weights);
         _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, dilation, act_info, pre_pad_layer);
     }
 
@@ -142,16 +186,34 @@ protected:
         {
             case DataType::QASYMM8:
             {
-                std::pair<int, int>                     bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
-                std::uniform_int_distribution<uint32_t> distribution(bounds.first, bounds.second);
-                library->fill(tensor, distribution, i);
+                if(_use_dynamic_output_quant)
+                {
+                    std::uniform_int_distribution<int32_t> distribution(0, 255);
+                    library->fill(tensor, distribution, i);
+                }
+                else
+                {
+                    // Legacy initialization in case the output quantization info can't be reliably estimated
+                    std::pair<int, int>                     bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
+                    std::uniform_int_distribution<uint32_t> distribution(bounds.first, bounds.second);
+                    library->fill(tensor, distribution, i);
+                }
                 break;
             }
             case DataType::QASYMM8_SIGNED:
             {
-                std::pair<int, int>                    bounds = get_quantized_qasymm8_signed_bounds(tensor.quantization_info(), -1.0f, 1.0f);
-                std::uniform_int_distribution<int32_t> distribution(bounds.first, bounds.second);
-                library->fill(tensor, distribution, i);
+                if(_use_dynamic_output_quant)
+                {
+                    std::uniform_int_distribution<int32_t> distribution(-128, 127);
+                    library->fill(tensor, distribution, i);
+                }
+                else
+                {
+                    // Legacy initialization in case the output quantization info can't be reliably estimated
+                    std::pair<int, int>                    bounds = get_quantized_qasymm8_signed_bounds(tensor.quantization_info(), -1.0f, 1.0f);
+                    std::uniform_int_distribution<int32_t> distribution(bounds.first, bounds.second);
+                    library->fill(tensor, distribution, i);
+                }
                 break;
             }
             case DataType::QSYMM8_PER_CHANNEL:
@@ -176,7 +238,7 @@ protected:
             }
             case DataType::S32:
             {
-                std::uniform_int_distribution<int32_t> distribution(-100, 100);
+                std::uniform_int_distribution<int32_t> distribution(_min_bias, _max_bias);
                 library->fill(tensor, distribution, i);
                 break;
             }
@@ -205,7 +267,7 @@ protected:
 
     // given input is IN nchw format
     TensorType compute_target(TensorShape input_shape, TensorShape weights_shape, const TensorShape &bias_shape, TensorShape output_shape, const PadStrideInfo &info,
-                              bool reshape_weights, const Size2D &dilation, const ActivationLayerInfo act_info, PaddingList pre_pad_layer = PaddingList({}))
+                              bool reshape_weights, const Size2D &dilation, const ActivationLayerInfo act_info, PaddingList pre_pad_layer = PaddingList({}), bool padded_weights = false)
     {
         ARM_COMPUTE_ERROR_ON((input_shape[2] % weights_shape[2]) != 0);
 
@@ -239,8 +301,8 @@ protected:
         // Create tensors
         TensorType src     = create_tensor<TensorType>(input_shape, _data_type, 1, _quantization_info, _data_layout);
         TensorType weights = create_tensor<TensorType>(reshaped_weights_shape, _weights_data_type, 1, _weight_quantization_info, _data_layout);
-        TensorType bias    = create_tensor<TensorType>(bias_shape, _bias_data_type, 1, _quantization_info, _data_layout);
-        TensorType dst     = create_tensor<TensorType>(output_shape, _output_data_type, 1, _quantization_info, _data_layout);
+        TensorType bias    = create_tensor<TensorType>(bias_shape, _bias_data_type, 1, QuantizationInfo() /*bias is not a quantized type*/, _data_layout);
+        TensorType dst     = create_tensor<TensorType>(output_shape, _output_data_type, 1, _dst_q_info, _data_layout);
 
         // Create and configure function
         FunctionType conv;
@@ -273,8 +335,13 @@ protected:
         ARM_COMPUTE_ASSERT(weights.info()->is_resizable());
         ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
         ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
-
-        add_padding_x({ &src, &weights, &bias, &dst }, _data_layout);
+        // Test "add padding after configure" behavior. This behavior should not affect the correctness
+        add_padding_x({ &src, &bias, &dst }, _data_layout);
+        // Padding weights may affect code path in some backends
+        if (padded_weights)
+        {
+            add_padding_x({ &weights }, _data_layout);
+        }
 
         // Allocate tensors
         src.allocator()->allocate();
@@ -288,9 +355,9 @@ protected:
         ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
-        fill(AccessorType(src), 0);
-        fill(AccessorType(weights), 1);
-        fill(AccessorType(bias), 2);
+        fill(AccessorType(src), 0 + _hash);
+        fill(AccessorType(weights), 1 + _hash);
+        fill(AccessorType(bias), 2 + _hash);
 
         if(_mixed_layout)
         {
@@ -322,9 +389,9 @@ protected:
         SimpleTensor<TW>    weights{ weights_shape, weights_dt, 1, _weight_quantization_info };
         SimpleTensor<TBias> bias{ bias_shape, bias_dt, 1, _quantization_info };
 
-        fill(src, 0);
-        fill(weights, 1);
-        fill(bias, 2);
+        fill(src, 0 + _hash);
+        fill(weights, 1 + _hash);
+        fill(bias, 2 + _hash);
 
         // Fill with bfloat16 to perform the conversion and reduce the mismatches in the output
         if(_is_bfloat16)
@@ -338,9 +405,9 @@ protected:
             src = reference::pad_layer<T>(src, pre_pad_layer, PixelValue(0), PaddingMode::CONSTANT);
         }
 
-        return (act_info.enabled()) ? reference::activation_layer<T>(reference::convolution_layer<T>(src, weights, bias, output_shape, info, dilation, num_groups),
+        return (act_info.enabled()) ? reference::activation_layer<T>(reference::convolution_layer<T>(src, weights, bias, output_shape, info, dilation, num_groups, _dst_q_info),
                                                                      act_info) :
-               reference::convolution_layer<T>(src, weights, bias, output_shape, info, dilation, num_groups);
+               reference::convolution_layer<T>(src, weights, bias, output_shape, info, dilation, num_groups, _dst_q_info);
     }
 
     TensorType       _target{};
@@ -352,9 +419,13 @@ protected:
     DataLayout       _data_layout{};
     QuantizationInfo _quantization_info{};
     QuantizationInfo _weight_quantization_info{};
-    bool             _is_quantized = false;
+    QuantizationInfo _dst_q_info{};
     bool             _is_bfloat16  = false;
     bool             _mixed_layout = false;
+    bool             _use_dynamic_output_quant{false};
+    int32_t          _hash{0};
+    int32_t          _min_bias{-100};
+    int32_t          _max_bias{100};
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
@@ -370,6 +441,19 @@ public:
     }
 };
 
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
+class ConvolutionValidationPaddedWeightsFixture : public ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>
+{
+public:
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type,
+               DataLayout data_layout)
+    {
+        ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights,
+                                                                                                 data_type, data_type, data_layout,
+                                                                                                 QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), mixed_layout, PaddingList({}), true);
+    }
+};
+
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
 class ConvolutionValidationWithPaddingFixture : public ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>
 {
@@ -415,6 +499,7 @@ public:
     }
 };
 
+
 #ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
 inline TensorInfo prepare_weights(const TensorInfo tensor_info, const arm_compute::WeightFormat weight_format)
 {
@@ -707,7 +792,7 @@ public:
         const WeightsInfo weights_info(false, 3U, 3U, 64U, false, query_weight_format);
         _kernel_found = bool(ConvolutionClass::has_opt_impl(_computed_weight_format, &src_info, &weight_info,
                                                             &bias_info, &dst_info, conv_info, weights_info,
-                                                            /*dilation*/ Size2D(1U, 1U), /*act_info*/ ActivationLayerInfo(), enable_fast_math));
+                                                            Size2D(1U, 1U) /*dilation*/, ActivationLayerInfo() /*act_info*/, enable_fast_math));
     }
 
 protected:
@@ -719,4 +804,5 @@ protected:
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_CONVOLUTION_LAYER_FIXTURE */
+
+#endif // ACL_TESTS_VALIDATION_FIXTURES_CONVOLUTIONLAYERFIXTURE_H
diff --git a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
index 5a00c7c2d6bd42978735d9edf6418d05e2c4b0a7..d291a7736a98a6f61ee2042ce2b42d7281e6c6fa 100644
--- a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
@@ -342,7 +342,6 @@ public:
 
         add_padding_x({ &_src, &_biases, &_target }, _data_layout);
         add_padding_x({ &_weights }, _data_layout, true);
-        add_padding_y({ &_src, &_target }, _data_layout);
 
         // Create Depthwise Convolution configure function
         const ConvolutionInfo info
diff --git a/tests/validation/fixtures/DirectConvolution3DFixture.h b/tests/validation/fixtures/DirectConvolution3DFixture.h
index 6aad5490d90b6fabe5250f85fe5d8fe4d4b01023..e80ad2f54ff90f3c79ae647c3d272a2577e09359 100644
--- a/tests/validation/fixtures/DirectConvolution3DFixture.h
+++ b/tests/validation/fixtures/DirectConvolution3DFixture.h
@@ -21,7 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DIRECTCONVOLUTION3DFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DIRECTCONVOLUTION3DFIXTURE_H
+
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "tests/framework/Asserts.h" // Required for ARM_COMPUTE_ASSERT
 #include "tests/framework/Fixture.h"
 #include "tests/validation/reference/ActivationLayer.h"
 #include "tests/validation/reference/Conv3D.h"
@@ -180,3 +185,5 @@ public:
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
+
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DIRECTCONVOLUTION3DFIXTURE_H
diff --git a/tests/validation/fixtures/DirectConvolutionLayerFixture.h b/tests/validation/fixtures/DirectConvolutionLayerFixture.h
index a666ff96a5da3010f575f1ecfefd2327d79153cc..6f204642ca03e53823afd0ad6bb2103a1a118175 100644
--- a/tests/validation/fixtures/DirectConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DirectConvolutionLayerFixture.h
@@ -21,6 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DIRECTCONVOLUTIONLAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DIRECTCONVOLUTIONLAYERFIXTURE_H
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -51,10 +55,52 @@ class DirectConvolutionValidationGenericFixture : public framework::Fixture
 public:
     using TBias = typename std::conditional < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int32_t, T >::type;
 
+    void setup_quantization(const TensorShape &input_shape, const TensorShape &weights_shape, QuantizationInfo &input_q_info,
+        QuantizationInfo &weights_q_info, DataType data_type)
+    {
+        const int32_t t_max = static_cast<int32_t>(std::numeric_limits<T>::max());
+        const int32_t t_min = static_cast<int32_t>(std::numeric_limits<T>::min());
+
+        std::mt19937                           generator(library->seed() + _hash);
+        std::uniform_real_distribution<float>  distribution_float(-5.0f, 3.0f);
+        std::uniform_int_distribution<int32_t> distribution_t(t_min, t_max);
+
+        const float scale_lhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+        const float scale_rhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+
+        const int32_t offset_lhs = distribution_t(generator);
+        const int32_t offset_rhs = distribution_t(generator);
+
+        input_q_info = QuantizationInfo(scale_lhs, offset_lhs);
+        weights_q_info = QuantizationInfo(scale_rhs, offset_rhs);
+
+        QuantizationHint q_hint = suggest_conv_dst_q_info_and_bias(input_q_info, weights_q_info,
+            weights_shape.y() /* heights */, weights_shape.x() /* width */, input_shape.z() /* channels */,
+            data_type, 0.5f /* bias_fraction */);
+
+        _dst_q_info = q_hint.q_info;
+        _min_bias = q_hint.bias_min;
+        _max_bias = q_hint.bias_max;
+
+        // Do not change here as these limits are the natural limits of the associated data types and
+        // are embeded in the computation of the dst quantization info.
+        _min_u8 = 0;
+        _max_u8 = 255;
+        _min_s8 = -128;
+        _max_s8 = 127;
+    }
+
     void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels,
                DataType data_type, QuantizationInfo quantization_info, ActivationLayerInfo act_info, DataLayout data_layout, bool mixed_layout = false)
     {
-        _quantization_info = quantization_info;
+        // This hash is used by random generators. There may be hash collisions but
+        // this is intentional as it's a very easy way to make the the current
+        // random generation process almost different for many test configurations,
+        // which were using the same set of values before.
+        _hash = input_shape[0] + input_shape[1] + input_shape[2] + input_shape[3] +
+                stride_x + stride_y + pad_x + pad_y + kernel_size + num_kernels + mixed_layout
+                + (data_layout == DataLayout::NHWC);
+
         _data_type         = data_type;
         _mixed_layout      = mixed_layout;
 
@@ -68,8 +114,17 @@ public:
 
         const TensorShape output_shape = compute_deep_convolution_shape(input_info, weights_info, info);
 
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, quantization_info, act_info, data_layout);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, quantization_info, act_info);
+        QuantizationInfo input_q_info = quantization_info;
+        QuantizationInfo weights_q_info = quantization_info;
+        _dst_q_info = quantization_info;
+
+        if(is_data_type_quantized(data_type) && (!act_info.enabled() || act_info.activation() == ActivationFunction::IDENTITY))
+        {
+            setup_quantization(input_shape, weights_shape, input_q_info, weights_q_info, data_type);
+        }
+
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, input_q_info, weights_q_info, act_info, data_layout);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, input_q_info, weights_q_info, act_info);
     }
 
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation,
@@ -78,13 +133,29 @@ public:
         ARM_COMPUTE_ERROR_ON(data_layout == DataLayout::UNKNOWN);
         ARM_COMPUTE_UNUSED(dilation);
 
-        _quantization_info = quantization_info;
+        // This hash is used by random generators. There may be hash collisions but
+        // this is intentional as it's a very easy way to make the the current
+        // random generation process almost different for many test configurations,
+        // which were using the same set of values before.
+        _hash = input_shape[0] + input_shape[1] + input_shape[2] + input_shape[3] +
+            weights_shape[0] + weights_shape[1] + weights_shape[2] + weights_shape[3] + dilation.x() +
+            dilation.y() + info.pad_bottom() + info.pad_left() + info.pad_right() + info.pad_top();
+
         _data_type         = data_type;
 
         const DataType bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
 
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, quantization_info, act_info, data_layout);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, quantization_info, act_info);
+        QuantizationInfo input_q_info = quantization_info;
+        QuantizationInfo weights_q_info = quantization_info;
+        _dst_q_info = quantization_info;
+
+        if(is_data_type_quantized(data_type) && (!act_info.enabled() || act_info.activation() == ActivationFunction::IDENTITY))
+        {
+            setup_quantization(input_shape, weights_shape, input_q_info, weights_q_info, data_type);
+        }
+
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, input_q_info, weights_q_info, act_info, data_layout);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, input_q_info, weights_q_info, act_info);
     }
 
 protected:
@@ -110,14 +181,14 @@ protected:
         {
             case DataType::QASYMM8:
             {
-                std::uniform_int_distribution<uint32_t> distribution(0, 50);
+                std::uniform_int_distribution<uint32_t> distribution(_min_u8, _max_u8);
                 library->fill(tensor, distribution, i);
                 break;
             }
             case DataType::QASYMM8_SIGNED:
             {
                 // Use small input range to avoid all the test results being saturated at the end.
-                std::uniform_int_distribution<int32_t> distribution(-25, 25);
+                std::uniform_int_distribution<int32_t> distribution(_min_s8, _max_s8);
                 library->fill(tensor, distribution, i);
                 break;
             }
@@ -135,7 +206,7 @@ protected:
             }
             case DataType::S32:
             {
-                std::uniform_int_distribution<int32_t> distribution(-5, 5);
+                std::uniform_int_distribution<int32_t> distribution(_min_bias, _max_bias);
                 library->fill(tensor, distribution, i);
                 break;
             }
@@ -145,7 +216,7 @@ protected:
     }
 
     TensorType compute_target(TensorShape input_shape, TensorShape weights_shape, const TensorShape &bias_shape, TensorShape output_shape, const PadStrideInfo &info,
-                              DataType data_type, DataType bias_data_type, QuantizationInfo quantization_info, ActivationLayerInfo act_info, const DataLayout &data_layout)
+                              DataType data_type, DataType bias_data_type, QuantizationInfo input_q_info, QuantizationInfo weights_q_info, ActivationLayerInfo act_info, const DataLayout &data_layout)
     {
         if(data_layout == DataLayout::NHWC)
         {
@@ -155,10 +226,10 @@ protected:
         }
 
         // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, quantization_info, data_layout);
-        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, quantization_info, data_layout);
-        TensorType bias    = create_tensor<TensorType>(bias_shape, bias_data_type, 1, quantization_info);
-        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, quantization_info, data_layout);
+        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, input_q_info, data_layout);
+        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, weights_q_info, data_layout);
+        TensorType bias    = create_tensor<TensorType>(bias_shape, bias_data_type, 1, QuantizationInfo());
+        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, _dst_q_info, data_layout);
 
         add_padding_x({ &src, &bias, &dst }, data_layout);
         add_padding_x({ &weights }, data_layout, input_shape[0] % 4 == 0); // Don't add left padding if cl image will be used
@@ -184,9 +255,9 @@ protected:
         ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
-        fill(AccessorType(src), 0);
-        fill(AccessorType(weights), 1);
-        fill(AccessorType(bias), 2);
+        fill(AccessorType(src), 0 + _hash);
+        fill(AccessorType(weights), 1 + _hash);
+        fill(AccessorType(bias), 2 + _hash);
 
         if(_mixed_layout)
         {
@@ -202,26 +273,39 @@ protected:
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
-                                      DataType data_type, DataType bias_data_type, QuantizationInfo quantization_info, ActivationLayerInfo act_info)
+                                      DataType data_type, DataType bias_data_type, QuantizationInfo input_q_info, QuantizationInfo weights_q_info, ActivationLayerInfo act_info)
     {
         // Create reference
-        SimpleTensor<T>     src{ input_shape, data_type, 1, quantization_info };
-        SimpleTensor<T>     weights{ weights_shape, data_type, 1, quantization_info };
-        SimpleTensor<TBias> bias{ bias_shape, bias_data_type, 1, quantization_info };
+        SimpleTensor<T>     src{ input_shape, data_type, 1, input_q_info };
+        SimpleTensor<T>     weights{ weights_shape, data_type, 1, weights_q_info };
+        SimpleTensor<TBias> bias{ bias_shape, bias_data_type, 1, QuantizationInfo() };
 
         // Fill reference
-        fill(src, 0);
-        fill(weights, 1);
-        fill(bias, 2);
-
-        SimpleTensor<T> dst = reference::convolution_layer<T>(src, weights, bias, output_shape, info);
-        return (act_info.enabled()) ? reference::activation_layer<T>(dst, act_info) : dst;
+        fill(src, 0 + _hash);
+        fill(weights, 1 + _hash);
+        fill(bias, 2 + _hash);
+
+        SimpleTensor<T> dst = reference::convolution_layer<T>(src, weights, bias, output_shape, info,
+            Size2D(1U, 1U) /* dilation */, 1 /* num_groups */, _dst_q_info);
+        SimpleTensor<T> dst2 = (act_info.enabled()) ? reference::activation_layer<T>(dst, act_info) : dst;
+        return dst2;
     }
     TensorType       _target{};
     SimpleTensor<T>  _reference{};
-    QuantizationInfo _quantization_info{};
+    QuantizationInfo _dst_q_info{};
     DataType         _data_type{};
     bool             _mixed_layout{ false };
+    int32_t _hash{0};
+
+    // Random initialization limits
+    // Default values are previously handcrafted limits
+    // that sould be used when we don't use dynamic quantization
+    int32_t _min_bias{-5};
+    int32_t _max_bias{5};
+    int32_t _min_u8{0};
+    int32_t _max_u8{50};
+    int32_t _min_s8{-25};
+    int32_t _max_s8{25};
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
@@ -275,3 +359,5 @@ public:
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
+
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DIRECTCONVOLUTIONLAYERFIXTURE_H
diff --git a/tests/validation/fixtures/ElementwiseOperationsFixture.h b/tests/validation/fixtures/ElementwiseOperationsFixture.h
index b5547b67628b7d1f3075839ba4a031da22c18958..f36a1f75b7bfe2839af4534252c660f06e79ad57 100644
--- a/tests/validation/fixtures/ElementwiseOperationsFixture.h
+++ b/tests/validation/fixtures/ElementwiseOperationsFixture.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_ELEMENTWISE_OPERATIONS_FIXTURE
-#define ARM_COMPUTE_TEST_ELEMENTWISE_OPERATIONS_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_ELEMENTWISEOPERATIONSFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_ELEMENTWISEOPERATIONSFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -303,7 +303,7 @@ public:
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::DIV, shape0, shape1,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), is_inplace, true);
+                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), is_inplace, true /* use_dynamic_shape */);
     }
 };
 
@@ -315,7 +315,7 @@ public:
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::DIV, shape, shape,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), is_inplace);
+                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), is_inplace, true /* use_dynamic_shape */);
     }
 };
 
@@ -700,4 +700,4 @@ public:
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_ARITHMETIC_OPERATIONS_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_ELEMENTWISEOPERATIONSFIXTURE_H
diff --git a/tests/validation/fixtures/FullyConnectedLayerFixture.h b/tests/validation/fixtures/FullyConnectedLayerFixture.h
index fb1cb4dcb62f9cbbd892d69b30a5e7f57725dc7e..05f20ac12b63da9069a33cd850d43994d0f40b76 100644
--- a/tests/validation/fixtures/FullyConnectedLayerFixture.h
+++ b/tests/validation/fixtures/FullyConnectedLayerFixture.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_FULLY_CONNECTED_LAYER_FIXTURE
-#define ARM_COMPUTE_TEST_FULLY_CONNECTED_LAYER_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_FULLYCONNECTEDLAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_FULLYCONNECTEDLAYERFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -34,6 +34,7 @@
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
 #include "tests/validation/Helpers.h"
+#include "tests/validation/Validation.h"
 #include "tests/validation/reference/ActivationLayer.h"
 #include "tests/validation/reference/FullyConnectedLayer.h"
 #include "tests/validation/reference/Utils.h"
@@ -54,6 +55,40 @@ public:
     using TBias  = typename std::conditional < (std::is_same<TDecay, uint8_t>::value || std::is_same<TDecay, int8_t>::value), int32_t, T >::type;
 
 public:
+    void setup_quantization(TensorShape weights_shape, TensorShape output_shape, QuantizationInfo &input_q_info, QuantizationInfo &weights_q_info, DataType data_type)
+    {
+        _hash = weights_shape[0] + weights_shape[1] + output_shape[0] + output_shape[1];
+        const int32_t t_max = static_cast<int32_t>(std::numeric_limits<T>::max());
+        const int32_t t_min = static_cast<int32_t>(std::numeric_limits<T>::min());
+
+        std::mt19937                           generator(library->seed() + _hash);
+        std::uniform_real_distribution<float>  distribution_float(-5.0f, 3.0f);
+        std::uniform_int_distribution<int32_t> distribution_t(t_min, t_max);
+
+        const float scale_lhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+        const float scale_rhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+        const int32_t offset_lhs = distribution_t(generator);
+        const int32_t offset_rhs = distribution_t(generator);
+
+        input_q_info = QuantizationInfo(scale_lhs, offset_lhs);
+        weights_q_info = QuantizationInfo(scale_rhs, offset_rhs);
+
+
+        const int k = weights_shape.x();
+        QuantizationHint q_hint = suggest_mac_dst_q_info_and_bias(input_q_info, weights_q_info, k, data_type, 0.1f /* bias_fraction */, 4 /* number of standard deviations*/);
+
+        _dst_q_info = q_hint.q_info;
+        _min_bias = q_hint.bias_min;
+        _max_bias = q_hint.bias_max;
+
+        // Do not change here as these limits are the natural limits of the associated data types and
+        // are embedded in the computation of the dst quantization info.
+        _min_u8 = 0;
+        _max_u8 = 255;
+        _min_s8 = -128;
+        _max_s8 = 127;
+    }
+
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, bool transpose_weights, bool reshape_weights,
                DataType data_type, QuantizationInfo quantization_info, ActivationLayerInfo activation_info, bool mixed_layout = false)
     {
@@ -63,7 +98,20 @@ public:
         _mixed_layout      = mixed_layout;
         _data_type         = data_type;
         _bias_data_type    = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
-        _quantization_info = quantization_info;
+
+        // Note : Quantization Info parameter from setup function is only used when quant datatype and activation function is not enabled or is identity.
+        if(is_data_type_quantized(data_type) && (!activation_info.enabled() || activation_info.activation() == ActivationFunction::IDENTITY))
+        {
+            // Initialises quantization info with appropriate scale and offset for given input shapes.
+            setup_quantization(weights_shape, output_shape,_input_q_info, _weight_q_info, data_type);
+        }
+        else
+        {
+            _input_q_info = quantization_info;
+            _weight_q_info = quantization_info;
+            _dst_q_info = quantization_info;
+        }
+
         _activation_info   = activation_info;
 
         _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, transpose_weights, reshape_weights);
@@ -91,17 +139,17 @@ protected:
     {
         if(_data_type == DataType::QASYMM8)
         {
-            std::uniform_int_distribution<uint32_t> distribution(0, 30);
+            std::uniform_int_distribution<uint32_t> distribution(_min_u8, _max_u8);
             library->fill(tensor, distribution, i);
         }
         else if(_data_type == DataType::QASYMM8_SIGNED)
         {
-            std::uniform_int_distribution<int32_t> distribution(-15, 15);
+            std::uniform_int_distribution<int32_t> distribution(_min_s8, _max_s8);
             library->fill(tensor, distribution, i);
         }
         else if(_data_type == DataType::S32)
         {
-            std::uniform_int_distribution<int32_t> distribution(-50, 50);
+            std::uniform_int_distribution<int32_t> distribution(_min_bias, _max_bias);
             library->fill(tensor, distribution, i);
         }
         else if(_data_type == DataType::F16)
@@ -143,10 +191,10 @@ protected:
         }
 
         // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, _data_type, 1, _quantization_info);
-        TensorType weights = create_tensor<TensorType>(reshaped_weights_shape, _data_type, 1, _quantization_info);
-        TensorType bias    = create_tensor<TensorType>(bias_shape, _bias_data_type, 1, _quantization_info);
-        TensorType dst     = create_tensor<TensorType>(output_shape, _data_type, 1, _quantization_info);
+        TensorType src     = create_tensor<TensorType>(input_shape, _data_type, 1, _input_q_info);
+        TensorType weights = create_tensor<TensorType>(reshaped_weights_shape, _data_type, 1, _weight_q_info);
+        TensorType bias    = create_tensor<TensorType>(bias_shape, _bias_data_type, 1);
+        TensorType dst     = create_tensor<TensorType>(output_shape, _data_type, 1, _dst_q_info);
 
         // Create Fully Connected layer info
         FullyConnectedLayerInfo fc_info;
@@ -177,8 +225,8 @@ protected:
         ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
-        fill(AccessorType(src), 0);
-        fill(AccessorType(bias), 2);
+        fill(AccessorType(src), 0 + _hash);
+        fill(AccessorType(bias), 2 + _hash);
 
         if(!reshape_weights || !transpose_weights)
         {
@@ -186,7 +234,7 @@ protected:
             RawTensor   tmp(tmp_shape, _data_type, 1);
 
             // Fill with original shape
-            fill(tmp, 1);
+            fill(tmp, 1 + _hash);
 
             // Transpose elementwise
             tmp = transpose(tmp);
@@ -203,7 +251,7 @@ protected:
         }
         else
         {
-            fill(AccessorType(weights), 1);
+            fill(AccessorType(weights), 1 + _hash);
         }
 
         if(_mixed_layout)
@@ -222,16 +270,16 @@ protected:
     SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape)
     {
         // Create reference
-        SimpleTensor<T>     src{ input_shape, _data_type, 1, _quantization_info };
-        SimpleTensor<T>     weights{ weights_shape, _data_type, 1, _quantization_info };
-        SimpleTensor<TBias> bias{ bias_shape, _bias_data_type, 1, _quantization_info };
+        SimpleTensor<T>     src{ input_shape, _data_type, 1, _input_q_info };
+        SimpleTensor<T>     weights{ weights_shape, _data_type, 1, _weight_q_info };
+        SimpleTensor<TBias> bias{ bias_shape, _bias_data_type, 1, QuantizationInfo() };
 
         // Fill reference
-        fill(src, 0);
-        fill(weights, 1);
-        fill(bias, 2);
+        fill(src, 0 + _hash);
+        fill(weights, 1 + _hash);
+        fill(bias, 2 + _hash);
 
-        return reference::activation_layer(reference::fully_connected_layer<T>(src, weights, bias, output_shape, _quantization_info), _activation_info, _quantization_info);
+        return reference::activation_layer(reference::fully_connected_layer<T>(src, weights, bias, output_shape, _dst_q_info), _activation_info, _dst_q_info);
     }
 
     TensorType          _target{};
@@ -239,8 +287,22 @@ protected:
     DataType            _data_type{};
     DataType            _bias_data_type{};
     bool                _mixed_layout{ false };
-    QuantizationInfo    _quantization_info{};
+    QuantizationInfo    _input_q_info{};
+    QuantizationInfo    _weight_q_info{};
+    QuantizationInfo    _dst_q_info{};
     ActivationLayerInfo _activation_info{};
+
+    // Random initialization limits
+    // Default values are previously handcrafted limits
+    // that sould be used when we don't use dynamic quantization
+    int32_t _min_bias{-50};
+    int32_t _max_bias{50};
+
+    int32_t _min_u8{0};
+    int32_t _max_u8{30};
+    int32_t _min_s8{-15};
+    int32_t _max_s8{15};
+    int    _hash{0};
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
@@ -288,12 +350,17 @@ private:
         }
         else if(_data_type == DataType::QASYMM8)
         {
-            std::uniform_int_distribution<uint32_t> distribution(0, 30);
+            std::uniform_int_distribution<uint32_t> distribution(_min_u8, _max_u8);
+            library->fill(tensor, distribution, i);
+        }
+        else if(_data_type == DataType::QASYMM8_SIGNED)
+        {
+            std::uniform_int_distribution<int32_t> distribution(_min_s8, _max_s8);
             library->fill(tensor, distribution, i);
         }
         else if(_data_type == DataType::S32)
         {
-            std::uniform_int_distribution<int32_t> distribution(-50, 50);
+            std::uniform_int_distribution<int32_t> distribution(_min_bias, _max_bias);
             library->fill(tensor, distribution, i);
         }
         else
@@ -351,6 +418,40 @@ private:
         validate(AccessorType(target), ref, tolerance_qasymm8_signed);
     }
 
+    void setup_quantization(TensorShape weights_shape, TensorShape output_shape, QuantizationInfo &input_q_info, QuantizationInfo &weights_q_info, DataType data_type)
+    {
+        _hash = weights_shape[0] + weights_shape[1] + output_shape[0] + output_shape[1];
+
+        const int32_t t_max = static_cast<int32_t>(std::numeric_limits<T>::max());
+        const int32_t t_min = static_cast<int32_t>(std::numeric_limits<T>::min());
+
+        std::mt19937                           generator(library->seed() + _hash);
+        std::uniform_real_distribution<float>  distribution_float(-5.0f, 3.0f);
+        std::uniform_int_distribution<int32_t> distribution_t(t_min, t_max);
+
+        const float scale_lhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+        const float scale_rhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+        const int32_t offset_lhs = distribution_t(generator);
+        const int32_t offset_rhs = distribution_t(generator);
+
+        input_q_info = QuantizationInfo(scale_lhs, offset_lhs);
+        weights_q_info = QuantizationInfo(scale_rhs, offset_rhs);
+
+        const int k = weights_shape.x();
+        QuantizationHint q_hint = suggest_mac_dst_q_info_and_bias(input_q_info, weights_q_info, k, data_type, 0.1f /* bias_fraction */, 4 /* number of standard deviations*/);
+
+        _dst_q_info = q_hint.q_info;
+        _min_bias = q_hint.bias_min;
+        _max_bias = q_hint.bias_max;
+
+        // Do not change here as these limits are the natural limits of the associated data types and
+        // are embedded in the computation of the dst quantization info.
+        _min_u8 = 0;
+        _max_u8 = 255;
+        _min_s8 = -128;
+        _max_s8 = 127;
+    }
+
 public:
     using TDecay = typename std::decay<T>::type;
     using TBias  = typename std::conditional < (std::is_same<TDecay, uint8_t>::value || std::is_same<TDecay, int8_t>::value), int32_t, T >::type;
@@ -363,15 +464,22 @@ public:
         const bool     is_quantized   = is_data_type_quantized(data_type);
         const DataType bias_data_type = (is_quantized) ? DataType::S32 : data_type;
 
-        const QuantizationInfo src_qinfo     = is_quantized ? QuantizationInfo(0.1f, 10) : QuantizationInfo();
-        const QuantizationInfo weights_qinfo = is_quantized ? QuantizationInfo(0.3f, 20) : QuantizationInfo();
-        const QuantizationInfo dst_qinfo     = is_quantized ? QuantizationInfo(0.2f, 5) : QuantizationInfo();
+        if (is_quantized && (!activation_info.enabled() || activation_info.activation() == ActivationFunction::IDENTITY))
+        {
+            setup_quantization(weights_shape, dst_shape, _src_q_info, _weights_q_info, data_type);
+        }
+        else
+        {
+            _src_q_info = QuantizationInfo(0.1f, 10);
+            _dst_q_info = QuantizationInfo(0.3f, 20);
+            _weights_q_info = QuantizationInfo(0.2f, 5);
+        }
 
         // Configure TensorInfo Objects
-        const TensorInfo src_info(src_shape, 1, data_type, src_qinfo);
-        const TensorInfo dst_info(dst_shape, 1, data_type, dst_qinfo);
+        const TensorInfo src_info(src_shape, 1, data_type, _src_q_info);
+        const TensorInfo dst_info(dst_shape, 1, data_type, _dst_q_info);
         TensorInfo       bias_info(bias_shape, 1, bias_data_type);
-        TensorInfo       wei_info(weights_shape, 1, data_type, weights_qinfo);
+        TensorInfo       wei_info(weights_shape, 1, data_type, _weights_q_info);
 
         if(!constant_weights && weights_reshaped)
         {
@@ -411,20 +519,20 @@ public:
         int           randomizer_offset = 0;
 
         // Create reference tensors
-        SimpleTensor<T>     src{ src_shape, data_type, 1, src_qinfo };
-        SimpleTensor<T>     weights{ weights_shape, data_type, 1, weights_qinfo };
+        SimpleTensor<T>     src{ src_shape, data_type, 1, _src_q_info };
+        SimpleTensor<T>     weights{ weights_shape, data_type, 1, _weights_q_info };
         SimpleTensor<TBias> bias{ bias_shape, bias_data_type };
 
         // Fill weights and/or bias if they remain constant
         if(constant_weights)
         {
-            fill(AccessorType(_weights), 1);
-            fill(weights, 1);
+            fill(AccessorType(_weights), 1 + _hash);
+            fill(weights, 1 + _hash);
         }
         if(constant_bias && !remove_bias)
         {
-            fill(AccessorType(_bias), 2);
-            fill(bias, 2);
+            fill(AccessorType(_bias), 2 + _hash);
+            fill(bias, 2 + _hash);
         }
         // To remove bias, fill with 0
         if(remove_bias && is_quantized)
@@ -445,16 +553,16 @@ public:
                 {
                     if(weights_reshaped)
                     {
-                        fill_transposed_weights(_weights, weights_shape, randomizer_offset + 1);
+                        fill_transposed_weights(_weights, weights_shape, randomizer_offset + 1 + _hash);
                     }
                     else
                     {
-                        fill(AccessorType(_weights), randomizer_offset + 1);
+                        fill(AccessorType(_weights), randomizer_offset + 1 +_hash);
                     }
                 }
                 if(!constant_bias && !remove_bias)
                 {
-                    fill(AccessorType(_bias), randomizer_offset + 2);
+                    fill(AccessorType(_bias), randomizer_offset + 2 + _hash);
                 }
 
                 fc.run();
@@ -466,14 +574,14 @@ public:
                 fill(src, randomizer_offset);
                 if(!constant_weights)
                 {
-                    fill(weights, randomizer_offset + 1);
+                    fill(weights, randomizer_offset + 1 + _hash);
                 }
                 if(!constant_bias && !remove_bias)
                 {
-                    fill(bias, randomizer_offset + 2);
+                    fill(bias, randomizer_offset + 2 + _hash);
                 }
 
-                auto dst = reference::activation_layer(reference::fully_connected_layer<T>(src, weights, bias, dst_shape, dst_qinfo), activation_info, dst_qinfo);
+                auto dst = reference::activation_layer(reference::fully_connected_layer<T>(src, weights, bias, dst_shape, _dst_q_info), activation_info, _dst_q_info);
 
                 // Validate
                 validate_with_tolerance(_dst, dst);
@@ -486,6 +594,22 @@ public:
 private:
     TensorType _src{}, _weights{}, _bias{}, _dst{};
     DataType   _data_type{ DataType::UNKNOWN };
+
+    QuantizationInfo _src_q_info{};
+    QuantizationInfo _weights_q_info{};
+    QuantizationInfo _dst_q_info{};
+
+    // Random initialization limits
+    // Default values are previously handcrafted limits
+    // that sould be used when we don't use dynamic quantization
+    int32_t _min_bias{-50};
+    int32_t _max_bias{50};
+
+    int32_t _min_u8{0};
+    int32_t _max_u8{30};
+    int32_t _min_s8{-15};
+    int32_t _max_s8{15};
+    int     _hash{0};
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
@@ -520,10 +644,10 @@ public:
                DataType data_type, ActivationLayerInfo activation_info)
     {
         FullyConnectedWithDynamicTensorsFixture<TensorType, AccessorType, FunctionType, T>::setup(src_shape, weights_shape, bias_shape,
-                                                                                                  dst_shape, data_type, activation_info, true, false, false, false /* weights_reshaped (not used) */);
+                                                                                                  dst_shape, data_type, activation_info, true, false, false, false);
     }
 };
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_FULLY_CONNECTED_LAYER_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_FULLYCONNECTEDLAYERFIXTURE_H
diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h
index f1e0ee9150f5fd43737b191dc04cf479bba43955..afde3d80679d938e99d61387426ab9cad7366fed 100644
--- a/tests/validation/fixtures/GEMMFixture.h
+++ b/tests/validation/fixtures/GEMMFixture.h
@@ -21,14 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_GEMM_FIXTURE
-#define ARM_COMPUTE_TEST_GEMM_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_GEMMFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_GEMMFIXTURE_H
 
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "src/core/experimental/PostOpUtils.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/IAccessor.h"
@@ -38,7 +36,6 @@
 #include "tests/validation/reference/ActivationLayer.h"
 #include "tests/validation/reference/ElementwiseOperations.h"
 #include "tests/validation/reference/GEMM.h"
-#include "tests/validation/reference/PostOps.h"
 
 #include <random>
 
@@ -304,8 +301,7 @@ protected:
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
             { ACL_SRC_1, &rhs },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -426,8 +422,7 @@ protected:
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
             { ACL_SRC_1, &rhs },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -580,8 +575,7 @@ protected:
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
             { ACL_SRC_1, &rhs_reshaped },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -734,8 +728,7 @@ protected:
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
             { ACL_SRC_1, &rhs_reshaped },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -908,8 +901,7 @@ protected:
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
             { ACL_SRC_1, &rhs_reshaped },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -960,471 +952,40 @@ protected:
     SimpleTensor<T> _reference{};
 };
 
-/** (EXPERIMENTAL_POST_OPS)*/
-template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSOperatorType, typename ReshapeRHSOperatorType, typename GEMMOperatorType, bool fp_mixed_precision = false>
-class GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture : public framework::Fixture
-{
-public:
-    using PostOpArgBroadcast = std::tuple<bool, bool, bool>; // Instruct fixture if we need broadcasting in dimension 0, 1, 2 of each PostOp argument
-public:
-    void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, bool interleave_lhs,
-               bool interleave_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, bool lhs_transpose, const ActivationLayerInfo &act_info,
-               const experimental::PostOpList<PostOpArgBroadcast> &post_ops)
-    {
-        GEMMLHSMatrixInfo lhs_info;
-        lhs_info.m0         = m0;
-        lhs_info.k0         = k0;
-        lhs_info.v0         = v0;
-        lhs_info.interleave = interleave_lhs;
-        lhs_info.transpose  = lhs_transpose;
-
-        GEMMRHSMatrixInfo rhs_info;
-        rhs_info.n0                 = n0;
-        rhs_info.k0                 = k0;
-        rhs_info.h0                 = h0;
-        rhs_info.interleave         = interleave_rhs;
-        rhs_info.transpose          = !lhs_transpose;
-        rhs_info.export_to_cl_image = export_to_cl_image;
-
-        // Set the tensor shapes for LHS and RHS matrices
-        const TensorShape lhs_shape(k, m, batch_size);
-        const TensorShape rhs_shape(n, k, batch_size);
-        const TensorShape bias_shape(n,
-                                     broadcast_bias ? 1 : m,
-                                     broadcast_bias ? 1 : batch_size);
-        auto post_ops_with_shapes = experimental::transform_post_op_list_arguments<PostOpArgBroadcast, TensorShape>(post_ops,
-                                                                                                                    [ = ](auto broadcast)
-        {
-            return TensorShape
-            {
-                std::get<0>(broadcast) ? 1 : n,
-                std::get<1>(broadcast) ? 1 : m,
-                std::get<2>(broadcast) ? 1 : batch_size,
-            };
-        });
-
-        _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
-        if(validate_result)
-        {
-            _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
-        }
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i)
-    {
-        static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
-        using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
-
-        DistributionType distribution{ T(-1.0f), T(1.0f) };
-        library->fill(tensor, distribution, i);
-
-        // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
-        DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
-        library->fill_borders_with_garbage(tensor, distribution_inf, i);
-    }
-
-    TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                              DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
-    {
-        // Create tensors
-        TensorType lhs  = create_tensor<TensorType>(lhs_shape, data_type, 1);
-        TensorType rhs  = create_tensor<TensorType>(rhs_shape, data_type, 1);
-        TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
-
-        // Create post op tensors and populate post op with them
-        std::vector<TensorType> post_op_tensors_holder{};
-        auto                    populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, ITensorInfo *>(post_ops,
-                                                                                                                                [&post_op_tensors_holder, &data_type](auto shape)
-        {
-            auto t = create_tensor<TensorType>(shape, data_type, 1);
-            post_op_tensors_holder.push_back(std::move(t));
-            return post_op_tensors_holder.back().info();
-        });
-        TensorType lhs_reshaped;
-        TensorType rhs_reshaped;
-        TensorType dst;
-
-        const unsigned int M = lhs_shape[1];
-        const unsigned int N = rhs_shape[0];
-        const unsigned int K = lhs_shape[0];
-        GEMMKernelInfo     kernel_info;
-        kernel_info.m                       = M;
-        kernel_info.n                       = N;
-        kernel_info.k                       = K;
-        kernel_info.depth_output_gemm3d     = 0;
-        kernel_info.reinterpret_input_as_3d = false;
-        kernel_info.broadcast_bias          = broadcast_bias;
-        kernel_info.activation_info         = act_info;
-        kernel_info.fp_mixed_precision      = fp_mixed_precision;
-        kernel_info.post_ops                = populated_post_ops;
-
-        // The output tensor will be auto-initialized within the function
-
-        // Create and configure function
-        ReshapeLHSOperatorType reshape_lhs;
-        ReshapeRHSOperatorType reshape_rhs;
-        GEMMOperatorType       gemm;
-
-        validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
-        validate_result = validate_result || !rhs_info.export_to_cl_image;
-        if(!validate_result)
-        {
-            return nullptr;
-        }
-
-        reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
-        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
-        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
-
-        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
-        for(const auto &tensor : post_op_tensors_holder)
-        {
-            ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
-        }
-
-        // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
-        if(!rhs_info.export_to_cl_image)
-        {
-            add_padding_x({ &lhs, &rhs, &lhs_reshaped, &rhs_reshaped, &bias, &dst });
-            for(auto &tensor : post_op_tensors_holder)
-            {
-                add_padding_x({ &tensor });
-            }
-        }
-
-        // Allocate tensors
-        lhs.allocator()->allocate();
-        rhs.allocator()->allocate();
-        lhs_reshaped.allocator()->allocate();
-        rhs_reshaped.allocator()->allocate();
-        bias.allocator()->allocate();
-        dst.allocator()->allocate();
-        for(auto &tensor : post_op_tensors_holder)
-        {
-            tensor.allocator()->allocate();
-        }
-
-        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!lhs_reshaped.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
-        for(const auto &tensor : post_op_tensors_holder)
-        {
-            ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
-        }
-
-        // Fill tensors
-        fill(AccessorType(lhs), 0);
-        fill(AccessorType(rhs), 1);
-        fill(AccessorType(bias), 2);
-        for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
-        {
-            fill(AccessorType(post_op_tensors_holder.at(i)), 3 + i);
-        }
-
-        // Compute GEMM
-        ITensorPack reshape_lhs_pack = { { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } };
-        reshape_lhs.run(reshape_lhs_pack);
-        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
-        reshape_rhs.run(reshape_rhs_pack);
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
-            { ACL_SRC_1, &rhs_reshaped },
-            { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
-        for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
-        {
-            gemm_pack.add_tensor(experimental::get_post_op_arg_type(i), &post_op_tensors_holder.at(i));
-        }
-        gemm.run(gemm_pack);
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias,
-                                      const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
-    {
-        TensorShape dst_shape = lhs_shape;
-        dst_shape[0]          = rhs_shape[0];
-        dst_shape[1]          = lhs_shape[1];
-
-        // Create reference
-        SimpleTensor<T> lhs{ lhs_shape, data_type, 1 };
-        SimpleTensor<T> rhs{ rhs_shape, data_type, 1 };
-        SimpleTensor<T> bias{ dst_shape, data_type, 1 };
-        // Create post op tensors and populate post op with them
-        auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, SimpleTensor<T>>(post_ops, [&data_type](auto shape)
-        {
-            return SimpleTensor<T> { shape, data_type, 1 };
-        });
-
-        const int n          = rhs_shape[0];
-        const int m          = lhs_shape[1];
-        const int batch_size = lhs_shape[2];
-
-        // Fill reference
-        int tensor_idx = 0;
-        fill(lhs, tensor_idx++);
-        fill(rhs, tensor_idx++);
-        fill(bias, tensor_idx++);
-        for(auto &op : populated_post_ops.get_list())
-        {
-            for(auto tensor : op->arguments())
-            {
-                fill(*tensor, tensor_idx++);
-            }
-        }
-
-        if(broadcast_bias)
-        {
-            // In case of broadcast, we need to simply copy the first into the following "M" ones
-            for(int i = 1; i < m * batch_size; i++)
-            {
-                memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
-            }
-        }
-
-        SimpleTensor<T> out;
-        if(fp_mixed_precision)
-        {
-            out = reference::gemm_mixed_precision<T>(lhs, rhs, bias, alpha, beta);
-        }
-        else
-        {
-            out = reference::gemm<T>(lhs, rhs, bias, alpha, beta);
-        }
-        // Ignore activation info if post ops are used instead
-        if(populated_post_ops.size() > 0)
-        {
-            out = reference::post_ops<T>(out, populated_post_ops);
-        }
-        else
-        {
-            out = reference::activation_layer(out, act_info);
-        }
-        return out;
-    }
-
-    bool            validate_result = true;
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-
 template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSOperatorType, typename ReshapeRHSOperatorType, typename GEMMOperatorType, bool fp_mixed_precision = false>
 class GEMMMatrixMultiplyReshaped3DValidationFixture : public framework::Fixture
 {
 public:
-    void setup(unsigned int m_w, unsigned int m_h, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-               bool interleave_lhs, bool interleave_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool lhs_transpose, const ActivationLayerInfo &act_info)
-    {
-        GEMMLHSMatrixInfo lhs_info;
-        lhs_info.m0         = m0;
-        lhs_info.k0         = k0;
-        lhs_info.v0         = v0;
-        lhs_info.interleave = interleave_lhs;
-        lhs_info.transpose  = lhs_transpose;
-
-        GEMMRHSMatrixInfo rhs_info;
-        rhs_info.n0                 = n0;
-        rhs_info.k0                 = k0;
-        rhs_info.h0                 = h0;
-        rhs_info.interleave         = interleave_rhs;
-        rhs_info.transpose          = !lhs_transpose;
-        rhs_info.export_to_cl_image = export_to_cl_image;
-
-        // In case of GEMM3D, m is the product between m_w and m_h
-        const unsigned int m = m_w * m_h;
-
-        // Set the tensor shapes for LHS and RHS matrices
-        const TensorShape lhs_shape(k, m, batch_size);
-        const TensorShape rhs_shape(n, k, batch_size);
-        const TensorShape bias_shape(n, 1, 1);
-
-        _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, m_h, act_info);
-        if(validate_result)
-        {
-            _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, m_h, act_info);
-        }
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i)
-    {
-        static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
-        using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
-
-        DistributionType distribution{ T(-1.0f), T(1.0f) };
-        library->fill(tensor, distribution, i);
-    }
-
-    TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                              DataType data_type, float alpha, float beta, unsigned int m_h, const ActivationLayerInfo &act_info)
-    {
-        // Create tensors
-        TensorType lhs  = create_tensor<TensorType>(lhs_shape, data_type, 1);
-        TensorType rhs  = create_tensor<TensorType>(rhs_shape, data_type, 1);
-        TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
-        TensorType lhs_reshaped;
-        TensorType rhs_reshaped;
-        TensorType dst;
-
-        const unsigned int M = lhs_shape[1];
-        const unsigned int N = rhs_shape[0];
-        const unsigned int K = lhs_shape[0];
-        GEMMKernelInfo     kernel_info;
-        kernel_info.m                       = M;
-        kernel_info.n                       = N;
-        kernel_info.k                       = K;
-        kernel_info.depth_output_gemm3d     = m_h;
-        kernel_info.reinterpret_input_as_3d = false;
-        kernel_info.broadcast_bias          = true;
-        kernel_info.activation_info         = act_info;
-        kernel_info.fp_mixed_precision      = fp_mixed_precision;
-
-        // The output tensor will be auto-initialized within the function
-
-        // Create and configure function
-        ReshapeLHSOperatorType reshape_lhs;
-        ReshapeRHSOperatorType reshape_rhs;
-        GEMMOperatorType       gemm;
-
-        validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
-        validate_result = validate_result || !rhs_info.export_to_cl_image;
-        if(!validate_result)
-        {
-            return nullptr;
-        }
-
-        reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
-        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
-        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
-
-        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
-
-        // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
-        if(!rhs_info.export_to_cl_image)
-        {
-            add_padding_x({ &lhs, &rhs, &lhs_reshaped, &rhs_reshaped, &bias, &dst });
-        }
-
-        // Allocate tensors
-        lhs.allocator()->allocate();
-        rhs.allocator()->allocate();
-        lhs_reshaped.allocator()->allocate();
-        rhs_reshaped.allocator()->allocate();
-        bias.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!lhs_reshaped.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
-
-        // Fill tensors
-        fill(AccessorType(lhs), 0);
-        fill(AccessorType(rhs), 1);
-        fill(AccessorType(bias), 2);
-
-        // Compute GEMM
-        ITensorPack reshape_lhs_pack = { { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } };
-        reshape_lhs.run(reshape_lhs_pack);
-        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
-        reshape_rhs.run(reshape_rhs_pack);
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
-            { ACL_SRC_1, &rhs_reshaped },
-            { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
-        gemm.run(gemm_pack);
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, unsigned int m_h,
-                                      const ActivationLayerInfo &act_info)
-    {
-        TensorShape dst_shape = lhs_shape;
-        dst_shape.set(0, rhs_shape[0]);
-        dst_shape.set(1, lhs_shape[1] / m_h);
-        dst_shape.set(2, m_h);
-        dst_shape.set(3, lhs_shape[2]);
-
-        // Create reference
-        SimpleTensor<T> lhs{ lhs_shape, data_type, 1 };
-        SimpleTensor<T> rhs{ rhs_shape, data_type, 1 };
-        SimpleTensor<T> bias{ dst_shape, data_type, 1 };
-
-        const int n          = rhs_shape[0];
-        const int m          = lhs_shape[1];
-        const int batch_size = lhs_shape[2];
-
-        // Fill reference
-        fill(lhs, 0);
-        fill(rhs, 1);
-        fill(bias, 2);
-
-        // In case of broadcast, we need to simply copy the first into the following "M" ones
-        for(int i = 1; i < m * batch_size; i++)
-        {
-            memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
-        }
-
-        if(fp_mixed_precision)
-        {
-            return reference::activation_layer(reference::gemm_mixed_precision<T>(lhs, rhs, bias, alpha, beta), act_info);
-        }
-        else
-        {
-            return reference::activation_layer(reference::gemm<T>(lhs, rhs, bias, alpha, beta), act_info);
-        }
-    }
-
-    bool            validate_result = true;
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-
-template <typename TensorType, typename AccessorType, typename T, typename ReshapeRHSOperatorType, typename GEMMOperatorType>
-class GEMMMatrixMultiplyReshapedOnlyRHSValidationFixture : public framework::Fixture
-{
-public:
-    void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int h0,
-               bool interleave_rhs, bool transpose_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info)
+    void setup(unsigned int m_w, unsigned int m_h, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
+               bool interleave_lhs, bool interleave_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool lhs_transpose, const ActivationLayerInfo &act_info)
     {
         GEMMLHSMatrixInfo lhs_info;
-        lhs_info.m0 = m0;
-        lhs_info.k0 = k0;
+        lhs_info.m0         = m0;
+        lhs_info.k0         = k0;
+        lhs_info.v0         = v0;
+        lhs_info.interleave = interleave_lhs;
+        lhs_info.transpose  = lhs_transpose;
 
         GEMMRHSMatrixInfo rhs_info;
         rhs_info.n0                 = n0;
         rhs_info.k0                 = k0;
         rhs_info.h0                 = h0;
         rhs_info.interleave         = interleave_rhs;
-        rhs_info.transpose          = transpose_rhs;
+        rhs_info.transpose          = !lhs_transpose;
         rhs_info.export_to_cl_image = export_to_cl_image;
 
+        // In case of GEMM3D, m is the product between m_w and m_h
+        const unsigned int m = m_w * m_h;
+
         // Set the tensor shapes for LHS and RHS matrices
         const TensorShape lhs_shape(k, m, batch_size);
         const TensorShape rhs_shape(n, k, batch_size);
-        const TensorShape bias_shape(n,
-                                     broadcast_bias ? 1 : m,
-                                     broadcast_bias ? 1 : batch_size);
+        const TensorShape bias_shape(n, 1, 1);
 
-        _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info);
+        _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, m_h, act_info);
         if(validate_result)
         {
-            _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info);
+            _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, m_h, act_info);
         }
     }
 
@@ -1437,19 +998,16 @@ protected:
 
         DistributionType distribution{ T(-1.0f), T(1.0f) };
         library->fill(tensor, distribution, i);
-
-        // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
-        DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
-        library->fill_borders_with_garbage(tensor, distribution_inf, i);
     }
 
     TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                              DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info)
+                              DataType data_type, float alpha, float beta, unsigned int m_h, const ActivationLayerInfo &act_info)
     {
         // Create tensors
         TensorType lhs  = create_tensor<TensorType>(lhs_shape, data_type, 1);
         TensorType rhs  = create_tensor<TensorType>(rhs_shape, data_type, 1);
         TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
+        TensorType lhs_reshaped;
         TensorType rhs_reshaped;
         TensorType dst;
 
@@ -1460,14 +1018,16 @@ protected:
         kernel_info.m                       = M;
         kernel_info.n                       = N;
         kernel_info.k                       = K;
-        kernel_info.depth_output_gemm3d     = 0;
+        kernel_info.depth_output_gemm3d     = m_h;
         kernel_info.reinterpret_input_as_3d = false;
-        kernel_info.broadcast_bias          = broadcast_bias;
+        kernel_info.broadcast_bias          = true;
         kernel_info.activation_info         = act_info;
+        kernel_info.fp_mixed_precision      = fp_mixed_precision;
 
         // The output tensor will be auto-initialized within the function
 
         // Create and configure function
+        ReshapeLHSOperatorType reshape_lhs;
         ReshapeRHSOperatorType reshape_rhs;
         GEMMOperatorType       gemm;
 
@@ -1478,8 +1038,9 @@ protected:
             return nullptr;
         }
 
+        reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
         reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
-        gemm.configure(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
+        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
 
         ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
         ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
@@ -1488,18 +1049,20 @@ protected:
         // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
         if(!rhs_info.export_to_cl_image)
         {
-            add_padding_x({ &lhs, &rhs, &rhs_reshaped, &bias, &dst });
+            add_padding_x({ &lhs, &rhs, &lhs_reshaped, &rhs_reshaped, &bias, &dst });
         }
 
         // Allocate tensors
         lhs.allocator()->allocate();
         rhs.allocator()->allocate();
+        lhs_reshaped.allocator()->allocate();
         rhs_reshaped.allocator()->allocate();
         bias.allocator()->allocate();
         dst.allocator()->allocate();
 
         ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
         ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!lhs_reshaped.info()->is_resizable());
         ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
         ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
         ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
@@ -1510,24 +1073,27 @@ protected:
         fill(AccessorType(bias), 2);
 
         // Compute GEMM
+        ITensorPack reshape_lhs_pack = { { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } };
+        reshape_lhs.run(reshape_lhs_pack);
         ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
         reshape_rhs.run(reshape_rhs_pack);
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
             { ACL_SRC_1, &rhs_reshaped },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias,
+    SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, unsigned int m_h,
                                       const ActivationLayerInfo &act_info)
     {
         TensorShape dst_shape = lhs_shape;
-        dst_shape[0]          = rhs_shape[0];
-        dst_shape[1]          = lhs_shape[1];
+        dst_shape.set(0, rhs_shape[0]);
+        dst_shape.set(1, lhs_shape[1] / m_h);
+        dst_shape.set(2, m_h);
+        dst_shape.set(3, lhs_shape[2]);
 
         // Create reference
         SimpleTensor<T> lhs{ lhs_shape, data_type, 1 };
@@ -1543,16 +1109,20 @@ protected:
         fill(rhs, 1);
         fill(bias, 2);
 
-        if(broadcast_bias)
+        // In case of broadcast, we need to simply copy the first into the following "M" ones
+        for(int i = 1; i < m * batch_size; i++)
         {
-            // In case of broadcast, we need to simply copy the first into the following "M" ones
-            for(int i = 1; i < m * batch_size; i++)
-            {
-                memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
-            }
+            memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
         }
 
-        return reference::activation_layer(reference::gemm<T>(lhs, rhs, bias, alpha, beta), act_info);
+        if(fp_mixed_precision)
+        {
+            return reference::activation_layer(reference::gemm_mixed_precision<T>(lhs, rhs, bias, alpha, beta), act_info);
+        }
+        else
+        {
+            return reference::activation_layer(reference::gemm<T>(lhs, rhs, bias, alpha, beta), act_info);
+        }
     }
 
     bool            validate_result = true;
@@ -1560,15 +1130,12 @@ protected:
     SimpleTensor<T> _reference{};
 };
 
-/** (EXPERIMENTAL_POST_OPS)*/
 template <typename TensorType, typename AccessorType, typename T, typename ReshapeRHSOperatorType, typename GEMMOperatorType>
-class GEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsValidationFixture : public framework::Fixture
+class GEMMMatrixMultiplyReshapedOnlyRHSValidationFixture : public framework::Fixture
 {
 public:
-    using PostOpArgBroadcast = std::tuple<bool, bool, bool>; // Instruct fixture if we need broadcasting in dimension 0, 1, 2 of each PostOp argument
     void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int h0,
-               bool interleave_rhs, bool transpose_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info,
-               const experimental::PostOpList<PostOpArgBroadcast> &post_ops)
+               bool interleave_rhs, bool transpose_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info)
     {
         GEMMLHSMatrixInfo lhs_info;
         lhs_info.m0 = m0;
@@ -1588,21 +1155,11 @@ public:
         const TensorShape bias_shape(n,
                                      broadcast_bias ? 1 : m,
                                      broadcast_bias ? 1 : batch_size);
-        auto post_ops_with_shapes = experimental::transform_post_op_list_arguments<PostOpArgBroadcast, TensorShape>(post_ops,
-                                                                                                                    [ = ](auto broadcast)
-        {
-            return TensorShape
-            {
-                std::get<0>(broadcast) ? 1 : n,
-                std::get<1>(broadcast) ? 1 : m,
-                std::get<2>(broadcast) ? 1 : batch_size,
-            };
-        });
 
-        _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
+        _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info);
         if(validate_result)
         {
-            _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
+            _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info);
         }
     }
 
@@ -1622,7 +1179,7 @@ protected:
     }
 
     TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                              DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
+                              DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info)
     {
         // Create tensors
         TensorType lhs  = create_tensor<TensorType>(lhs_shape, data_type, 1);
@@ -1630,15 +1187,6 @@ protected:
         TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
         TensorType rhs_reshaped;
         TensorType dst;
-        // Create post op tensors and populate post op with them
-        std::vector<TensorType> post_op_tensors_holder{};
-        auto                    populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, ITensorInfo *>(post_ops,
-                                                                                                                                [&post_op_tensors_holder, &data_type](auto shape)
-        {
-            auto t = create_tensor<TensorType>(shape, data_type, 1);
-            post_op_tensors_holder.push_back(std::move(t));
-            return post_op_tensors_holder.back().info();
-        });
 
         const unsigned int M = lhs_shape[1];
         const unsigned int N = rhs_shape[0];
@@ -1651,7 +1199,6 @@ protected:
         kernel_info.reinterpret_input_as_3d = false;
         kernel_info.broadcast_bias          = broadcast_bias;
         kernel_info.activation_info         = act_info;
-        kernel_info.post_ops                = populated_post_ops;
 
         // The output tensor will be auto-initialized within the function
 
@@ -1672,19 +1219,11 @@ protected:
         ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
         ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
         ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
-        for(const auto &tensor : post_op_tensors_holder)
-        {
-            ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
-        }
 
         // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
         if(!rhs_info.export_to_cl_image)
         {
             add_padding_x({ &lhs, &rhs, &rhs_reshaped, &bias, &dst });
-            for(auto &tensor : post_op_tensors_holder)
-            {
-                add_padding_x({ &tensor });
-            }
         }
 
         // Allocate tensors
@@ -1693,29 +1232,17 @@ protected:
         rhs_reshaped.allocator()->allocate();
         bias.allocator()->allocate();
         dst.allocator()->allocate();
-        for(auto &tensor : post_op_tensors_holder)
-        {
-            tensor.allocator()->allocate();
-        }
 
         ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
         ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
         ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
         ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
         ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
-        for(const auto &tensor : post_op_tensors_holder)
-        {
-            ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
-        }
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
         fill(AccessorType(rhs), 1);
         fill(AccessorType(bias), 2);
-        for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
-        {
-            fill(AccessorType(post_op_tensors_holder.at(i)), 3 + i);
-        }
 
         // Compute GEMM
         ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
@@ -1723,19 +1250,14 @@ protected:
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
             { ACL_SRC_1, &rhs_reshaped },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
-        for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
-        {
-            gemm_pack.add_tensor(experimental::get_post_op_arg_type(i), &post_op_tensors_holder.at(i));
-        }
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias,
-                                      const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
+                                      const ActivationLayerInfo &act_info)
     {
         TensorShape dst_shape = lhs_shape;
         dst_shape[0]          = rhs_shape[0];
@@ -1745,28 +1267,15 @@ protected:
         SimpleTensor<T> lhs{ lhs_shape, data_type, 1 };
         SimpleTensor<T> rhs{ rhs_shape, data_type, 1 };
         SimpleTensor<T> bias{ dst_shape, data_type, 1 };
-        // Create post op tensors and populate post op with them
-        auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, SimpleTensor<T>>(post_ops, [&data_type](auto shape)
-        {
-            return SimpleTensor<T> { shape, data_type, 1 };
-        });
 
         const int n          = rhs_shape[0];
         const int m          = lhs_shape[1];
         const int batch_size = lhs_shape[2];
 
         // Fill reference
-        int tensor_idx = 0;
-        fill(lhs, tensor_idx++);
-        fill(rhs, tensor_idx++);
-        fill(bias, tensor_idx++);
-        for(auto &op : populated_post_ops.get_list())
-        {
-            for(auto tensor : op->arguments())
-            {
-                fill(*tensor, tensor_idx++);
-            }
-        }
+        fill(lhs, 0);
+        fill(rhs, 1);
+        fill(bias, 2);
 
         if(broadcast_bias)
         {
@@ -1777,18 +1286,7 @@ protected:
             }
         }
 
-        SimpleTensor<T> out;
-        out = reference::gemm<T>(lhs, rhs, bias, alpha, beta);
-        // Ignore activation info if post ops are used instead
-        if(populated_post_ops.size() > 0)
-        {
-            out = reference::post_ops<T>(out, populated_post_ops);
-        }
-        else
-        {
-            out = reference::activation_layer(out, act_info);
-        }
-        return out;
+        return reference::activation_layer(reference::gemm<T>(lhs, rhs, bias, alpha, beta), act_info);
     }
 
     bool            validate_result = true;
@@ -1921,8 +1419,7 @@ protected:
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
             { ACL_SRC_1, &rhs_reshaped },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -2057,8 +1554,7 @@ protected:
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
             { ACL_SRC_1, &rhs },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -2101,212 +1597,6 @@ protected:
     SimpleTensor<T> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename T, typename GEMMOperatorType>
-class GEMMMatrixMultiplyNativeWithPostOpsValidationFixture : public framework::Fixture
-{
-public:
-    using PostOpArgBroadcast = std::tuple<bool, bool, bool>; // Instruct fixture if we need broadcasting in dimension 0, 1, 2 of each PostOp argument
-public:
-    void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, DataType data_type, float alpha, float beta, bool broadcast_bias,
-               const ActivationLayerInfo &act_info, const experimental::PostOpList<PostOpArgBroadcast> &post_ops)
-    {
-        GEMMLHSMatrixInfo lhs_info;
-        lhs_info.m0 = m0;
-        lhs_info.k0 = k0;
-
-        GEMMRHSMatrixInfo rhs_info;
-        rhs_info.n0 = n0;
-        rhs_info.k0 = k0;
-
-        // Set the tensor shapes for LHS and RHS matrices
-        const TensorShape lhs_shape(k, m, batch_size);
-        const TensorShape rhs_shape(n, k, batch_size);
-        const TensorShape bias_shape(n,
-                                     broadcast_bias ? 1 : m,
-                                     broadcast_bias ? 1 : batch_size);
-        const auto post_ops_with_shapes = experimental::transform_post_op_list_arguments<PostOpArgBroadcast, TensorShape>(post_ops,
-                                                                                                                          [ = ](auto broadcast)
-        {
-            return TensorShape
-            {
-                std::get<0>(broadcast) ? 1 : n,
-                std::get<1>(broadcast) ? 1 : m,
-                std::get<2>(broadcast) ? 1 : batch_size,
-            };
-        });
-
-        _target    = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
-        _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i)
-    {
-        static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
-        using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
-
-        DistributionType distribution{ T(-1.0f), T(1.0f) };
-        library->fill(tensor, distribution, i);
-
-        // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
-        DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
-        library->fill_borders_with_garbage(tensor, distribution_inf, i);
-    }
-
-    TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                              DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
-    {
-        // Create tensors
-        TensorType lhs  = create_tensor<TensorType>(lhs_shape, data_type, 1);
-        TensorType rhs  = create_tensor<TensorType>(rhs_shape, data_type, 1);
-        TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
-        TensorType dst;
-        // Create post op tensors and populate post op with them
-        std::vector<TensorType> post_op_tensors_holder{};
-        auto                    populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, ITensorInfo *>(post_ops,
-                                                                                                                                [&post_op_tensors_holder, &data_type](auto shape)
-        {
-            auto t = create_tensor<TensorType>(shape, data_type, 1);
-            post_op_tensors_holder.push_back(std::move(t));
-            return post_op_tensors_holder.back().info();
-        });
-
-        const unsigned int M = lhs_shape[1];
-        const unsigned int N = rhs_shape[0];
-        const unsigned int K = lhs_shape[0];
-        GEMMKernelInfo     kernel_info;
-        kernel_info.m                       = M;
-        kernel_info.n                       = N;
-        kernel_info.k                       = K;
-        kernel_info.depth_output_gemm3d     = 0;
-        kernel_info.reinterpret_input_as_3d = false;
-        kernel_info.broadcast_bias          = broadcast_bias;
-        kernel_info.activation_info         = act_info;
-        kernel_info.post_ops                = populated_post_ops;
-
-        // Create and configure function
-        GEMMOperatorType gemm;
-        gemm.configure(lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
-
-        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
-        for(const auto &tensor : post_op_tensors_holder)
-        {
-            ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
-        }
-
-        add_padding_x({ &lhs, &rhs, &bias, &dst });
-        for(auto &tensor : post_op_tensors_holder)
-        {
-            add_padding_x({ &tensor });
-        }
-
-        // Allocate tensors
-        lhs.allocator()->allocate();
-        rhs.allocator()->allocate();
-        bias.allocator()->allocate();
-        dst.allocator()->allocate();
-        for(auto &tensor : post_op_tensors_holder)
-        {
-            tensor.allocator()->allocate();
-        }
-
-        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
-        for(const auto &tensor : post_op_tensors_holder)
-        {
-            ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
-        }
-
-        // Fill tensors
-        fill(AccessorType(lhs), 0);
-        fill(AccessorType(rhs), 1);
-        fill(AccessorType(bias), 2);
-        for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
-        {
-            fill(AccessorType(post_op_tensors_holder.at(i)), 3 + i);
-        }
-
-        // Compute GEMM
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
-            { ACL_SRC_1, &rhs },
-            { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
-        for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
-        {
-            gemm_pack.add_tensor(experimental::get_post_op_arg_type(i), &post_op_tensors_holder.at(i));
-        }
-        gemm.run(gemm_pack);
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias,
-                                      const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
-    {
-        TensorShape dst_shape = lhs_shape;
-        dst_shape[0]          = rhs_shape[0];
-        dst_shape[1]          = lhs_shape[1];
-
-        // Create reference
-        SimpleTensor<T> lhs{ lhs_shape, data_type, 1 };
-        SimpleTensor<T> rhs{ rhs_shape, data_type, 1 };
-        SimpleTensor<T> bias{ dst_shape, data_type, 1 };
-        // Create post op tensors and populate post op with them
-        auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, SimpleTensor<T>>(post_ops, [&data_type](auto shape)
-        {
-            return SimpleTensor<T> { shape, data_type, 1 };
-        });
-
-        const int n          = rhs_shape[0];
-        const int m          = lhs_shape[1];
-        const int batch_size = lhs_shape[2];
-
-        // Fill reference
-        int tensor_idx = 0;
-        fill(lhs, tensor_idx++);
-        fill(rhs, tensor_idx++);
-        fill(bias, tensor_idx++);
-        for(auto &op : populated_post_ops.get_list())
-        {
-            for(auto tensor : op->arguments())
-            {
-                fill(*tensor, tensor_idx++);
-            }
-        }
-
-        if(broadcast_bias)
-        {
-            // In case of broadcast, we need to simply copy the first into the following "M" ones
-            for(int i = 1; i < m * batch_size; i++)
-            {
-                memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
-            }
-        }
-
-        SimpleTensor<T> out;
-        out = reference::gemm<T>(lhs, rhs, bias, alpha, beta);
-        // Ignore activation info if post ops are used instead
-        if(populated_post_ops.size() > 0)
-        {
-            out = reference::post_ops<T>(out, populated_post_ops);
-        }
-        else
-        {
-            out = reference::activation_layer(out, act_info);
-        }
-        return out;
-    }
-
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-
 template <typename TensorType, typename AccessorType, typename T, typename GEMMOperatorType>
 class GEMMMatrixMultiplyNative3DValidationFixture : public framework::Fixture
 {
@@ -2398,8 +1688,7 @@ protected:
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
             { ACL_SRC_1, &rhs },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -2557,8 +1846,7 @@ protected:
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
             { ACL_SRC_1, &rhs_reshaped },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -2608,4 +1896,4 @@ protected:
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMM_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_GEMMFIXTURE_H
diff --git a/tests/validation/fixtures/MatMulFixture.h b/tests/validation/fixtures/MatMulFixture.h
index e6b28f5766fe0cdd894d53a7602c6e635340beeb..2e79612a3773a40211a74af0b2874d8c2d1c0cc0 100644
--- a/tests/validation/fixtures/MatMulFixture.h
+++ b/tests/validation/fixtures/MatMulFixture.h
@@ -21,14 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_TESTS_VALIDATION_FIXTURES_MATMULFIXTURE
-#define ACL_TESTS_VALIDATION_FIXTURES_MATMULFIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_MATMULFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_MATMULFIXTURE_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "src/core/utils/quantization/AsymmHelpers.h"
+#include "tests/framework/Asserts.h" // Required for ARM_COMPUTE_ASSERT
 #include "tests/framework/Fixture.h"
+#include "tests/validation/Validation.h"
 #include "tests/validation/reference/ActivationLayer.h"
 #include "tests/validation/reference/GEMM.h"
 #include "tests/validation/reference/GEMMLowp.h"
@@ -338,4 +340,4 @@ public:
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ACL_TESTS_VALIDATION_FIXTURES_MATMULFIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_MATMULFIXTURE_H
diff --git a/tests/validation/fixtures/MatMulKernelFixture.h b/tests/validation/fixtures/MatMulKernelFixture.h
index 91ac77d5af0487a58e07a51b9851a31973a61678..26072dff65637c2faee204bfa56811a9717b11c7 100644
--- a/tests/validation/fixtures/MatMulKernelFixture.h
+++ b/tests/validation/fixtures/MatMulKernelFixture.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_TESTS_VALIDATION_FIXTURES_MATMULKERNELFIXTURE
-#define ACL_TESTS_VALIDATION_FIXTURES_MATMULKERNELFIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_MATMULKERNELFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_MATMULKERNELFIXTURE_H
 
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Utils.h"
@@ -30,8 +30,10 @@
 
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
+#include "tests/framework/Asserts.h" // Required for ARM_COMPUTE_ASSERT
 #include "tests/framework/Fixture.h"
 #include "tests/validation/Helpers.h"
+#include "tests/validation/Validation.h"
 #include "tests/validation/reference/GEMM.h"
 #include "tests/validation/reference/GEMMLowp.h"
 #include "tests/validation/reference/Permute.h"
@@ -54,6 +56,12 @@ public:
     void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool pretranspose_a, bool pretranspose_b, int M0, int N0, int K0, bool export_rhs_to_cl_image, DataType data_type,
                bool enable_bias)
     {
+        // This hash is used by random generators. There may be hash collisions but
+        // this is intentional as it's a very easy way to make the the current
+        // random generation process almost different for many test configurations,
+        // which were using the same set of values before.
+        _hash = M0 + N0 + K0 + shape_a[0] + shape_a[1] + shape_b[0] + shape_b[1] + enable_bias + export_rhs_to_cl_image;
+
         // Flag to create a bias
         _enable_bias = enable_bias;
 
@@ -67,7 +75,7 @@ public:
             const int32_t t_max = static_cast<int32_t>(std::numeric_limits<T>::max());
             const int32_t t_min = static_cast<int32_t>(std::numeric_limits<T>::min());
 
-            std::mt19937                           generator(library->seed());
+            std::mt19937                           generator(library->seed() + _hash);
             std::uniform_real_distribution<float>  distribution_float(-5.0f, 3.0f);
             std::uniform_int_distribution<int32_t> distribution_t(t_min, t_max);
 
@@ -84,7 +92,12 @@ public:
             const int n = shape_b.x();
             const int k = shape_a.x();
 
-            dst_q_info = calculate_mat_mul_dst_q_info(lhs_q_info, rhs_q_info, m, n, k, data_type);
+            const float bias_fraction = enable_bias ? 0.5f : 0.f;
+
+            QuantizationHint q_hint = suggest_matmul_dst_q_info_and_bias(lhs_q_info, rhs_q_info, m, n, k, data_type, bias_fraction);
+            dst_q_info              = q_hint.q_info;
+            _min_bias               = q_hint.bias_min;
+            _max_bias               = q_hint.bias_max;
         }
 
         if(pretranspose_a)
@@ -142,12 +155,9 @@ protected:
     }
 
     template <typename U>
-    void fill_bias_s32(U &&tensor, int i, const UniformQuantizationInfo &q_info)
+    void fill_bias_s32(U &&tensor, int i, int32_t min, int32_t max)
     {
-        // For quantized cases, fill the S32 bias according to the following to avoid saturation of test cases.
-        // The following code limits size of bias values to within expected range of output quantization.
-        const unsigned int                     bound = std::abs(q_info.scale * 256); // 256 is size of 8 bit datatype
-        std::uniform_int_distribution<int32_t> distribution(-(bound / 10), (bound / 10));
+        std::uniform_int_distribution<int32_t> distribution(min, max);
         library->fill(tensor, distribution, i);
     }
 
@@ -192,8 +202,8 @@ protected:
         ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
-        fill(CLAccessor(a), 0);
-        fill(CLAccessor(b), 1);
+        fill(CLAccessor(a), _hash + 1);
+        fill(CLAccessor(b), _hash + 2);
 
         // Compute matMul kernel
         ITensorPack tensors_pack({ { ACL_SRC_0, &a },
@@ -207,11 +217,11 @@ protected:
             bias.allocator()->allocate();
             if(is_quantized)
             {
-                fill_bias_s32(CLAccessor(bias), 2, dst_q_info.uniform());
+                fill_bias_s32(CLAccessor(bias), _hash + 3, _min_bias, _max_bias);
             }
             else
             {
-                fill(CLAccessor(bias), 2);
+                fill(CLAccessor(bias), _hash + 3);
             }
             tensors_pack.add_tensor(ACL_SRC_2, &bias);
         }
@@ -236,8 +246,8 @@ protected:
         SimpleTensor<T> c{ output_shape_collapsed, data_type, 1, dst_q_info };
 
         // Fill reference
-        fill(a, 0);
-        fill(b, 1);
+        fill(a, _hash + 1);
+        fill(b, _hash + 2);
 
         /* Note: Assuming the usual batch matmul dimensions A = (B x M x K), B = (B x K x N), if pretranspose_A is set to true, then A is assumed to be (B x K x M),
            therefore, A must be pre-transposed before passing it to the fixture. And, we transpose A again in the fixture to make it (B x M x K)
@@ -288,7 +298,7 @@ protected:
         // of bias tensor from shape [dst.dimension(0)] to [dst.tensor_shape()] in target kernel
         if(_enable_bias)
         {
-            fill(c, 2);
+            fill(c, _hash + 3);
             const int n          = c.shape().x();
             const int other_dims = c.shape().collapsed_from(1)[1];
             for(int i = 1; i < other_dims; ++i) // For all data, copy first n elements into remaining batches
@@ -323,7 +333,7 @@ protected:
         if(_enable_bias)
         {
             // Identical to float implementation, fill and copy values of bias first dimension
-            fill_bias_s32(bias, 2, cq);
+            fill_bias_s32(bias, _hash + 3, _min_bias, _max_bias);
             const int          n          = bias.shape().x();
             const int          other_dims = bias.shape().collapsed_from(1)[1];
             const unsigned int dt_size    = sizeof(int32_t);
@@ -348,6 +358,9 @@ protected:
     bool            _enable_bias{ false };
     bool            _device_supports_export_to_cl_image{ true };
     bool            _device_supports_mmul{ true };
+    int32_t         _min_bias{ 0 };
+    int32_t         _max_bias{ 0 };
+    int32_t         _hash{ 0 };
 };
 
 template <typename T, typename KernelType, bool use_mmul = false>
@@ -374,4 +387,4 @@ public:
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ACL_TESTS_VALIDATION_FIXTURES_MATMULKERNELFIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_MATMULKERNELFIXTURE_H
diff --git a/tests/validation/fixtures/ReshapeLayerFixture.h b/tests/validation/fixtures/ReshapeLayerFixture.h
index b4c3a9fa1b82ea748ab591022d876d7757e7f350..5be431f8cf4b1794a8cf2550ec7aeb60f00fd483 100644
--- a/tests/validation/fixtures/ReshapeLayerFixture.h
+++ b/tests/validation/fixtures/ReshapeLayerFixture.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_RESHAPE_LAYER_FIXTURE
-#define ARM_COMPUTE_TEST_RESHAPE_LAYER_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_RESHAPELAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_RESHAPELAYERFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -31,6 +31,7 @@
 #include "tests/IAccessor.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
 #include "tests/validation/reference/ReshapeLayer.h"
 
 namespace arm_compute
@@ -41,12 +42,12 @@ namespace validation
 {
 /** [ReshapeLayer fixture] **/
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ReshapeLayerValidationFixture : public framework::Fixture
+class ReshapeLayerGenericValidationFixture : public framework::Fixture
 {
 public:
-    void setup(TensorShape input_shape, TensorShape output_shape, DataType data_type)
+    void setup(TensorShape input_shape, TensorShape output_shape, DataType data_type, bool add_x_padding = false)
     {
-        _target    = compute_target(input_shape, output_shape, data_type);
+        _target    = compute_target(input_shape, output_shape, data_type, add_x_padding);
         _reference = compute_reference(input_shape, output_shape, data_type);
     }
 
@@ -57,7 +58,7 @@ protected:
         library->fill_tensor_uniform(tensor, i);
     }
 
-    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type)
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type, bool add_x_padding = false)
     {
         // Check if indeed the input shape can be reshape to the output one
         ARM_COMPUTE_ASSERT(input_shape.total_size() == output_shape.total_size());
@@ -74,6 +75,12 @@ protected:
         ARM_COMPUTE_ASSERT(src.info()->is_resizable());
         ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
+        if(add_x_padding)
+        {
+            // Add random padding in x dimension
+            add_padding_x({ &src, &dst });
+        }
+
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
@@ -104,8 +111,27 @@ protected:
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ReshapeLayerValidationFixture : public ReshapeLayerGenericValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape input_shape, TensorShape output_shape, DataType data_type)
+    {
+        ReshapeLayerGenericValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, output_shape, data_type);
+    }
+};
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ReshapeLayerPaddedValidationFixture : public ReshapeLayerGenericValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape input_shape, TensorShape output_shape, DataType data_type)
+    {
+        ReshapeLayerGenericValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, output_shape, data_type, true /* add_x_padding */);
+    }
+};
 /** [ReshapeLayer fixture] **/
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_RESHAPE_LAYER_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_RESHAPELAYERFIXTURE_H
diff --git a/tests/validation/fixtures/ReverseFixture.h b/tests/validation/fixtures/ReverseFixture.h
index 509fd93abf3a164790d9b6e64be5734381f601ae..856bff7b1212e38c0486837fc160a359f339130f 100644
--- a/tests/validation/fixtures/ReverseFixture.h
+++ b/tests/validation/fixtures/ReverseFixture.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_REVERSE_FIXTURE
-#define ARM_COMPUTE_TEST_REVERSE_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_REVERSEFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_REVERSEFIXTURE_H
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorShape.h"
@@ -45,10 +45,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ReverseValidationFixture : public framework::Fixture
 {
 public:
-    void setup(TensorShape shape, TensorShape axis_shape, DataType data_type)
+    void setup(TensorShape shape, TensorShape axis_shape, DataType data_type, bool use_negative_axis = false, bool use_inverted_axis = false)
     {
-        _target    = compute_target(shape, axis_shape, data_type);
-        _reference = compute_reference(shape, axis_shape, data_type);
+        _num_dims  = shape.num_dimensions();
+        _target    = compute_target(shape, axis_shape, data_type, use_negative_axis, use_inverted_axis);
+        _reference = compute_reference(shape, axis_shape, data_type, use_negative_axis, use_inverted_axis);
     }
 
 protected:
@@ -57,16 +58,25 @@ protected:
     {
         library->fill_tensor_uniform(tensor, 0);
     }
-    std::vector<int> generate_random_axis()
+    std::vector<int32_t> generate_random_axis(bool use_negative = false)
     {
-        std::vector<int> axis_v = { 0, 1, 2, 3 };
-        std::mt19937     g(0);
+        std::vector<int32_t> axis_v;
+        if(use_negative)
+        {
+            axis_v = { -1, -2, -3, -4 };
+        }
+        else
+        {
+            axis_v = { 0, 1, 2, 3 };
+        }
+        axis_v = std::vector<int32_t>(axis_v.begin(), axis_v.begin() + _num_dims);
+        std::mt19937 g(library->seed());
         std::shuffle(axis_v.begin(), axis_v.end(), g);
 
         return axis_v;
     }
 
-    TensorType compute_target(const TensorShape &shape, const TensorShape &axis_shape, DataType data_type)
+    TensorType compute_target(const TensorShape &shape, const TensorShape &axis_shape, DataType data_type, bool use_negative_axis, bool use_inverted_axis = false)
     {
         // Create tensors
         TensorType src  = create_tensor<TensorType>(shape, data_type, 1);
@@ -75,7 +85,7 @@ protected:
 
         // Create and configure function
         FunctionType reverse_func;
-        reverse_func.configure(&src, &dst, &axis);
+        reverse_func.configure(&src, &dst, &axis, use_inverted_axis);
 
         ARM_COMPUTE_ASSERT(src.info()->is_resizable());
         ARM_COMPUTE_ASSERT(axis.info()->is_resizable());
@@ -94,8 +104,8 @@ protected:
         fill(AccessorType(src));
         {
             auto axis_data = AccessorType(axis);
-            auto axis_v    = generate_random_axis();
-            std::copy(axis_v.begin(), axis_v.begin() + axis_shape.x(), static_cast<int32_t *>(axis_data.data()));
+            auto axis_v    = generate_random_axis(use_negative_axis);
+            std::copy(axis_v.begin(), axis_v.begin() + axis_shape.total_size(), static_cast<int32_t *>(axis_data.data()));
         }
 
         // Compute function
@@ -104,24 +114,25 @@ protected:
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &shape, const TensorShape &axis_shape, DataType data_type)
+    SimpleTensor<T> compute_reference(const TensorShape &shape, const TensorShape &axis_shape, DataType data_type, bool use_negative_axis, bool use_inverted_axis = false)
     {
         // Create reference
-        SimpleTensor<T>        src{ shape, data_type };
-        SimpleTensor<uint32_t> axis{ axis_shape, DataType::U32 };
+        SimpleTensor<T>       src{ shape, data_type };
+        SimpleTensor<int32_t> axis{ axis_shape, DataType::S32 };
 
         // Fill reference
         fill(src);
-        auto axis_v = generate_random_axis();
-        std::copy(axis_v.begin(), axis_v.begin() + axis_shape.x(), axis.data());
+        auto axis_v = generate_random_axis(use_negative_axis);
+        std::copy(axis_v.begin(), axis_v.begin() + axis_shape.total_size(), axis.data());
 
-        return reference::reverse<T>(src, axis);
+        return reference::reverse<T>(src, axis, use_inverted_axis);
     }
 
     TensorType      _target{};
     SimpleTensor<T> _reference{};
+    unsigned int    _num_dims{};
 };
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_REVERSE_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_REVERSEFIXTURE_H
diff --git a/tests/validation/fixtures/ScaleFixture.h b/tests/validation/fixtures/ScaleFixture.h
index b00a86f6dce60405dcd3cc23efc1ee3ae2efa82a..86d89d71f7f01860c6d4abfd28789cb279fb7e00 100644
--- a/tests/validation/fixtures/ScaleFixture.h
+++ b/tests/validation/fixtures/ScaleFixture.h
@@ -21,9 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_SCALE_FIXTURE
-#define ARM_COMPUTE_TEST_SCALE_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_SCALEFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_SCALEFIXTURE_H
 
+#include "tests/framework/Asserts.h" // Required for ARM_COMPUTE_ASSERT
 #include "tests/framework/Fixture.h"
 #include "tests/validation/reference/Permute.h"
 #include "tests/validation/reference/Scale.h"
@@ -266,4 +267,4 @@ public:
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_SCALE_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_SCALEFIXTURE_H
diff --git a/tests/validation/fixtures/StackLayerFixture.h b/tests/validation/fixtures/StackLayerFixture.h
index 7320a032bdfb3f22b0285f506074d2bae3578d96..7dd8fe47dcc4d2aabd1c5c2a7d0c9f7d2f5dffe7 100644
--- a/tests/validation/fixtures/StackLayerFixture.h
+++ b/tests/validation/fixtures/StackLayerFixture.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_STACK_LAYER_FIXTURE
-#define ARM_COMPUTE_TEST_STACK_LAYER_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_STACKLAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_STACKLAYERFIXTURE_H
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorShape.h"
@@ -54,7 +54,7 @@ class StackLayerValidationFixture : public framework::Fixture
 public:
     void setup(TensorShape shape_src, int axis, DataType data_type, int num_tensors)
     {
-        _target    = compute_target(shape_src, axis, data_type, num_tensors);
+        _target    = compute_target(shape_src, axis, data_type, num_tensors, false /* add_x_padding */);
         _reference = compute_reference(shape_src, axis, data_type, num_tensors);
     }
 
@@ -65,7 +65,7 @@ protected:
         library->fill_tensor_uniform(tensor, i);
     }
 
-    TensorType compute_target(TensorShape shape_src, int axis, DataType data_type, int num_tensors)
+    TensorType compute_target(TensorShape shape_src, int axis, DataType data_type, int num_tensors, bool add_x_padding)
     {
         std::vector<TensorType>           tensors(num_tensors);
         std::vector<AbstractTensorType *> src(num_tensors);
@@ -90,6 +90,11 @@ protected:
         // Allocate and fill the input tensors
         for(int i = 0; i < num_tensors; ++i)
         {
+            if(add_x_padding)
+            {
+                add_padding_x({&tensors[i]}, DataLayout::NHWC);
+            }
+
             ARM_COMPUTE_ASSERT(tensors[i].info()->is_resizable());
             tensors[i].allocator()->allocate();
             ARM_COMPUTE_ASSERT(!tensors[i].info()->is_resizable());
@@ -98,6 +103,11 @@ protected:
             fill(AccessorType(tensors[i]), i);
         }
 
+        if(add_x_padding)
+        {
+            add_padding_x({&dst}, DataLayout::NHWC);
+        }
+
         // Allocate output tensor
         dst.allocator()->allocate();
 
@@ -131,7 +141,21 @@ protected:
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
+
+template <typename TensorType, typename AbstractTensorType, typename AccessorType, typename FunctionType, typename T>
+class StackLayerWithPaddingValidationFixture :
+    public StackLayerValidationFixture<TensorType, AbstractTensorType, AccessorType, FunctionType, T>
+{
+public:
+    using Parent = StackLayerValidationFixture<TensorType, AbstractTensorType, AccessorType, FunctionType, T>;
+
+    void setup(TensorShape shape_src, int axis, DataType data_type, int num_tensors)
+    {
+        Parent::_target    = Parent::compute_target(shape_src, axis, data_type, num_tensors, true /* add_x_padding */);
+        Parent::_reference = Parent::compute_reference(shape_src, axis, data_type, num_tensors);
+    }
+};
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_STACK_LAYER_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_STACKLAYERFIXTURE_H
diff --git a/tests/validation/fixtures/TransposeFixture.h b/tests/validation/fixtures/TransposeFixture.h
index 92eb9af0c1618d89a5c8b914b38c649baf977eb2..212c76cc9afb3475bb96980fb0a1969dafcb884c 100644
--- a/tests/validation/fixtures/TransposeFixture.h
+++ b/tests/validation/fixtures/TransposeFixture.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_TRANSPOSE_FIXTURE
-#define ARM_COMPUTE_TEST_TRANSPOSE_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_TRANSPOSEFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_TRANSPOSEFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -32,7 +32,7 @@
 #include "tests/IAccessor.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Transpose.h"
+#include "tests/validation/reference/Permute.h"
 
 namespace arm_compute
 {
@@ -97,7 +97,7 @@ protected:
         // Fill reference
         fill(src);
 
-        return reference::transpose<T>(src);
+        return reference::permute<T>(src, PermutationVector(1U, 0U));
     }
 
     TensorType      _target{};
@@ -106,4 +106,4 @@ protected:
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_TRANSPOSE_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_TRANSPOSEFIXTURE_H
diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ac4b91db8c0a8adf30fd30f7375b11734d906f
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_MATMULKERNELFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_MATMULKERNELFIXTURE_H
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/reference/GEMM.h"
+#include "tests/validation/reference/Permute.h"
+#include "tests/validation/reference/ReshapeLayer.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+template <typename U>
+void fill(U &&tensor, int i)
+{
+    switch(tensor.data_type())
+    {
+        case DataType::F16:
+        {
+            arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -1.0f, 1.0f };
+            library->fill(tensor, distribution, i);
+            break;
+        }
+        case DataType::F32:
+        {
+            std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
+            library->fill(tensor, distribution, i);
+            break;
+        }
+        default:
+            library->fill_tensor_uniform(tensor, i);
+    }
+}
+
+} // namespace
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuMatMulValidationGenericFixture : public framework::Fixture
+{
+
+public:
+    void setup(TensorShape lhs_shape, TensorShape rhs_shape, TensorShape output_shape, bool transpose_a, bool transpose_b,
+    int M0, int N0, int K0, bool export_rhs_to_cl_image, DataType data_type)
+    {
+        //For brevity, the input shapes are assumed to be not-transposed for both a and b matrices.
+        if(transpose_a)
+        {
+            permute(lhs_shape, PermutationVector(1U, 0U));
+        }
+        if(transpose_b)
+        {
+            permute(rhs_shape, PermutationVector(1U, 0U));
+        }
+
+        // Skip configurations unsupported by the device.
+        _device_supports_export_to_cl_image = image2d_from_buffer_supported(CLKernelLibrary::get().get_device());
+        if(!_device_supports_export_to_cl_image && export_rhs_to_cl_image)
+        {
+            ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+            framework::ARM_COMPUTE_PRINT_INFO();
+            return; // Note: Also need to skip the validate in corresponding FIXTURE_DATA_TEST_CASEs.
+        }
+
+        _target    = compute_target(lhs_shape, rhs_shape, transpose_a, transpose_b, M0, N0, K0, export_rhs_to_cl_image, data_type);
+        _reference = compute_reference(lhs_shape, rhs_shape, output_shape, transpose_a, transpose_b, data_type);
+    }
+
+protected:
+    TensorType compute_target(TensorShape &shape_a, TensorShape &shape_b, bool transpose_a, bool transpose_b, int M0, int N0, int K0, bool export_rhs_to_cl_image, DataType data_type)
+    {
+        ARM_COMPUTE_UNUSED(export_rhs_to_cl_image);
+        CLScheduler::get().default_reinit();
+
+        // Create a new workload sketch
+        auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        auto              context        = GpuWorkloadContext{ &cl_compile_ctx };
+        GpuWorkloadSketch sketch{ &context };
+
+        // Create sketch tensors
+        TensorInfo lhs_info    = context.create_tensor_info(TensorInfo(shape_a, 1, data_type));
+        TensorInfo rhs_info    = context.create_tensor_info(TensorInfo(shape_b, 1, data_type));
+        TensorInfo dst_info    = context.create_tensor_info();
+
+        MatMulAttributes matmul_attr {};
+        matmul_attr.adj_lhs(transpose_a);
+        matmul_attr.adj_rhs(transpose_b);
+
+        GpuMatMulSettings matmul_settings {};
+        matmul_settings.m0(M0);
+        matmul_settings.n0(N0);
+        matmul_settings.k0(K0);
+
+        ITensorInfo *ans_info = FunctionType::create_op(sketch, &lhs_info, &rhs_info, matmul_attr, matmul_settings);
+        GpuOutput::create_op(sketch, ans_info, &dst_info);
+
+        // Configure runtime
+        ClWorkloadRuntime runtime;
+        runtime.configure(sketch);
+
+        for(auto &data : runtime.get_auxiliary_tensors())
+        {
+            CLTensor     *tensor      = std::get<0>(data);
+            TensorInfo    info        = std::get<1>(data);
+            AuxMemoryInfo aux_mem_req = std::get<2>(data);
+            tensor->allocator()->init(info, aux_mem_req.alignment);
+            tensor->allocator()->allocate(); // Use ACL allocated memory
+        }
+
+        // Construct user tensors
+        TensorType t_lhs{};
+        TensorType t_rhs{};
+        TensorType t_dst{};
+
+        // Initialize user tensors
+        t_lhs.allocator()->init(lhs_info);
+        t_rhs.allocator()->init(rhs_info);
+        t_dst.allocator()->init(dst_info);
+
+        ARM_COMPUTE_ASSERT(t_lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(t_rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(t_dst.info()->is_resizable());
+
+        // Allocate and fill user tensors
+        t_lhs.allocator()->allocate();
+        t_rhs.allocator()->allocate();
+        t_dst.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!t_lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!t_rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!t_dst.info()->is_resizable());
+
+        fill(AccessorType(t_lhs), 0);
+        fill(AccessorType(t_rhs), 1);
+
+        // Run runtime
+        runtime.run({ &t_lhs, &t_rhs, &t_dst });
+
+        return t_dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &output_shape, bool pretranspose_a, bool pretranspose_b, DataType data_type)
+    {
+        // We collapse dimensions > 3 onto dimension 3, i.e. 5D+ tensors will look like 4D
+        // This is necessary unless we choose to extend gemm reference for 5D+ tensors
+        TensorShape output_shape_collapsed = output_shape.collapsed_from(Window::DimZ);
+        TensorShape shape_a_collapsed      = shape_a.collapsed_from(Window::DimZ);
+        TensorShape shape_b_collapsed      = shape_b.collapsed_from(Window::DimZ);
+
+        // Create reference
+        SimpleTensor<T> a{ shape_a_collapsed, data_type, 1 };
+        SimpleTensor<T> b{ shape_b_collapsed, data_type, 1 };
+        SimpleTensor<T> c{ output_shape_collapsed, data_type, 1 };
+
+        // Fill reference
+        fill(a, 0);
+        fill(b, 1);
+
+        /* Note: Assuming the usual batch matmul dimensions A = (B x M x K), B = (B x K x N), if pretranspose_A is set to true, then A is assumed to be (B x K x M),
+           therefore, A must be pre-transposed before passing it to the fixture. And, we transpose A again in the fixture to make it (B x M x K)
+           in order to be able to call reference implementation that works with (B x M x K) input.
+           Similarly, if pretranspose_B is set to true, then B is assumed to be (B x N x K), B must be pre-transposed before passing it to the fixture. */
+
+        // Define transposed shapes
+        TensorShape a_transposed_shape(a.shape());
+        a_transposed_shape.set(0, a.shape().y());
+        a_transposed_shape.set(1, a.shape().x());
+
+        TensorShape b_transposed_shape(b.shape());
+        b_transposed_shape.set(0, b.shape().y());
+        b_transposed_shape.set(1, b.shape().x());
+
+        // Define transposed tensors
+        SimpleTensor<T> a_transposed{ a_transposed_shape, data_type };
+        SimpleTensor<T> b_transposed{ b_transposed_shape, data_type };
+
+        //pretranspose a if necessary
+        if(pretranspose_a)
+        {
+            a_transposed = reference::permute<T>(a, PermutationVector(1U, 0U));
+        }
+
+        // pretranspose b if necessary
+        if(pretranspose_b)
+        {
+            b_transposed = reference::permute<T>(b, PermutationVector(1U, 0U));
+        }
+
+        // Use transposed tensors if boolean enabled else use original tensors
+        SimpleTensor<T> result = reference::gemm<T>((pretranspose_a) ? a_transposed : a, (pretranspose_b) ? b_transposed : b, c, 1.0f, 0.f);
+
+
+        // We reshape the gemm output back if the tensor is high dimensional
+        if(output_shape_collapsed != output_shape)
+        {
+            // std::cout << "called reshape: \n";
+            result = reference::reshape_layer(result, output_shape);
+        }
+
+        return result;
+    }
+
+    CLTensor        _target{};
+    SimpleTensor<T> _reference{};
+    bool            _device_supports_export_to_cl_image{ false };
+    bool            _device_supports_mmul{ false };
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuMatMulValidationFixture : public DynamicFusionGpuMatMulValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+    public:
+    void setup(TensorShape lhs_shape, TensorShape rhs_shape, TensorShape output_shape, bool transpose_a, bool transpose_b,
+    int M0, int N0, int K0, bool export_rhs_to_cl_image, DataType data_type)
+    {
+        ARM_COMPUTE_UNUSED(export_rhs_to_cl_image);
+        DynamicFusionGpuMatMulValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(lhs_shape, rhs_shape, output_shape, transpose_a, transpose_b, M0,
+        N0, K0, false /* export_rhs_to_cl_image bias */, data_type);
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_MATMULKERNELFIXTURE_H
diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h
index 0efb761967b251d59b7fd268a3c22e7f4094cee9..34f2647741c9cec5b015064405c4a65712d5565e 100644
--- a/tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h
+++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_POOL2DFIXTURE
-#define TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_POOL2DFIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_POOL2DFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_POOL2DFIXTURE_H
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -33,6 +33,7 @@
 #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
 #include "src/dynamic_fusion/utils/Utils.h"
 
 #include "tests/CL/CLAccessor.h"
@@ -100,7 +101,8 @@ protected:
         // Create Pool2dSettings
         GpuPool2dSettings pool_settings = GpuPool2dSettings().mixed_precision(mixed_precision);
 
-        FunctionType::create_op(sketch, &input_info, &dst_info, pool_attr, pool_settings);
+        ITensorInfo *ans_info = FunctionType::create_op(sketch, &input_info, pool_attr, pool_settings);
+        GpuOutput::create_op(sketch, ans_info, &dst_info);
 
         // Configure runtime
         ClWorkloadRuntime runtime;
@@ -184,4 +186,4 @@ public:
 } // namespace test
 } // namespace arm_compute
 
-#endif /* TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_POOL2DFIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_POOL2DFIXTURE_H
diff --git a/tests/validation/reference/DFT.cpp b/tests/validation/reference/DFT.cpp
index fd126c7d73cf408bda5b9cb30860dbefdcf34789..2b03c270acb61941304cbae02c375573d07bbbc5 100644
--- a/tests/validation/reference/DFT.cpp
+++ b/tests/validation/reference/DFT.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -400,10 +400,10 @@ SimpleTensor<T> conv2d_dft(const SimpleTensor<T> &src, const SimpleTensor<T> &w,
     auto              padded_src = pad_layer(src, padding_in);
 
     // Flip weights
-    std::vector<uint32_t>  axis_v = { 0, 1 };
-    SimpleTensor<uint32_t> axis{ TensorShape(2U), DataType::U32 };
+    std::vector<uint32_t> axis_v = { 0, 1 };
+    SimpleTensor<int32_t> axis{ TensorShape(2U), DataType::S32 };
     std::copy(axis_v.begin(), axis_v.begin() + axis.shape().x(), axis.data());
-    auto flipped_w = reverse(w, axis);
+    auto flipped_w = reverse(w, axis, /* use_inverted_axis */ false);
 
     // Pad weights to have the same size as input
     const PaddingList paddings_w = { { 0, src.shape()[0] - 1 }, { 0, src.shape()[1] - 1 } };
diff --git a/tests/validation/reference/PostOps.cpp b/tests/validation/reference/PostOps.cpp
deleted file mode 100644
index ecfed4c48f5301509752a93da5f49a21bf62ed29..0000000000000000000000000000000000000000
--- a/tests/validation/reference/PostOps.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "PostOps.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/PostOps.h"
-#include "support/Cast.h"
-#include "tests/validation/reference/ActivationLayer.h"
-#include "tests/validation/reference/ElementwiseOperations.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> post_ops(const SimpleTensor<T> &a, experimental::PostOpList<SimpleTensor<T>> post_ops)
-{
-    // Create reference
-    SimpleTensor<T> dst{ a };
-
-    for(auto &post_op : post_ops.get_list())
-    {
-        switch(post_op->type())
-        {
-            case experimental::PostOpType::Activation:
-            {
-                const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpAct<SimpleTensor<T>> *>(post_op.get());
-                dst                 = reference::activation_layer(dst, _post_op->_act_info);
-                break;
-            }
-            case experimental::PostOpType::Eltwise_Add:
-            {
-                const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpEltwiseAdd<SimpleTensor<T>> *>(post_op.get());
-                dst                 = reference::arithmetic_operation(ArithmeticOperation::ADD, dst, _post_op->_addend, dst, _post_op->_policy);
-                break;
-            }
-            case experimental::PostOpType::Eltwise_PRelu:
-            {
-                const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpEltwisePRelu<SimpleTensor<T>> *>(post_op.get());
-
-                // If previous main operation output is the the first pRelu argument, then pass it as src1 parameter of the arithmetic operation
-                if(_post_op->_prev_dst_pos == 0)
-                {
-                    dst = reference::arithmetic_operation(ArithmeticOperation::PRELU, dst, _post_op->_alpha_param, dst, _post_op->_policy);
-                }
-                // If previous main operation output is the the second pRelu argument, then pass it as src2 parameter of the arithmetic operation
-                else if(_post_op->_prev_dst_pos == 1)
-                {
-                    dst = reference::arithmetic_operation(ArithmeticOperation::PRELU, _post_op->_alpha_param, dst, dst, _post_op->_policy);
-                }
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported PostOpType");
-            }
-        }
-    }
-    return dst;
-}
-
-template SimpleTensor<float> post_ops(const SimpleTensor<float> &a, experimental::PostOpList<SimpleTensor<float>> post_ops);
-template SimpleTensor<half> post_ops(const SimpleTensor<half> &a, experimental::PostOpList<SimpleTensor<half>> post_ops);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
\ No newline at end of file
diff --git a/tests/validation/reference/Reverse.cpp b/tests/validation/reference/Reverse.cpp
index c6c461427883224a6bc8a80a83867f388968b53e..7924f900d127f3f0df3fc1f205796cb3c950e8f8 100644
--- a/tests/validation/reference/Reverse.cpp
+++ b/tests/validation/reference/Reverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,8 +35,9 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<uint32_t> &axis)
+SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis)
 {
+    ARM_COMPUTE_ERROR_ON(src.shape().num_dimensions() > 4);
     ARM_COMPUTE_ERROR_ON(axis.shape().num_dimensions() > 1);
     ARM_COMPUTE_ERROR_ON(axis.shape().x() > 4);
 
@@ -48,10 +49,32 @@ SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<uint32_t>
     const unsigned int depth   = src.shape()[2];
     const unsigned int batches = src.shape()[3];
 
+    const int rank = src.shape().num_dimensions();
+
     std::array<bool, 4> to_reverse = { { false, false, false, false } };
     for(int i = 0; i < axis.num_elements(); ++i)
     {
-        to_reverse[axis[i]] = true;
+        int axis_i = axis[i];
+
+        // The values of axis tensor must be between [-rank, rank-1].
+        if((axis_i < -rank) || (axis_i >= rank))
+        {
+            ARM_COMPUTE_ERROR("the values of the axis tensor must be within [-rank, rank-1].");
+        }
+
+        // In case of negative axis value i.e targeted axis(i) = rank + axis(i)
+        if(axis_i < 0)
+        {
+            axis_i = rank + axis_i;
+        }
+
+        // Reverse ACL axis indices convention i.e. (inverted)axis = (tensor_rank - 1) - axis
+        if(use_inverted_axis)
+        {
+            axis_i = (rank - 1) - axis_i;
+        }
+
+        to_reverse[axis_i] = true;
     }
 
     const uint32_t num_elements = src.num_elements();
@@ -73,9 +96,9 @@ SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<uint32_t>
     return dst;
 }
 
-template SimpleTensor<uint8_t> reverse(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint32_t> &axis);
-template SimpleTensor<half> reverse(const SimpleTensor<half> &src, const SimpleTensor<uint32_t> &axis);
-template SimpleTensor<float> reverse(const SimpleTensor<float> &src, const SimpleTensor<uint32_t> &axis);
+template SimpleTensor<uint8_t> reverse(const SimpleTensor<uint8_t> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
+template SimpleTensor<half> reverse(const SimpleTensor<half> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
+template SimpleTensor<float> reverse(const SimpleTensor<float> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Reverse.h b/tests/validation/reference/Reverse.h
index 4a28da7270469b7deaae23a42a67c61e50026542..30926b05a5aaa2ec747d8709f810127f49f46b52 100644
--- a/tests/validation/reference/Reverse.h
+++ b/tests/validation/reference/Reverse.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_REVERSE_H
-#define ARM_COMPUTE_TEST_REVERSE_H
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_REVERSE_H
+#define ACL_TESTS_VALIDATION_REFERENCE_REVERSE_H
 
 #include "tests/SimpleTensor.h"
 
@@ -35,9 +35,9 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<uint32_t> &axis);
+SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis = false);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_REVERSE_H */
+#endif // ACL_TESTS_VALIDATION_REFERENCE_REVERSE_H
diff --git a/utils/BUILD.bazel b/utils/BUILD.bazel
index 32d6a61638b423b73b72f6c3eabe04bf39bb6df5..2be7ee193ba5cacef9bf451b3b9e7364a2c0d987 100644
--- a/utils/BUILD.bazel
+++ b/utils/BUILD.bazel
@@ -22,7 +22,15 @@
 
 cc_library(
     name = "utils",
-    srcs = glob(["**/*.cpp"]),
+    srcs = glob(
+        [
+            "**/*.cpp"
+        ],
+	exclude = glob(
+        [
+            "CommonGraphOptions.cpp",
+        ]),
+    ),
     hdrs = glob(["**/*.h"]),
     visibility = ["//visibility:public"],
     deps = [
@@ -33,3 +41,8 @@ cc_library(
         "//support",
     ],
 )
+
+exports_files(
+    ["CommonGraphOptions.cpp"],
+    visibility = ["//visibility:public"]
+)
diff --git a/utils/CommonGraphOptions.cpp b/utils/CommonGraphOptions.cpp
index c0270726da788d0b8f5d1864da5dd50936f2c957..42524d802d1652489f2ec1d6fece4ab2eb9d150a 100644
--- a/utils/CommonGraphOptions.cpp
+++ b/utils/CommonGraphOptions.cpp
@@ -37,15 +37,15 @@ namespace
 {
 std::pair<unsigned int, unsigned int> parse_validation_range(const std::string &validation_range)
 {
-    std::pair<unsigned int /* start */, unsigned int /* end */> range = { 0, std::numeric_limits<unsigned int>::max() };
-    if(!validation_range.empty())
+    std::pair<unsigned int /* start */, unsigned int /* end */> range = {0, std::numeric_limits<unsigned int>::max()};
+    if (!validation_range.empty())
     {
         std::string       str;
         std::stringstream stream(validation_range);
 
         // Get first value
         std::getline(stream, str, ',');
-        if(stream.fail())
+        if (stream.fail())
         {
             return range;
         }
@@ -56,7 +56,7 @@ std::pair<unsigned int, unsigned int> parse_validation_range(const std::string &
 
         // Get second value
         std::getline(stream, str);
-        if(stream.fail())
+        if (stream.fail())
         {
             range.second = range.first;
             return range;
@@ -88,24 +88,26 @@ namespace utils
     os << "Tuner mode : " << common_params.tuner_mode << std::endl;
     os << "Tuner file : " << common_params.tuner_file << std::endl;
     os << "MLGO file : " << common_params.mlgo_file << std::endl;
-    os << "Fast math enabled? : " << (common_params.fast_math_hint == FastMathHint::Enabled ? true_str : false_str) << std::endl;
-    if(!common_params.data_path.empty())
+    os << "Fast math enabled? : " << (common_params.fast_math_hint == FastMathHint::Enabled ? true_str : false_str)
+       << std::endl;
+    if (!common_params.data_path.empty())
     {
         os << "Data path : " << common_params.data_path << std::endl;
     }
-    if(!common_params.image.empty())
+    if (!common_params.image.empty())
     {
         os << "Image file : " << common_params.image << std::endl;
     }
-    if(!common_params.labels.empty())
+    if (!common_params.labels.empty())
     {
         os << "Labels file : " << common_params.labels << std::endl;
     }
-    if(!common_params.validation_file.empty())
+    if (!common_params.validation_file.empty())
     {
-        os << "Validation range : " << common_params.validation_range_start << "-" << common_params.validation_range_end << std::endl;
+        os << "Validation range : " << common_params.validation_range_start << "-" << common_params.validation_range_end
+           << std::endl;
         os << "Validation file : " << common_params.validation_file << std::endl;
-        if(!common_params.validation_path.empty())
+        if (!common_params.validation_path.empty())
         {
             os << "Validation path : " << common_params.validation_path << std::endl;
         }
@@ -134,33 +136,25 @@ CommonGraphOptions::CommonGraphOptions(CommandLineParser &parser)
       tuner_file(parser.add_option<SimpleOption<std::string>>("tuner-file")),
       mlgo_file(parser.add_option<SimpleOption<std::string>>("mlgo-file"))
 {
-    std::set<arm_compute::graph::Target> supported_targets
-    {
+    std::set<arm_compute::graph::Target> supported_targets{
         Target::NEON,
         Target::CL,
         Target::CLVK,
     };
 
-    std::set<arm_compute::DataType> supported_data_types
-    {
+    std::set<arm_compute::DataType> supported_data_types{
         DataType::F16,
         DataType::F32,
         DataType::QASYMM8,
         DataType::QASYMM8_SIGNED,
     };
 
-    std::set<DataLayout> supported_data_layouts
-    {
+    std::set<DataLayout> supported_data_layouts{
         DataLayout::NHWC,
         DataLayout::NCHW,
     };
 
-    const std::set<CLTunerMode> supported_tuner_modes
-    {
-        CLTunerMode::EXHAUSTIVE,
-        CLTunerMode::NORMAL,
-        CLTunerMode::RAPID
-    };
+    const std::set<CLTunerMode> supported_tuner_modes{CLTunerMode::EXHAUSTIVE, CLTunerMode::NORMAL, CLTunerMode::RAPID};
 
     target      = parser.add_option<EnumOption<Target>>("target", supported_targets, Target::NEON);
     data_type   = parser.add_option<EnumOption<DataType>>("type", supported_data_types, DataType::F32);
@@ -175,11 +169,10 @@ CommonGraphOptions::CommonGraphOptions(CommandLineParser &parser)
     data_layout->set_help("Data layout to use");
     enable_tuner->set_help("Enable OpenCL dynamic tuner");
     enable_cl_cache->set_help("Enable OpenCL program caches");
-    tuner_mode->set_help(
-        "Configures the time taken by the tuner to tune. "
-        "Exhaustive: slowest but produces the most performant LWS configuration. "
-        "Normal: slow but produces the LWS configurations on par with Exhaustive most of the time. "
-        "Rapid: fast but produces less performant LWS configurations");
+    tuner_mode->set_help("Configures the time taken by the tuner to tune. "
+                         "Exhaustive: slowest but produces the most performant LWS configuration. "
+                         "Normal: slow but produces the LWS configurations on par with Exhaustive most of the time. "
+                         "Rapid: fast but produces less performant LWS configurations");
     fast_math_hint->set_help("Enable fast math");
     data_path->set_help("Path where graph parameters reside");
     image->set_help("Input image for the graph");
@@ -193,8 +186,9 @@ CommonGraphOptions::CommonGraphOptions(CommandLineParser &parser)
 
 CommonGraphParams consume_common_graph_parameters(CommonGraphOptions &options)
 {
-    FastMathHint fast_math_hint_value = options.fast_math_hint->value() ? FastMathHint::Enabled : FastMathHint::Disabled;
-    auto         validation_range     = parse_validation_range(options.validation_range->value());
+    FastMathHint fast_math_hint_value =
+        options.fast_math_hint->value() ? FastMathHint::Enabled : FastMathHint::Disabled;
+    auto validation_range = parse_validation_range(options.validation_range->value());
 
     CommonGraphParams common_params;
     common_params.help      = options.help->is_set() ? options.help->value() : false;
@@ -202,19 +196,21 @@ CommonGraphParams consume_common_graph_parameters(CommonGraphOptions &options)
     common_params.batches   = options.batches->value();
     common_params.target    = options.target->value();
     common_params.data_type = options.data_type->value();
-    if(options.data_layout->is_set())
+    if (options.data_layout->is_set())
     {
         common_params.data_layout = options.data_layout->value();
     }
-    common_params.enable_tuner           = options.enable_tuner->is_set() ? options.enable_tuner->value() : false;
-    common_params.enable_cl_cache        = common_params.target == arm_compute::graph::Target::NEON ? false : (options.enable_cl_cache->is_set() ? options.enable_cl_cache->value() : true);
-    common_params.tuner_mode             = options.tuner_mode->value();
-    common_params.fast_math_hint         = options.fast_math_hint->is_set() ? fast_math_hint_value : FastMathHint::Disabled;
-    common_params.data_path              = options.data_path->value();
-    common_params.image                  = options.image->value();
-    common_params.labels                 = options.labels->value();
-    common_params.validation_file        = options.validation_file->value();
-    common_params.validation_path        = options.validation_path->value();
+    common_params.enable_tuner    = options.enable_tuner->is_set() ? options.enable_tuner->value() : false;
+    common_params.enable_cl_cache = common_params.target == arm_compute::graph::Target::NEON
+                                        ? false
+                                        : (options.enable_cl_cache->is_set() ? options.enable_cl_cache->value() : true);
+    common_params.tuner_mode      = options.tuner_mode->value();
+    common_params.fast_math_hint  = options.fast_math_hint->is_set() ? fast_math_hint_value : FastMathHint::Disabled;
+    common_params.data_path       = options.data_path->value();
+    common_params.image           = options.image->value();
+    common_params.labels          = options.labels->value();
+    common_params.validation_file = options.validation_file->value();
+    common_params.validation_path = options.validation_path->value();
     common_params.validation_range_start = validation_range.first;
     common_params.validation_range_end   = validation_range.second;
     common_params.tuner_file             = options.tuner_file->value();
diff --git a/utils/CommonGraphOptions.h b/utils/CommonGraphOptions.h
index afdb78b1beb839777214feca19e76d6b7c80fb29..c42e06cb84313d8c6086d28c681514cc238df828 100644
--- a/utils/CommonGraphOptions.h
+++ b/utils/CommonGraphOptions.h
@@ -24,13 +24,13 @@
 #ifndef ARM_COMPUTE_EXAMPLES_UTILS_COMMON_GRAPH_OPTIONS
 #define ARM_COMPUTE_EXAMPLES_UTILS_COMMON_GRAPH_OPTIONS
 
-#include "utils/command_line/CommandLineOptions.h"
-#include "utils/command_line/CommandLineParser.h"
-
 #include "arm_compute/graph/TypeLoader.h"
 #include "arm_compute/graph/TypePrinter.h"
 #include "arm_compute/runtime/CL/CLTunerTypes.h"
 
+#include "utils/command_line/CommandLineOptions.h"
+#include "utils/command_line/CommandLineParser.h"
+
 namespace arm_compute
 {
 namespace utils
@@ -92,16 +92,16 @@ namespace utils
 /** Structure holding all the common graph parameters */
 struct CommonGraphParams
 {
-    bool                             help{ false };
-    int                              threads{ 0 };
-    int                              batches{ 1 };
-    arm_compute::graph::Target       target{ arm_compute::graph::Target::NEON };
-    arm_compute::DataType            data_type{ DataType::F32 };
-    arm_compute::DataLayout          data_layout{ DataLayout::NHWC };
-    bool                             enable_tuner{ false };
-    bool                             enable_cl_cache{ false };
-    arm_compute::CLTunerMode         tuner_mode{ CLTunerMode::NORMAL };
-    arm_compute::graph::FastMathHint fast_math_hint{ arm_compute::graph::FastMathHint::Disabled };
+    bool                             help{false};
+    int                              threads{0};
+    int                              batches{1};
+    arm_compute::graph::Target       target{arm_compute::graph::Target::NEON};
+    arm_compute::DataType            data_type{DataType::F32};
+    arm_compute::DataLayout          data_layout{DataLayout::NHWC};
+    bool                             enable_tuner{false};
+    bool                             enable_cl_cache{false};
+    arm_compute::CLTunerMode         tuner_mode{CLTunerMode::NORMAL};
+    arm_compute::graph::FastMathHint fast_math_hint{arm_compute::graph::FastMathHint::Disabled};
     std::string                      data_path{};
     std::string                      image{};
     std::string                      labels{};
@@ -109,8 +109,8 @@ struct CommonGraphParams
     std::string                      validation_path{};
     std::string                      tuner_file{};
     std::string                      mlgo_file{};
-    unsigned int                     validation_range_start{ 0 };
-    unsigned int                     validation_range_end{ std::numeric_limits<unsigned int>::max() };
+    unsigned int                     validation_range_start{0};
+    unsigned int                     validation_range_end{std::numeric_limits<unsigned int>::max()};
 };
 
 /** Formatted output of the CommonGraphParams type
diff --git a/utils/GraphUtils.cpp b/utils/GraphUtils.cpp
index c3f71299f6fd90ece327cbffa21b156972775507..ca8e14abba7a1f313c53b3acfa9297882c68c91a 100644
--- a/utils/GraphUtils.cpp
+++ b/utils/GraphUtils.cpp
@@ -43,18 +43,21 @@ using namespace arm_compute::graph_utils;
 
 namespace
 {
-std::pair<arm_compute::TensorShape, arm_compute::PermutationVector> compute_permutation_parameters(const arm_compute::TensorShape &shape,
-                                                                                                   arm_compute::DataLayout data_layout)
+std::pair<arm_compute::TensorShape, arm_compute::PermutationVector>
+compute_permutation_parameters(const arm_compute::TensorShape &shape, arm_compute::DataLayout data_layout)
 {
     // Set permutation parameters if needed
     arm_compute::TensorShape       permuted_shape = shape;
     arm_compute::PermutationVector perm;
     // Permute only if num_dimensions greater than 2
-    if(shape.num_dimensions() > 2)
+    if (shape.num_dimensions() > 2)
     {
-        perm = (data_layout == arm_compute::DataLayout::NHWC) ? arm_compute::PermutationVector(2U, 0U, 1U) : arm_compute::PermutationVector(1U, 2U, 0U);
+        perm = (data_layout == arm_compute::DataLayout::NHWC) ? arm_compute::PermutationVector(2U, 0U, 1U)
+                                                              : arm_compute::PermutationVector(1U, 2U, 0U);
 
-        arm_compute::PermutationVector perm_shape = (data_layout == arm_compute::DataLayout::NCHW) ? arm_compute::PermutationVector(2U, 0U, 1U) : arm_compute::PermutationVector(1U, 2U, 0U);
+        arm_compute::PermutationVector perm_shape = (data_layout == arm_compute::DataLayout::NCHW)
+                                                        ? arm_compute::PermutationVector(2U, 0U, 1U)
+                                                        : arm_compute::PermutationVector(1U, 2U, 0U);
         arm_compute::permute(permuted_shape, perm_shape);
     }
 
@@ -62,17 +65,16 @@ std::pair<arm_compute::TensorShape, arm_compute::PermutationVector> compute_perm
 }
 } // namespace
 
-TFPreproccessor::TFPreproccessor(float min_range, float max_range)
-    : _min_range(min_range), _max_range(max_range)
+TFPreproccessor::TFPreproccessor(float min_range, float max_range) : _min_range(min_range), _max_range(max_range)
 {
 }
 void TFPreproccessor::preprocess(ITensor &tensor)
 {
-    if(tensor.info()->data_type() == DataType::F32)
+    if (tensor.info()->data_type() == DataType::F32)
     {
         preprocess_typed<float>(tensor);
     }
-    else if(tensor.info()->data_type() == DataType::F16)
+    else if (tensor.info()->data_type() == DataType::F16)
     {
         preprocess_typed<half>(tensor);
     }
@@ -89,19 +91,20 @@ void TFPreproccessor::preprocess_typed(ITensor &tensor)
     window.use_tensor_dimensions(tensor.info()->tensor_shape());
 
     const float range = _max_range - _min_range;
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const T value                                     = *reinterpret_cast<T *>(tensor.ptr_to_element(id));
-        float   res                                       = value / 255.f;            // Normalize to [0, 1]
-        res                                               = res * range + _min_range; // Map to [min_range, max_range]
-        *reinterpret_cast<T *>(tensor.ptr_to_element(id)) = res;
-    });
+    execute_window_loop(window,
+                        [&](const Coordinates &id)
+                        {
+                            const T value = *reinterpret_cast<T *>(tensor.ptr_to_element(id));
+                            float   res   = value / 255.f;            // Normalize to [0, 1]
+                            res           = res * range + _min_range; // Map to [min_range, max_range]
+                            *reinterpret_cast<T *>(tensor.ptr_to_element(id)) = res;
+                        });
 }
 
 CaffePreproccessor::CaffePreproccessor(std::array<float, 3> mean, bool bgr, float scale)
     : _mean(mean), _bgr(bgr), _scale(scale)
 {
-    if(_bgr)
+    if (_bgr)
     {
         std::swap(_mean[0], _mean[2]);
     }
@@ -109,11 +112,11 @@ CaffePreproccessor::CaffePreproccessor(std::array<float, 3> mean, bool bgr, floa
 
 void CaffePreproccessor::preprocess(ITensor &tensor)
 {
-    if(tensor.info()->data_type() == DataType::F32)
+    if (tensor.info()->data_type() == DataType::F32)
     {
         preprocess_typed<float>(tensor);
     }
-    else if(tensor.info()->data_type() == DataType::F16)
+    else if (tensor.info()->data_type() == DataType::F16)
     {
         preprocess_typed<half>(tensor);
     }
@@ -130,15 +133,16 @@ void CaffePreproccessor::preprocess_typed(ITensor &tensor)
     window.use_tensor_dimensions(tensor.info()->tensor_shape());
     const int channel_idx = get_data_layout_dimension_index(tensor.info()->data_layout(), DataLayoutDimension::CHANNEL);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const T value                                     = *reinterpret_cast<T *>(tensor.ptr_to_element(id)) - T(_mean[id[channel_idx]]);
-        *reinterpret_cast<T *>(tensor.ptr_to_element(id)) = value * T(_scale);
-    });
+    execute_window_loop(window,
+                        [&](const Coordinates &id)
+                        {
+                            const T value =
+                                *reinterpret_cast<T *>(tensor.ptr_to_element(id)) - T(_mean[id[channel_idx]]);
+                            *reinterpret_cast<T *>(tensor.ptr_to_element(id)) = value * T(_scale);
+                        });
 }
 
-PPMWriter::PPMWriter(std::string name, unsigned int maximum)
-    : _name(std::move(name)), _iterator(0), _maximum(maximum)
+PPMWriter::PPMWriter(std::string name, unsigned int maximum) : _name(std::move(name)), _iterator(0), _maximum(maximum)
 {
 }
 
@@ -150,15 +154,14 @@ bool PPMWriter::access_tensor(ITensor &tensor)
     arm_compute::utils::save_to_ppm(tensor, ss.str());
 
     _iterator++;
-    if(_maximum == 0)
+    if (_maximum == 0)
     {
         return true;
     }
     return _iterator < _maximum;
 }
 
-DummyAccessor::DummyAccessor(unsigned int maximum)
-    : _iterator(0), _maximum(maximum)
+DummyAccessor::DummyAccessor(unsigned int maximum) : _iterator(0), _maximum(maximum)
 {
 }
 
@@ -171,7 +174,7 @@ bool DummyAccessor::access_tensor(ITensor &tensor)
 {
     ARM_COMPUTE_UNUSED(tensor);
     bool ret = _maximum == 0 || _iterator < _maximum;
-    if(_iterator == _maximum)
+    if (_iterator == _maximum)
     {
         _iterator = 0;
     }
@@ -182,7 +185,8 @@ bool DummyAccessor::access_tensor(ITensor &tensor)
     return ret;
 }
 
-NumPyAccessor::NumPyAccessor(std::string npy_path, TensorShape shape, DataType data_type, DataLayout data_layout, std::ostream &output_stream)
+NumPyAccessor::NumPyAccessor(
+    std::string npy_path, TensorShape shape, DataType data_type, DataLayout data_layout, std::ostream &output_stream)
     : _npy_tensor(), _filename(std::move(npy_path)), _output_stream(output_stream)
 {
     NumPyBinLoader loader(_filename, data_layout);
@@ -203,8 +207,10 @@ void NumPyAccessor::access_numpy_tensor(ITensor &tensor, T tolerance)
     int       num_mismatches        = utils::compare_tensor<T>(tensor, _npy_tensor, tolerance);
     float     percentage_mismatches = static_cast<float>(num_mismatches) / num_elements;
 
-    _output_stream << "Results: " << 100.f - (percentage_mismatches * 100) << " % matches with the provided output[" << _filename << "]." << std::endl;
-    _output_stream << "         " << num_elements - num_mismatches << " out of " << num_elements << " matches with the provided output[" << _filename << "]." << std::endl
+    _output_stream << "Results: " << 100.f - (percentage_mismatches * 100) << " % matches with the provided output["
+                   << _filename << "]." << std::endl;
+    _output_stream << "         " << num_elements - num_mismatches << " out of " << num_elements
+                   << " matches with the provided output[" << _filename << "]." << std::endl
                    << std::endl;
 }
 
@@ -213,7 +219,7 @@ bool NumPyAccessor::access_tensor(ITensor &tensor)
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&tensor, 1, DataType::F32, DataType::QASYMM8);
     ARM_COMPUTE_ERROR_ON(_npy_tensor.info()->dimension(0) != tensor.info()->dimension(0));
 
-    switch(tensor.info()->data_type())
+    switch (tensor.info()->data_type())
     {
         case DataType::QASYMM8:
             access_numpy_tensor<qasymm8_t>(tensor, 0);
@@ -262,7 +268,7 @@ ImageAccessor::ImageAccessor(std::string filename, bool bgr, std::unique_ptr<IPr
 
 bool ImageAccessor::access_tensor(ITensor &tensor)
 {
-    if(!_already_loaded)
+    if (!_already_loaded)
     {
         auto image_loader = utils::ImageLoaderFactory::create(_filename);
         ARM_COMPUTE_EXIT_ON_MSG(image_loader == nullptr, "Unsupported image type");
@@ -273,27 +279,30 @@ bool ImageAccessor::access_tensor(ITensor &tensor)
         // Get permutated shape and permutation parameters
         TensorShape                    permuted_shape = tensor.info()->tensor_shape();
         arm_compute::PermutationVector perm;
-        if(tensor.info()->data_layout() != DataLayout::NCHW)
+        if (tensor.info()->data_layout() != DataLayout::NCHW)
         {
-            std::tie(permuted_shape, perm) = compute_permutation_parameters(tensor.info()->tensor_shape(), tensor.info()->data_layout());
+            std::tie(permuted_shape, perm) =
+                compute_permutation_parameters(tensor.info()->tensor_shape(), tensor.info()->data_layout());
         }
 
 #ifdef __arm__
-        ARM_COMPUTE_EXIT_ON_MSG_VAR(image_loader->width() != permuted_shape.x() || image_loader->height() != permuted_shape.y(),
-                                    "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu32 ",%" PRIu32 "].",
-                                    image_loader->width(), image_loader->height(), permuted_shape.x(), permuted_shape.y());
+        ARM_COMPUTE_EXIT_ON_MSG_VAR(
+            image_loader->width() != permuted_shape.x() || image_loader->height() != permuted_shape.y(),
+            "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu32 ",%" PRIu32 "].",
+            image_loader->width(), image_loader->height(), permuted_shape.x(), permuted_shape.y());
 #else  // __arm__
-        ARM_COMPUTE_EXIT_ON_MSG_VAR(image_loader->width() != permuted_shape.x() || image_loader->height() != permuted_shape.y(),
-                                    "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu64 ",%" PRIu64 "].",
-                                    image_loader->width(), image_loader->height(),
-                                    static_cast<uint64_t>(permuted_shape.x()), static_cast<uint64_t>(permuted_shape.y()));
+        ARM_COMPUTE_EXIT_ON_MSG_VAR(
+            image_loader->width() != permuted_shape.x() || image_loader->height() != permuted_shape.y(),
+            "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu64 ",%" PRIu64 "].",
+            image_loader->width(), image_loader->height(), static_cast<uint64_t>(permuted_shape.x()),
+            static_cast<uint64_t>(permuted_shape.y()));
 #endif // __arm__
 
         // Fill the tensor with the PPM content (BGR)
         image_loader->fill_planar_tensor(tensor, _bgr);
 
         // Preprocess tensor
-        if(_preprocessor)
+        if (_preprocessor)
         {
             _preprocessor->preprocess(tensor);
         }
@@ -310,7 +319,12 @@ ValidationInputAccessor::ValidationInputAccessor(const std::string             &
                                                  unsigned int                   start,
                                                  unsigned int                   end,
                                                  std::ostream                  &output_stream)
-    : _path(std::move(images_path)), _images(), _preprocessor(std::move(preprocessor)), _bgr(bgr), _offset(0), _output_stream(output_stream)
+    : _path(std::move(images_path)),
+      _images(),
+      _preprocessor(std::move(preprocessor)),
+      _bgr(bgr),
+      _offset(0),
+      _output_stream(output_stream)
 {
     ARM_COMPUTE_EXIT_ON_MSG(start > end, "Invalid validation range!");
 
@@ -322,10 +336,10 @@ ValidationInputAccessor::ValidationInputAccessor(const std::string             &
 
         // Parse image names
         unsigned int counter = 0;
-        for(std::string line; !std::getline(ifs, line).fail() && counter <= end; ++counter)
+        for (std::string line; !std::getline(ifs, line).fail() && counter <= end; ++counter)
         {
             // Add image to process if withing range
-            if(counter >= start)
+            if (counter >= start)
             {
                 std::stringstream linestream(line);
                 std::string       image_name;
@@ -335,7 +349,7 @@ ValidationInputAccessor::ValidationInputAccessor(const std::string             &
             }
         }
     }
-    catch(const std::ifstream::failure &e)
+    catch (const std::ifstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", image_list.c_str(), e.what());
     }
@@ -344,7 +358,7 @@ ValidationInputAccessor::ValidationInputAccessor(const std::string             &
 bool ValidationInputAccessor::access_tensor(arm_compute::ITensor &tensor)
 {
     bool ret = _offset < _images.size();
-    if(ret)
+    if (ret)
     {
         utils::JPEGLoader jpeg;
 
@@ -356,28 +370,30 @@ bool ValidationInputAccessor::access_tensor(arm_compute::ITensor &tensor)
         // Get permutated shape and permutation parameters
         TensorShape                    permuted_shape = tensor.info()->tensor_shape();
         arm_compute::PermutationVector perm;
-        if(tensor.info()->data_layout() != DataLayout::NCHW)
+        if (tensor.info()->data_layout() != DataLayout::NCHW)
         {
-            std::tie(permuted_shape, perm) = compute_permutation_parameters(tensor.info()->tensor_shape(),
-                                                                            tensor.info()->data_layout());
+            std::tie(permuted_shape, perm) =
+                compute_permutation_parameters(tensor.info()->tensor_shape(), tensor.info()->data_layout());
         }
 
 #ifdef __arm__
         ARM_COMPUTE_EXIT_ON_MSG_VAR(jpeg.width() != permuted_shape.x() || jpeg.height() != permuted_shape.y(),
-                                    "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu32 ",%" PRIu32 "].",
+                                    "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu32
+                                    ",%" PRIu32 "].",
                                     jpeg.width(), jpeg.height(), permuted_shape.x(), permuted_shape.y());
 #else  // __arm__
         ARM_COMPUTE_EXIT_ON_MSG_VAR(jpeg.width() != permuted_shape.x() || jpeg.height() != permuted_shape.y(),
-                                    "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu64 ",%" PRIu64 "].",
-                                    jpeg.width(), jpeg.height(),
-                                    static_cast<uint64_t>(permuted_shape.x()), static_cast<uint64_t>(permuted_shape.y()));
+                                    "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu64
+                                    ",%" PRIu64 "].",
+                                    jpeg.width(), jpeg.height(), static_cast<uint64_t>(permuted_shape.x()),
+                                    static_cast<uint64_t>(permuted_shape.y()));
 #endif // __arm__
 
         // Fill the tensor with the JPEG content (BGR)
         jpeg.fill_planar_tensor(tensor, _bgr);
 
         // Preprocess tensor
-        if(_preprocessor)
+        if (_preprocessor)
         {
             _preprocessor->preprocess(tensor);
         }
@@ -402,10 +418,10 @@ ValidationOutputAccessor::ValidationOutputAccessor(const std::string &image_list
 
         // Parse image correctly classified labels
         unsigned int counter = 0;
-        for(std::string line; !std::getline(ifs, line).fail() && counter <= end; ++counter)
+        for (std::string line; !std::getline(ifs, line).fail() && counter <= end; ++counter)
         {
             // Add label if within range
-            if(counter >= start)
+            if (counter >= start)
             {
                 std::stringstream linestream(line);
                 std::string       image_name;
@@ -416,7 +432,7 @@ ValidationOutputAccessor::ValidationOutputAccessor(const std::string &image_list
             }
         }
     }
-    catch(const std::ifstream::failure &e)
+    catch (const std::ifstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", image_list.c_str(), e.what());
     }
@@ -432,11 +448,11 @@ void ValidationOutputAccessor::reset()
 bool ValidationOutputAccessor::access_tensor(arm_compute::ITensor &tensor)
 {
     bool ret = _offset < _results.size();
-    if(ret)
+    if (ret)
     {
         // Get results
         std::vector<size_t> tensor_results;
-        switch(tensor.info()->data_type())
+        switch (tensor.info()->data_type())
         {
             case DataType::QASYMM8:
                 tensor_results = access_predictions_tensor<uint8_t>(tensor);
@@ -459,7 +475,7 @@ bool ValidationOutputAccessor::access_tensor(arm_compute::ITensor &tensor)
     }
 
     // Report top_n accuracy
-    if(_offset >= _results.size())
+    if (_offset >= _results.size())
     {
         report_top_n(1, _results.size(), _positive_samples_top1);
         report_top_n(5, _results.size(), _positive_samples_top5);
@@ -481,23 +497,19 @@ std::vector<size_t> ValidationOutputAccessor::access_predictions_tensor(arm_comp
 
     // Sort results
     std::iota(std::begin(index), std::end(index), static_cast<size_t>(0));
-    std::sort(std::begin(index), std::end(index),
-              [&](size_t a, size_t b)
-    {
-        return output_net[a] > output_net[b];
-    });
+    std::sort(std::begin(index), std::end(index), [&](size_t a, size_t b) { return output_net[a] > output_net[b]; });
 
     return index;
 }
 
-void ValidationOutputAccessor::aggregate_sample(const std::vector<size_t> &res, size_t &positive_samples, size_t top_n, size_t correct_label)
+void ValidationOutputAccessor::aggregate_sample(const std::vector<size_t> &res,
+                                                size_t                    &positive_samples,
+                                                size_t                     top_n,
+                                                size_t                     correct_label)
 {
-    auto is_valid_label = [correct_label](size_t label)
-    {
-        return label == correct_label;
-    };
+    auto is_valid_label = [correct_label](size_t label) { return label == correct_label; };
 
-    if(std::any_of(std::begin(res), std::begin(res) + top_n, is_valid_label))
+    if (std::any_of(std::begin(res), std::begin(res) + top_n, is_valid_label))
     {
         ++positive_samples;
     }
@@ -508,14 +520,15 @@ void ValidationOutputAccessor::report_top_n(size_t top_n, size_t total_samples,
     size_t negative_samples = total_samples - positive_samples;
     float  accuracy         = positive_samples / static_cast<float>(total_samples);
 
-    _output_stream << "----------Top " << top_n << " accuracy ----------" << std::endl
-                   << std::endl;
+    _output_stream << "----------Top " << top_n << " accuracy ----------" << std::endl << std::endl;
     _output_stream << "Positive samples : " << positive_samples << std::endl;
     _output_stream << "Negative samples : " << negative_samples << std::endl;
     _output_stream << "Accuracy : " << accuracy << std::endl;
 }
 
-DetectionOutputAccessor::DetectionOutputAccessor(const std::string &labels_path, std::vector<TensorShape> &imgs_tensor_shapes, std::ostream &output_stream)
+DetectionOutputAccessor::DetectionOutputAccessor(const std::string        &labels_path,
+                                                 std::vector<TensorShape> &imgs_tensor_shapes,
+                                                 std::ostream             &output_stream)
     : _labels(), _tensor_shapes(std::move(imgs_tensor_shapes)), _output_stream(output_stream)
 {
     _labels.clear();
@@ -527,12 +540,12 @@ DetectionOutputAccessor::DetectionOutputAccessor(const std::string &labels_path,
         ifs.exceptions(std::ifstream::badbit);
         ifs.open(labels_path, std::ios::in | std::ios::binary);
 
-        for(std::string line; !std::getline(ifs, line).fail();)
+        for (std::string line; !std::getline(ifs, line).fail();)
         {
             _labels.emplace_back(line);
         }
     }
-    catch(const std::ifstream::failure &e)
+    catch (const std::ifstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", labels_path.c_str(), e.what());
     }
@@ -542,26 +555,24 @@ template <typename T>
 void DetectionOutputAccessor::access_predictions_tensor(ITensor &tensor)
 {
     const size_t num_detection = tensor.info()->valid_region().shape.y();
-    const auto   output_prt    = reinterpret_cast<T *>(tensor.buffer() + tensor.info()->offset_first_element_in_bytes());
+    const auto   output_prt = reinterpret_cast<T *>(tensor.buffer() + tensor.info()->offset_first_element_in_bytes());
 
-    if(num_detection > 0)
+    if (num_detection > 0)
     {
-        _output_stream << "---------------------- Detections ----------------------" << std::endl
-                       << std::endl;
+        _output_stream << "---------------------- Detections ----------------------" << std::endl << std::endl;
 
-        _output_stream << std::left << std::setprecision(4) << std::setw(8) << "Image | " << std::setw(8) << "Label | " << std::setw(12) << "Confidence | "
+        _output_stream << std::left << std::setprecision(4) << std::setw(8) << "Image | " << std::setw(8) << "Label | "
+                       << std::setw(12) << "Confidence | "
                        << "[ xmin, ymin, xmax, ymax ]" << std::endl;
 
-        for(size_t i = 0; i < num_detection; ++i)
+        for (size_t i = 0; i < num_detection; ++i)
         {
             auto im = static_cast<const int>(output_prt[i * 7]);
-            _output_stream << std::setw(8) << im << std::setw(8)
-                           << _labels[output_prt[i * 7 + 1]] << std::setw(12) << output_prt[i * 7 + 2]
-                           << " [" << (output_prt[i * 7 + 3] * _tensor_shapes[im].x())
-                           << ", " << (output_prt[i * 7 + 4] * _tensor_shapes[im].y())
-                           << ", " << (output_prt[i * 7 + 5] * _tensor_shapes[im].x())
-                           << ", " << (output_prt[i * 7 + 6] * _tensor_shapes[im].y())
-                           << "]" << std::endl;
+            _output_stream << std::setw(8) << im << std::setw(8) << _labels[output_prt[i * 7 + 1]] << std::setw(12)
+                           << output_prt[i * 7 + 2] << " [" << (output_prt[i * 7 + 3] * _tensor_shapes[im].x()) << ", "
+                           << (output_prt[i * 7 + 4] * _tensor_shapes[im].y()) << ", "
+                           << (output_prt[i * 7 + 5] * _tensor_shapes[im].x()) << ", "
+                           << (output_prt[i * 7 + 6] * _tensor_shapes[im].y()) << "]" << std::endl;
         }
     }
     else
@@ -574,7 +585,7 @@ bool DetectionOutputAccessor::access_tensor(ITensor &tensor)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&tensor, 1, DataType::F32);
 
-    switch(tensor.info()->data_type())
+    switch (tensor.info()->data_type())
     {
         case DataType::F32:
             access_predictions_tensor<float>(tensor);
@@ -586,7 +597,9 @@ bool DetectionOutputAccessor::access_tensor(ITensor &tensor)
     return false;
 }
 
-TopNPredictionsAccessor::TopNPredictionsAccessor(const std::string &labels_path, size_t top_n, std::ostream &output_stream)
+TopNPredictionsAccessor::TopNPredictionsAccessor(const std::string &labels_path,
+                                                 size_t             top_n,
+                                                 std::ostream      &output_stream)
     : _labels(), _output_stream(output_stream), _top_n(top_n)
 {
     _labels.clear();
@@ -598,12 +611,12 @@ TopNPredictionsAccessor::TopNPredictionsAccessor(const std::string &labels_path,
         ifs.exceptions(std::ifstream::badbit);
         ifs.open(labels_path, std::ios::in | std::ios::binary);
 
-        for(std::string line; !std::getline(ifs, line).fail();)
+        for (std::string line; !std::getline(ifs, line).fail();)
         {
             _labels.emplace_back(line);
         }
     }
-    catch(const std::ifstream::failure &e)
+    catch (const std::ifstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", labels_path.c_str(), e.what());
     }
@@ -627,18 +640,13 @@ void TopNPredictionsAccessor::access_predictions_tensor(ITensor &tensor)
     // Sort results
     std::iota(std::begin(index), std::end(index), static_cast<size_t>(0));
     std::sort(std::begin(index), std::end(index),
-              [&](size_t a, size_t b)
-    {
-        return classes_prob[a] > classes_prob[b];
-    });
+              [&](size_t a, size_t b) { return classes_prob[a] > classes_prob[b]; });
 
-    _output_stream << "---------- Top " << _top_n << " predictions ----------" << std::endl
-                   << std::endl;
-    for(size_t i = 0; i < _top_n; ++i)
+    _output_stream << "---------- Top " << _top_n << " predictions ----------" << std::endl << std::endl;
+    for (size_t i = 0; i < _top_n; ++i)
     {
-        _output_stream << std::fixed << std::setprecision(4)
-                       << +classes_prob[index.at(i)]
-                       << " - [id = " << index.at(i) << "]"
+        _output_stream << std::fixed << std::setprecision(4) << +classes_prob[index.at(i)] << " - [id = " << index.at(i)
+                       << "]"
                        << ", " << _labels[index.at(i)] << std::endl;
     }
 }
@@ -648,7 +656,7 @@ bool TopNPredictionsAccessor::access_tensor(ITensor &tensor)
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&tensor, 1, DataType::F32, DataType::QASYMM8);
     ARM_COMPUTE_ERROR_ON(_labels.size() != tensor.info()->dimension(0));
 
-    switch(tensor.info()->data_type())
+    switch (tensor.info()->data_type())
     {
         case DataType::QASYMM8:
             access_predictions_tensor<uint8_t>(tensor);
@@ -673,9 +681,9 @@ void RandomAccessor::fill(ITensor &tensor, D &&distribution)
 {
     std::mt19937 gen(_seed);
 
-    if(tensor.info()->padding().empty() && (dynamic_cast<SubTensor *>(&tensor) == nullptr))
+    if (tensor.info()->padding().empty() && (dynamic_cast<SubTensor *>(&tensor) == nullptr))
     {
-        for(size_t offset = 0; offset < tensor.info()->total_size(); offset += tensor.info()->element_size())
+        for (size_t offset = 0; offset < tensor.info()->total_size(); offset += tensor.info()->element_size())
         {
             const auto value                                 = static_cast<T>(distribution(gen));
             *reinterpret_cast<T *>(tensor.buffer() + offset) = value;
@@ -687,17 +695,18 @@ void RandomAccessor::fill(ITensor &tensor, D &&distribution)
         Window window;
         window.use_tensor_dimensions(tensor.info()->tensor_shape());
 
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto value                                  = static_cast<T>(distribution(gen));
-            *reinterpret_cast<T *>(tensor.ptr_to_element(id)) = value;
-        });
+        execute_window_loop(window,
+                            [&](const Coordinates &id)
+                            {
+                                const auto value                                  = static_cast<T>(distribution(gen));
+                                *reinterpret_cast<T *>(tensor.ptr_to_element(id)) = value;
+                            });
     }
 }
 
 bool RandomAccessor::access_tensor(ITensor &tensor)
 {
-    switch(tensor.info()->data_type())
+    switch (tensor.info()->data_type())
     {
         case DataType::QASYMM8:
         case DataType::U8:
@@ -750,7 +759,8 @@ bool RandomAccessor::access_tensor(ITensor &tensor)
         }
         case DataType::F16:
         {
-            arm_compute::utils::uniform_real_distribution_16bit<half> distribution_f16(_lower.get<float>(), _upper.get<float>());
+            arm_compute::utils::uniform_real_distribution_16bit<half> distribution_f16(_lower.get<float>(),
+                                                                                       _upper.get<float>());
             fill<half>(tensor, distribution_f16);
             break;
         }
@@ -779,7 +789,7 @@ NumPyBinLoader::NumPyBinLoader(std::string filename, DataLayout file_layout)
 
 bool NumPyBinLoader::access_tensor(ITensor &tensor)
 {
-    if(!_already_loaded)
+    if (!_already_loaded)
     {
         utils::NPYLoader loader;
         loader.open(_filename, _file_layout);
diff --git a/utils/GraphUtils.h b/utils/GraphUtils.h
index 80055acc0f66f6e4623e08a4870b4f46206f0252..b48300bd01e5d072c11b8566c5e5bb06da57c277 100644
--- a/utils/GraphUtils.h
+++ b/utils/GraphUtils.h
@@ -66,7 +66,7 @@ public:
      * @param[in] bgr   Boolean specifying if the preprocessing should assume BGR format
      * @param[in] scale Scale value
      */
-    CaffePreproccessor(std::array<float, 3> mean = std::array<float, 3> { { 0, 0, 0 } }, bool bgr = true, float scale = 1.f);
+    CaffePreproccessor(std::array<float, 3> mean = std::array<float, 3>{{0, 0, 0}}, bool bgr = true, float scale = 1.f);
     void preprocess(ITensor &tensor) override;
 
 private:
@@ -74,8 +74,8 @@ private:
     void preprocess_typed(ITensor &tensor);
 
     std::array<float, 3> _mean;
-    bool  _bgr;
-    float _scale;
+    bool                 _bgr;
+    float                _scale;
 };
 
 /** TF preproccessor */
@@ -155,7 +155,11 @@ public:
      * @param[in]  data_layout   (Optional) DataLayout of the numpy tensor data.
      * @param[out] output_stream (Optional) Output stream
      */
-    NumPyAccessor(std::string npy_path, TensorShape shape, DataType data_type, DataLayout data_layout = DataLayout::NCHW, std::ostream &output_stream = std::cout);
+    NumPyAccessor(std::string   npy_path,
+                  TensorShape   shape,
+                  DataType      data_type,
+                  DataLayout    data_layout   = DataLayout::NCHW,
+                  std::ostream &output_stream = std::cout);
     /** Allow instances of this class to be move constructed */
     NumPyAccessor(NumPyAccessor &&) = default;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -353,7 +357,9 @@ public:
      * @param[in]  imgs_tensor_shapes Network input images tensor shapes.
      * @param[out] output_stream      (Optional) Output stream
      */
-    DetectionOutputAccessor(const std::string &labels_path, std::vector<TensorShape> &imgs_tensor_shapes, std::ostream &output_stream = std::cout);
+    DetectionOutputAccessor(const std::string        &labels_path,
+                            std::vector<TensorShape> &imgs_tensor_shapes,
+                            std::ostream             &output_stream = std::cout);
     /** Allow instances of this class to be move constructed */
     DetectionOutputAccessor(DetectionOutputAccessor &&) = default;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -422,7 +428,7 @@ public:
 
 private:
     template <typename T, typename D>
-    void fill(ITensor &tensor, D &&distribution);
+    void                            fill(ITensor &tensor, D &&distribution);
     PixelValue                      _lower;
     PixelValue                      _upper;
     std::random_device::result_type _seed;
@@ -458,7 +464,8 @@ private:
  *
  * @return A ramdom accessor
  */
-inline std::unique_ptr<graph::ITensorAccessor> get_random_accessor(PixelValue lower, PixelValue upper, const std::random_device::result_type seed = 0)
+inline std::unique_ptr<graph::ITensorAccessor>
+get_random_accessor(PixelValue lower, PixelValue upper, const std::random_device::result_type seed = 0)
 {
     return std::make_unique<RandomAccessor>(lower, upper, seed);
 }
@@ -473,11 +480,10 @@ inline std::unique_ptr<graph::ITensorAccessor> get_random_accessor(PixelValue lo
  *
  * @return An appropriate tensor accessor
  */
-inline std::unique_ptr<graph::ITensorAccessor> get_weights_accessor(const std::string &path,
-                                                                    const std::string &data_file,
-                                                                    DataLayout         file_layout = DataLayout::NCHW)
+inline std::unique_ptr<graph::ITensorAccessor>
+get_weights_accessor(const std::string &path, const std::string &data_file, DataLayout file_layout = DataLayout::NCHW)
 {
-    if(path.empty())
+    if (path.empty())
     {
         return std::make_unique<DummyAccessor>();
     }
@@ -495,30 +501,28 @@ inline std::unique_ptr<graph::ITensorAccessor> get_weights_accessor(const std::s
  *
  * @return An appropriate tensor accessor
  */
-inline std::unique_ptr<graph::ITensorAccessor> get_input_accessor(const arm_compute::utils::CommonGraphParams &graph_parameters,
-                                                                  std::unique_ptr<IPreprocessor>               preprocessor = nullptr,
-                                                                  bool                                         bgr          = true)
+inline std::unique_ptr<graph::ITensorAccessor>
+get_input_accessor(const arm_compute::utils::CommonGraphParams &graph_parameters,
+                   std::unique_ptr<IPreprocessor>               preprocessor = nullptr,
+                   bool                                         bgr          = true)
 {
-    if(!graph_parameters.validation_file.empty())
+    if (!graph_parameters.validation_file.empty())
     {
-        return std::make_unique<ValidationInputAccessor>(graph_parameters.validation_file,
-                                                         graph_parameters.validation_path,
-                                                         std::move(preprocessor),
-                                                         bgr,
-                                                         graph_parameters.validation_range_start,
-                                                         graph_parameters.validation_range_end);
+        return std::make_unique<ValidationInputAccessor>(
+            graph_parameters.validation_file, graph_parameters.validation_path, std::move(preprocessor), bgr,
+            graph_parameters.validation_range_start, graph_parameters.validation_range_end);
     }
     else
     {
         const std::string &image_file       = graph_parameters.image;
         const std::string &image_file_lower = lower_string(image_file);
-        if(arm_compute::utility::endswith(image_file_lower, ".npy"))
+        if (arm_compute::utility::endswith(image_file_lower, ".npy"))
         {
             return std::make_unique<NumPyBinLoader>(image_file, graph_parameters.data_layout);
         }
-        else if(arm_compute::utility::endswith(image_file_lower, ".jpeg")
-                || arm_compute::utility::endswith(image_file_lower, ".jpg")
-                || arm_compute::utility::endswith(image_file_lower, ".ppm"))
+        else if (arm_compute::utility::endswith(image_file_lower, ".jpeg") ||
+                 arm_compute::utility::endswith(image_file_lower, ".jpg") ||
+                 arm_compute::utility::endswith(image_file_lower, ".ppm"))
         {
             return std::make_unique<ImageAccessor>(image_file, bgr, std::move(preprocessor));
         }
@@ -541,20 +545,20 @@ inline std::unique_ptr<graph::ITensorAccessor> get_input_accessor(const arm_comp
  *
  * @return An appropriate tensor accessor
  */
-inline std::unique_ptr<graph::ITensorAccessor> get_output_accessor(const arm_compute::utils::CommonGraphParams &graph_parameters,
-                                                                   size_t                                       top_n         = 5,
-                                                                   bool                                         is_validation = false,
-                                                                   std::ostream                                &output_stream = std::cout)
+inline std::unique_ptr<graph::ITensorAccessor>
+get_output_accessor(const arm_compute::utils::CommonGraphParams &graph_parameters,
+                    size_t                                       top_n         = 5,
+                    bool                                         is_validation = false,
+                    std::ostream                                &output_stream = std::cout)
 {
     ARM_COMPUTE_UNUSED(is_validation);
-    if(!graph_parameters.validation_file.empty())
+    if (!graph_parameters.validation_file.empty())
     {
-        return std::make_unique<ValidationOutputAccessor>(graph_parameters.validation_file,
-                                                          output_stream,
+        return std::make_unique<ValidationOutputAccessor>(graph_parameters.validation_file, output_stream,
                                                           graph_parameters.validation_range_start,
                                                           graph_parameters.validation_range_end);
     }
-    else if(graph_parameters.labels.empty())
+    else if (graph_parameters.labels.empty())
     {
         return std::make_unique<DummyAccessor>(0);
     }
@@ -575,20 +579,20 @@ inline std::unique_ptr<graph::ITensorAccessor> get_output_accessor(const arm_com
  *
  * @return An appropriate tensor accessor
  */
-inline std::unique_ptr<graph::ITensorAccessor> get_detection_output_accessor(const arm_compute::utils::CommonGraphParams &graph_parameters,
-                                                                             std::vector<TensorShape>                     tensor_shapes,
-                                                                             bool                                         is_validation = false,
-                                                                             std::ostream                                &output_stream = std::cout)
+inline std::unique_ptr<graph::ITensorAccessor>
+get_detection_output_accessor(const arm_compute::utils::CommonGraphParams &graph_parameters,
+                              std::vector<TensorShape>                     tensor_shapes,
+                              bool                                         is_validation = false,
+                              std::ostream                                &output_stream = std::cout)
 {
     ARM_COMPUTE_UNUSED(is_validation);
-    if(!graph_parameters.validation_file.empty())
+    if (!graph_parameters.validation_file.empty())
     {
-        return std::make_unique<ValidationOutputAccessor>(graph_parameters.validation_file,
-                                                          output_stream,
+        return std::make_unique<ValidationOutputAccessor>(graph_parameters.validation_file, output_stream,
                                                           graph_parameters.validation_range_start,
                                                           graph_parameters.validation_range_end);
     }
-    else if(graph_parameters.labels.empty())
+    else if (graph_parameters.labels.empty())
     {
         return std::make_unique<DummyAccessor>(0);
     }
@@ -609,10 +613,13 @@ inline std::unique_ptr<graph::ITensorAccessor> get_detection_output_accessor(con
  *
  * @return An appropriate tensor accessor
  */
-inline std::unique_ptr<graph::ITensorAccessor> get_npy_output_accessor(const std::string &npy_path, TensorShape shape, DataType data_type, DataLayout data_layout = DataLayout::NCHW,
+inline std::unique_ptr<graph::ITensorAccessor> get_npy_output_accessor(const std::string &npy_path,
+                                                                       TensorShape        shape,
+                                                                       DataType           data_type,
+                                                                       DataLayout    data_layout   = DataLayout::NCHW,
                                                                        std::ostream &output_stream = std::cout)
 {
-    if(npy_path.empty())
+    if (npy_path.empty())
     {
         return std::make_unique<DummyAccessor>(0);
     }
@@ -631,9 +638,10 @@ inline std::unique_ptr<graph::ITensorAccessor> get_npy_output_accessor(const std
  *
  * @return An appropriate tensor accessor
  */
-inline std::unique_ptr<graph::ITensorAccessor> get_save_npy_output_accessor(const std::string &npy_name, const bool is_fortran = false)
+inline std::unique_ptr<graph::ITensorAccessor> get_save_npy_output_accessor(const std::string &npy_name,
+                                                                            const bool         is_fortran = false)
 {
-    if(npy_name.empty())
+    if (npy_name.empty())
     {
         return std::make_unique<DummyAccessor>(0);
     }
@@ -664,9 +672,11 @@ inline std::unique_ptr<graph::ITensorAccessor> get_print_output_accessor(std::os
  */
 inline TensorShape permute_shape(TensorShape tensor_shape, DataLayout in_data_layout, DataLayout out_data_layout)
 {
-    if(in_data_layout != out_data_layout)
+    if (in_data_layout != out_data_layout)
     {
-        arm_compute::PermutationVector perm_vec = (in_data_layout == DataLayout::NCHW) ? arm_compute::PermutationVector(2U, 0U, 1U) : arm_compute::PermutationVector(1U, 2U, 0U);
+        arm_compute::PermutationVector perm_vec = (in_data_layout == DataLayout::NCHW)
+                                                      ? arm_compute::PermutationVector(2U, 0U, 1U)
+                                                      : arm_compute::PermutationVector(1U, 2U, 0U);
         arm_compute::permute(tensor_shape, perm_vec);
     }
     return tensor_shape;
@@ -681,7 +691,7 @@ inline TensorShape permute_shape(TensorShape tensor_shape, DataLayout in_data_la
 inline graph::Target set_target_hint(int target)
 {
     ARM_COMPUTE_ERROR_ON_MSG(target > 2, "Invalid target. Target must be 0 (NEON), 1 (OpenCL), 2 (OpenCL + Tuner)");
-    if((target == 1 || target == 2))
+    if ((target == 1 || target == 2))
     {
         return graph::Target::CL;
     }
diff --git a/utils/ImageLoader.h b/utils/ImageLoader.h
index aab0f5e770e38c86983f0603c91a14111d86de5f..2ae1a416e2df747af84de928039d084463223ac0 100644
--- a/utils/ImageLoader.h
+++ b/utils/ImageLoader.h
@@ -68,8 +68,7 @@ public:
      *
      * @param[in] fs Image file stream
      */
-    FileImageFeeder(std::ifstream &fs)
-        : _fs(fs)
+    FileImageFeeder(std::ifstream &fs) : _fs(fs)
     {
     }
     // Inherited overridden methods
@@ -94,8 +93,7 @@ public:
      *
      * @param[in] data Pointer to data
      */
-    MemoryImageFeeder(const uint8_t *data)
-        : _data(data)
+    MemoryImageFeeder(const uint8_t *data) : _data(data)
     {
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -127,8 +125,7 @@ class IImageLoader
 {
 public:
     /** Default Constructor */
-    IImageLoader()
-        : _feeder(nullptr), _width(0), _height(0)
+    IImageLoader() : _feeder(nullptr), _width(0), _height(0)
     {
     }
     /** Virtual base destructor */
@@ -188,7 +185,7 @@ public:
             // Validate feeding data
             validate_info(image.info());
 
-            switch(image.info()->format())
+            switch (image.info()->format())
             {
                 case Format::U8:
                 {
@@ -204,15 +201,17 @@ public:
                     unsigned char green = 0;
                     unsigned char blue  = 0;
 
-                    execute_window_loop(window, [&](const Coordinates &)
-                    {
-                        red   = _feeder->get();
-                        green = _feeder->get();
-                        blue  = _feeder->get();
+                    execute_window_loop(
+                        window,
+                        [&](const Coordinates &)
+                        {
+                            red   = _feeder->get();
+                            green = _feeder->get();
+                            blue  = _feeder->get();
 
-                        *out.ptr() = 0.2126f * red + 0.7152f * green + 0.0722f * blue;
-                    },
-                    out);
+                            *out.ptr() = 0.2126f * red + 0.7152f * green + 0.0722f * blue;
+                        },
+                        out);
 
                     break;
                 }
@@ -226,11 +225,8 @@ public:
                     Iterator out(&image, window);
                     size_t   row_size = _width * image.info()->element_size();
 
-                    execute_window_loop(window, [&](const Coordinates &)
-                    {
-                        _feeder->get_row(out.ptr(), row_size);
-                    },
-                    out);
+                    execute_window_loop(
+                        window, [&](const Coordinates &) { _feeder->get_row(out.ptr(), row_size); }, out);
 
                     break;
                 }
@@ -241,7 +237,7 @@ public:
             // Unmap buffer if creating a CLTensor
             unmap(image);
         }
-        catch(const std::ifstream::failure &e)
+        catch (const std::ifstream::failure &e)
         {
             ARM_COMPUTE_ERROR_VAR("Loading image file: %s", e.what());
         }
@@ -257,15 +253,19 @@ public:
     void fill_planar_tensor(T &tensor, bool bgr = false)
     {
         ARM_COMPUTE_ERROR_ON(!is_open());
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&tensor, 1, DataType::U8, DataType::QASYMM8, DataType::F32, DataType::F16);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&tensor, 1, DataType::U8, DataType::QASYMM8, DataType::F32,
+                                                      DataType::F16);
 
         const DataLayout  data_layout  = tensor.info()->data_layout();
         const TensorShape tensor_shape = tensor.info()->tensor_shape();
 
         ARM_COMPUTE_UNUSED(tensor_shape);
-        ARM_COMPUTE_ERROR_ON(tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)] != _width);
-        ARM_COMPUTE_ERROR_ON(tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)] != _height);
-        ARM_COMPUTE_ERROR_ON(tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)] != 3);
+        ARM_COMPUTE_ERROR_ON(tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)] !=
+                             _width);
+        ARM_COMPUTE_ERROR_ON(tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)] !=
+                             _height);
+        ARM_COMPUTE_ERROR_ON(tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)] !=
+                             3);
 
         ARM_COMPUTE_ERROR_ON(_feeder.get() == nullptr);
 
@@ -282,7 +282,7 @@ public:
 
             // Iterate through every pixel of the image
             Window window;
-            if(data_layout == DataLayout::NCHW)
+            if (data_layout == DataLayout::NCHW)
             {
                 window.set(Window::DimX, Window::Dimension(0, _width, 1));
                 window.set(Window::DimY, Window::Dimension(0, _height, 1));
@@ -303,48 +303,50 @@ public:
             unsigned char green = 0;
             unsigned char blue  = 0;
 
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                red   = _feeder->get();
-                green = _feeder->get();
-                blue  = _feeder->get();
-
-                switch(tensor.info()->data_type())
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
                 {
-                    case DataType::U8:
-                    case DataType::QASYMM8:
-                    {
-                        *(out.ptr() + 0 * stride_z) = bgr ? blue : red;
-                        *(out.ptr() + 1 * stride_z) = green;
-                        *(out.ptr() + 2 * stride_z) = bgr ? red : blue;
-                        break;
-                    }
-                    case DataType::F32:
-                    {
-                        *reinterpret_cast<float *>(out.ptr() + 0 * stride_z) = static_cast<float>(bgr ? blue : red);
-                        *reinterpret_cast<float *>(out.ptr() + 1 * stride_z) = static_cast<float>(green);
-                        *reinterpret_cast<float *>(out.ptr() + 2 * stride_z) = static_cast<float>(bgr ? red : blue);
-                        break;
-                    }
-                    case DataType::F16:
-                    {
-                        *reinterpret_cast<half *>(out.ptr() + 0 * stride_z) = static_cast<half>(bgr ? blue : red);
-                        *reinterpret_cast<half *>(out.ptr() + 1 * stride_z) = static_cast<half>(green);
-                        *reinterpret_cast<half *>(out.ptr() + 2 * stride_z) = static_cast<half>(bgr ? red : blue);
-                        break;
-                    }
-                    default:
+                    red   = _feeder->get();
+                    green = _feeder->get();
+                    blue  = _feeder->get();
+
+                    switch (tensor.info()->data_type())
                     {
-                        ARM_COMPUTE_ERROR("Unsupported data type");
+                        case DataType::U8:
+                        case DataType::QASYMM8:
+                        {
+                            *(out.ptr() + 0 * stride_z) = bgr ? blue : red;
+                            *(out.ptr() + 1 * stride_z) = green;
+                            *(out.ptr() + 2 * stride_z) = bgr ? red : blue;
+                            break;
+                        }
+                        case DataType::F32:
+                        {
+                            *reinterpret_cast<float *>(out.ptr() + 0 * stride_z) = static_cast<float>(bgr ? blue : red);
+                            *reinterpret_cast<float *>(out.ptr() + 1 * stride_z) = static_cast<float>(green);
+                            *reinterpret_cast<float *>(out.ptr() + 2 * stride_z) = static_cast<float>(bgr ? red : blue);
+                            break;
+                        }
+                        case DataType::F16:
+                        {
+                            *reinterpret_cast<half *>(out.ptr() + 0 * stride_z) = static_cast<half>(bgr ? blue : red);
+                            *reinterpret_cast<half *>(out.ptr() + 1 * stride_z) = static_cast<half>(green);
+                            *reinterpret_cast<half *>(out.ptr() + 2 * stride_z) = static_cast<half>(bgr ? red : blue);
+                            break;
+                        }
+                        default:
+                        {
+                            ARM_COMPUTE_ERROR("Unsupported data type");
+                        }
                     }
-                }
-            },
-            out);
+                },
+                out);
 
             // Unmap buffer if creating a CLTensor
             unmap(tensor);
         }
-        catch(const std::ifstream::failure &e)
+        catch (const std::ifstream::failure &e)
         {
             ARM_COMPUTE_ERROR_VAR("Loading image file: %s", e.what());
         }
@@ -368,8 +370,7 @@ class PPMLoader : public IImageLoader
 {
 public:
     /** Default Constructor */
-    PPMLoader()
-        : IImageLoader(), _fs()
+    PPMLoader() : IImageLoader(), _fs()
     {
     }
 
@@ -386,7 +387,7 @@ public:
             _fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
             _fs.open(filename, std::ios::in | std::ios::binary);
 
-            unsigned int max_val = 0;
+            unsigned int max_val               = 0;
             std::tie(_width, _height, max_val) = parse_ppm_header(_fs);
 
             ARM_COMPUTE_ERROR_ON_MSG_VAR(max_val >= 256, "2 bytes per colour channel not supported in file %s",
@@ -394,14 +395,14 @@ public:
 
             _feeder = std::make_unique<FileImageFeeder>(_fs);
         }
-        catch(std::runtime_error &e)
+        catch (std::runtime_error &e)
         {
             ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", filename.c_str(), e.what());
         }
     }
     void close() override
     {
-        if(is_open())
+        if (is_open())
         {
             _fs.close();
             _feeder = nullptr;
@@ -443,8 +444,7 @@ private:
 
 public:
     /** Default Constructor */
-    JPEGLoader()
-        : IImageLoader(), _is_loaded(false), _data(nullptr)
+    JPEGLoader() : IImageLoader(), _is_loaded(false), _data(nullptr)
     {
     }
 
@@ -457,7 +457,7 @@ public:
     {
         int      bpp, width, height;
         uint8_t *rgb_image = stbi_load(filename.c_str(), &width, &height, &bpp, 3);
-        if(rgb_image == NULL)
+        if (rgb_image == NULL)
         {
             ARM_COMPUTE_ERROR_VAR("Accessing %s failed", filename.c_str());
         }
@@ -472,7 +472,7 @@ public:
     }
     void close() override
     {
-        if(is_open())
+        if (is_open())
         {
             _width  = 0;
             _height = 0;
@@ -483,7 +483,7 @@ public:
     /** Explicitly Releases the memory of the loaded data */
     void release()
     {
-        if(_is_loaded)
+        if (_is_loaded)
         {
             _data.reset();
             _is_loaded = false;
@@ -492,7 +492,7 @@ public:
     }
 
 private:
-    bool _is_loaded;
+    bool                                     _is_loaded;
     std::unique_ptr<uint8_t, malloc_deleter> _data;
 };
 
@@ -509,7 +509,7 @@ public:
     static std::unique_ptr<IImageLoader> create(const std::string &filename)
     {
         ImageType type = arm_compute::utils::get_image_type_from_file(filename);
-        switch(type)
+        switch (type)
         {
             case ImageType::PPM:
                 return std::make_unique<PPMLoader>();
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 4bc326b574978da6faae956a54960fb65d97ff78..4f14d985aff740878d43b2a45f14ef3ea3c26d44 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -21,8 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TYPE_PRINTER_H__
-#define __ARM_COMPUTE_TYPE_PRINTER_H__
+
+#ifndef ACL_UTILS_TYPEPRINTER_H
+#define ACL_UTILS_TYPEPRINTER_H
 
 #ifdef ARM_COMPUTE_OPENCL_ENABLED
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -36,8 +37,6 @@
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "arm_compute/core/experimental/PostOps.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/ClampAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
@@ -52,11 +51,13 @@
 #include "arm_compute/function_info/MatMulInfo.h"
 #include "arm_compute/runtime/CL/CLTunerTypes.h"
 #include "arm_compute/runtime/CL/CLTypes.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/NEON/functions/NEMatMul.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
+
 #include "support/Cast.h"
 #include "support/StringSupport.h"
+
 #include <ostream>
 #include <sstream>
 #include <string>
@@ -72,7 +73,7 @@ namespace arm_compute
 template <typename T>
 std::string to_string_if_not_null(T *arg)
 {
-    if(arg == nullptr)
+    if (arg == nullptr)
     {
         return "nullptr";
     }
@@ -112,13 +113,13 @@ template <typename T>
     os << "[";
     bool   first = true;
     size_t i;
-    for(i = 0; i < args.size(); ++i)
+    for (i = 0; i < args.size(); ++i)
     {
-        if(i == max_print_size)
+        if (i == max_print_size)
         {
             break;
         }
-        if(first)
+        if (first)
         {
             first = false;
         }
@@ -128,7 +129,7 @@ template <typename T>
         }
         os << to_string(args[i]);
     }
-    if(i < args.size())
+    if (i < args.size())
     {
         os << ", ...";
     }
@@ -150,144 +151,6 @@ std::string to_string(const std::vector<T> &args)
     return str.str();
 }
 
-/** @name (EXPERIMENTAL_POST_OPS)
- * @{
- */
-/** Formmated output of the @ref experimental::PostOpType type
- *
- * @param[out] os           Output stream.
- * @param[in]  post_op_type Type to output.
- *
- * @return Modified output stream.
- */
-inline ::std::ostream &operator<<(::std::ostream &os, experimental::PostOpType post_op_type)
-{
-    os << "type=";
-    switch(post_op_type)
-    {
-        case experimental::PostOpType::Activation:
-        {
-            os << "Activation";
-            break;
-        }
-        case experimental::PostOpType::Eltwise_Add:
-        {
-            os << "Eltwise_Add";
-            break;
-        }
-        case experimental::PostOpType::Eltwise_PRelu:
-        {
-            os << "Eltwise_PRelu";
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Unsupported PostOpType");
-            break;
-        }
-    }
-    return os;
-}
-/** Converts a @ref experimental::PostOpType to string
- *
- * @param[in] post_op_type PostOpType value to be converted
- *
- * @return String representing the corresponding PostOpType
- */
-inline std::string to_string(experimental::PostOpType post_op_type)
-{
-    std::stringstream str;
-    str << post_op_type;
-    return str.str();
-}
-/** Formatted output of the @ref experimental::IPostOp type.
- *
- * @param[out] os      Output stream.
- * @param[in]  post_op Type to output.
- *
- * @return Modified output stream.
- */
-template <typename T>
-inline ::std::ostream &operator<<(::std::ostream &os, const experimental::IPostOp<T> &post_op)
-{
-    os << "<";
-    os << post_op.type() << ",";
-    os << "prev_dst_pos=" << post_op.prev_dst_pos() << ",";
-    switch(post_op.type())
-    {
-        case experimental::PostOpType::Activation:
-        {
-            const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpAct<T> *>(&post_op);
-            os << "act_info=" << &(_post_op->_act_info);
-            break;
-        }
-        case experimental::PostOpType::Eltwise_Add:
-        {
-            const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpEltwiseAdd<T> *>(&post_op);
-            os << "convert_policy=" << _post_op->_policy;
-            break;
-        }
-        case experimental::PostOpType::Eltwise_PRelu:
-        {
-            const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpEltwisePRelu<T> *>(&post_op);
-            os << "convert_policy=" << _post_op->_policy;
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Unsupported PostOpType");
-            break;
-        }
-    }
-    os << ">";
-    return os;
-}
-/** Converts an @ref experimental::IPostOp to string
- *
- * @param[in] post_op IPostOp value to be converted
- *
- * @return String representing the corresponding IPostOp
- */
-template <typename T>
-inline std::string to_string(const experimental::IPostOp<T> &post_op)
-{
-    std::stringstream str;
-    str << post_op;
-    return str.str();
-}
-/** Formatted output of the @ref experimental::PostOpList type.
- *
- * @param[out] os       Output stream.
- * @param[in]  post_ops Type to output.
- *
- * @return Modified output stream.
- */
-template <typename T>
-inline ::std::ostream &operator<<(::std::ostream &os, const experimental::PostOpList<T> &post_ops)
-{
-    os << "[";
-    for(const auto &post_op : post_ops.get_list())
-    {
-        os << *post_op << ",";
-    }
-    os << "]";
-    return os;
-}
-/** Converts a @ref experimental::PostOpList to string
- *
- * @param[in] post_ops PostOpList value to be converted
- *
- * @return String representing the corresponding PostOpList
- */
-template <typename T>
-inline std::string to_string(const experimental::PostOpList<T> &post_ops)
-{
-    std::stringstream str;
-    str << post_ops;
-    return str.str();
-}
-/** @} */ // end of group (EXPERIMENTAL_POST_OPS)
-
 /** Formatted output of the Dimensions type.
  *
  * @param[out] os         Output stream.
@@ -298,11 +161,11 @@ inline std::string to_string(const experimental::PostOpList<T> &post_ops)
 template <typename T>
 inline ::std::ostream &operator<<(::std::ostream &os, const Dimensions<T> &dimensions)
 {
-    if(dimensions.num_dimensions() > 0)
+    if (dimensions.num_dimensions() > 0)
     {
         os << dimensions[0];
 
-        for(unsigned int d = 1; d < dimensions.num_dimensions(); ++d)
+        for (unsigned int d = 1; d < dimensions.num_dimensions(); ++d)
         {
             os << "," << dimensions[d];
         }
@@ -320,7 +183,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const Dimensions<T> &dimen
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const RoundingPolicy &rounding_policy)
 {
-    switch(rounding_policy)
+    switch (rounding_policy)
     {
         case RoundingPolicy::TO_ZERO:
             os << "TO_ZERO";
@@ -348,7 +211,8 @@ inline ::std::ostream &operator<<(::std::ostream &os, const RoundingPolicy &roun
 inline ::std::ostream &operator<<(::std::ostream &os, const WeightsInfo &weights_info)
 {
     os << weights_info.are_reshaped() << ";";
-    os << weights_info.num_kernels() << ";" << weights_info.kernel_size().first << "," << weights_info.kernel_size().second;
+    os << weights_info.num_kernels() << ";" << weights_info.kernel_size().first << ","
+       << weights_info.kernel_size().second;
 
     return os;
 }
@@ -399,7 +263,6 @@ inline ::std::ostream &operator<<(::std::ostream &os, const GEMMKernelInfo &gemm
     os << " mult_interleave4x4_height=" << gemm_info.mult_interleave4x4_height;
     os << " a_offset=" << gemm_info.a_offset;
     os << " b_offset=" << gemm_info.b_offset;
-    os << "post_ops=" << gemm_info.post_ops;
     os << ")";
     return os;
 }
@@ -413,7 +276,8 @@ inline ::std::ostream &operator<<(::std::ostream &os, const GEMMKernelInfo &gemm
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const GEMMLHSMatrixInfo &gemm_info)
 {
-    os << "( m0=" << (unsigned int)gemm_info.m0 << " k0=" << gemm_info.k0 << "  v0=" << gemm_info.v0 << "  trans=" << gemm_info.transpose << "  inter=" << gemm_info.interleave << "})";
+    os << "( m0=" << (unsigned int)gemm_info.m0 << " k0=" << gemm_info.k0 << "  v0=" << gemm_info.v0
+       << "  trans=" << gemm_info.transpose << "  inter=" << gemm_info.interleave << "})";
     return os;
 }
 
@@ -426,8 +290,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const GEMMLHSMatrixInfo &g
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const GEMMRHSMatrixInfo &gemm_info)
 {
-    os << "( n0=" << (unsigned int)gemm_info.n0 << " k0=" << gemm_info.k0 << "  h0=" << gemm_info.h0 << "  trans=" << gemm_info.transpose << "  inter=" << gemm_info.interleave << " exp_img=" <<
-       gemm_info.export_to_cl_image << "})";
+    os << "( n0=" << (unsigned int)gemm_info.n0 << " k0=" << gemm_info.k0 << "  h0=" << gemm_info.h0
+       << "  trans=" << gemm_info.transpose << "  inter=" << gemm_info.interleave
+       << " exp_img=" << gemm_info.export_to_cl_image << "})";
     return os;
 }
 
@@ -480,8 +345,8 @@ inline std::string to_string(const GEMMKernelInfo &gemm_info)
 inline ::std::ostream &operator<<(::std::ostream &os, const BoundingBoxTransformInfo &bbox_info)
 {
     auto weights = bbox_info.weights();
-    os << "(" << bbox_info.img_width() << "x" << bbox_info.img_height() << ")~" << bbox_info.scale() << "(weights={" << weights[0] << ", " << weights[1] << ", " << weights[2] << ", " << weights[3] <<
-       "})";
+    os << "(" << bbox_info.img_width() << "x" << bbox_info.img_height() << ")~" << bbox_info.scale() << "(weights={"
+       << weights[0] << ", " << weights[1] << ", " << weights[2] << ", " << weights[3] << "})";
     return os;
 }
 
@@ -594,7 +459,7 @@ inline std::string to_string(const QuantizationInfo &quantization_info)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ActivationLayerInfo::ActivationFunction &act_function)
 {
-    switch(act_function)
+    switch (act_function)
     {
         case ActivationLayerInfo::ActivationFunction::ABS:
             os << "ABS";
@@ -661,7 +526,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const ActivationLayerInfo:
 inline std::string to_string(const arm_compute::ActivationLayerInfo &info)
 {
     std::stringstream str;
-    if(info.enabled())
+    if (info.enabled())
     {
         str << info.activation();
     }
@@ -677,9 +542,9 @@ inline std::string to_string(const arm_compute::ActivationLayerInfo &info)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ActivationLayerInfo *info)
 {
-    if(info != nullptr)
+    if (info != nullptr)
     {
-        if(info->enabled())
+        if (info->enabled())
         {
             os << info->activation();
             os << "(";
@@ -721,7 +586,7 @@ inline std::string to_string(const arm_compute::ActivationLayerInfo::ActivationF
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const NormType &norm_type)
 {
-    switch(norm_type)
+    switch (norm_type)
     {
         case NormType::CROSS_MAP:
             os << "CROSS_MAP";
@@ -774,7 +639,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NormalizationLayerIn
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const PoolingType &pool_type)
 {
-    switch(pool_type)
+    switch (pool_type)
     {
         case PoolingType::AVG:
             os << "AVG";
@@ -829,7 +694,7 @@ inline std::string to_string(const RoundingPolicy &rounding_policy)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const DataLayout &data_layout)
 {
-    switch(data_layout)
+    switch (data_layout)
     {
         case DataLayout::UNKNOWN:
             os << "UNKNOWN";
@@ -876,7 +741,7 @@ inline std::string to_string(const arm_compute::DataLayout &data_layout)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const DataLayoutDimension &data_layout_dim)
 {
-    switch(data_layout_dim)
+    switch (data_layout_dim)
     {
         case DataLayoutDimension::WIDTH:
             os << "WIDTH";
@@ -908,7 +773,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const DataLayoutDimension
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const DataType &data_type)
 {
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::UNKNOWN:
             os << "UNKNOWN";
@@ -999,7 +864,7 @@ inline std::string to_string(const arm_compute::DataType &data_type)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const Format &format)
 {
-    switch(format)
+    switch (format)
     {
         case Format::UNKNOWN:
             os << "UNKNOWN";
@@ -1081,7 +946,7 @@ inline std::string to_string(const Format &format)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const Channel &channel)
 {
-    switch(channel)
+    switch (channel)
     {
         case Channel::UNKNOWN:
             os << "UNKNOWN";
@@ -1148,7 +1013,7 @@ inline std::string to_string(const Channel &channel)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const BorderMode &mode)
 {
-    switch(mode)
+    switch (mode)
     {
         case BorderMode::UNDEFINED:
             os << "UNDEFINED";
@@ -1175,10 +1040,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const BorderMode &mode)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const BorderSize &border)
 {
-    os << border.top << ","
-       << border.right << ","
-       << border.bottom << ","
-       << border.left;
+    os << border.top << "," << border.right << "," << border.bottom << "," << border.left;
 
     return os;
 }
@@ -1193,7 +1055,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const BorderSize &border)
 inline ::std::ostream &operator<<(::std::ostream &os, const PaddingList &padding)
 {
     os << "{";
-    for(auto const &p : padding)
+    for (auto const &p : padding)
     {
         os << "{" << p.first << "," << p.second << "}";
     }
@@ -1211,7 +1073,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const PaddingList &padding
 inline ::std::ostream &operator<<(::std::ostream &os, const Multiples &multiples)
 {
     os << "(";
-    for(size_t i = 0; i < multiples.size() - 1; i++)
+    for (size_t i = 0; i < multiples.size() - 1; i++)
     {
         os << multiples[i] << ", ";
     }
@@ -1228,7 +1090,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const Multiples &multiples
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const InterpolationPolicy &policy)
 {
-    switch(policy)
+    switch (policy)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
             os << "NEAREST_NEIGHBOR";
@@ -1255,7 +1117,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const InterpolationPolicy
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const SamplingPolicy &policy)
 {
-    switch(policy)
+    switch (policy)
     {
         case SamplingPolicy::CENTER:
             os << "CENTER";
@@ -1286,18 +1148,16 @@ inline ::std::ostream &operator<<(std::ostream &os, const ITensorInfo *info)
        << "DataLayout=" << string_from_data_layout(data_layout) << ","
        << "DataType=" << string_from_data_type(data_type);
 
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         const QuantizationInfo qinfo   = info->quantization_info();
         const auto             scales  = qinfo.scale();
         const auto             offsets = qinfo.offset();
 
         os << ", QuantizationInfo={"
-           << "scales.size=" << scales.size()
-           << ", scale(s)=" << scales << ", ";
+           << "scales.size=" << scales.size() << ", scale(s)=" << scales << ", ";
 
-        os << "offsets.size=" << offsets.size()
-           << ", offset(s)=" << offsets << "}";
+        os << "offsets.size=" << offsets.size() << ", offset(s)=" << offsets << "}";
     }
     return os;
 }
@@ -1350,7 +1210,7 @@ inline std::string to_string(const ITensorInfo &info)
 inline std::string to_string(const ITensorInfo *info)
 {
     std::string ret_str = "nullptr";
-    if(info != nullptr)
+    if (info != nullptr)
     {
         std::stringstream str;
         str << info;
@@ -1379,7 +1239,7 @@ inline std::string to_string(ITensorInfo *info)
 inline std::string to_string(const ITensor *tensor)
 {
     std::string ret_str = "nullptr";
-    if(tensor != nullptr)
+    if (tensor != nullptr)
     {
         std::stringstream str;
         str << "ITensor->info(): " << tensor->info();
@@ -1422,7 +1282,7 @@ inline std::string to_string(ITensor &tensor)
 inline std::string to_string(const ICLTensor *cl_tensor)
 {
     std::string ret_str = "nullptr";
-    if(cl_tensor != nullptr)
+    if (cl_tensor != nullptr)
     {
         std::stringstream str;
         str << "ICLTensor->info(): " << cl_tensor->info();
@@ -1451,11 +1311,7 @@ inline std::string to_string(ICLTensor *cl_tensor)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const cl::NDRange &nd_range)
 {
-    os << "{"
-       << nd_range[0] << ","
-       << nd_range[1] << ","
-       << nd_range[2]
-       << "}";
+    os << "{" << nd_range[0] << "," << nd_range[1] << "," << nd_range[2] << "}";
     return os;
 }
 
@@ -1563,7 +1419,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const GEMMInfo &info)
     os << "fp_mixed_precision=" << info.fp_mixed_precision() << ",";
     os << "broadcast_bias=" << info.broadcast_bias() << ",";
     os << "pretranspose_B=" << info.pretranspose_B() << ",";
-    os << "post_ops=" << info.post_ops() << "}";
+    os << "}";
 
     return os;
 }
@@ -1591,9 +1447,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const Window::Dimension &d
 inline ::std::ostream &operator<<(::std::ostream &os, const Window &win)
 {
     os << "{";
-    for(unsigned int i = 0; i < Coordinates::num_max_dimensions; i++)
+    for (unsigned int i = 0; i < Coordinates::num_max_dimensions; i++)
     {
-        if(i > 0)
+        if (i > 0)
         {
             os << ", ";
         }
@@ -1677,7 +1533,7 @@ inline std::string to_string(const Window &win)
 inline std::string to_string(Window *win)
 {
     std::string ret_str = "nullptr";
-    if(win != nullptr)
+    if (win != nullptr)
     {
         std::stringstream str;
         str << *win;
@@ -1710,7 +1566,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const Rectangle &rect)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const PaddingMode &mode)
 {
-    switch(mode)
+    switch (mode)
     {
         case PaddingMode::CONSTANT:
             os << "CONSTANT";
@@ -1752,8 +1608,8 @@ inline ::std::ostream &operator<<(::std::ostream &os, const PadStrideInfo &pad_s
 {
     os << pad_stride_info.stride().first << "," << pad_stride_info.stride().second;
     os << ";";
-    os << pad_stride_info.pad_left() << "," << pad_stride_info.pad_right() << ","
-       << pad_stride_info.pad_top() << "," << pad_stride_info.pad_bottom();
+    os << pad_stride_info.pad_left() << "," << pad_stride_info.pad_right() << "," << pad_stride_info.pad_top() << ","
+       << pad_stride_info.pad_bottom();
 
     return os;
 }
@@ -1858,7 +1714,7 @@ inline std::string to_string(const SamplingPolicy &policy)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ConvertPolicy &policy)
 {
-    switch(policy)
+    switch (policy)
     {
         case ConvertPolicy::WRAP:
             os << "WRAP";
@@ -1889,7 +1745,7 @@ inline std::string to_string(const ConvertPolicy &policy)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ArithmeticOperation &op)
 {
-    switch(op)
+    switch (op)
     {
         case ArithmeticOperation::ADD:
             os << "ADD";
@@ -1944,7 +1800,7 @@ inline std::string to_string(const ArithmeticOperation &op)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ReductionOperation &op)
 {
-    switch(op)
+    switch (op)
     {
         case ReductionOperation::SUM:
             os << "SUM";
@@ -1999,7 +1855,7 @@ inline std::string to_string(const ReductionOperation &op)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ComparisonOperation &op)
 {
-    switch(op)
+    switch (op)
     {
         case ComparisonOperation::Equal:
             os << "Equal";
@@ -2035,7 +1891,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const ComparisonOperation
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ElementWiseUnary &op)
 {
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::RSQRT:
             os << "RSQRT";
@@ -2132,7 +1988,7 @@ inline std::string to_string(const PoolingLayerInfo &info)
     str << "{Type=" << info.pool_type << ","
         << "DataLayout=" << info.data_layout << ","
         << "IsGlobalPooling=" << info.is_global_pooling;
-    if(!info.is_global_pooling)
+    if (!info.is_global_pooling)
     {
         str << ","
             << "PoolSize=" << info.pool_size.width << "," << info.pool_size.height << ","
@@ -2178,8 +2034,7 @@ inline std::string to_string(const Size3D &type)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const Padding3D &padding3d)
 {
-    os << padding3d.left << "," << padding3d.right << ","
-       << padding3d.top << "," << padding3d.bottom << ","
+    os << padding3d.left << "," << padding3d.right << "," << padding3d.top << "," << padding3d.bottom << ","
        << padding3d.front << "," << padding3d.back;
     return os;
 }
@@ -2206,7 +2061,7 @@ inline std::string to_string(const Padding3D &padding3d)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const DimensionRoundingType &rounding_type)
 {
-    switch(rounding_type)
+    switch (rounding_type)
     {
         case DimensionRoundingType::CEIL:
             os << "CEIL";
@@ -2231,7 +2086,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const Pooling3dLayerInfo &
 {
     os << "{Type=" << info.pool_type << ","
        << "IsGlobalPooling=" << info.is_global_pooling;
-    if(!info.is_global_pooling)
+    if (!info.is_global_pooling)
     {
         os << ","
            << "PoolSize=" << info.pool_size << ", "
@@ -2268,16 +2123,10 @@ inline std::string to_string(const PriorBoxLayerInfo &info)
 {
     std::stringstream str;
     str << "{";
-    str << "Clip:" << info.clip()
-        << "Flip:" << info.flip()
-        << "StepX:" << info.steps()[0]
-        << "StepY:" << info.steps()[1]
-        << "MinSizes:" << info.min_sizes().size()
-        << "MaxSizes:" << info.max_sizes().size()
-        << "ImgSizeX:" << info.img_size().x
-        << "ImgSizeY:" << info.img_size().y
-        << "Offset:" << info.offset()
-        << "Variances:" << info.variances().size();
+    str << "Clip:" << info.clip() << "Flip:" << info.flip() << "StepX:" << info.steps()[0]
+        << "StepY:" << info.steps()[1] << "MinSizes:" << info.min_sizes().size()
+        << "MaxSizes:" << info.max_sizes().size() << "ImgSizeX:" << info.img_size().x
+        << "ImgSizeY:" << info.img_size().y << "Offset:" << info.offset() << "Variances:" << info.variances().size();
     str << "}";
     return str.str();
 }
@@ -2318,7 +2167,7 @@ inline std::string to_string(const Size2D &type)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ConvolutionMethod &conv_method)
 {
-    switch(conv_method)
+    switch (conv_method)
     {
         case ConvolutionMethod::GEMM:
             os << "GEMM";
@@ -2364,7 +2213,7 @@ inline std::string to_string(const ConvolutionMethod &conv_method)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const GPUTarget &gpu_target)
 {
-    switch(gpu_target)
+    switch (gpu_target)
     {
         case GPUTarget::GPU_ARCH_MASK:
             os << "GPU_ARCH_MASK";
@@ -2498,7 +2347,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const DetectionWindow &det
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const DetectionOutputLayerCodeType &detection_code)
 {
-    switch(detection_code)
+    switch (detection_code)
     {
         case DetectionOutputLayerCodeType::CENTER_SIZE:
             os << "CENTER_SIZE";
@@ -2550,8 +2399,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const DetectionOutputLayer
        << "BackgroundLabelId=" << detection_info.background_label_id() << ","
        << "ConfidenceThreshold=" << detection_info.confidence_threshold() << ","
        << "TopK=" << detection_info.top_k() << ","
-       << "NumLocClasses=" << detection_info.num_loc_classes()
-       << "}";
+       << "NumLocClasses=" << detection_info.num_loc_classes() << "}";
 
     return os;
 }
@@ -2587,8 +2435,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const DetectionPostProcess
        << "ScaleValue_h=" << detection_info.scale_value_h() << ","
        << "ScaleValue_w=" << detection_info.scale_value_w() << ","
        << "UseRegularNms=" << detection_info.use_regular_nms() << ","
-       << "DetectionPerClass=" << detection_info.detection_per_class()
-       << "}";
+       << "DetectionPerClass=" << detection_info.detection_per_class() << "}";
 
     return os;
 }
@@ -2628,16 +2475,9 @@ inline std::string to_string(const DetectionWindow &detection_window)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const PriorBoxLayerInfo &info)
 {
-    os << "Clip:" << info.clip()
-       << "Flip:" << info.flip()
-       << "StepX:" << info.steps()[0]
-       << "StepY:" << info.steps()[1]
-       << "MinSizes:" << info.min_sizes()
-       << "MaxSizes:" << info.max_sizes()
-       << "ImgSizeX:" << info.img_size().x
-       << "ImgSizeY:" << info.img_size().y
-       << "Offset:" << info.offset()
-       << "Variances:" << info.variances();
+    os << "Clip:" << info.clip() << "Flip:" << info.flip() << "StepX:" << info.steps()[0] << "StepY:" << info.steps()[1]
+       << "MinSizes:" << info.min_sizes() << "MaxSizes:" << info.max_sizes() << "ImgSizeX:" << info.img_size().x
+       << "ImgSizeY:" << info.img_size().y << "Offset:" << info.offset() << "Variances:" << info.variances();
 
     return os;
 }
@@ -2668,7 +2508,7 @@ inline std::string to_string(const WinogradInfo &type)
  */
 inline std::string to_string(const CLTunerMode val)
 {
-    switch(val)
+    switch (val)
     {
         case CLTunerMode::EXHAUSTIVE:
         {
@@ -2697,7 +2537,7 @@ inline std::string to_string(const CLTunerMode val)
  */
 inline std::string to_string(CLGEMMKernelType val)
 {
-    switch(val)
+    switch (val)
     {
         case CLGEMMKernelType::NATIVE:
         {
@@ -2800,7 +2640,7 @@ inline std::string to_string(const FullyConnectedLayerInfo &info)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const GEMMLowpOutputStageType &gemm_type)
 {
-    switch(gemm_type)
+    switch (gemm_type)
     {
         case GEMMLowpOutputStageType::NONE:
             os << "NONE";
@@ -2883,7 +2723,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const Conv2dInfo &conv_inf
        << "act_info=" << to_string(conv_info.act_info) << ", "
        << "enable_fast_math=" << conv_info.enable_fast_math << ", "
        << "num_groups=" << conv_info.num_groups << ","
-       << "post_ops=" << conv_info.post_ops << "}";
+       << "}";
     return os;
 }
 
@@ -2967,7 +2807,7 @@ inline std::string to_string(const ScaleKernelInfo &scale_info)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const FFTDirection &fft_dir)
 {
-    switch(fft_dir)
+    switch (fft_dir)
     {
         case FFTDirection::Forward:
             os << "Forward";
@@ -3085,7 +2925,7 @@ inline std::string to_string(const Coordinates2D &coord_2d)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const FuseBatchNormalizationType &fuse_type)
 {
-    switch(fuse_type)
+    switch (fuse_type)
     {
         case FuseBatchNormalizationType::CONVOLUTION:
             os << "CONVOLUTION";
@@ -3213,7 +3053,7 @@ inline std::string to_string(const uint8_t num)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const NMSType &nms_type)
 {
-    switch(nms_type)
+    switch (nms_type)
     {
         case NMSType::LINEAR:
             os << "LINEAR";
@@ -3336,46 +3176,46 @@ inline std::string to_string(const Conv3dInfo &conv3d_info)
 inline std::string to_string(const WeightFormat wf)
 {
 #define __CASE_WEIGHT_FORMAT(wf) \
-case WeightFormat::wf:       \
-    return #wf;
-    switch(wf)
+    case WeightFormat::wf:       \
+        return #wf;
+    switch (wf)
     {
-            __CASE_WEIGHT_FORMAT(UNSPECIFIED)
-            __CASE_WEIGHT_FORMAT(ANY)
-            __CASE_WEIGHT_FORMAT(OHWI)
-            __CASE_WEIGHT_FORMAT(OHWIo2)
-            __CASE_WEIGHT_FORMAT(OHWIo4)
-            __CASE_WEIGHT_FORMAT(OHWIo8)
-            __CASE_WEIGHT_FORMAT(OHWIo16)
-            __CASE_WEIGHT_FORMAT(OHWIo32)
-            __CASE_WEIGHT_FORMAT(OHWIo64)
-            __CASE_WEIGHT_FORMAT(OHWIo128)
-            __CASE_WEIGHT_FORMAT(OHWIo4i2)
-            __CASE_WEIGHT_FORMAT(OHWIo4i2_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo8i2)
-            __CASE_WEIGHT_FORMAT(OHWIo8i2_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo16i2)
-            __CASE_WEIGHT_FORMAT(OHWIo16i2_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo32i2)
-            __CASE_WEIGHT_FORMAT(OHWIo32i2_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo64i2)
-            __CASE_WEIGHT_FORMAT(OHWIo64i2_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo4i4)
-            __CASE_WEIGHT_FORMAT(OHWIo4i4_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo8i4)
-            __CASE_WEIGHT_FORMAT(OHWIo8i4_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo16i4)
-            __CASE_WEIGHT_FORMAT(OHWIo16i4_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo32i4)
-            __CASE_WEIGHT_FORMAT(OHWIo32i4_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo64i4)
-            __CASE_WEIGHT_FORMAT(OHWIo64i4_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo2i8)
-            __CASE_WEIGHT_FORMAT(OHWIo4i8)
-            __CASE_WEIGHT_FORMAT(OHWIo8i8)
-            __CASE_WEIGHT_FORMAT(OHWIo16i8)
-            __CASE_WEIGHT_FORMAT(OHWIo32i8)
-            __CASE_WEIGHT_FORMAT(OHWIo64i8)
+        __CASE_WEIGHT_FORMAT(UNSPECIFIED)
+        __CASE_WEIGHT_FORMAT(ANY)
+        __CASE_WEIGHT_FORMAT(OHWI)
+        __CASE_WEIGHT_FORMAT(OHWIo2)
+        __CASE_WEIGHT_FORMAT(OHWIo4)
+        __CASE_WEIGHT_FORMAT(OHWIo8)
+        __CASE_WEIGHT_FORMAT(OHWIo16)
+        __CASE_WEIGHT_FORMAT(OHWIo32)
+        __CASE_WEIGHT_FORMAT(OHWIo64)
+        __CASE_WEIGHT_FORMAT(OHWIo128)
+        __CASE_WEIGHT_FORMAT(OHWIo4i2)
+        __CASE_WEIGHT_FORMAT(OHWIo4i2_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo8i2)
+        __CASE_WEIGHT_FORMAT(OHWIo8i2_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo16i2)
+        __CASE_WEIGHT_FORMAT(OHWIo16i2_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo32i2)
+        __CASE_WEIGHT_FORMAT(OHWIo32i2_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo64i2)
+        __CASE_WEIGHT_FORMAT(OHWIo64i2_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo4i4)
+        __CASE_WEIGHT_FORMAT(OHWIo4i4_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo8i4)
+        __CASE_WEIGHT_FORMAT(OHWIo8i4_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo16i4)
+        __CASE_WEIGHT_FORMAT(OHWIo16i4_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo32i4)
+        __CASE_WEIGHT_FORMAT(OHWIo32i4_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo64i4)
+        __CASE_WEIGHT_FORMAT(OHWIo64i4_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo2i8)
+        __CASE_WEIGHT_FORMAT(OHWIo4i8)
+        __CASE_WEIGHT_FORMAT(OHWIo8i8)
+        __CASE_WEIGHT_FORMAT(OHWIo16i8)
+        __CASE_WEIGHT_FORMAT(OHWIo32i8)
+        __CASE_WEIGHT_FORMAT(OHWIo64i8)
         default:
             return "invalid value";
     }
@@ -3422,8 +3262,7 @@ inline std::string to_string(const std::tuple<TensorShape, TensorShape, arm_comp
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const Padding2D &padding2d)
 {
-    os << padding2d.left << "," << padding2d.right << ","
-       << padding2d.top << "," << padding2d.bottom;
+    os << padding2d.left << "," << padding2d.right << "," << padding2d.top << "," << padding2d.bottom;
     return os;
 }
 
@@ -3566,7 +3405,8 @@ inline std::string to_string(const experimental::dynamic_fusion::CastAttributes
  *
  * @return Modified output stream.
  */
-inline ::std::ostream &operator<<(::std::ostream &os, const experimental::dynamic_fusion::DepthwiseConv2dAttributes &dw_conv2d_attr)
+inline ::std::ostream &operator<<(::std::ostream                                                &os,
+                                  const experimental::dynamic_fusion::DepthwiseConv2dAttributes &dw_conv2d_attr)
 {
     os << "DepthwiseConv2dAttributes="
        << "["
@@ -3658,7 +3498,8 @@ inline std::string to_string(const experimental::dynamic_fusion::ResizeAttribute
  *
  * @return Modified output stream.
  */
-inline ::std::ostream &operator<<(::std::ostream &os, const experimental::dynamic_fusion::SoftmaxAttributes &softmax_attr)
+inline ::std::ostream &operator<<(::std::ostream                                        &os,
+                                  const experimental::dynamic_fusion::SoftmaxAttributes &softmax_attr)
 {
     os << "SoftmaxAttributes="
        << "["
@@ -3723,8 +3564,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const arm_compute::MatMulK
        << "M0=" << matmul_info.m0 << ", "
        << "N0=" << matmul_info.n0 << ", "
        << "K0=" << matmul_info.k0 << ", "
-       << "export_rhs_to_cl_image=" << matmul_info.export_rhs_to_cl_image
-       << "]";
+       << "export_rhs_to_cl_image=" << matmul_info.export_rhs_to_cl_image << "]";
 
     return os;
 }
@@ -3752,8 +3592,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const arm_compute::CpuMatM
 {
     os << "CpuMatMulSettings="
        << "["
-       << "fast_math=" << settings.fast_math()
-       << "]";
+       << "fast_math=" << settings.fast_math() << "]";
 
     return os;
 }
@@ -3772,4 +3611,4 @@ inline std::string to_string(const arm_compute::CpuMatMulSettings &settings)
 
 } // namespace arm_compute
 
-#endif /* __ARM_COMPUTE_TYPE_PRINTER_H__ */
+#endif // ACL_UTILS_TYPEPRINTER_H
diff --git a/utils/Utils.cpp b/utils/Utils.cpp
index 545d64e4b914df905c3ad97bdca593a01922ea4b..a143dc497f26a877b337050f83577e01349dbac5 100644
--- a/utils/Utils.cpp
+++ b/utils/Utils.cpp
@@ -59,7 +59,7 @@ namespace
  */
 void discard_comments(std::ifstream &fs)
 {
-    while(fs.peek() == '#')
+    while (fs.peek() == '#')
     {
         fs.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
     }
@@ -71,11 +71,11 @@ void discard_comments(std::ifstream &fs)
  */
 void discard_comments_and_spaces(std::ifstream &fs)
 {
-    while(true)
+    while (true)
     {
         discard_comments(fs);
 
-        if(isspace(fs.peek()) == 0)
+        if (isspace(fs.peek()) == 0)
         {
             break;
         }
@@ -88,13 +88,12 @@ void discard_comments_and_spaces(std::ifstream &fs)
 #ifndef BENCHMARK_EXAMPLES
 int run_example(int argc, char **argv, std::unique_ptr<Example> example)
 {
-    std::cout << "\n"
-              << argv[0] << "\n\n";
+    std::cout << "\n" << argv[0] << "\n\n";
 
     try
     {
         bool status = example->do_setup(argc, argv);
-        if(!status)
+        if (!status)
         {
             return 1;
         }
@@ -105,19 +104,17 @@ int run_example(int argc, char **argv, std::unique_ptr<Example> example)
         return 0;
     }
 #ifdef ARM_COMPUTE_CL
-    catch(cl::Error &err)
+    catch (cl::Error &err)
     {
         std::cerr << "!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
-        std::cerr << std::endl
-                  << "ERROR " << err.what() << "(" << err.err() << ")" << std::endl;
+        std::cerr << std::endl << "ERROR " << err.what() << "(" << err.err() << ")" << std::endl;
         std::cerr << "!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
     }
 #endif /* ARM_COMPUTE_CL */
-    catch(std::runtime_error &err)
+    catch (std::runtime_error &err)
     {
         std::cerr << "!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
-        std::cerr << std::endl
-                  << "ERROR " << err.what() << " " << (errno ? strerror(errno) : "") << std::endl;
+        std::cerr << std::endl << "ERROR " << err.what() << " " << (errno ? strerror(errno) : "") << std::endl;
         std::cerr << "!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
     }
 
@@ -131,13 +128,15 @@ void draw_detection_rectangle(ITensor *tensor, const DetectionWindow &rect, uint
 {
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(tensor, Format::RGB888);
 
-    uint8_t *top    = tensor->info()->offset_element_in_bytes(Coordinates(rect.x, rect.y)) + tensor->buffer();
-    uint8_t *bottom = tensor->info()->offset_element_in_bytes(Coordinates(rect.x, rect.y + rect.height)) + tensor->buffer();
-    uint8_t *left   = top;
-    uint8_t *right  = tensor->info()->offset_element_in_bytes(Coordinates(rect.x + rect.width, rect.y)) + tensor->buffer();
-    size_t   stride = tensor->info()->strides_in_bytes()[Window::DimY];
+    uint8_t *top = tensor->info()->offset_element_in_bytes(Coordinates(rect.x, rect.y)) + tensor->buffer();
+    uint8_t *bottom =
+        tensor->info()->offset_element_in_bytes(Coordinates(rect.x, rect.y + rect.height)) + tensor->buffer();
+    uint8_t *left = top;
+    uint8_t *right =
+        tensor->info()->offset_element_in_bytes(Coordinates(rect.x + rect.width, rect.y)) + tensor->buffer();
+    size_t stride = tensor->info()->strides_in_bytes()[Window::DimY];
 
-    for(size_t x = 0; x < rect.width; ++x)
+    for (size_t x = 0; x < rect.width; ++x)
     {
         top[0]    = r;
         top[1]    = g;
@@ -150,7 +149,7 @@ void draw_detection_rectangle(ITensor *tensor, const DetectionWindow &rect, uint
         bottom += 3;
     }
 
-    for(size_t y = 0; y < rect.height; ++y)
+    for (size_t y = 0; y < rect.height; ++y)
     {
         left[0]  = r;
         left[1]  = g;
@@ -176,22 +175,22 @@ ImageType get_image_type_from_file(const std::string &filename)
         fs.open(filename, std::ios::in | std::ios::binary);
 
         // Identify type from magic number
-        std::array<unsigned char, 2> magic_number{ { 0 } };
+        std::array<unsigned char, 2> magic_number{{0}};
         fs >> magic_number[0] >> magic_number[1];
 
         // PPM check
-        if(static_cast<char>(magic_number[0]) == 'P' && static_cast<char>(magic_number[1]) == '6')
+        if (static_cast<char>(magic_number[0]) == 'P' && static_cast<char>(magic_number[1]) == '6')
         {
             type = ImageType::PPM;
         }
-        else if(magic_number[0] == 0xFF && magic_number[1] == 0xD8)
+        else if (magic_number[0] == 0xFF && magic_number[1] == 0xD8)
         {
             type = ImageType::JPEG;
         }
 
         fs.close();
     }
-    catch(std::runtime_error &e)
+    catch (std::runtime_error &e)
     {
         ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", filename.c_str(), e.what());
     }
@@ -202,7 +201,7 @@ ImageType get_image_type_from_file(const std::string &filename)
 std::tuple<unsigned int, unsigned int, int> parse_ppm_header(std::ifstream &fs)
 {
     // Check the PPM magic number is valid
-    std::array<char, 2> magic_number{ { 0 } };
+    std::array<char, 2> magic_number{{0}};
     fs >> magic_number[0] >> magic_number[1];
     ARM_COMPUTE_ERROR_ON_MSG(magic_number[0] != 'P' || magic_number[1] != '6', "Invalid file type");
     ARM_COMPUTE_UNUSED(magic_number);
@@ -238,12 +237,12 @@ npy::header_t parse_npy_header(std::ifstream &fs) //NOLINT
     // Parse header
     npy::header_t header = npy::parse_header(header_s);
 
-    bool fortran_order = false;
-    std::vector<unsigned long> shape = header.shape;
+    bool                       fortran_order = false;
+    std::vector<unsigned long> shape         = header.shape;
 
     std::reverse(shape.begin(), shape.end());
 
-    return npy::header_t{ header.dtype, fortran_order, shape };
+    return npy::header_t{header.dtype, fortran_order, shape};
 }
 
 /** This function returns the amount of memory free reading from /proc/meminfo
@@ -255,15 +254,15 @@ uint64_t get_mem_free_from_meminfo()
     std::string   line_attribute;
     std::ifstream file_meminfo("/proc/meminfo");
 
-    if(file_meminfo.is_open())
+    if (file_meminfo.is_open())
     {
-        while(!(file_meminfo >> line_attribute).fail())
+        while (!(file_meminfo >> line_attribute).fail())
         {
             //Test if is the line containing MemFree
-            if(line_attribute == "MemFree:")
+            if (line_attribute == "MemFree:")
             {
                 uint64_t mem_available;
-                if(!(file_meminfo >> mem_available).fail())
+                if (!(file_meminfo >> mem_available).fail())
                 {
                     return mem_available;
                 }
diff --git a/utils/Utils.h b/utils/Utils.h
index d181022ffe0cfba1d868f0734829b43c30776669..626cbcf07fbbd7c6b17b3f5e30b42325662a4ed8 100644
--- a/utils/Utils.h
+++ b/utils/Utils.h
@@ -87,9 +87,9 @@ public:
         return true;
     };
     /** Run the example. */
-    virtual void do_run() {};
+    virtual void do_run(){};
     /** Teardown the example. */
-    virtual void do_teardown() {};
+    virtual void do_teardown(){};
 
     /** Default destructor. */
     virtual ~Example() = default;
@@ -117,7 +117,8 @@ int run_example(int argc, char **argv)
  * @param[in]      g      Green colour to use
  * @param[in]      b      Blue colour to use
  */
-void draw_detection_rectangle(arm_compute::ITensor *tensor, const arm_compute::DetectionWindow &rect, uint8_t r, uint8_t g, uint8_t b);
+void draw_detection_rectangle(
+    arm_compute::ITensor *tensor, const arm_compute::DetectionWindow &rect, uint8_t r, uint8_t g, uint8_t b);
 
 /** Gets image type given a file
  *
@@ -157,7 +158,7 @@ inline std::string get_typestring(DataType data_type)
     const unsigned int i = 1;
     const char        *c = reinterpret_cast<const char *>(&i);
     std::string        endianness;
-    if(*c == 1)
+    if (*c == 1)
     {
         endianness = std::string("<");
     }
@@ -167,7 +168,7 @@ inline std::string get_typestring(DataType data_type)
     }
     const std::string no_endianness("|");
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -253,7 +254,8 @@ inline void unmap(CLTensor &tensor)
 template <typename T>
 class uniform_real_distribution_16bit
 {
-    static_assert(std::is_same<T, half>::value || std::is_same<T, bfloat16>::value, "Only half and bfloat16 data types supported");
+    static_assert(std::is_same<T, half>::value || std::is_same<T, bfloat16>::value,
+                  "Only half and bfloat16 data types supported");
 
 public:
     using result_type = T;
@@ -262,8 +264,7 @@ public:
      * @param[in] min Minimum value of the distribution
      * @param[in] max Maximum value of the distribution
      */
-    explicit uniform_real_distribution_16bit(float min = 0.f, float max = 1.0)
-        : dist(min, max)
+    explicit uniform_real_distribution_16bit(float min = 0.f, float max = 1.0) : dist(min, max)
     {
     }
 
@@ -285,8 +286,7 @@ class NPYLoader
 {
 public:
     /** Default constructor */
-    NPYLoader()
-        : _fs(), _shape(), _fortran_order(false), _typestring(), _file_layout(DataLayout::NCHW)
+    NPYLoader() : _fs(), _shape(), _fortran_order(false), _typestring(), _file_layout(DataLayout::NCHW)
     {
     }
 
@@ -310,7 +310,7 @@ public:
             _fortran_order       = header.fortran_order;
             _typestring          = header.dtype.str();
         }
-        catch(const std::ifstream::failure &e)
+        catch (const std::ifstream::failure &e)
         {
             ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", npy_filename.c_str(), e.what());
         }
@@ -341,10 +341,10 @@ public:
         // Use the size of the input NPY tensor
         TensorShape shape;
         shape.set_num_dimensions(_shape.size());
-        for(size_t i = 0; i < _shape.size(); ++i)
+        for (size_t i = 0; i < _shape.size(); ++i)
         {
             size_t src = i;
-            if(_fortran_order)
+            if (_fortran_order)
             {
                 src = _shape.size() - 1 - i;
             }
@@ -365,7 +365,8 @@ public:
     void fill_tensor(T &tensor)
     {
         ARM_COMPUTE_ERROR_ON(!is_open());
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(&tensor, arm_compute::DataType::QASYMM8, arm_compute::DataType::S32, arm_compute::DataType::F32, arm_compute::DataType::F16);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(&tensor, arm_compute::DataType::QASYMM8, arm_compute::DataType::S32,
+                                              arm_compute::DataType::F32, arm_compute::DataType::F16);
         try
         {
             // Map buffer if creating a CLTensor
@@ -377,7 +378,8 @@ public:
             const size_t end_position = _fs.tellg();
             _fs.seekg(current_position, std::ios_base::beg);
 
-            ARM_COMPUTE_ERROR_ON_MSG((end_position - current_position) < tensor.info()->tensor_shape().total_size() * tensor.info()->element_size(),
+            ARM_COMPUTE_ERROR_ON_MSG((end_position - current_position) <
+                                         tensor.info()->tensor_shape().total_size() * tensor.info()->element_size(),
                                      "Not enough data in file");
             ARM_COMPUTE_UNUSED(end_position);
 
@@ -385,12 +387,12 @@ public:
             std::string expect_typestr = get_typestring(tensor.info()->data_type());
 
             bool enable_f32_to_f16_conversion = false;
-            if(_typestring != expect_typestr)
+            if (_typestring != expect_typestr)
             {
                 const std::string f32_typestring = "<f4";
                 const std::string f16_typestring = "<f2";
                 // if typestring does not match, check whether _typestring is F32 and can be downcasted to expect_typestr
-                if(_typestring == f32_typestring && expect_typestr == f16_typestring)
+                if (_typestring == f32_typestring && expect_typestr == f16_typestring)
                 {
                     enable_f32_to_f16_conversion = true;
                 }
@@ -402,11 +404,11 @@ public:
 
             bool are_layouts_different = (_file_layout != tensor.info()->data_layout());
             // Correct dimensions (Needs to match TensorShape dimension corrections)
-            if(_shape.size() != tensor.info()->tensor_shape().num_dimensions())
+            if (_shape.size() != tensor.info()->tensor_shape().num_dimensions())
             {
-                for(int i = static_cast<int>(_shape.size()) - 1; i > 0; --i)
+                for (int i = static_cast<int>(_shape.size()) - 1; i > 0; --i)
                 {
-                    if(_shape[i] == 1)
+                    if (_shape[i] == 1)
                     {
                         _shape.pop_back();
                     }
@@ -419,22 +421,28 @@ public:
 
             TensorShape                    permuted_shape = tensor.info()->tensor_shape();
             arm_compute::PermutationVector perm;
-            if(are_layouts_different && tensor.info()->tensor_shape().num_dimensions() > 2)
+            if (are_layouts_different && tensor.info()->tensor_shape().num_dimensions() > 2)
             {
-                perm                                    = (tensor.info()->data_layout() == arm_compute::DataLayout::NHWC) ? arm_compute::PermutationVector(2U, 0U, 1U) : arm_compute::PermutationVector(1U, 2U, 0U);
-                arm_compute::PermutationVector perm_vec = (tensor.info()->data_layout() == arm_compute::DataLayout::NCHW) ? arm_compute::PermutationVector(2U, 0U, 1U) : arm_compute::PermutationVector(1U, 2U, 0U);
+                perm = (tensor.info()->data_layout() == arm_compute::DataLayout::NHWC)
+                           ? arm_compute::PermutationVector(2U, 0U, 1U)
+                           : arm_compute::PermutationVector(1U, 2U, 0U);
+                arm_compute::PermutationVector perm_vec =
+                    (tensor.info()->data_layout() == arm_compute::DataLayout::NCHW)
+                        ? arm_compute::PermutationVector(2U, 0U, 1U)
+                        : arm_compute::PermutationVector(1U, 2U, 0U);
 
                 arm_compute::permute(permuted_shape, perm_vec);
             }
 
             // Validate tensor shape
-            ARM_COMPUTE_ERROR_ON_MSG(_shape.size() != tensor.info()->tensor_shape().num_dimensions(), "Tensor ranks mismatch");
-            for(size_t i = 0; i < _shape.size(); ++i)
+            ARM_COMPUTE_ERROR_ON_MSG(_shape.size() != tensor.info()->tensor_shape().num_dimensions(),
+                                     "Tensor ranks mismatch");
+            for (size_t i = 0; i < _shape.size(); ++i)
             {
                 ARM_COMPUTE_ERROR_ON_MSG(permuted_shape[i] != _shape[i], "Tensor dimensions mismatch");
             }
 
-            switch(tensor.info()->data_type())
+            switch (tensor.info()->data_type())
             {
                 case arm_compute::DataType::QASYMM8:
                 case arm_compute::DataType::S32:
@@ -442,7 +450,8 @@ public:
                 case arm_compute::DataType::F16:
                 {
                     // Read data
-                    if(!are_layouts_different && !_fortran_order && tensor.info()->padding().empty() && !enable_f32_to_f16_conversion)
+                    if (!are_layouts_different && !_fortran_order && tensor.info()->padding().empty() &&
+                        !enable_f32_to_f16_conversion)
                     {
                         // If tensor has no padding read directly from stream.
                         _fs.read(reinterpret_cast<char *>(tensor.buffer()), tensor.info()->total_size());
@@ -452,19 +461,19 @@ public:
                         // If tensor has padding or is in fortran order accessing tensor elements through execution window.
                         Window             window;
                         const unsigned int num_dims = _shape.size();
-                        if(_fortran_order)
+                        if (_fortran_order)
                         {
-                            for(unsigned int dim = 0; dim < num_dims; dim++)
+                            for (unsigned int dim = 0; dim < num_dims; dim++)
                             {
                                 permuted_shape.set(dim, _shape[num_dims - dim - 1]);
                                 perm.set(dim, num_dims - dim - 1);
                             }
-                            if(are_layouts_different)
+                            if (are_layouts_different)
                             {
                                 // Permute only if num_dimensions greater than 2
-                                if(num_dims > 2)
+                                if (num_dims > 2)
                                 {
-                                    if(_file_layout == DataLayout::NHWC) // i.e destination is NCHW --> permute(1,2,0)
+                                    if (_file_layout == DataLayout::NHWC) // i.e destination is NCHW --> permute(1,2,0)
                                     {
                                         arm_compute::permute(perm, arm_compute::PermutationVector(1U, 2U, 0U));
                                     }
@@ -477,22 +486,25 @@ public:
                         }
                         window.use_tensor_dimensions(permuted_shape);
 
-                        execute_window_loop(window, [&](const Coordinates & id)
-                        {
-                            Coordinates dst(id);
-                            arm_compute::permute(dst, perm);
-                            if(enable_f32_to_f16_conversion)
-                            {
-                                float f32_val = 0;
-                                _fs.read(reinterpret_cast<char *>(&f32_val), 4u);
-                                half f16_val                                            = half_float::half_cast<half, std::round_to_nearest>(f32_val);
-                                *(reinterpret_cast<half *>(tensor.ptr_to_element(dst))) = f16_val;
-                            }
-                            else
-                            {
-                                _fs.read(reinterpret_cast<char *>(tensor.ptr_to_element(dst)), tensor.info()->element_size());
-                            }
-                        });
+                        execute_window_loop(window,
+                                            [&](const Coordinates &id)
+                                            {
+                                                Coordinates dst(id);
+                                                arm_compute::permute(dst, perm);
+                                                if (enable_f32_to_f16_conversion)
+                                                {
+                                                    float f32_val = 0;
+                                                    _fs.read(reinterpret_cast<char *>(&f32_val), 4u);
+                                                    half f16_val =
+                                                        half_float::half_cast<half, std::round_to_nearest>(f32_val);
+                                                    *(reinterpret_cast<half *>(tensor.ptr_to_element(dst))) = f16_val;
+                                                }
+                                                else
+                                                {
+                                                    _fs.read(reinterpret_cast<char *>(tensor.ptr_to_element(dst)),
+                                                             tensor.info()->element_size());
+                                                }
+                                            });
                     }
 
                     break;
@@ -504,7 +516,7 @@ public:
             // Unmap buffer if creating a CLTensor
             unmap(tensor);
         }
-        catch(const std::ifstream::failure &e)
+        catch (const std::ifstream::failure &e)
         {
             ARM_COMPUTE_ERROR_VAR("Loading NPY file: %s", e.what());
         }
@@ -543,13 +555,12 @@ void save_to_ppm(T &tensor, const std::string &ppm_filename)
         const unsigned int width  = tensor.info()->tensor_shape()[0];
         const unsigned int height = tensor.info()->tensor_shape()[1];
 
-        fs << "P6\n"
-           << width << " " << height << " 255\n";
+        fs << "P6\n" << width << " " << height << " 255\n";
 
         // Map buffer if creating a CLTensor
         map(tensor, true);
 
-        switch(tensor.info()->format())
+        switch (tensor.info()->format())
         {
             case arm_compute::Format::U8:
             {
@@ -559,13 +570,15 @@ void save_to_ppm(T &tensor, const std::string &ppm_filename)
 
                 arm_compute::Iterator in(&tensor, window);
 
-                arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates &)
-                {
-                    const unsigned char value = *in.ptr();
+                arm_compute::execute_window_loop(
+                    window,
+                    [&](const arm_compute::Coordinates &)
+                    {
+                        const unsigned char value = *in.ptr();
 
-                    fs << value << value << value;
-                },
-                in);
+                        fs << value << value << value;
+                    },
+                    in);
 
                 break;
             }
@@ -577,11 +590,13 @@ void save_to_ppm(T &tensor, const std::string &ppm_filename)
 
                 arm_compute::Iterator in(&tensor, window);
 
-                arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates &)
-                {
-                    fs.write(reinterpret_cast<std::fstream::char_type *>(in.ptr()), width * tensor.info()->element_size());
-                },
-                in);
+                arm_compute::execute_window_loop(
+                    window,
+                    [&](const arm_compute::Coordinates &) {
+                        fs.write(reinterpret_cast<std::fstream::char_type *>(in.ptr()),
+                                 width * tensor.info()->element_size());
+                    },
+                    in);
 
                 break;
             }
@@ -592,7 +607,7 @@ void save_to_ppm(T &tensor, const std::string &ppm_filename)
         // Unmap buffer if creating a CLTensor
         unmap(tensor);
     }
-    catch(const std::ofstream::failure &e)
+    catch (const std::ofstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Writing %s: (%s)", ppm_filename.c_str(), e.what());
     }
@@ -620,7 +635,7 @@ void save_to_npy(T &tensor, const std::string &npy_filename, bool fortran_order)
 
         std::vector<npy::ndarray_len_t> shape(tensor.info()->num_dimensions());
 
-        for(unsigned int i = 0, j = tensor.info()->num_dimensions() - 1; i < tensor.info()->num_dimensions(); ++i, --j)
+        for (unsigned int i = 0, j = tensor.info()->num_dimensions() - 1; i < tensor.info()->num_dimensions(); ++i, --j)
         {
             shape[i] = tensor.info()->tensor_shape()[!fortran_order ? j : i];
         }
@@ -634,7 +649,7 @@ void save_to_npy(T &tensor, const std::string &npy_filename, bool fortran_order)
         const npy::dtype_t           dtype = npy::dtype_map.at(std::type_index(typeid(tmp)));
 
         std::ofstream stream(npy_filename, std::ofstream::binary);
-        npy::header_t header{ dtype, fortran_order, shape };
+        npy::header_t header{dtype, fortran_order, shape};
         npy::write_header(stream, header);
 
         arm_compute::Window window;
@@ -642,16 +657,16 @@ void save_to_npy(T &tensor, const std::string &npy_filename, bool fortran_order)
 
         arm_compute::Iterator in(&tensor, window);
 
-        arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates &)
-        {
-            stream.write(reinterpret_cast<const char *>(in.ptr()), sizeof(typestring_type));
-        },
-        in);
+        arm_compute::execute_window_loop(
+            window,
+            [&](const arm_compute::Coordinates &)
+            { stream.write(reinterpret_cast<const char *>(in.ptr()), sizeof(typestring_type)); },
+            in);
 
         // Unmap buffer if creating a CLTensor
         unmap(tensor);
     }
-    catch(const std::ofstream::failure &e)
+    catch (const std::ofstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Writing %s: (%s)", npy_filename.c_str(), e.what());
     }
@@ -675,7 +690,7 @@ void load_trained_data(T &tensor, const std::string &filename)
         // Open file
         fs.open(filename, std::ios::in | std::ios::binary);
 
-        if(!fs.good())
+        if (!fs.good())
         {
             throw std::runtime_error("Could not load binary data: " + filename);
         }
@@ -687,23 +702,26 @@ void load_trained_data(T &tensor, const std::string &filename)
 
         window.set(arm_compute::Window::DimX, arm_compute::Window::Dimension(0, 1, 1));
 
-        for(unsigned int d = 1; d < tensor.info()->num_dimensions(); ++d)
+        for (unsigned int d = 1; d < tensor.info()->num_dimensions(); ++d)
         {
             window.set(d, Window::Dimension(0, tensor.info()->tensor_shape()[d], 1));
         }
 
         arm_compute::Iterator in(&tensor, window);
 
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            fs.read(reinterpret_cast<std::fstream::char_type *>(in.ptr()), tensor.info()->tensor_shape()[0] * tensor.info()->element_size());
-        },
-        in);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &)
+            {
+                fs.read(reinterpret_cast<std::fstream::char_type *>(in.ptr()),
+                        tensor.info()->tensor_shape()[0] * tensor.info()->element_size());
+            },
+            in);
 
         // Unmap buffer if creating a CLTensor
         unmap(tensor);
     }
-    catch(const std::ofstream::failure &e)
+    catch (const std::ofstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Writing %s: (%s)", filename.c_str(), e.what());
     }
@@ -718,11 +736,8 @@ void fill_tensor_value(TensorType &tensor, T value)
     window.use_tensor_dimensions(tensor.info()->tensor_shape());
 
     Iterator it_tensor(&tensor, window);
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        *reinterpret_cast<T *>(it_tensor.ptr()) = value;
-    },
-    it_tensor);
+    execute_window_loop(
+        window, [&](const Coordinates &) { *reinterpret_cast<T *>(it_tensor.ptr()) = value; }, it_tensor);
 
     unmap(tensor);
 }
@@ -745,22 +760,23 @@ void fill_tensor_vector(TensorType &tensor, std::vector<T> vec)
 
     int      i = 0;
     Iterator it_tensor(&tensor, window);
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        *reinterpret_cast<T *>(it_tensor.ptr()) = vec.at(i++);
-    },
-    it_tensor);
+    execute_window_loop(
+        window, [&](const Coordinates &) { *reinterpret_cast<T *>(it_tensor.ptr()) = vec.at(i++); }, it_tensor);
 
     unmap(tensor);
 }
 
 template <typename T, typename TensorType>
-void fill_random_tensor(TensorType &tensor, std::random_device::result_type seed, T lower_bound = std::numeric_limits<T>::lowest(), T upper_bound = std::numeric_limits<T>::max())
+void fill_random_tensor(TensorType                     &tensor,
+                        std::random_device::result_type seed,
+                        T                               lower_bound = std::numeric_limits<T>::lowest(),
+                        T                               upper_bound = std::numeric_limits<T>::max())
 {
     constexpr bool is_fp_16bit = std::is_same<T, half>::value || std::is_same<T, bfloat16>::value;
     constexpr bool is_integral = std::is_integral<T>::value && !is_fp_16bit;
 
-    using fp_dist_type = typename std::conditional<is_fp_16bit, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
+    using fp_dist_type = typename std::conditional<is_fp_16bit, arm_compute::utils::uniform_real_distribution_16bit<T>,
+                                                   std::uniform_real_distribution<T>>::type;
     using dist_type    = typename std::conditional<is_integral, std::uniform_int_distribution<T>, fp_dist_type>::type;
 
     std::mt19937 gen(seed);
@@ -772,17 +788,16 @@ void fill_random_tensor(TensorType &tensor, std::random_device::result_type seed
     window.use_tensor_dimensions(tensor.info()->tensor_shape());
 
     Iterator it(&tensor, window);
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        *reinterpret_cast<T *>(it.ptr()) = dist(gen);
-    },
-    it);
+    execute_window_loop(
+        window, [&](const Coordinates &) { *reinterpret_cast<T *>(it.ptr()) = dist(gen); }, it);
 
     unmap(tensor);
 }
 
 template <typename T, typename TensorType>
-void fill_random_tensor(TensorType &tensor, T lower_bound = std::numeric_limits<T>::lowest(), T upper_bound = std::numeric_limits<T>::max())
+void fill_random_tensor(TensorType &tensor,
+                        T           lower_bound = std::numeric_limits<T>::lowest(),
+                        T           upper_bound = std::numeric_limits<T>::max())
 {
     std::random_device rd;
     fill_random_tensor(tensor, rd(), lower_bound, upper_bound);
@@ -791,7 +806,8 @@ void fill_random_tensor(TensorType &tensor, T lower_bound = std::numeric_limits<
 template <typename T>
 void init_sgemm_output(T &dst, T &src0, T &src1, arm_compute::DataType dt)
 {
-    dst.allocator()->init(TensorInfo(TensorShape(src1.info()->dimension(0), src0.info()->dimension(1), src0.info()->dimension(2)), 1, dt));
+    dst.allocator()->init(TensorInfo(
+        TensorShape(src1.info()->dimension(0), src0.info()->dimension(1), src0.info()->dimension(2)), 1, dt));
 }
 /** This function returns the amount of memory free reading from /proc/meminfo
  *
@@ -823,14 +839,16 @@ int compare_tensor(ITensor &tensor1, ITensor &tensor2, T tolerance)
     Iterator itensor1(&tensor1, window);
     Iterator itensor2(&tensor2, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        if(std::abs(*reinterpret_cast<T *>(itensor1.ptr()) - *reinterpret_cast<T *>(itensor2.ptr())) > tolerance)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            ++num_mismatches;
-        }
-    },
-    itensor1, itensor2);
+            if (std::abs(*reinterpret_cast<T *>(itensor1.ptr()) - *reinterpret_cast<T *>(itensor2.ptr())) > tolerance)
+            {
+                ++num_mismatches;
+            }
+        },
+        itensor1, itensor2);
 
     unmap(itensor1);
     unmap(itensor2);
diff --git a/utils/command_line/CommandLineParser.h b/utils/command_line/CommandLineParser.h
index 523f25e8a1f6011eb0e351f8aab283c9d929226d..57796bce73243ce12ac84e36cfb0298388ae610f 100644
--- a/utils/command_line/CommandLineParser.h
+++ b/utils/command_line/CommandLineParser.h
@@ -24,14 +24,13 @@
 #ifndef ARM_COMPUTE_UTILS_COMMANDLINEPARSER
 #define ARM_COMPUTE_UTILS_COMMANDLINEPARSER
 
-#include "Option.h"
 #include "arm_compute/core/utils/misc/Utility.h"
 
+#include "Option.h"
 #include <cstring>
 #include <iostream>
 #include <map>
 #include <memory>
-#include <memory>
 #include <regex>
 #include <string>
 #include <utility>
@@ -56,7 +55,7 @@ public:
      * @return Pointer to the option. The option is owned by the parser.
      */
     template <typename T, typename... As>
-    T *add_option(const std::string &name, As &&... args);
+    T *add_option(const std::string &name, As &&...args);
 
     /** Function to add a new positional argument to the parser.
      *
@@ -65,7 +64,7 @@ public:
      * @return Pointer to the option. The option is owned by the parser.
      */
     template <typename T, typename... As>
-    T *add_positional_option(As &&... args);
+    T *add_positional_option(As &&...args);
 
     /** Parses the command line arguments and updates the options accordingly.
      *
@@ -101,14 +100,14 @@ private:
 };
 
 template <typename T, typename... As>
-inline T *CommandLineParser::add_option(const std::string &name, As &&... args)
+inline T *CommandLineParser::add_option(const std::string &name, As &&...args)
 {
     auto result = _options.emplace(name, std::make_unique<T>(name, std::forward<As>(args)...));
     return static_cast<T *>(result.first->second.get());
 }
 
 template <typename T, typename... As>
-inline T *CommandLineParser::add_positional_option(As &&... args)
+inline T *CommandLineParser::add_positional_option(As &&...args)
 {
     _positional_options.emplace_back(std::make_unique<T>(std::forward<As>(args)...));
     return static_cast<T *>(_positional_options.back().get());
@@ -116,11 +115,11 @@ inline T *CommandLineParser::add_positional_option(As &&... args)
 
 inline void CommandLineParser::parse(int argc, char **argv)
 {
-    const std::regex option_regex{ "--((?:no-)?)([^=]+)(?:=(.*))?" };
+    const std::regex option_regex{"--((?:no-)?)([^=]+)(?:=(.*))?"};
 
-    const auto set_option = [&](const std::string & option, const std::string & name, const std::string & value)
+    const auto set_option = [&](const std::string &option, const std::string &name, const std::string &value)
     {
-        if(_options.find(name) == _options.end())
+        if (_options.find(name) == _options.end())
         {
             _unknown_options.push_back(option);
             return;
@@ -128,7 +127,7 @@ inline void CommandLineParser::parse(int argc, char **argv)
 
         const bool success = _options[name]->parse(value);
 
-        if(!success)
+        if (!success)
         {
             _invalid_options.push_back(option);
         }
@@ -136,26 +135,27 @@ inline void CommandLineParser::parse(int argc, char **argv)
 
     unsigned int positional_index = 0;
 
-    for(int i = 1; i < argc; ++i)
+    for (int i = 1; i < argc; ++i)
     {
-        std::string mixed_case_opt{ argv[i] };
+        std::string mixed_case_opt{argv[i]};
         int         equal_sign = mixed_case_opt.find('=');
         int         pos        = (equal_sign == -1) ? strlen(argv[i]) : equal_sign;
 
-        const std::string option = arm_compute::utility::tolower(mixed_case_opt.substr(0, pos)) + mixed_case_opt.substr(pos);
-        std::smatch       option_matches;
+        const std::string option =
+            arm_compute::utility::tolower(mixed_case_opt.substr(0, pos)) + mixed_case_opt.substr(pos);
+        std::smatch option_matches;
 
-        if(std::regex_match(option, option_matches, option_regex))
+        if (std::regex_match(option, option_matches, option_regex))
         {
             // Boolean option
-            if(option_matches.str(3).empty())
+            if (option_matches.str(3).empty())
             {
                 set_option(option, option_matches.str(2), option_matches.str(1).empty() ? "true" : "false");
             }
             else
             {
                 // Can't have "no-" and a value
-                if(!option_matches.str(1).empty())
+                if (!option_matches.str(1).empty())
                 {
                     _invalid_options.emplace_back(option);
                 }
@@ -167,7 +167,7 @@ inline void CommandLineParser::parse(int argc, char **argv)
         }
         else
         {
-            if(positional_index >= _positional_options.size())
+            if (positional_index >= _positional_options.size())
             {
                 _invalid_options.push_back(mixed_case_opt);
             }
@@ -184,30 +184,30 @@ inline bool CommandLineParser::validate() const
 {
     bool is_valid = true;
 
-    for(const auto &option : _options)
+    for (const auto &option : _options)
     {
-        if(option.second->is_required() && !option.second->is_set())
+        if (option.second->is_required() && !option.second->is_set())
         {
             is_valid = false;
             std::cerr << "ERROR: Option '" << option.second->name() << "' is required but not given!\n";
         }
     }
 
-    for(const auto &option : _positional_options)
+    for (const auto &option : _positional_options)
     {
-        if(option->is_required() && !option->is_set())
+        if (option->is_required() && !option->is_set())
         {
             is_valid = false;
             std::cerr << "ERROR: Option '" << option->name() << "' is required but not given!\n";
         }
     }
 
-    for(const auto &option : _unknown_options)
+    for (const auto &option : _unknown_options)
     {
         std::cerr << "WARNING: Skipping unknown option '" << option << "'!\n";
     }
 
-    for(const auto &option : _invalid_options)
+    for (const auto &option : _invalid_options)
     {
         std::cerr << "WARNING: Skipping invalid option '" << option << "'!\n";
     }
@@ -219,19 +219,19 @@ inline void CommandLineParser::print_help(const std::string &program_name) const
 {
     std::cout << "usage: " << program_name << " \n";
 
-    for(const auto &option : _options)
+    for (const auto &option : _options)
     {
         std::cout << option.second->help() << "\n";
     }
 
-    for(const auto &option : _positional_options)
+    for (const auto &option : _positional_options)
     {
         std::string help_to_print;
 
         // Extract help sub-string
         const std::string help_str = option->help();
         const size_t      help_pos = help_str.find(" - ");
-        if(help_pos != std::string::npos)
+        if (help_pos != std::string::npos)
         {
             help_to_print = help_str.substr(help_pos);
         }
diff --git a/utils/command_line/EnumListOption.h b/utils/command_line/EnumListOption.h
index f4ee2835286dec5f72330ab75b266ee3a8a5aa4f..6c4146fa7544462f55ff48c74b59c5dfd659446b 100644
--- a/utils/command_line/EnumListOption.h
+++ b/utils/command_line/EnumListOption.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_UTILS_ENUMLISTOPTION
 
 #include "Option.h"
-
 #include <initializer_list>
 #include <set>
 #include <sstream>
@@ -57,7 +56,7 @@ public:
      */
     EnumListOption(std::string name, std::set<T> allowed_values, std::initializer_list<T> &&default_values);
 
-    bool parse(std::string value) override;
+    bool        parse(std::string value) override;
     std::string help() const override;
 
     /** Get the values of the option.
@@ -73,13 +72,17 @@ private:
 
 template <typename T>
 inline EnumListOption<T>::EnumListOption(std::string name, std::set<T> allowed_values)
-    : Option{ std::move(name) }, _allowed_values{ std::move(allowed_values) }
+    : Option{std::move(name)}, _allowed_values{std::move(allowed_values)}
 {
 }
 
 template <typename T>
-inline EnumListOption<T>::EnumListOption(std::string name, std::set<T> allowed_values, std::initializer_list<T> &&default_values)
-    : Option{ std::move(name), false, true }, _values{ std::forward<std::initializer_list<T>>(default_values) }, _allowed_values{ std::move(allowed_values) }
+inline EnumListOption<T>::EnumListOption(std::string                name,
+                                         std::set<T>                allowed_values,
+                                         std::initializer_list<T> &&default_values)
+    : Option{std::move(name), false, true},
+      _values{std::forward<std::initializer_list<T>>(default_values)},
+      _allowed_values{std::move(allowed_values)}
 {
 }
 
@@ -90,10 +93,10 @@ bool EnumListOption<T>::parse(std::string value)
     _values.clear();
     _is_set = true;
 
-    std::stringstream stream{ value };
+    std::stringstream stream{value};
     std::string       item;
 
-    while(!std::getline(stream, item, ',').fail())
+    while (!std::getline(stream, item, ',').fail())
     {
         try
         {
@@ -102,9 +105,9 @@ bool EnumListOption<T>::parse(std::string value)
 
             item_stream >> typed_value;
 
-            if(!item_stream.fail())
+            if (!item_stream.fail())
             {
-                if(_allowed_values.count(typed_value) == 0)
+                if (_allowed_values.count(typed_value) == 0)
                 {
                     _is_set = false;
                     continue;
@@ -115,7 +118,7 @@ bool EnumListOption<T>::parse(std::string value)
 
             _is_set = _is_set && !item_stream.fail();
         }
-        catch(const std::invalid_argument &)
+        catch (const std::invalid_argument &)
         {
             _is_set = false;
         }
@@ -130,7 +133,7 @@ std::string EnumListOption<T>::help() const
     std::stringstream msg;
     msg << "--" + name() + "={";
 
-    for(const auto &value : _allowed_values)
+    for (const auto &value : _allowed_values)
     {
         msg << value << ",";
     }
diff --git a/utils/command_line/EnumOption.h b/utils/command_line/EnumOption.h
index 6bcfe5f14e50364ce5214828a756394646fd73e5..eb43b6c54e4c235f8e5dfba03b223151bc28a2cf 100644
--- a/utils/command_line/EnumOption.h
+++ b/utils/command_line/EnumOption.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_UTILS_ENUMOPTION
 
 #include "SimpleOption.h"
-
 #include <set>
 #include <sstream>
 #include <stdexcept>
@@ -55,7 +54,7 @@ public:
      */
     EnumOption(std::string name, std::set<T> allowed_values, T default_value);
 
-    bool parse(std::string value) override;
+    bool        parse(std::string value) override;
     std::string help() const override;
 
     /** Get the selected value.
@@ -70,13 +69,13 @@ private:
 
 template <typename T>
 inline EnumOption<T>::EnumOption(std::string name, std::set<T> allowed_values)
-    : SimpleOption<T>{ std::move(name) }, _allowed_values{ std::move(allowed_values) }
+    : SimpleOption<T>{std::move(name)}, _allowed_values{std::move(allowed_values)}
 {
 }
 
 template <typename T>
 inline EnumOption<T>::EnumOption(std::string name, std::set<T> allowed_values, T default_value)
-    : SimpleOption<T>{ std::move(name), std::move(default_value) }, _allowed_values{ std::move(allowed_values) }
+    : SimpleOption<T>{std::move(name), std::move(default_value)}, _allowed_values{std::move(allowed_values)}
 {
 }
 
@@ -85,14 +84,14 @@ bool EnumOption<T>::parse(std::string value)
 {
     try
     {
-        std::stringstream stream{ value };
+        std::stringstream stream{value};
         T                 typed_value{};
 
         stream >> typed_value;
 
-        if(!stream.fail())
+        if (!stream.fail())
         {
-            if(_allowed_values.count(typed_value) == 0)
+            if (_allowed_values.count(typed_value) == 0)
             {
                 return false;
             }
@@ -104,7 +103,7 @@ bool EnumOption<T>::parse(std::string value)
 
         return false;
     }
-    catch(const std::invalid_argument &)
+    catch (const std::invalid_argument &)
     {
         return false;
     }
@@ -116,7 +115,7 @@ std::string EnumOption<T>::help() const
     std::stringstream msg;
     msg << "--" + this->name() + "={";
 
-    for(const auto &value : _allowed_values)
+    for (const auto &value : _allowed_values)
     {
         msg << value << ",";
     }
diff --git a/utils/command_line/ListOption.h b/utils/command_line/ListOption.h
index b290191e084bec561b385357f04dad55f98e984f..f318e1646a3e9afd281ea2e29cff105ec83f5519 100644
--- a/utils/command_line/ListOption.h
+++ b/utils/command_line/ListOption.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_UTILS_LISTOPTION
 
 #include "Option.h"
-
 #include <initializer_list>
 #include <sstream>
 #include <stdexcept>
@@ -50,7 +49,7 @@ public:
      */
     ListOption(std::string name, std::initializer_list<T> &&default_values);
 
-    bool parse(std::string value) override;
+    bool        parse(std::string value) override;
     std::string help() const override;
 
     /** Get the list of option values.
@@ -65,7 +64,7 @@ private:
 
 template <typename T>
 inline ListOption<T>::ListOption(std::string name, std::initializer_list<T> &&default_values)
-    : Option{ std::move(name), false, true }, _values{ std::forward<std::initializer_list<T>>(default_values) }
+    : Option{std::move(name), false, true}, _values{std::forward<std::initializer_list<T>>(default_values)}
 {
 }
 
@@ -76,17 +75,17 @@ bool ListOption<T>::parse(std::string value)
 
     try
     {
-        std::stringstream stream{ value };
+        std::stringstream stream{value};
         std::string       item;
 
-        while(!std::getline(stream, item, ',').fail())
+        while (!std::getline(stream, item, ',').fail())
         {
             std::stringstream item_stream(item);
             T                 typed_value{};
 
             item_stream >> typed_value;
 
-            if(!item_stream.fail())
+            if (!item_stream.fail())
             {
                 _values.emplace_back(typed_value);
             }
@@ -96,7 +95,7 @@ bool ListOption<T>::parse(std::string value)
 
         return _is_set;
     }
-    catch(const std::invalid_argument &)
+    catch (const std::invalid_argument &)
     {
         return false;
     }
diff --git a/utils/command_line/Option.h b/utils/command_line/Option.h
index c845e5499fab9bc7367898e7535eaa4a72862df9..b4288538b03d7084595e99c3c91e8e8f782e1a3d 100644
--- a/utils/command_line/Option.h
+++ b/utils/command_line/Option.h
@@ -97,18 +97,17 @@ public:
 
 protected:
     std::string _name;
-    bool        _is_required{ false };
-    bool        _is_set{ false };
+    bool        _is_required{false};
+    bool        _is_set{false};
     std::string _help{};
 };
 
-inline Option::Option(std::string name)
-    : _name{ std::move(name) }
+inline Option::Option(std::string name) : _name{std::move(name)}
 {
 }
 
 inline Option::Option(std::string name, bool is_required, bool is_set)
-    : _name{ std::move(name) }, _is_required{ is_required }, _is_set{ is_set }
+    : _name{std::move(name)}, _is_required{is_required}, _is_set{is_set}
 {
 }
 
diff --git a/utils/command_line/SimpleOption.h b/utils/command_line/SimpleOption.h
index d76797375de43d12be32f1c45852a31cf8f18b7d..f6329c1790a8dee60435141f9116b5321fc321e8 100644
--- a/utils/command_line/SimpleOption.h
+++ b/utils/command_line/SimpleOption.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_UTILS_SIMPLEOPTION
 
 #include "Option.h"
-
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -74,7 +73,7 @@ protected:
 
 template <typename T>
 inline SimpleOption<T>::SimpleOption(std::string name, T default_value)
-    : Option{ std::move(name), false, true }, _value{ std::move(default_value) }
+    : Option{std::move(name), false, true}, _value{std::move(default_value)}
 {
 }
 
@@ -83,12 +82,12 @@ bool SimpleOption<T>::parse(std::string value)
 {
     try
     {
-        std::stringstream stream{ std::move(value) };
+        std::stringstream stream{std::move(value)};
         stream >> _value;
         _is_set = !stream.fail();
         return _is_set;
     }
-    catch(const std::invalid_argument &)
+    catch (const std::invalid_argument &)
     {
         return false;
     }
diff --git a/utils/command_line/ToggleOption.h b/utils/command_line/ToggleOption.h
index d3c68663b591b9da04835cfe838e2b8ab71e7bd3..694b7bb9e6059479579fd3985ffb160502c12750 100644
--- a/utils/command_line/ToggleOption.h
+++ b/utils/command_line/ToggleOption.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_UTILS_TOGGLEOPTION
 
 #include "SimpleOption.h"
-
 #include <string>
 
 namespace arm_compute
@@ -45,26 +44,23 @@ public:
      */
     ToggleOption(std::string name, bool default_value);
 
-    bool parse(std::string value) override;
+    bool        parse(std::string value) override;
     std::string help() const override;
 };
 
 inline ToggleOption::ToggleOption(std::string name, bool default_value)
-    : SimpleOption<bool>
-{
-    std::move(name), default_value
-}
+    : SimpleOption<bool>{std::move(name), default_value}
 {
 }
 
 inline bool ToggleOption::parse(std::string value)
 {
-    if(value == "true")
+    if (value == "true")
     {
         _value  = true;
         _is_set = true;
     }
-    else if(value == "false")
+    else if (value == "false")
     {
         _value  = false;
         _is_set = true;